Page MenuHomeFreeBSD

No OneTemporary

This file is larger than 256 KB, so syntax highlighting was skipped.
Index: vendor/llvm/dist-release_90/include/llvm/Analysis/AliasAnalysis.h
===================================================================
--- vendor/llvm/dist-release_90/include/llvm/Analysis/AliasAnalysis.h (revision 351302)
+++ vendor/llvm/dist-release_90/include/llvm/Analysis/AliasAnalysis.h (revision 351303)
@@ -1,1223 +1,1223 @@
//===- llvm/Analysis/AliasAnalysis.h - Alias Analysis Interface -*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines the generic AliasAnalysis interface, which is used as the
// common interface used by all clients of alias analysis information, and
// implemented by all alias analysis implementations. Mod/Ref information is
// also captured by this interface.
//
// Implementations of this interface must implement the various virtual methods,
// which automatically provides functionality for the entire suite of client
// APIs.
//
// This API identifies memory regions with the MemoryLocation class. The pointer
// component specifies the base memory address of the region. The Size specifies
// the maximum size (in address units) of the memory region, or
// MemoryLocation::UnknownSize if the size is not known. The TBAA tag
// identifies the "type" of the memory reference; see the
// TypeBasedAliasAnalysis class for details.
//
// Some non-obvious details include:
// - Pointers that point to two completely different objects in memory never
// alias, regardless of the value of the Size component.
// - NoAlias doesn't imply inequal pointers. The most obvious example of this
// is two pointers to constant memory. Even if they are equal, constant
// memory is never stored to, so there will never be any dependencies.
// In this and other situations, the pointers may be both NoAlias and
// MustAlias at the same time. The current API can only return one result,
// though this is rarely a problem in practice.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_ANALYSIS_ALIASANALYSIS_H
#define LLVM_ANALYSIS_ALIASANALYSIS_H
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/PassManager.h"
#include "llvm/Pass.h"
#include <cstdint>
#include <functional>
#include <memory>
#include <vector>
namespace llvm {
class AnalysisUsage;
class BasicAAResult;
class BasicBlock;
class DominatorTree;
class OrderedBasicBlock;
class Value;
/// The possible results of an alias query.
///
/// These results are always computed between two MemoryLocation objects as
/// a query to some alias analysis.
///
/// Note that these are unscoped enumerations because we would like to support
/// implicitly testing a result for the existence of any possible aliasing with
/// a conversion to bool, but an "enum class" doesn't support this. The
/// canonical names from the literature are suffixed and unique anyways, and so
/// they serve as global constants in LLVM for these results.
///
/// See docs/AliasAnalysis.html for more information on the specific meanings
/// of these values.
enum AliasResult : uint8_t {
/// The two locations do not alias at all.
///
/// This value is arranged to convert to false, while all other values
/// convert to true. This allows a boolean context to convert the result to
/// a binary flag indicating whether there is the possibility of aliasing.
NoAlias = 0,
/// The two locations may or may not alias. This is the least precise result.
MayAlias,
/// The two locations alias, but only due to a partial overlap.
PartialAlias,
/// The two locations precisely alias each other.
MustAlias,
};
/// << operator for AliasResult.
raw_ostream &operator<<(raw_ostream &OS, AliasResult AR);
/// Flags indicating whether a memory access modifies or references memory.
///
/// This is no access at all, a modification, a reference, or both
/// a modification and a reference. These are specifically structured such that
/// they form a three bit matrix and bit-tests for 'mod' or 'ref' or 'must'
/// work with any of the possible values.
enum class ModRefInfo : uint8_t {
/// Must is provided for completeness, but no routines will return only
/// Must today. See definition of Must below.
Must = 0,
/// The access may reference the value stored in memory,
/// a mustAlias relation was found, and no mayAlias or partialAlias found.
MustRef = 1,
/// The access may modify the value stored in memory,
/// a mustAlias relation was found, and no mayAlias or partialAlias found.
MustMod = 2,
/// The access may reference, modify or both the value stored in memory,
/// a mustAlias relation was found, and no mayAlias or partialAlias found.
MustModRef = MustRef | MustMod,
/// The access neither references nor modifies the value stored in memory.
NoModRef = 4,
/// The access may reference the value stored in memory.
Ref = NoModRef | MustRef,
/// The access may modify the value stored in memory.
Mod = NoModRef | MustMod,
/// The access may reference and may modify the value stored in memory.
ModRef = Ref | Mod,
/// About Must:
/// Must is set in a best effort manner.
/// We usually do not try our best to infer Must, instead it is merely
/// another piece of "free" information that is presented when available.
/// Must set means there was certainly a MustAlias found. For calls,
/// where multiple arguments are checked (argmemonly), this translates to
/// only MustAlias or NoAlias was found.
/// Must is not set for RAR accesses, even if the two locations must
/// alias. The reason is that two read accesses translate to an early return
/// of NoModRef. An additional alias check to set Must may be
/// expensive. Other cases may also not set Must(e.g. callCapturesBefore).
/// We refer to Must being *set* when the most significant bit is *cleared*.
/// Conversely we *clear* Must information by *setting* the Must bit to 1.
};
LLVM_NODISCARD inline bool isNoModRef(const ModRefInfo MRI) {
return (static_cast<int>(MRI) & static_cast<int>(ModRefInfo::MustModRef)) ==
static_cast<int>(ModRefInfo::Must);
}
LLVM_NODISCARD inline bool isModOrRefSet(const ModRefInfo MRI) {
return static_cast<int>(MRI) & static_cast<int>(ModRefInfo::MustModRef);
}
LLVM_NODISCARD inline bool isModAndRefSet(const ModRefInfo MRI) {
return (static_cast<int>(MRI) & static_cast<int>(ModRefInfo::MustModRef)) ==
static_cast<int>(ModRefInfo::MustModRef);
}
LLVM_NODISCARD inline bool isModSet(const ModRefInfo MRI) {
return static_cast<int>(MRI) & static_cast<int>(ModRefInfo::MustMod);
}
LLVM_NODISCARD inline bool isRefSet(const ModRefInfo MRI) {
return static_cast<int>(MRI) & static_cast<int>(ModRefInfo::MustRef);
}
LLVM_NODISCARD inline bool isMustSet(const ModRefInfo MRI) {
return !(static_cast<int>(MRI) & static_cast<int>(ModRefInfo::NoModRef));
}
LLVM_NODISCARD inline ModRefInfo setMod(const ModRefInfo MRI) {
return ModRefInfo(static_cast<int>(MRI) |
static_cast<int>(ModRefInfo::MustMod));
}
LLVM_NODISCARD inline ModRefInfo setRef(const ModRefInfo MRI) {
return ModRefInfo(static_cast<int>(MRI) |
static_cast<int>(ModRefInfo::MustRef));
}
LLVM_NODISCARD inline ModRefInfo setMust(const ModRefInfo MRI) {
return ModRefInfo(static_cast<int>(MRI) &
static_cast<int>(ModRefInfo::MustModRef));
}
LLVM_NODISCARD inline ModRefInfo setModAndRef(const ModRefInfo MRI) {
return ModRefInfo(static_cast<int>(MRI) |
static_cast<int>(ModRefInfo::MustModRef));
}
LLVM_NODISCARD inline ModRefInfo clearMod(const ModRefInfo MRI) {
return ModRefInfo(static_cast<int>(MRI) & static_cast<int>(ModRefInfo::Ref));
}
LLVM_NODISCARD inline ModRefInfo clearRef(const ModRefInfo MRI) {
return ModRefInfo(static_cast<int>(MRI) & static_cast<int>(ModRefInfo::Mod));
}
LLVM_NODISCARD inline ModRefInfo clearMust(const ModRefInfo MRI) {
return ModRefInfo(static_cast<int>(MRI) |
static_cast<int>(ModRefInfo::NoModRef));
}
LLVM_NODISCARD inline ModRefInfo unionModRef(const ModRefInfo MRI1,
const ModRefInfo MRI2) {
return ModRefInfo(static_cast<int>(MRI1) | static_cast<int>(MRI2));
}
LLVM_NODISCARD inline ModRefInfo intersectModRef(const ModRefInfo MRI1,
const ModRefInfo MRI2) {
return ModRefInfo(static_cast<int>(MRI1) & static_cast<int>(MRI2));
}
/// The locations at which a function might access memory.
///
/// These are primarily used in conjunction with the \c AccessKind bits to
/// describe both the nature of access and the locations of access for a
/// function call.
enum FunctionModRefLocation {
/// Base case is no access to memory.
FMRL_Nowhere = 0,
/// Access to memory via argument pointers.
FMRL_ArgumentPointees = 8,
/// Memory that is inaccessible via LLVM IR.
FMRL_InaccessibleMem = 16,
/// Access to any memory.
FMRL_Anywhere = 32 | FMRL_InaccessibleMem | FMRL_ArgumentPointees
};
/// Summary of how a function affects memory in the program.
///
/// Loads from constant globals are not considered memory accesses for this
/// interface. Also, functions may freely modify stack space local to their
/// invocation without having to report it through these interfaces.
enum FunctionModRefBehavior {
/// This function does not perform any non-local loads or stores to memory.
///
/// This property corresponds to the GCC 'const' attribute.
/// This property corresponds to the LLVM IR 'readnone' attribute.
/// This property corresponds to the IntrNoMem LLVM intrinsic flag.
FMRB_DoesNotAccessMemory =
FMRL_Nowhere | static_cast<int>(ModRefInfo::NoModRef),
/// The only memory references in this function (if it has any) are
/// non-volatile loads from objects pointed to by its pointer-typed
/// arguments, with arbitrary offsets.
///
/// This property corresponds to the IntrReadArgMem LLVM intrinsic flag.
FMRB_OnlyReadsArgumentPointees =
FMRL_ArgumentPointees | static_cast<int>(ModRefInfo::Ref),
/// The only memory references in this function (if it has any) are
/// non-volatile loads and stores from objects pointed to by its
/// pointer-typed arguments, with arbitrary offsets.
///
/// This property corresponds to the IntrArgMemOnly LLVM intrinsic flag.
FMRB_OnlyAccessesArgumentPointees =
FMRL_ArgumentPointees | static_cast<int>(ModRefInfo::ModRef),
/// The only memory references in this function (if it has any) are
/// references of memory that is otherwise inaccessible via LLVM IR.
///
/// This property corresponds to the LLVM IR inaccessiblememonly attribute.
FMRB_OnlyAccessesInaccessibleMem =
FMRL_InaccessibleMem | static_cast<int>(ModRefInfo::ModRef),
/// The function may perform non-volatile loads and stores of objects
/// pointed to by its pointer-typed arguments, with arbitrary offsets, and
/// it may also perform loads and stores of memory that is otherwise
/// inaccessible via LLVM IR.
///
/// This property corresponds to the LLVM IR
/// inaccessiblemem_or_argmemonly attribute.
FMRB_OnlyAccessesInaccessibleOrArgMem = FMRL_InaccessibleMem |
FMRL_ArgumentPointees |
static_cast<int>(ModRefInfo::ModRef),
/// This function does not perform any non-local stores or volatile loads,
/// but may read from any memory location.
///
/// This property corresponds to the GCC 'pure' attribute.
/// This property corresponds to the LLVM IR 'readonly' attribute.
/// This property corresponds to the IntrReadMem LLVM intrinsic flag.
FMRB_OnlyReadsMemory = FMRL_Anywhere | static_cast<int>(ModRefInfo::Ref),
// This function does not read from memory anywhere, but may write to any
// memory location.
//
// This property corresponds to the LLVM IR 'writeonly' attribute.
// This property corresponds to the IntrWriteMem LLVM intrinsic flag.
FMRB_DoesNotReadMemory = FMRL_Anywhere | static_cast<int>(ModRefInfo::Mod),
/// This indicates that the function could not be classified into one of the
/// behaviors above.
FMRB_UnknownModRefBehavior =
FMRL_Anywhere | static_cast<int>(ModRefInfo::ModRef)
};
// Wrapper method strips bits significant only in FunctionModRefBehavior,
// to obtain a valid ModRefInfo. The benefit of using the wrapper is that if
// ModRefInfo enum changes, the wrapper can be updated to & with the new enum
// entry with all bits set to 1.
LLVM_NODISCARD inline ModRefInfo
createModRefInfo(const FunctionModRefBehavior FMRB) {
return ModRefInfo(FMRB & static_cast<int>(ModRefInfo::ModRef));
}
/// This class stores info we want to provide to or retain within an alias
/// query. By default, the root query is stateless and starts with a freshly
/// constructed info object. Specific alias analyses can use this query info to
/// store per-query state that is important for recursive or nested queries to
/// avoid recomputing. To enable preserving this state across multiple queries
/// where safe (due to the IR not changing), use a `BatchAAResults` wrapper.
/// The information stored in an `AAQueryInfo` is currently limitted to the
/// caches used by BasicAA, but can further be extended to fit other AA needs.
class AAQueryInfo {
public:
using LocPair = std::pair<MemoryLocation, MemoryLocation>;
using AliasCacheT = SmallDenseMap<LocPair, AliasResult, 8>;
AliasCacheT AliasCache;
using IsCapturedCacheT = SmallDenseMap<const Value *, bool, 8>;
IsCapturedCacheT IsCapturedCache;
AAQueryInfo() : AliasCache(), IsCapturedCache() {}
};
class BatchAAResults;
class AAResults {
public:
// Make these results default constructable and movable. We have to spell
// these out because MSVC won't synthesize them.
AAResults(const TargetLibraryInfo &TLI) : TLI(TLI) {}
AAResults(AAResults &&Arg);
~AAResults();
/// Register a specific AA result.
template <typename AAResultT> void addAAResult(AAResultT &AAResult) {
// FIXME: We should use a much lighter weight system than the usual
// polymorphic pattern because we don't own AAResult. It should
// ideally involve two pointers and no separate allocation.
AAs.emplace_back(new Model<AAResultT>(AAResult, *this));
}
/// Register a function analysis ID that the results aggregation depends on.
///
/// This is used in the new pass manager to implement the invalidation logic
/// where we must invalidate the results aggregation if any of our component
/// analyses become invalid.
void addAADependencyID(AnalysisKey *ID) { AADeps.push_back(ID); }
/// Handle invalidation events in the new pass manager.
///
/// The aggregation is invalidated if any of the underlying analyses is
/// invalidated.
bool invalidate(Function &F, const PreservedAnalyses &PA,
FunctionAnalysisManager::Invalidator &Inv);
//===--------------------------------------------------------------------===//
/// \name Alias Queries
/// @{
/// The main low level interface to the alias analysis implementation.
/// Returns an AliasResult indicating whether the two pointers are aliased to
/// each other. This is the interface that must be implemented by specific
/// alias analysis implementations.
AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB);
/// A convenience wrapper around the primary \c alias interface.
AliasResult alias(const Value *V1, LocationSize V1Size, const Value *V2,
LocationSize V2Size) {
return alias(MemoryLocation(V1, V1Size), MemoryLocation(V2, V2Size));
}
/// A convenience wrapper around the primary \c alias interface.
AliasResult alias(const Value *V1, const Value *V2) {
return alias(V1, LocationSize::unknown(), V2, LocationSize::unknown());
}
/// A trivial helper function to check to see if the specified pointers are
/// no-alias.
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB) {
return alias(LocA, LocB) == NoAlias;
}
/// A convenience wrapper around the \c isNoAlias helper interface.
bool isNoAlias(const Value *V1, LocationSize V1Size, const Value *V2,
LocationSize V2Size) {
return isNoAlias(MemoryLocation(V1, V1Size), MemoryLocation(V2, V2Size));
}
/// A convenience wrapper around the \c isNoAlias helper interface.
bool isNoAlias(const Value *V1, const Value *V2) {
return isNoAlias(MemoryLocation(V1), MemoryLocation(V2));
}
/// A trivial helper function to check to see if the specified pointers are
/// must-alias.
bool isMustAlias(const MemoryLocation &LocA, const MemoryLocation &LocB) {
return alias(LocA, LocB) == MustAlias;
}
/// A convenience wrapper around the \c isMustAlias helper interface.
bool isMustAlias(const Value *V1, const Value *V2) {
return alias(V1, LocationSize::precise(1), V2, LocationSize::precise(1)) ==
MustAlias;
}
/// Checks whether the given location points to constant memory, or if
/// \p OrLocal is true whether it points to a local alloca.
bool pointsToConstantMemory(const MemoryLocation &Loc, bool OrLocal = false);
/// A convenience wrapper around the primary \c pointsToConstantMemory
/// interface.
bool pointsToConstantMemory(const Value *P, bool OrLocal = false) {
return pointsToConstantMemory(MemoryLocation(P), OrLocal);
}
/// @}
//===--------------------------------------------------------------------===//
/// \name Simple mod/ref information
/// @{
/// Get the ModRef info associated with a pointer argument of a call. The
/// result's bits are set to indicate the allowed aliasing ModRef kinds. Note
/// that these bits do not necessarily account for the overall behavior of
/// the function, but rather only provide additional per-argument
/// information. This never sets ModRefInfo::Must.
ModRefInfo getArgModRefInfo(const CallBase *Call, unsigned ArgIdx);
/// Return the behavior of the given call site.
FunctionModRefBehavior getModRefBehavior(const CallBase *Call);
/// Return the behavior when calling the given function.
FunctionModRefBehavior getModRefBehavior(const Function *F);
/// Checks if the specified call is known to never read or write memory.
///
/// Note that if the call only reads from known-constant memory, it is also
/// legal to return true. Also, calls that unwind the stack are legal for
/// this predicate.
///
/// Many optimizations (such as CSE and LICM) can be performed on such calls
/// without worrying about aliasing properties, and many calls have this
/// property (e.g. calls to 'sin' and 'cos').
///
/// This property corresponds to the GCC 'const' attribute.
bool doesNotAccessMemory(const CallBase *Call) {
return getModRefBehavior(Call) == FMRB_DoesNotAccessMemory;
}
/// Checks if the specified function is known to never read or write memory.
///
/// Note that if the function only reads from known-constant memory, it is
/// also legal to return true. Also, function that unwind the stack are legal
/// for this predicate.
///
/// Many optimizations (such as CSE and LICM) can be performed on such calls
/// to such functions without worrying about aliasing properties, and many
/// functions have this property (e.g. 'sin' and 'cos').
///
/// This property corresponds to the GCC 'const' attribute.
bool doesNotAccessMemory(const Function *F) {
return getModRefBehavior(F) == FMRB_DoesNotAccessMemory;
}
/// Checks if the specified call is known to only read from non-volatile
/// memory (or not access memory at all).
///
/// Calls that unwind the stack are legal for this predicate.
///
/// This property allows many common optimizations to be performed in the
/// absence of interfering store instructions, such as CSE of strlen calls.
///
/// This property corresponds to the GCC 'pure' attribute.
bool onlyReadsMemory(const CallBase *Call) {
return onlyReadsMemory(getModRefBehavior(Call));
}
/// Checks if the specified function is known to only read from non-volatile
/// memory (or not access memory at all).
///
/// Functions that unwind the stack are legal for this predicate.
///
/// This property allows many common optimizations to be performed in the
/// absence of interfering store instructions, such as CSE of strlen calls.
///
/// This property corresponds to the GCC 'pure' attribute.
bool onlyReadsMemory(const Function *F) {
return onlyReadsMemory(getModRefBehavior(F));
}
/// Checks if functions with the specified behavior are known to only read
/// from non-volatile memory (or not access memory at all).
static bool onlyReadsMemory(FunctionModRefBehavior MRB) {
return !isModSet(createModRefInfo(MRB));
}
/// Checks if functions with the specified behavior are known to only write
/// memory (or not access memory at all).
static bool doesNotReadMemory(FunctionModRefBehavior MRB) {
return !isRefSet(createModRefInfo(MRB));
}
/// Checks if functions with the specified behavior are known to read and
/// write at most from objects pointed to by their pointer-typed arguments
/// (with arbitrary offsets).
static bool onlyAccessesArgPointees(FunctionModRefBehavior MRB) {
return !(MRB & FMRL_Anywhere & ~FMRL_ArgumentPointees);
}
/// Checks if functions with the specified behavior are known to potentially
/// read or write from objects pointed to be their pointer-typed arguments
/// (with arbitrary offsets).
static bool doesAccessArgPointees(FunctionModRefBehavior MRB) {
return isModOrRefSet(createModRefInfo(MRB)) &&
(MRB & FMRL_ArgumentPointees);
}
/// Checks if functions with the specified behavior are known to read and
/// write at most from memory that is inaccessible from LLVM IR.
static bool onlyAccessesInaccessibleMem(FunctionModRefBehavior MRB) {
return !(MRB & FMRL_Anywhere & ~FMRL_InaccessibleMem);
}
/// Checks if functions with the specified behavior are known to potentially
/// read or write from memory that is inaccessible from LLVM IR.
static bool doesAccessInaccessibleMem(FunctionModRefBehavior MRB) {
return isModOrRefSet(createModRefInfo(MRB)) && (MRB & FMRL_InaccessibleMem);
}
/// Checks if functions with the specified behavior are known to read and
/// write at most from memory that is inaccessible from LLVM IR or objects
/// pointed to by their pointer-typed arguments (with arbitrary offsets).
static bool onlyAccessesInaccessibleOrArgMem(FunctionModRefBehavior MRB) {
return !(MRB & FMRL_Anywhere &
~(FMRL_InaccessibleMem | FMRL_ArgumentPointees));
}
/// getModRefInfo (for call sites) - Return information about whether
/// a particular call site modifies or reads the specified memory location.
ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc);
/// getModRefInfo (for call sites) - A convenience wrapper.
ModRefInfo getModRefInfo(const CallBase *Call, const Value *P,
LocationSize Size) {
return getModRefInfo(Call, MemoryLocation(P, Size));
}
/// getModRefInfo (for loads) - Return information about whether
/// a particular load modifies or reads the specified memory location.
ModRefInfo getModRefInfo(const LoadInst *L, const MemoryLocation &Loc);
/// getModRefInfo (for loads) - A convenience wrapper.
ModRefInfo getModRefInfo(const LoadInst *L, const Value *P,
LocationSize Size) {
return getModRefInfo(L, MemoryLocation(P, Size));
}
/// getModRefInfo (for stores) - Return information about whether
/// a particular store modifies or reads the specified memory location.
ModRefInfo getModRefInfo(const StoreInst *S, const MemoryLocation &Loc);
/// getModRefInfo (for stores) - A convenience wrapper.
ModRefInfo getModRefInfo(const StoreInst *S, const Value *P,
LocationSize Size) {
return getModRefInfo(S, MemoryLocation(P, Size));
}
/// getModRefInfo (for fences) - Return information about whether
/// a particular store modifies or reads the specified memory location.
ModRefInfo getModRefInfo(const FenceInst *S, const MemoryLocation &Loc);
/// getModRefInfo (for fences) - A convenience wrapper.
ModRefInfo getModRefInfo(const FenceInst *S, const Value *P,
LocationSize Size) {
return getModRefInfo(S, MemoryLocation(P, Size));
}
/// getModRefInfo (for cmpxchges) - Return information about whether
/// a particular cmpxchg modifies or reads the specified memory location.
ModRefInfo getModRefInfo(const AtomicCmpXchgInst *CX,
const MemoryLocation &Loc);
/// getModRefInfo (for cmpxchges) - A convenience wrapper.
ModRefInfo getModRefInfo(const AtomicCmpXchgInst *CX, const Value *P,
LocationSize Size) {
return getModRefInfo(CX, MemoryLocation(P, Size));
}
/// getModRefInfo (for atomicrmws) - Return information about whether
/// a particular atomicrmw modifies or reads the specified memory location.
ModRefInfo getModRefInfo(const AtomicRMWInst *RMW, const MemoryLocation &Loc);
/// getModRefInfo (for atomicrmws) - A convenience wrapper.
ModRefInfo getModRefInfo(const AtomicRMWInst *RMW, const Value *P,
LocationSize Size) {
return getModRefInfo(RMW, MemoryLocation(P, Size));
}
/// getModRefInfo (for va_args) - Return information about whether
/// a particular va_arg modifies or reads the specified memory location.
ModRefInfo getModRefInfo(const VAArgInst *I, const MemoryLocation &Loc);
/// getModRefInfo (for va_args) - A convenience wrapper.
ModRefInfo getModRefInfo(const VAArgInst *I, const Value *P,
LocationSize Size) {
return getModRefInfo(I, MemoryLocation(P, Size));
}
/// getModRefInfo (for catchpads) - Return information about whether
/// a particular catchpad modifies or reads the specified memory location.
ModRefInfo getModRefInfo(const CatchPadInst *I, const MemoryLocation &Loc);
/// getModRefInfo (for catchpads) - A convenience wrapper.
ModRefInfo getModRefInfo(const CatchPadInst *I, const Value *P,
LocationSize Size) {
return getModRefInfo(I, MemoryLocation(P, Size));
}
/// getModRefInfo (for catchrets) - Return information about whether
/// a particular catchret modifies or reads the specified memory location.
ModRefInfo getModRefInfo(const CatchReturnInst *I, const MemoryLocation &Loc);
/// getModRefInfo (for catchrets) - A convenience wrapper.
ModRefInfo getModRefInfo(const CatchReturnInst *I, const Value *P,
LocationSize Size) {
return getModRefInfo(I, MemoryLocation(P, Size));
}
/// Check whether or not an instruction may read or write the optionally
/// specified memory location.
///
///
/// An instruction that doesn't read or write memory may be trivially LICM'd
/// for example.
///
/// For function calls, this delegates to the alias-analysis specific
/// call-site mod-ref behavior queries. Otherwise it delegates to the specific
/// helpers above.
ModRefInfo getModRefInfo(const Instruction *I,
const Optional<MemoryLocation> &OptLoc) {
AAQueryInfo AAQIP;
return getModRefInfo(I, OptLoc, AAQIP);
}
/// A convenience wrapper for constructing the memory location.
ModRefInfo getModRefInfo(const Instruction *I, const Value *P,
LocationSize Size) {
return getModRefInfo(I, MemoryLocation(P, Size));
}
/// Return information about whether a call and an instruction may refer to
/// the same memory locations.
ModRefInfo getModRefInfo(Instruction *I, const CallBase *Call);
/// Return information about whether two call sites may refer to the same set
/// of memory locations. See the AA documentation for details:
/// http://llvm.org/docs/AliasAnalysis.html#ModRefInfo
ModRefInfo getModRefInfo(const CallBase *Call1, const CallBase *Call2);
/// Return information about whether a particular call site modifies
/// or reads the specified memory location \p MemLoc before instruction \p I
/// in a BasicBlock. An ordered basic block \p OBB can be used to speed up
/// instruction ordering queries inside the BasicBlock containing \p I.
/// Early exits in callCapturesBefore may lead to ModRefInfo::Must not being
/// set.
ModRefInfo callCapturesBefore(const Instruction *I,
const MemoryLocation &MemLoc, DominatorTree *DT,
OrderedBasicBlock *OBB = nullptr);
/// A convenience wrapper to synthesize a memory location.
ModRefInfo callCapturesBefore(const Instruction *I, const Value *P,
LocationSize Size, DominatorTree *DT,
OrderedBasicBlock *OBB = nullptr) {
return callCapturesBefore(I, MemoryLocation(P, Size), DT, OBB);
}
/// @}
//===--------------------------------------------------------------------===//
/// \name Higher level methods for querying mod/ref information.
/// @{
/// Check if it is possible for execution of the specified basic block to
/// modify the location Loc.
bool canBasicBlockModify(const BasicBlock &BB, const MemoryLocation &Loc);
/// A convenience wrapper synthesizing a memory location.
bool canBasicBlockModify(const BasicBlock &BB, const Value *P,
LocationSize Size) {
return canBasicBlockModify(BB, MemoryLocation(P, Size));
}
/// Check if it is possible for the execution of the specified instructions
/// to mod\ref (according to the mode) the location Loc.
///
/// The instructions to consider are all of the instructions in the range of
/// [I1,I2] INCLUSIVE. I1 and I2 must be in the same basic block.
bool canInstructionRangeModRef(const Instruction &I1, const Instruction &I2,
const MemoryLocation &Loc,
const ModRefInfo Mode);
/// A convenience wrapper synthesizing a memory location.
bool canInstructionRangeModRef(const Instruction &I1, const Instruction &I2,
const Value *Ptr, LocationSize Size,
const ModRefInfo Mode) {
return canInstructionRangeModRef(I1, I2, MemoryLocation(Ptr, Size), Mode);
}
private:
AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB,
AAQueryInfo &AAQI);
bool pointsToConstantMemory(const MemoryLocation &Loc, AAQueryInfo &AAQI,
bool OrLocal = false);
ModRefInfo getModRefInfo(Instruction *I, const CallBase *Call2,
AAQueryInfo &AAQIP);
ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc,
AAQueryInfo &AAQI);
ModRefInfo getModRefInfo(const CallBase *Call1, const CallBase *Call2,
AAQueryInfo &AAQI);
ModRefInfo getModRefInfo(const VAArgInst *V, const MemoryLocation &Loc,
AAQueryInfo &AAQI);
ModRefInfo getModRefInfo(const LoadInst *L, const MemoryLocation &Loc,
AAQueryInfo &AAQI);
ModRefInfo getModRefInfo(const StoreInst *S, const MemoryLocation &Loc,
AAQueryInfo &AAQI);
ModRefInfo getModRefInfo(const FenceInst *S, const MemoryLocation &Loc,
AAQueryInfo &AAQI);
ModRefInfo getModRefInfo(const AtomicCmpXchgInst *CX,
const MemoryLocation &Loc, AAQueryInfo &AAQI);
ModRefInfo getModRefInfo(const AtomicRMWInst *RMW, const MemoryLocation &Loc,
AAQueryInfo &AAQI);
ModRefInfo getModRefInfo(const CatchPadInst *I, const MemoryLocation &Loc,
AAQueryInfo &AAQI);
ModRefInfo getModRefInfo(const CatchReturnInst *I, const MemoryLocation &Loc,
AAQueryInfo &AAQI);
ModRefInfo getModRefInfo(const Instruction *I,
const Optional<MemoryLocation> &OptLoc,
AAQueryInfo &AAQIP) {
if (OptLoc == None) {
if (const auto *Call = dyn_cast<CallBase>(I)) {
return createModRefInfo(getModRefBehavior(Call));
}
}
const MemoryLocation &Loc = OptLoc.getValueOr(MemoryLocation());
switch (I->getOpcode()) {
case Instruction::VAArg:
return getModRefInfo((const VAArgInst *)I, Loc, AAQIP);
case Instruction::Load:
return getModRefInfo((const LoadInst *)I, Loc, AAQIP);
case Instruction::Store:
return getModRefInfo((const StoreInst *)I, Loc, AAQIP);
case Instruction::Fence:
return getModRefInfo((const FenceInst *)I, Loc, AAQIP);
case Instruction::AtomicCmpXchg:
return getModRefInfo((const AtomicCmpXchgInst *)I, Loc, AAQIP);
case Instruction::AtomicRMW:
return getModRefInfo((const AtomicRMWInst *)I, Loc, AAQIP);
case Instruction::Call:
return getModRefInfo((const CallInst *)I, Loc, AAQIP);
case Instruction::Invoke:
return getModRefInfo((const InvokeInst *)I, Loc, AAQIP);
case Instruction::CatchPad:
return getModRefInfo((const CatchPadInst *)I, Loc, AAQIP);
case Instruction::CatchRet:
return getModRefInfo((const CatchReturnInst *)I, Loc, AAQIP);
default:
return ModRefInfo::NoModRef;
}
}
class Concept;
template <typename T> class Model;
template <typename T> friend class AAResultBase;
const TargetLibraryInfo &TLI;
std::vector<std::unique_ptr<Concept>> AAs;
std::vector<AnalysisKey *> AADeps;
friend class BatchAAResults;
};
/// This class is a wrapper over an AAResults, and it is intended to be used
/// only when there are no IR changes inbetween queries. BatchAAResults is
/// reusing the same `AAQueryInfo` to preserve the state across queries,
/// esentially making AA work in "batch mode". The internal state cannot be
/// cleared, so to go "out-of-batch-mode", the user must either use AAResults,
/// or create a new BatchAAResults.
class BatchAAResults {
AAResults &AA;
AAQueryInfo AAQI;
public:
BatchAAResults(AAResults &AAR) : AA(AAR), AAQI() {}
AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB) {
return AA.alias(LocA, LocB, AAQI);
}
bool pointsToConstantMemory(const MemoryLocation &Loc, bool OrLocal = false) {
return AA.pointsToConstantMemory(Loc, AAQI, OrLocal);
}
ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc) {
return AA.getModRefInfo(Call, Loc, AAQI);
}
ModRefInfo getModRefInfo(const CallBase *Call1, const CallBase *Call2) {
return AA.getModRefInfo(Call1, Call2, AAQI);
}
ModRefInfo getModRefInfo(const Instruction *I,
const Optional<MemoryLocation> &OptLoc) {
return AA.getModRefInfo(I, OptLoc, AAQI);
}
ModRefInfo getModRefInfo(Instruction *I, const CallBase *Call2) {
return AA.getModRefInfo(I, Call2, AAQI);
}
ModRefInfo getArgModRefInfo(const CallBase *Call, unsigned ArgIdx) {
return AA.getArgModRefInfo(Call, ArgIdx);
}
FunctionModRefBehavior getModRefBehavior(const CallBase *Call) {
return AA.getModRefBehavior(Call);
}
};
/// Temporary typedef for legacy code that uses a generic \c AliasAnalysis
/// pointer or reference.
using AliasAnalysis = AAResults;
/// A private abstract base class describing the concept of an individual alias
/// analysis implementation.
///
/// This interface is implemented by any \c Model instantiation. It is also the
/// interface which a type used to instantiate the model must provide.
///
/// All of these methods model methods by the same name in the \c
/// AAResults class. Only differences and specifics to how the
/// implementations are called are documented here.
class AAResults::Concept {
public:
virtual ~Concept() = 0;
/// An update API used internally by the AAResults to provide
/// a handle back to the top level aggregation.
virtual void setAAResults(AAResults *NewAAR) = 0;
//===--------------------------------------------------------------------===//
/// \name Alias Queries
/// @{
/// The main low level interface to the alias analysis implementation.
/// Returns an AliasResult indicating whether the two pointers are aliased to
/// each other. This is the interface that must be implemented by specific
/// alias analysis implementations.
virtual AliasResult alias(const MemoryLocation &LocA,
const MemoryLocation &LocB, AAQueryInfo &AAQI) = 0;
/// Checks whether the given location points to constant memory, or if
/// \p OrLocal is true whether it points to a local alloca.
virtual bool pointsToConstantMemory(const MemoryLocation &Loc,
AAQueryInfo &AAQI, bool OrLocal) = 0;
/// @}
//===--------------------------------------------------------------------===//
/// \name Simple mod/ref information
/// @{
/// Get the ModRef info associated with a pointer argument of a callsite. The
/// result's bits are set to indicate the allowed aliasing ModRef kinds. Note
/// that these bits do not necessarily account for the overall behavior of
/// the function, but rather only provide additional per-argument
/// information.
virtual ModRefInfo getArgModRefInfo(const CallBase *Call,
unsigned ArgIdx) = 0;
/// Return the behavior of the given call site.
virtual FunctionModRefBehavior getModRefBehavior(const CallBase *Call) = 0;
/// Return the behavior when calling the given function.
virtual FunctionModRefBehavior getModRefBehavior(const Function *F) = 0;
/// getModRefInfo (for call sites) - Return information about whether
/// a particular call site modifies or reads the specified memory location.
virtual ModRefInfo getModRefInfo(const CallBase *Call,
const MemoryLocation &Loc,
AAQueryInfo &AAQI) = 0;
/// Return information about whether two call sites may refer to the same set
/// of memory locations. See the AA documentation for details:
/// http://llvm.org/docs/AliasAnalysis.html#ModRefInfo
virtual ModRefInfo getModRefInfo(const CallBase *Call1, const CallBase *Call2,
AAQueryInfo &AAQI) = 0;
/// @}
};
/// A private class template which derives from \c Concept and wraps some other
/// type.
///
/// This models the concept by directly forwarding each interface point to the
/// wrapped type which must implement a compatible interface. This provides
/// a type erased binding.
template <typename AAResultT> class AAResults::Model final : public Concept {
AAResultT &Result;
public:
explicit Model(AAResultT &Result, AAResults &AAR) : Result(Result) {
Result.setAAResults(&AAR);
}
~Model() override = default;
void setAAResults(AAResults *NewAAR) override { Result.setAAResults(NewAAR); }
AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB,
AAQueryInfo &AAQI) override {
return Result.alias(LocA, LocB, AAQI);
}
bool pointsToConstantMemory(const MemoryLocation &Loc, AAQueryInfo &AAQI,
bool OrLocal) override {
return Result.pointsToConstantMemory(Loc, AAQI, OrLocal);
}
ModRefInfo getArgModRefInfo(const CallBase *Call, unsigned ArgIdx) override {
return Result.getArgModRefInfo(Call, ArgIdx);
}
FunctionModRefBehavior getModRefBehavior(const CallBase *Call) override {
return Result.getModRefBehavior(Call);
}
FunctionModRefBehavior getModRefBehavior(const Function *F) override {
return Result.getModRefBehavior(F);
}
ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc,
AAQueryInfo &AAQI) override {
return Result.getModRefInfo(Call, Loc, AAQI);
}
ModRefInfo getModRefInfo(const CallBase *Call1, const CallBase *Call2,
AAQueryInfo &AAQI) override {
return Result.getModRefInfo(Call1, Call2, AAQI);
}
};
/// A CRTP-driven "mixin" base class to help implement the function alias
/// analysis results concept.
///
/// Because of the nature of many alias analysis implementations, they often
/// only implement a subset of the interface. This base class will attempt to
/// implement the remaining portions of the interface in terms of simpler forms
/// of the interface where possible, and otherwise provide conservatively
/// correct fallback implementations.
///
/// Implementors of an alias analysis should derive from this CRTP, and then
/// override specific methods that they wish to customize. There is no need to
/// use virtual anywhere, the CRTP base class does static dispatch to the
/// derived type passed into it.
template <typename DerivedT> class AAResultBase {
// Expose some parts of the interface only to the AAResults::Model
// for wrapping. Specifically, this allows the model to call our
// setAAResults method without exposing it as a fully public API.
friend class AAResults::Model<DerivedT>;
/// A pointer to the AAResults object that this AAResult is
/// aggregated within. May be null if not aggregated.
- AAResults *AAR;
+ AAResults *AAR = nullptr;
/// Helper to dispatch calls back through the derived type.
DerivedT &derived() { return static_cast<DerivedT &>(*this); }
/// A setter for the AAResults pointer, which is used to satisfy the
/// AAResults::Model contract.
void setAAResults(AAResults *NewAAR) { AAR = NewAAR; }
protected:
/// This proxy class models a common pattern where we delegate to either the
/// top-level \c AAResults aggregation if one is registered, or to the
/// current result if none are registered.
class AAResultsProxy {
AAResults *AAR;
DerivedT &CurrentResult;
public:
AAResultsProxy(AAResults *AAR, DerivedT &CurrentResult)
: AAR(AAR), CurrentResult(CurrentResult) {}
AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB,
AAQueryInfo &AAQI) {
return AAR ? AAR->alias(LocA, LocB, AAQI)
: CurrentResult.alias(LocA, LocB, AAQI);
}
bool pointsToConstantMemory(const MemoryLocation &Loc, AAQueryInfo &AAQI,
bool OrLocal) {
return AAR ? AAR->pointsToConstantMemory(Loc, AAQI, OrLocal)
: CurrentResult.pointsToConstantMemory(Loc, AAQI, OrLocal);
}
ModRefInfo getArgModRefInfo(const CallBase *Call, unsigned ArgIdx) {
return AAR ? AAR->getArgModRefInfo(Call, ArgIdx)
: CurrentResult.getArgModRefInfo(Call, ArgIdx);
}
FunctionModRefBehavior getModRefBehavior(const CallBase *Call) {
return AAR ? AAR->getModRefBehavior(Call)
: CurrentResult.getModRefBehavior(Call);
}
FunctionModRefBehavior getModRefBehavior(const Function *F) {
return AAR ? AAR->getModRefBehavior(F) : CurrentResult.getModRefBehavior(F);
}
ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc,
AAQueryInfo &AAQI) {
return AAR ? AAR->getModRefInfo(Call, Loc, AAQI)
: CurrentResult.getModRefInfo(Call, Loc, AAQI);
}
ModRefInfo getModRefInfo(const CallBase *Call1, const CallBase *Call2,
AAQueryInfo &AAQI) {
return AAR ? AAR->getModRefInfo(Call1, Call2, AAQI)
: CurrentResult.getModRefInfo(Call1, Call2, AAQI);
}
};
explicit AAResultBase() = default;
// Provide all the copy and move constructors so that derived types aren't
// constrained.
AAResultBase(const AAResultBase &Arg) {}
AAResultBase(AAResultBase &&Arg) {}
/// Get a proxy for the best AA result set to query at this time.
///
/// When this result is part of a larger aggregation, this will proxy to that
/// aggregation. When this result is used in isolation, it will just delegate
/// back to the derived class's implementation.
///
/// Note that callers of this need to take considerable care to not cause
/// performance problems when they use this routine, in the case of a large
/// number of alias analyses being aggregated, it can be expensive to walk
/// back across the chain.
AAResultsProxy getBestAAResults() { return AAResultsProxy(AAR, derived()); }
public:
AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB,
AAQueryInfo &AAQI) {
return MayAlias;
}
bool pointsToConstantMemory(const MemoryLocation &Loc, AAQueryInfo &AAQI,
bool OrLocal) {
return false;
}
ModRefInfo getArgModRefInfo(const CallBase *Call, unsigned ArgIdx) {
return ModRefInfo::ModRef;
}
FunctionModRefBehavior getModRefBehavior(const CallBase *Call) {
return FMRB_UnknownModRefBehavior;
}
FunctionModRefBehavior getModRefBehavior(const Function *F) {
return FMRB_UnknownModRefBehavior;
}
ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc,
AAQueryInfo &AAQI) {
return ModRefInfo::ModRef;
}
ModRefInfo getModRefInfo(const CallBase *Call1, const CallBase *Call2,
AAQueryInfo &AAQI) {
return ModRefInfo::ModRef;
}
};
/// Return true if this pointer is returned by a noalias function.
bool isNoAliasCall(const Value *V);
/// Return true if this is an argument with the noalias attribute.
bool isNoAliasArgument(const Value *V);
/// Return true if this pointer refers to a distinct and identifiable object.
/// This returns true for:
/// Global Variables and Functions (but not Global Aliases)
/// Allocas
/// ByVal and NoAlias Arguments
/// NoAlias returns (e.g. calls to malloc)
///
bool isIdentifiedObject(const Value *V);
/// Return true if V is umabigously identified at the function-level.
/// Different IdentifiedFunctionLocals can't alias.
/// Further, an IdentifiedFunctionLocal can not alias with any function
/// arguments other than itself, which is not necessarily true for
/// IdentifiedObjects.
bool isIdentifiedFunctionLocal(const Value *V);
/// A manager for alias analyses.
///
/// This class can have analyses registered with it and when run, it will run
/// all of them and aggregate their results into single AA results interface
/// that dispatches across all of the alias analysis results available.
///
/// Note that the order in which analyses are registered is very significant.
/// That is the order in which the results will be aggregated and queried.
///
/// This manager effectively wraps the AnalysisManager for registering alias
/// analyses. When you register your alias analysis with this manager, it will
/// ensure the analysis itself is registered with its AnalysisManager.
///
/// The result of this analysis is only invalidated if one of the particular
/// aggregated AA results end up being invalidated. This removes the need to
/// explicitly preserve the results of `AAManager`. Note that analyses should no
/// longer be registered once the `AAManager` is run.
class AAManager : public AnalysisInfoMixin<AAManager> {
public:
using Result = AAResults;
/// Register a specific AA result.
template <typename AnalysisT> void registerFunctionAnalysis() {
ResultGetters.push_back(&getFunctionAAResultImpl<AnalysisT>);
}
/// Register a specific AA result.
template <typename AnalysisT> void registerModuleAnalysis() {
ResultGetters.push_back(&getModuleAAResultImpl<AnalysisT>);
}
Result run(Function &F, FunctionAnalysisManager &AM) {
Result R(AM.getResult<TargetLibraryAnalysis>(F));
for (auto &Getter : ResultGetters)
(*Getter)(F, AM, R);
return R;
}
private:
friend AnalysisInfoMixin<AAManager>;
static AnalysisKey Key;
SmallVector<void (*)(Function &F, FunctionAnalysisManager &AM,
AAResults &AAResults),
4> ResultGetters;
template <typename AnalysisT>
static void getFunctionAAResultImpl(Function &F,
FunctionAnalysisManager &AM,
AAResults &AAResults) {
AAResults.addAAResult(AM.template getResult<AnalysisT>(F));
AAResults.addAADependencyID(AnalysisT::ID());
}
template <typename AnalysisT>
static void getModuleAAResultImpl(Function &F, FunctionAnalysisManager &AM,
AAResults &AAResults) {
auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
auto &MAM = MAMProxy.getManager();
if (auto *R = MAM.template getCachedResult<AnalysisT>(*F.getParent())) {
AAResults.addAAResult(*R);
MAMProxy
.template registerOuterAnalysisInvalidation<AnalysisT, AAManager>();
}
}
};
/// A wrapper pass to provide the legacy pass manager access to a suitably
/// prepared AAResults object.
class AAResultsWrapperPass : public FunctionPass {
std::unique_ptr<AAResults> AAR;
public:
static char ID;
AAResultsWrapperPass();
AAResults &getAAResults() { return *AAR; }
const AAResults &getAAResults() const { return *AAR; }
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override;
};
/// A wrapper pass for external alias analyses. This just squirrels away the
/// callback used to run any analyses and register their results.
struct ExternalAAWrapperPass : ImmutablePass {
using CallbackT = std::function<void(Pass &, Function &, AAResults &)>;
CallbackT CB;
static char ID;
ExternalAAWrapperPass() : ImmutablePass(ID) {
initializeExternalAAWrapperPassPass(*PassRegistry::getPassRegistry());
}
explicit ExternalAAWrapperPass(CallbackT CB)
: ImmutablePass(ID), CB(std::move(CB)) {
initializeExternalAAWrapperPassPass(*PassRegistry::getPassRegistry());
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesAll();
}
};
FunctionPass *createAAResultsWrapperPass();
/// A wrapper pass around a callback which can be used to populate the
/// AAResults in the AAResultsWrapperPass from an external AA.
///
/// The callback provided here will be used each time we prepare an AAResults
/// object, and will receive a reference to the function wrapper pass, the
/// function, and the AAResults object to populate. This should be used when
/// setting up a custom pass pipeline to inject a hook into the AA results.
ImmutablePass *createExternalAAWrapperPass(
std::function<void(Pass &, Function &, AAResults &)> Callback);
/// A helper for the legacy pass manager to create a \c AAResults
/// object populated to the best of our ability for a particular function when
/// inside of a \c ModulePass or a \c CallGraphSCCPass.
///
/// If a \c ModulePass or a \c CallGraphSCCPass calls \p
/// createLegacyPMAAResults, it also needs to call \p addUsedAAAnalyses in \p
/// getAnalysisUsage.
AAResults createLegacyPMAAResults(Pass &P, Function &F, BasicAAResult &BAR);
/// A helper for the legacy pass manager to populate \p AU to add uses to make
/// sure the analyses required by \p createLegacyPMAAResults are available.
void getAAResultsAnalysisUsage(AnalysisUsage &AU);
} // end namespace llvm
#endif // LLVM_ANALYSIS_ALIASANALYSIS_H
Index: vendor/llvm/dist-release_90/include/llvm/CodeGen/SelectionDAG.h
===================================================================
--- vendor/llvm/dist-release_90/include/llvm/CodeGen/SelectionDAG.h (revision 351302)
+++ vendor/llvm/dist-release_90/include/llvm/CodeGen/SelectionDAG.h (revision 351303)
@@ -1,1768 +1,1786 @@
//===- llvm/CodeGen/SelectionDAG.h - InstSelection DAG ----------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file declares the SelectionDAG class, and transitively defines the
// SDNode class and subclasses.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_CODEGEN_SELECTIONDAG_H
#define LLVM_CODEGEN_SELECTIONDAG_H
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/FoldingSet.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/ilist.h"
#include "llvm/ADT/iterator.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/CodeGen/DAGCombine.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Metadata.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/ArrayRecycler.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/RecyclingAllocator.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <functional>
#include <map>
#include <string>
#include <tuple>
#include <utility>
#include <vector>
namespace llvm {
class BlockAddress;
class Constant;
class ConstantFP;
class ConstantInt;
class DataLayout;
struct fltSemantics;
class GlobalValue;
struct KnownBits;
class LLVMContext;
class MachineBasicBlock;
class MachineConstantPoolValue;
class MCSymbol;
class OptimizationRemarkEmitter;
class SDDbgValue;
class SDDbgLabel;
class SelectionDAG;
class SelectionDAGTargetInfo;
class TargetLibraryInfo;
class TargetLowering;
class TargetMachine;
class TargetSubtargetInfo;
class Value;
class SDVTListNode : public FoldingSetNode {
friend struct FoldingSetTrait<SDVTListNode>;
/// A reference to an Interned FoldingSetNodeID for this node.
/// The Allocator in SelectionDAG holds the data.
/// SDVTList contains all types which are frequently accessed in SelectionDAG.
/// The size of this list is not expected to be big so it won't introduce
/// a memory penalty.
FoldingSetNodeIDRef FastID;
const EVT *VTs;
unsigned int NumVTs;
/// The hash value for SDVTList is fixed, so cache it to avoid
/// hash calculation.
unsigned HashValue;
public:
SDVTListNode(const FoldingSetNodeIDRef ID, const EVT *VT, unsigned int Num) :
FastID(ID), VTs(VT), NumVTs(Num) {
HashValue = ID.ComputeHash();
}
SDVTList getSDVTList() {
SDVTList result = {VTs, NumVTs};
return result;
}
};
/// Specialize FoldingSetTrait for SDVTListNode
/// to avoid computing temp FoldingSetNodeID and hash value.
template<> struct FoldingSetTrait<SDVTListNode> : DefaultFoldingSetTrait<SDVTListNode> {
static void Profile(const SDVTListNode &X, FoldingSetNodeID& ID) {
ID = X.FastID;
}
static bool Equals(const SDVTListNode &X, const FoldingSetNodeID &ID,
unsigned IDHash, FoldingSetNodeID &TempID) {
if (X.HashValue != IDHash)
return false;
return ID == X.FastID;
}
static unsigned ComputeHash(const SDVTListNode &X, FoldingSetNodeID &TempID) {
return X.HashValue;
}
};
template <> struct ilist_alloc_traits<SDNode> {
static void deleteNode(SDNode *) {
llvm_unreachable("ilist_traits<SDNode> shouldn't see a deleteNode call!");
}
};
/// Keeps track of dbg_value information through SDISel. We do
/// not build SDNodes for these so as not to perturb the generated code;
/// instead the info is kept off to the side in this structure. Each SDNode may
/// have one or more associated dbg_value entries. This information is kept in
/// DbgValMap.
/// Byval parameters are handled separately because they don't use alloca's,
/// which busts the normal mechanism. There is good reason for handling all
/// parameters separately: they may not have code generated for them, they
/// should always go at the beginning of the function regardless of other code
/// motion, and debug info for them is potentially useful even if the parameter
/// is unused. Right now only byval parameters are handled separately.
class SDDbgInfo {
BumpPtrAllocator Alloc;
SmallVector<SDDbgValue*, 32> DbgValues;
SmallVector<SDDbgValue*, 32> ByvalParmDbgValues;
SmallVector<SDDbgLabel*, 4> DbgLabels;
using DbgValMapType = DenseMap<const SDNode *, SmallVector<SDDbgValue *, 2>>;
DbgValMapType DbgValMap;
public:
SDDbgInfo() = default;
SDDbgInfo(const SDDbgInfo &) = delete;
SDDbgInfo &operator=(const SDDbgInfo &) = delete;
void add(SDDbgValue *V, const SDNode *Node, bool isParameter) {
if (isParameter) {
ByvalParmDbgValues.push_back(V);
} else DbgValues.push_back(V);
if (Node)
DbgValMap[Node].push_back(V);
}
void add(SDDbgLabel *L) {
DbgLabels.push_back(L);
}
/// Invalidate all DbgValues attached to the node and remove
/// it from the Node-to-DbgValues map.
void erase(const SDNode *Node);
void clear() {
DbgValMap.clear();
DbgValues.clear();
ByvalParmDbgValues.clear();
DbgLabels.clear();
Alloc.Reset();
}
BumpPtrAllocator &getAlloc() { return Alloc; }
bool empty() const {
return DbgValues.empty() && ByvalParmDbgValues.empty() && DbgLabels.empty();
}
ArrayRef<SDDbgValue*> getSDDbgValues(const SDNode *Node) const {
auto I = DbgValMap.find(Node);
if (I != DbgValMap.end())
return I->second;
return ArrayRef<SDDbgValue*>();
}
using DbgIterator = SmallVectorImpl<SDDbgValue*>::iterator;
using DbgLabelIterator = SmallVectorImpl<SDDbgLabel*>::iterator;
DbgIterator DbgBegin() { return DbgValues.begin(); }
DbgIterator DbgEnd() { return DbgValues.end(); }
DbgIterator ByvalParmDbgBegin() { return ByvalParmDbgValues.begin(); }
DbgIterator ByvalParmDbgEnd() { return ByvalParmDbgValues.end(); }
DbgLabelIterator DbgLabelBegin() { return DbgLabels.begin(); }
DbgLabelIterator DbgLabelEnd() { return DbgLabels.end(); }
};
void checkForCycles(const SelectionDAG *DAG, bool force = false);
/// This is used to represent a portion of an LLVM function in a low-level
/// Data Dependence DAG representation suitable for instruction selection.
/// This DAG is constructed as the first step of instruction selection in order
/// to allow implementation of machine specific optimizations
/// and code simplifications.
///
/// The representation used by the SelectionDAG is a target-independent
/// representation, which has some similarities to the GCC RTL representation,
/// but is significantly more simple, powerful, and is a graph form instead of a
/// linear form.
///
class SelectionDAG {
const TargetMachine &TM;
const SelectionDAGTargetInfo *TSI = nullptr;
const TargetLowering *TLI = nullptr;
const TargetLibraryInfo *LibInfo = nullptr;
MachineFunction *MF;
Pass *SDAGISelPass = nullptr;
LLVMContext *Context;
CodeGenOpt::Level OptLevel;
LegacyDivergenceAnalysis * DA = nullptr;
FunctionLoweringInfo * FLI = nullptr;
/// The function-level optimization remark emitter. Used to emit remarks
/// whenever manipulating the DAG.
OptimizationRemarkEmitter *ORE;
/// The starting token.
SDNode EntryNode;
/// The root of the entire DAG.
SDValue Root;
/// A linked list of nodes in the current DAG.
ilist<SDNode> AllNodes;
/// The AllocatorType for allocating SDNodes. We use
/// pool allocation with recycling.
using NodeAllocatorType = RecyclingAllocator<BumpPtrAllocator, SDNode,
sizeof(LargestSDNode),
alignof(MostAlignedSDNode)>;
/// Pool allocation for nodes.
NodeAllocatorType NodeAllocator;
/// This structure is used to memoize nodes, automatically performing
/// CSE with existing nodes when a duplicate is requested.
FoldingSet<SDNode> CSEMap;
/// Pool allocation for machine-opcode SDNode operands.
BumpPtrAllocator OperandAllocator;
ArrayRecycler<SDUse> OperandRecycler;
/// Pool allocation for misc. objects that are created once per SelectionDAG.
BumpPtrAllocator Allocator;
/// Tracks dbg_value and dbg_label information through SDISel.
SDDbgInfo *DbgInfo;
using CallSiteInfo = MachineFunction::CallSiteInfo;
using CallSiteInfoImpl = MachineFunction::CallSiteInfoImpl;
- DenseMap<const SDNode *, CallSiteInfo> SDCallSiteInfo;
+ struct CallSiteDbgInfo {
+ CallSiteInfo CSInfo;
+ MDNode *HeapAllocSite = nullptr;
+ };
+
+ DenseMap<const SDNode *, CallSiteDbgInfo> SDCallSiteDbgInfo;
+
uint16_t NextPersistentId = 0;
public:
/// Clients of various APIs that cause global effects on
/// the DAG can optionally implement this interface. This allows the clients
/// to handle the various sorts of updates that happen.
///
/// A DAGUpdateListener automatically registers itself with DAG when it is
/// constructed, and removes itself when destroyed in RAII fashion.
struct DAGUpdateListener {
DAGUpdateListener *const Next;
SelectionDAG &DAG;
explicit DAGUpdateListener(SelectionDAG &D)
: Next(D.UpdateListeners), DAG(D) {
DAG.UpdateListeners = this;
}
virtual ~DAGUpdateListener() {
assert(DAG.UpdateListeners == this &&
"DAGUpdateListeners must be destroyed in LIFO order");
DAG.UpdateListeners = Next;
}
/// The node N that was deleted and, if E is not null, an
/// equivalent node E that replaced it.
virtual void NodeDeleted(SDNode *N, SDNode *E);
/// The node N that was updated.
virtual void NodeUpdated(SDNode *N);
/// The node N that was inserted.
virtual void NodeInserted(SDNode *N);
};
struct DAGNodeDeletedListener : public DAGUpdateListener {
std::function<void(SDNode *, SDNode *)> Callback;
DAGNodeDeletedListener(SelectionDAG &DAG,
std::function<void(SDNode *, SDNode *)> Callback)
: DAGUpdateListener(DAG), Callback(std::move(Callback)) {}
void NodeDeleted(SDNode *N, SDNode *E) override { Callback(N, E); }
private:
virtual void anchor();
};
/// When true, additional steps are taken to
/// ensure that getConstant() and similar functions return DAG nodes that
/// have legal types. This is important after type legalization since
/// any illegally typed nodes generated after this point will not experience
/// type legalization.
bool NewNodesMustHaveLegalTypes = false;
private:
/// DAGUpdateListener is a friend so it can manipulate the listener stack.
friend struct DAGUpdateListener;
/// Linked list of registered DAGUpdateListener instances.
/// This stack is maintained by DAGUpdateListener RAII.
DAGUpdateListener *UpdateListeners = nullptr;
/// Implementation of setSubgraphColor.
/// Return whether we had to truncate the search.
bool setSubgraphColorHelper(SDNode *N, const char *Color,
DenseSet<SDNode *> &visited,
int level, bool &printed);
template <typename SDNodeT, typename... ArgTypes>
SDNodeT *newSDNode(ArgTypes &&... Args) {
return new (NodeAllocator.template Allocate<SDNodeT>())
SDNodeT(std::forward<ArgTypes>(Args)...);
}
/// Build a synthetic SDNodeT with the given args and extract its subclass
/// data as an integer (e.g. for use in a folding set).
///
/// The args to this function are the same as the args to SDNodeT's
/// constructor, except the second arg (assumed to be a const DebugLoc&) is
/// omitted.
template <typename SDNodeT, typename... ArgTypes>
static uint16_t getSyntheticNodeSubclassData(unsigned IROrder,
ArgTypes &&... Args) {
// The compiler can reduce this expression to a constant iff we pass an
// empty DebugLoc. Thankfully, the debug location doesn't have any bearing
// on the subclass data.
return SDNodeT(IROrder, DebugLoc(), std::forward<ArgTypes>(Args)...)
.getRawSubclassData();
}
template <typename SDNodeTy>
static uint16_t getSyntheticNodeSubclassData(unsigned Opc, unsigned Order,
SDVTList VTs, EVT MemoryVT,
MachineMemOperand *MMO) {
return SDNodeTy(Opc, Order, DebugLoc(), VTs, MemoryVT, MMO)
.getRawSubclassData();
}
void createOperands(SDNode *Node, ArrayRef<SDValue> Vals);
void removeOperands(SDNode *Node) {
if (!Node->OperandList)
return;
OperandRecycler.deallocate(
ArrayRecycler<SDUse>::Capacity::get(Node->NumOperands),
Node->OperandList);
Node->NumOperands = 0;
Node->OperandList = nullptr;
}
void CreateTopologicalOrder(std::vector<SDNode*>& Order);
public:
explicit SelectionDAG(const TargetMachine &TM, CodeGenOpt::Level);
SelectionDAG(const SelectionDAG &) = delete;
SelectionDAG &operator=(const SelectionDAG &) = delete;
~SelectionDAG();
/// Prepare this SelectionDAG to process code in the given MachineFunction.
void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE,
Pass *PassPtr, const TargetLibraryInfo *LibraryInfo,
LegacyDivergenceAnalysis * Divergence);
void setFunctionLoweringInfo(FunctionLoweringInfo * FuncInfo) {
FLI = FuncInfo;
}
/// Clear state and free memory necessary to make this
/// SelectionDAG ready to process a new block.
void clear();
MachineFunction &getMachineFunction() const { return *MF; }
const Pass *getPass() const { return SDAGISelPass; }
const DataLayout &getDataLayout() const { return MF->getDataLayout(); }
const TargetMachine &getTarget() const { return TM; }
const TargetSubtargetInfo &getSubtarget() const { return MF->getSubtarget(); }
const TargetLowering &getTargetLoweringInfo() const { return *TLI; }
const TargetLibraryInfo &getLibInfo() const { return *LibInfo; }
const SelectionDAGTargetInfo &getSelectionDAGInfo() const { return *TSI; }
const LegacyDivergenceAnalysis *getDivergenceAnalysis() const { return DA; }
LLVMContext *getContext() const {return Context; }
OptimizationRemarkEmitter &getORE() const { return *ORE; }
/// Pop up a GraphViz/gv window with the DAG rendered using 'dot'.
void viewGraph(const std::string &Title);
void viewGraph();
#ifndef NDEBUG
std::map<const SDNode *, std::string> NodeGraphAttrs;
#endif
/// Clear all previously defined node graph attributes.
/// Intended to be used from a debugging tool (eg. gdb).
void clearGraphAttrs();
/// Set graph attributes for a node. (eg. "color=red".)
void setGraphAttrs(const SDNode *N, const char *Attrs);
/// Get graph attributes for a node. (eg. "color=red".)
/// Used from getNodeAttributes.
const std::string getGraphAttrs(const SDNode *N) const;
/// Convenience for setting node color attribute.
void setGraphColor(const SDNode *N, const char *Color);
/// Convenience for setting subgraph color attribute.
void setSubgraphColor(SDNode *N, const char *Color);
using allnodes_const_iterator = ilist<SDNode>::const_iterator;
allnodes_const_iterator allnodes_begin() const { return AllNodes.begin(); }
allnodes_const_iterator allnodes_end() const { return AllNodes.end(); }
using allnodes_iterator = ilist<SDNode>::iterator;
allnodes_iterator allnodes_begin() { return AllNodes.begin(); }
allnodes_iterator allnodes_end() { return AllNodes.end(); }
ilist<SDNode>::size_type allnodes_size() const {
return AllNodes.size();
}
iterator_range<allnodes_iterator> allnodes() {
return make_range(allnodes_begin(), allnodes_end());
}
iterator_range<allnodes_const_iterator> allnodes() const {
return make_range(allnodes_begin(), allnodes_end());
}
/// Return the root tag of the SelectionDAG.
const SDValue &getRoot() const { return Root; }
/// Return the token chain corresponding to the entry of the function.
SDValue getEntryNode() const {
return SDValue(const_cast<SDNode *>(&EntryNode), 0);
}
/// Set the current root tag of the SelectionDAG.
///
const SDValue &setRoot(SDValue N) {
assert((!N.getNode() || N.getValueType() == MVT::Other) &&
"DAG root value is not a chain!");
if (N.getNode())
checkForCycles(N.getNode(), this);
Root = N;
if (N.getNode())
checkForCycles(this);
return Root;
}
#ifndef NDEBUG
void VerifyDAGDiverence();
#endif
/// This iterates over the nodes in the SelectionDAG, folding
/// certain types of nodes together, or eliminating superfluous nodes. The
/// Level argument controls whether Combine is allowed to produce nodes and
/// types that are illegal on the target.
void Combine(CombineLevel Level, AliasAnalysis *AA,
CodeGenOpt::Level OptLevel);
/// This transforms the SelectionDAG into a SelectionDAG that
/// only uses types natively supported by the target.
/// Returns "true" if it made any changes.
///
/// Note that this is an involved process that may invalidate pointers into
/// the graph.
bool LegalizeTypes();
/// This transforms the SelectionDAG into a SelectionDAG that is
/// compatible with the target instruction selector, as indicated by the
/// TargetLowering object.
///
/// Note that this is an involved process that may invalidate pointers into
/// the graph.
void Legalize();
/// Transforms a SelectionDAG node and any operands to it into a node
/// that is compatible with the target instruction selector, as indicated by
/// the TargetLowering object.
///
/// \returns true if \c N is a valid, legal node after calling this.
///
/// This essentially runs a single recursive walk of the \c Legalize process
/// over the given node (and its operands). This can be used to incrementally
/// legalize the DAG. All of the nodes which are directly replaced,
/// potentially including N, are added to the output parameter \c
/// UpdatedNodes so that the delta to the DAG can be understood by the
/// caller.
///
/// When this returns false, N has been legalized in a way that make the
/// pointer passed in no longer valid. It may have even been deleted from the
/// DAG, and so it shouldn't be used further. When this returns true, the
/// N passed in is a legal node, and can be immediately processed as such.
/// This may still have done some work on the DAG, and will still populate
/// UpdatedNodes with any new nodes replacing those originally in the DAG.
bool LegalizeOp(SDNode *N, SmallSetVector<SDNode *, 16> &UpdatedNodes);
/// This transforms the SelectionDAG into a SelectionDAG
/// that only uses vector math operations supported by the target. This is
/// necessary as a separate step from Legalize because unrolling a vector
/// operation can introduce illegal types, which requires running
/// LegalizeTypes again.
///
/// This returns true if it made any changes; in that case, LegalizeTypes
/// is called again before Legalize.
///
/// Note that this is an involved process that may invalidate pointers into
/// the graph.
bool LegalizeVectors();
/// This method deletes all unreachable nodes in the SelectionDAG.
void RemoveDeadNodes();
/// Remove the specified node from the system. This node must
/// have no referrers.
void DeleteNode(SDNode *N);
/// Return an SDVTList that represents the list of values specified.
SDVTList getVTList(EVT VT);
SDVTList getVTList(EVT VT1, EVT VT2);
SDVTList getVTList(EVT VT1, EVT VT2, EVT VT3);
SDVTList getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4);
SDVTList getVTList(ArrayRef<EVT> VTs);
//===--------------------------------------------------------------------===//
// Node creation methods.
/// Create a ConstantSDNode wrapping a constant value.
/// If VT is a vector type, the constant is splatted into a BUILD_VECTOR.
///
/// If only legal types can be produced, this does the necessary
/// transformations (e.g., if the vector element type is illegal).
/// @{
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT,
bool isTarget = false, bool isOpaque = false);
SDValue getConstant(const APInt &Val, const SDLoc &DL, EVT VT,
bool isTarget = false, bool isOpaque = false);
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget = false,
bool IsOpaque = false) {
return getConstant(APInt::getAllOnesValue(VT.getScalarSizeInBits()), DL,
VT, IsTarget, IsOpaque);
}
SDValue getConstant(const ConstantInt &Val, const SDLoc &DL, EVT VT,
bool isTarget = false, bool isOpaque = false);
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL,
bool isTarget = false);
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL,
bool LegalTypes = true);
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT,
bool isOpaque = false) {
return getConstant(Val, DL, VT, true, isOpaque);
}
SDValue getTargetConstant(const APInt &Val, const SDLoc &DL, EVT VT,
bool isOpaque = false) {
return getConstant(Val, DL, VT, true, isOpaque);
}
SDValue getTargetConstant(const ConstantInt &Val, const SDLoc &DL, EVT VT,
bool isOpaque = false) {
return getConstant(Val, DL, VT, true, isOpaque);
}
/// Create a true or false constant of type \p VT using the target's
/// BooleanContent for type \p OpVT.
SDValue getBoolConstant(bool V, const SDLoc &DL, EVT VT, EVT OpVT);
/// @}
/// Create a ConstantFPSDNode wrapping a constant value.
/// If VT is a vector type, the constant is splatted into a BUILD_VECTOR.
///
/// If only legal types can be produced, this does the necessary
/// transformations (e.g., if the vector element type is illegal).
/// The forms that take a double should only be used for simple constants
/// that can be exactly represented in VT. No checks are made.
/// @{
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT,
bool isTarget = false);
SDValue getConstantFP(const APFloat &Val, const SDLoc &DL, EVT VT,
bool isTarget = false);
SDValue getConstantFP(const ConstantFP &V, const SDLoc &DL, EVT VT,
bool isTarget = false);
SDValue getTargetConstantFP(double Val, const SDLoc &DL, EVT VT) {
return getConstantFP(Val, DL, VT, true);
}
SDValue getTargetConstantFP(const APFloat &Val, const SDLoc &DL, EVT VT) {
return getConstantFP(Val, DL, VT, true);
}
SDValue getTargetConstantFP(const ConstantFP &Val, const SDLoc &DL, EVT VT) {
return getConstantFP(Val, DL, VT, true);
}
/// @}
SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT,
int64_t offset = 0, bool isTargetGA = false,
unsigned char TargetFlags = 0);
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT,
int64_t offset = 0,
unsigned char TargetFlags = 0) {
return getGlobalAddress(GV, DL, VT, offset, true, TargetFlags);
}
SDValue getFrameIndex(int FI, EVT VT, bool isTarget = false);
SDValue getTargetFrameIndex(int FI, EVT VT) {
return getFrameIndex(FI, VT, true);
}
SDValue getJumpTable(int JTI, EVT VT, bool isTarget = false,
unsigned char TargetFlags = 0);
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned char TargetFlags = 0) {
return getJumpTable(JTI, VT, true, TargetFlags);
}
SDValue getConstantPool(const Constant *C, EVT VT,
unsigned Align = 0, int Offs = 0, bool isT=false,
unsigned char TargetFlags = 0);
SDValue getTargetConstantPool(const Constant *C, EVT VT,
unsigned Align = 0, int Offset = 0,
unsigned char TargetFlags = 0) {
return getConstantPool(C, VT, Align, Offset, true, TargetFlags);
}
SDValue getConstantPool(MachineConstantPoolValue *C, EVT VT,
unsigned Align = 0, int Offs = 0, bool isT=false,
unsigned char TargetFlags = 0);
SDValue getTargetConstantPool(MachineConstantPoolValue *C,
EVT VT, unsigned Align = 0,
int Offset = 0, unsigned char TargetFlags=0) {
return getConstantPool(C, VT, Align, Offset, true, TargetFlags);
}
SDValue getTargetIndex(int Index, EVT VT, int64_t Offset = 0,
unsigned char TargetFlags = 0);
// When generating a branch to a BB, we don't in general know enough
// to provide debug info for the BB at that time, so keep this one around.
SDValue getBasicBlock(MachineBasicBlock *MBB);
SDValue getBasicBlock(MachineBasicBlock *MBB, SDLoc dl);
SDValue getExternalSymbol(const char *Sym, EVT VT);
SDValue getExternalSymbol(const char *Sym, const SDLoc &dl, EVT VT);
SDValue getTargetExternalSymbol(const char *Sym, EVT VT,
unsigned char TargetFlags = 0);
SDValue getMCSymbol(MCSymbol *Sym, EVT VT);
SDValue getValueType(EVT);
SDValue getRegister(unsigned Reg, EVT VT);
SDValue getRegisterMask(const uint32_t *RegMask);
SDValue getEHLabel(const SDLoc &dl, SDValue Root, MCSymbol *Label);
SDValue getLabelNode(unsigned Opcode, const SDLoc &dl, SDValue Root,
MCSymbol *Label);
SDValue getBlockAddress(const BlockAddress *BA, EVT VT,
int64_t Offset = 0, bool isTarget = false,
unsigned char TargetFlags = 0);
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT,
int64_t Offset = 0,
unsigned char TargetFlags = 0) {
return getBlockAddress(BA, VT, Offset, true, TargetFlags);
}
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg,
SDValue N) {
return getNode(ISD::CopyToReg, dl, MVT::Other, Chain,
getRegister(Reg, N.getValueType()), N);
}
// This version of the getCopyToReg method takes an extra operand, which
// indicates that there is potentially an incoming glue value (if Glue is not
// null) and that there should be a glue result.
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N,
SDValue Glue) {
SDVTList VTs = getVTList(MVT::Other, MVT::Glue);
SDValue Ops[] = { Chain, getRegister(Reg, N.getValueType()), N, Glue };
return getNode(ISD::CopyToReg, dl, VTs,
makeArrayRef(Ops, Glue.getNode() ? 4 : 3));
}
// Similar to last getCopyToReg() except parameter Reg is a SDValue
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, SDValue Reg, SDValue N,
SDValue Glue) {
SDVTList VTs = getVTList(MVT::Other, MVT::Glue);
SDValue Ops[] = { Chain, Reg, N, Glue };
return getNode(ISD::CopyToReg, dl, VTs,
makeArrayRef(Ops, Glue.getNode() ? 4 : 3));
}
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT) {
SDVTList VTs = getVTList(VT, MVT::Other);
SDValue Ops[] = { Chain, getRegister(Reg, VT) };
return getNode(ISD::CopyFromReg, dl, VTs, Ops);
}
// This version of the getCopyFromReg method takes an extra operand, which
// indicates that there is potentially an incoming glue value (if Glue is not
// null) and that there should be a glue result.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT,
SDValue Glue) {
SDVTList VTs = getVTList(VT, MVT::Other, MVT::Glue);
SDValue Ops[] = { Chain, getRegister(Reg, VT), Glue };
return getNode(ISD::CopyFromReg, dl, VTs,
makeArrayRef(Ops, Glue.getNode() ? 3 : 2));
}
SDValue getCondCode(ISD::CondCode Cond);
/// Return an ISD::VECTOR_SHUFFLE node. The number of elements in VT,
/// which must be a vector type, must match the number of mask elements
/// NumElts. An integer mask element equal to -1 is treated as undefined.
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2,
ArrayRef<int> Mask);
/// Return an ISD::BUILD_VECTOR node. The number of elements in VT,
/// which must be a vector type, must match the number of operands in Ops.
/// The operands must have the same type as (or, for integers, a type wider
/// than) VT's element type.
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef<SDValue> Ops) {
// VerifySDNode (via InsertNode) checks BUILD_VECTOR later.
return getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
}
/// Return an ISD::BUILD_VECTOR node. The number of elements in VT,
/// which must be a vector type, must match the number of operands in Ops.
/// The operands must have the same type as (or, for integers, a type wider
/// than) VT's element type.
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef<SDUse> Ops) {
// VerifySDNode (via InsertNode) checks BUILD_VECTOR later.
return getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
}
/// Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all
/// elements. VT must be a vector type. Op's type must be the same as (or,
/// for integers, a type wider than) VT's element type.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op) {
// VerifySDNode (via InsertNode) checks BUILD_VECTOR later.
if (Op.getOpcode() == ISD::UNDEF) {
assert((VT.getVectorElementType() == Op.getValueType() ||
(VT.isInteger() &&
VT.getVectorElementType().bitsLE(Op.getValueType()))) &&
"A splatted value must have a width equal or (for integers) "
"greater than the vector element type!");
return getNode(ISD::UNDEF, SDLoc(), VT);
}
SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Op);
return getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
}
/// Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to
/// the shuffle node in input but with swapped operands.
///
/// Example: shuffle A, B, <0,5,2,7> -> shuffle B, A, <4,1,6,3>
SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV);
/// Convert Op, which must be of float type, to the
/// float type VT, by either extending or rounding (by truncation).
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT);
/// Convert Op, which must be of integer type, to the
/// integer type VT, by either any-extending or truncating it.
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT);
/// Convert Op, which must be of integer type, to the
/// integer type VT, by either sign-extending or truncating it.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT);
/// Convert Op, which must be of integer type, to the
/// integer type VT, by either zero-extending or truncating it.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT);
/// Return the expression required to zero extend the Op
/// value assuming it was the smaller SrcTy value.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT);
/// Convert Op, which must be of integer type, to the integer type VT, by
/// either truncating it or performing either zero or sign extension as
/// appropriate extension for the pointer's semantics.
SDValue getPtrExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT);
/// Return the expression required to extend the Op as a pointer value
/// assuming it was the smaller SrcTy value. This may be either a zero extend
/// or a sign extend.
SDValue getPtrExtendInReg(SDValue Op, const SDLoc &DL, EVT VT);
/// Convert Op, which must be of integer type, to the integer type VT,
/// by using an extension appropriate for the target's
/// BooleanContent for type OpVT or truncating it.
SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT);
/// Create a bitwise NOT operation as (XOR Val, -1).
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT);
/// Create a logical NOT operation as (XOR Val, BooleanOne).
SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT);
/// Create an add instruction with appropriate flags when used for
/// addressing some offset of an object. i.e. if a load is split into multiple
/// components, create an add nuw from the base pointer to the offset.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Op, int64_t Offset) {
EVT VT = Op.getValueType();
return getObjectPtrOffset(SL, Op, getConstant(Offset, SL, VT));
}
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Op, SDValue Offset) {
EVT VT = Op.getValueType();
// The object itself can't wrap around the address space, so it shouldn't be
// possible for the adds of the offsets to the split parts to overflow.
SDNodeFlags Flags;
Flags.setNoUnsignedWrap(true);
return getNode(ISD::ADD, SL, VT, Op, Offset, Flags);
}
/// Return a new CALLSEQ_START node, that starts new call frame, in which
/// InSize bytes are set up inside CALLSEQ_START..CALLSEQ_END sequence and
/// OutSize specifies part of the frame set up prior to the sequence.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize,
const SDLoc &DL) {
SDVTList VTs = getVTList(MVT::Other, MVT::Glue);
SDValue Ops[] = { Chain,
getIntPtrConstant(InSize, DL, true),
getIntPtrConstant(OutSize, DL, true) };
return getNode(ISD::CALLSEQ_START, DL, VTs, Ops);
}
/// Return a new CALLSEQ_END node, which always must have a
/// glue result (to ensure it's not CSE'd).
/// CALLSEQ_END does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2,
SDValue InGlue, const SDLoc &DL) {
SDVTList NodeTys = getVTList(MVT::Other, MVT::Glue);
SmallVector<SDValue, 4> Ops;
Ops.push_back(Chain);
Ops.push_back(Op1);
Ops.push_back(Op2);
if (InGlue.getNode())
Ops.push_back(InGlue);
return getNode(ISD::CALLSEQ_END, DL, NodeTys, Ops);
}
/// Return true if the result of this operation is always undefined.
bool isUndef(unsigned Opcode, ArrayRef<SDValue> Ops);
/// Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getUNDEF(EVT VT) {
return getNode(ISD::UNDEF, SDLoc(), VT);
}
/// Return a GLOBAL_OFFSET_TABLE node. This does not have a useful SDLoc.
SDValue getGLOBAL_OFFSET_TABLE(EVT VT) {
return getNode(ISD::GLOBAL_OFFSET_TABLE, SDLoc(), VT);
}
/// Gets or creates the specified node.
///
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
ArrayRef<SDUse> Ops);
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
ArrayRef<SDValue> Ops, const SDNodeFlags Flags = SDNodeFlags());
SDValue getNode(unsigned Opcode, const SDLoc &DL, ArrayRef<EVT> ResultTys,
ArrayRef<SDValue> Ops);
SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
ArrayRef<SDValue> Ops);
// Specialize based on number of operands.
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT);
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand,
const SDNodeFlags Flags = SDNodeFlags());
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
SDValue N2, const SDNodeFlags Flags = SDNodeFlags());
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
SDValue N2, SDValue N3,
const SDNodeFlags Flags = SDNodeFlags());
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
SDValue N2, SDValue N3, SDValue N4);
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
SDValue N2, SDValue N3, SDValue N4, SDValue N5);
// Specialize again based on number of operands for nodes with a VTList
// rather than a single VT.
SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList);
SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, SDValue N);
SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, SDValue N1,
SDValue N2);
SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, SDValue N1,
SDValue N2, SDValue N3);
SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, SDValue N1,
SDValue N2, SDValue N3, SDValue N4);
SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, SDValue N1,
SDValue N2, SDValue N3, SDValue N4, SDValue N5);
/// Compute a TokenFactor to force all the incoming stack arguments to be
/// loaded from the stack. This is used in tail call lowering to protect
/// stack arguments from being clobbered.
SDValue getStackArgumentTokenFactor(SDValue Chain);
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src,
SDValue Size, unsigned Align, bool isVol, bool AlwaysInline,
bool isTailCall, MachinePointerInfo DstPtrInfo,
MachinePointerInfo SrcPtrInfo);
SDValue getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src,
SDValue Size, unsigned Align, bool isVol, bool isTailCall,
MachinePointerInfo DstPtrInfo,
MachinePointerInfo SrcPtrInfo);
SDValue getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src,
SDValue Size, unsigned Align, bool isVol, bool isTailCall,
MachinePointerInfo DstPtrInfo);
SDValue getAtomicMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
unsigned DstAlign, SDValue Src, unsigned SrcAlign,
SDValue Size, Type *SizeTy, unsigned ElemSz,
bool isTailCall, MachinePointerInfo DstPtrInfo,
MachinePointerInfo SrcPtrInfo);
SDValue getAtomicMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
unsigned DstAlign, SDValue Src, unsigned SrcAlign,
SDValue Size, Type *SizeTy, unsigned ElemSz,
bool isTailCall, MachinePointerInfo DstPtrInfo,
MachinePointerInfo SrcPtrInfo);
SDValue getAtomicMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
unsigned DstAlign, SDValue Value, SDValue Size,
Type *SizeTy, unsigned ElemSz, bool isTailCall,
MachinePointerInfo DstPtrInfo);
/// Helper function to make it easier to build SetCC's if you just have an
/// ISD::CondCode instead of an SDValue.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS,
ISD::CondCode Cond) {
assert(LHS.getValueType().isVector() == RHS.getValueType().isVector() &&
"Cannot compare scalars to vectors");
assert(LHS.getValueType().isVector() == VT.isVector() &&
"Cannot compare scalars to vectors");
assert(Cond != ISD::SETCC_INVALID &&
"Cannot create a setCC of an invalid node.");
return getNode(ISD::SETCC, DL, VT, LHS, RHS, getCondCode(Cond));
}
/// Helper function to make it easier to build Select's if you just have
/// operands and don't want to check for vector.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS,
SDValue RHS) {
assert(LHS.getValueType() == RHS.getValueType() &&
"Cannot use select on differing types");
assert(VT.isVector() == LHS.getValueType().isVector() &&
"Cannot mix vectors and scalars");
auto Opcode = Cond.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT;
return getNode(Opcode, DL, VT, Cond, LHS, RHS);
}
/// Helper function to make it easier to build SelectCC's if you just have an
/// ISD::CondCode instead of an SDValue.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True,
SDValue False, ISD::CondCode Cond) {
return getNode(ISD::SELECT_CC, DL, True.getValueType(), LHS, RHS, True,
False, getCondCode(Cond));
}
/// Try to simplify a select/vselect into 1 of its operands or a constant.
SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal);
/// Try to simplify a shift into 1 of its operands or a constant.
SDValue simplifyShift(SDValue X, SDValue Y);
/// Try to simplify a floating-point binary operation into 1 of its operands
/// or a constant.
SDValue simplifyFPBinop(unsigned Opcode, SDValue X, SDValue Y);
/// VAArg produces a result and token chain, and takes a pointer
/// and a source value as input.
SDValue getVAArg(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
SDValue SV, unsigned Align);
/// Gets a node for an atomic cmpxchg op. There are two
/// valid Opcodes. ISD::ATOMIC_CMO_SWAP produces the value loaded and a
/// chain result. ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS produces the value loaded,
/// a success flag (initially i1), and a chain.
SDValue getAtomicCmpSwap(unsigned Opcode, const SDLoc &dl, EVT MemVT,
SDVTList VTs, SDValue Chain, SDValue Ptr,
SDValue Cmp, SDValue Swp, MachineMemOperand *MMO);
/// Gets a node for an atomic op, produces result (if relevant)
/// and chain and takes 2 operands.
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain,
SDValue Ptr, SDValue Val, MachineMemOperand *MMO);
/// Gets a node for an atomic op, produces result and chain and
/// takes 1 operand.
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, EVT VT,
SDValue Chain, SDValue Ptr, MachineMemOperand *MMO);
/// Gets a node for an atomic op, produces result and chain and takes N
/// operands.
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
SDVTList VTList, ArrayRef<SDValue> Ops,
MachineMemOperand *MMO);
/// Creates a MemIntrinsicNode that may produce a
/// result and takes a list of operands. Opcode may be INTRINSIC_VOID,
/// INTRINSIC_W_CHAIN, or a target-specific opcode with a value not
/// less than FIRST_TARGET_MEMORY_OPCODE.
SDValue getMemIntrinsicNode(
unsigned Opcode, const SDLoc &dl, SDVTList VTList,
ArrayRef<SDValue> Ops, EVT MemVT,
MachinePointerInfo PtrInfo,
unsigned Align = 0,
MachineMemOperand::Flags Flags
= MachineMemOperand::MOLoad | MachineMemOperand::MOStore,
unsigned Size = 0,
const AAMDNodes &AAInfo = AAMDNodes());
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList,
ArrayRef<SDValue> Ops, EVT MemVT,
MachineMemOperand *MMO);
/// Creates a LifetimeSDNode that starts (`IsStart==true`) or ends
/// (`IsStart==false`) the lifetime of the portion of `FrameIndex` between
/// offsets `Offset` and `Offset + Size`.
SDValue getLifetimeNode(bool IsStart, const SDLoc &dl, SDValue Chain,
int FrameIndex, int64_t Size, int64_t Offset = -1);
/// Create a MERGE_VALUES node from the given operands.
SDValue getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl);
/// Loads are not normal binary operators: their result type is not
/// determined by their operands, and they produce a value AND a token chain.
///
/// This function will set the MOLoad flag on MMOFlags, but you can set it if
/// you want. The MOStore flag must not be set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
MachinePointerInfo PtrInfo, unsigned Alignment = 0,
MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
const AAMDNodes &AAInfo = AAMDNodes(),
const MDNode *Ranges = nullptr);
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
MachineMemOperand *MMO);
SDValue
getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain,
SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT,
unsigned Alignment = 0,
MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
const AAMDNodes &AAInfo = AAMDNodes());
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT,
SDValue Chain, SDValue Ptr, EVT MemVT,
MachineMemOperand *MMO);
SDValue getIndexedLoad(SDValue OrigLoad, const SDLoc &dl, SDValue Base,
SDValue Offset, ISD::MemIndexedMode AM);
SDValue getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT,
const SDLoc &dl, SDValue Chain, SDValue Ptr, SDValue Offset,
MachinePointerInfo PtrInfo, EVT MemVT, unsigned Alignment = 0,
MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
const AAMDNodes &AAInfo = AAMDNodes(),
const MDNode *Ranges = nullptr);
SDValue getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT,
const SDLoc &dl, SDValue Chain, SDValue Ptr, SDValue Offset,
EVT MemVT, MachineMemOperand *MMO);
/// Helper function to build ISD::STORE nodes.
///
/// This function will set the MOStore flag on MMOFlags, but you can set it if
/// you want. The MOLoad and MOInvariant flags must not be set.
SDValue
getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr,
MachinePointerInfo PtrInfo, unsigned Alignment = 0,
MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
const AAMDNodes &AAInfo = AAMDNodes());
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr,
MachineMemOperand *MMO);
SDValue
getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr,
MachinePointerInfo PtrInfo, EVT SVT, unsigned Alignment = 0,
MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
const AAMDNodes &AAInfo = AAMDNodes());
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
SDValue Ptr, EVT SVT, MachineMemOperand *MMO);
SDValue getIndexedStore(SDValue OrigStore, const SDLoc &dl, SDValue Base,
SDValue Offset, ISD::MemIndexedMode AM);
/// Returns sum of the base pointer and offset.
SDValue getMemBasePlusOffset(SDValue Base, unsigned Offset, const SDLoc &DL);
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
SDValue Mask, SDValue Src0, EVT MemVT,
MachineMemOperand *MMO, ISD::LoadExtType,
bool IsExpanding = false);
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val,
SDValue Ptr, SDValue Mask, EVT MemVT,
MachineMemOperand *MMO, bool IsTruncating = false,
bool IsCompressing = false);
SDValue getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
ArrayRef<SDValue> Ops, MachineMemOperand *MMO);
SDValue getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
ArrayRef<SDValue> Ops, MachineMemOperand *MMO);
/// Return (create a new or find existing) a target-specific node.
/// TargetMemSDNode should be derived class from MemSDNode.
template <class TargetMemSDNode>
SDValue getTargetMemSDNode(SDVTList VTs, ArrayRef<SDValue> Ops,
const SDLoc &dl, EVT MemVT,
MachineMemOperand *MMO);
/// Construct a node to track a Value* through the backend.
SDValue getSrcValue(const Value *v);
/// Return an MDNodeSDNode which holds an MDNode.
SDValue getMDNode(const MDNode *MD);
/// Return a bitcast using the SDLoc of the value operand, and casting to the
/// provided type. Use getNode to set a custom SDLoc.
SDValue getBitcast(EVT VT, SDValue V);
/// Return an AddrSpaceCastSDNode.
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS,
unsigned DestAS);
/// Return the specified value casted to
/// the target's desired shift amount type.
SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op);
/// Expand the specified \c ISD::VAARG node as the Legalize pass would.
SDValue expandVAArg(SDNode *Node);
/// Expand the specified \c ISD::VACOPY node as the Legalize pass would.
SDValue expandVACopy(SDNode *Node);
/// Returs an GlobalAddress of the function from the current module with
/// name matching the given ExternalSymbol. Additionally can provide the
/// matched function.
/// Panics the function doesn't exists.
SDValue getSymbolFunctionGlobalAddress(SDValue Op,
Function **TargetFunction = nullptr);
/// *Mutate* the specified node in-place to have the
/// specified operands. If the resultant node already exists in the DAG,
/// this does not modify the specified node, instead it returns the node that
/// already exists. If the resultant node does not exist in the DAG, the
/// input node is returned. As a degenerate case, if you specify the same
/// input operands as the node already has, the input node is returned.
SDNode *UpdateNodeOperands(SDNode *N, SDValue Op);
SDNode *UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2);
SDNode *UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
SDValue Op3);
SDNode *UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
SDValue Op3, SDValue Op4);
SDNode *UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
SDValue Op3, SDValue Op4, SDValue Op5);
SDNode *UpdateNodeOperands(SDNode *N, ArrayRef<SDValue> Ops);
/// Creates a new TokenFactor containing \p Vals. If \p Vals contains 64k
/// values or more, move values into new TokenFactors in 64k-1 blocks, until
/// the final TokenFactor has less than 64k operands.
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl<SDValue> &Vals);
/// *Mutate* the specified machine node's memory references to the provided
/// list.
void setNodeMemRefs(MachineSDNode *N,
ArrayRef<MachineMemOperand *> NewMemRefs);
// Propagates the change in divergence to users
void updateDivergence(SDNode * N);
/// These are used for target selectors to *mutate* the
/// specified node to have the specified return type, Target opcode, and
/// operands. Note that target opcodes are stored as
/// ~TargetOpcode in the node opcode field. The resultant node is returned.
SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT);
SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT, SDValue Op1);
SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT,
SDValue Op1, SDValue Op2);
SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT,
SDValue Op1, SDValue Op2, SDValue Op3);
SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT,
ArrayRef<SDValue> Ops);
SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1, EVT VT2);
SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1,
EVT VT2, ArrayRef<SDValue> Ops);
SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1,
EVT VT2, EVT VT3, ArrayRef<SDValue> Ops);
SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT1,
EVT VT2, SDValue Op1);
SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1,
EVT VT2, SDValue Op1, SDValue Op2);
SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, SDVTList VTs,
ArrayRef<SDValue> Ops);
/// This *mutates* the specified node to have the specified
/// return type, opcode, and operands.
SDNode *MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs,
ArrayRef<SDValue> Ops);
/// Mutate the specified strict FP node to its non-strict equivalent,
/// unlinking the node from its chain and dropping the metadata arguments.
/// The node must be a strict FP node.
SDNode *mutateStrictFPToFP(SDNode *Node);
/// These are used for target selectors to create a new node
/// with specified return type(s), MachineInstr opcode, and operands.
///
/// Note that getMachineNode returns the resultant node. If there is already
/// a node of the specified opcode and operands, it returns that node instead
/// of the current one.
MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT);
MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT,
SDValue Op1);
MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT,
SDValue Op1, SDValue Op2);
MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT,
SDValue Op1, SDValue Op2, SDValue Op3);
MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT,
ArrayRef<SDValue> Ops);
MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
EVT VT2, SDValue Op1, SDValue Op2);
MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
EVT VT2, SDValue Op1, SDValue Op2, SDValue Op3);
MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
EVT VT2, ArrayRef<SDValue> Ops);
MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
EVT VT2, EVT VT3, SDValue Op1, SDValue Op2);
MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
EVT VT2, EVT VT3, SDValue Op1, SDValue Op2,
SDValue Op3);
MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
EVT VT2, EVT VT3, ArrayRef<SDValue> Ops);
MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl,
ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops);
MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, SDVTList VTs,
ArrayRef<SDValue> Ops);
/// A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT,
SDValue Operand);
/// A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT,
SDValue Operand, SDValue Subreg);
/// Get the specified node if it's already available, or else return NULL.
SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef<SDValue> Ops,
const SDNodeFlags Flags = SDNodeFlags());
/// Creates a SDDbgValue node.
SDDbgValue *getDbgValue(DIVariable *Var, DIExpression *Expr, SDNode *N,
unsigned R, bool IsIndirect, const DebugLoc &DL,
unsigned O);
/// Creates a constant SDDbgValue node.
SDDbgValue *getConstantDbgValue(DIVariable *Var, DIExpression *Expr,
const Value *C, const DebugLoc &DL,
unsigned O);
/// Creates a FrameIndex SDDbgValue node.
SDDbgValue *getFrameIndexDbgValue(DIVariable *Var, DIExpression *Expr,
unsigned FI, bool IsIndirect,
const DebugLoc &DL, unsigned O);
/// Creates a VReg SDDbgValue node.
SDDbgValue *getVRegDbgValue(DIVariable *Var, DIExpression *Expr,
unsigned VReg, bool IsIndirect,
const DebugLoc &DL, unsigned O);
/// Creates a SDDbgLabel node.
SDDbgLabel *getDbgLabel(DILabel *Label, const DebugLoc &DL, unsigned O);
/// Transfer debug values from one node to another, while optionally
/// generating fragment expressions for split-up values. If \p InvalidateDbg
/// is set, debug values are invalidated after they are transferred.
void transferDbgValues(SDValue From, SDValue To, unsigned OffsetInBits = 0,
unsigned SizeInBits = 0, bool InvalidateDbg = true);
/// Remove the specified node from the system. If any of its
/// operands then becomes dead, remove them as well. Inform UpdateListener
/// for each node deleted.
void RemoveDeadNode(SDNode *N);
/// This method deletes the unreachable nodes in the
/// given list, and any nodes that become unreachable as a result.
void RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes);
/// Modify anything using 'From' to use 'To' instead.
/// This can cause recursive merging of nodes in the DAG. Use the first
/// version if 'From' is known to have a single result, use the second
/// if you have two nodes with identical results (or if 'To' has a superset
/// of the results of 'From'), use the third otherwise.
///
/// These methods all take an optional UpdateListener, which (if not null) is
/// informed about nodes that are deleted and modified due to recursive
/// changes in the dag.
///
/// These functions only replace all existing uses. It's possible that as
/// these replacements are being performed, CSE may cause the From node
/// to be given new uses. These new uses of From are left in place, and
/// not automatically transferred to To.
///
void ReplaceAllUsesWith(SDValue From, SDValue To);
void ReplaceAllUsesWith(SDNode *From, SDNode *To);
void ReplaceAllUsesWith(SDNode *From, const SDValue *To);
/// Replace any uses of From with To, leaving
/// uses of other values produced by From.getNode() alone.
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To);
/// Like ReplaceAllUsesOfValueWith, but for multiple values at once.
/// This correctly handles the case where
/// there is an overlap between the From values and the To values.
void ReplaceAllUsesOfValuesWith(const SDValue *From, const SDValue *To,
unsigned Num);
/// If an existing load has uses of its chain, create a token factor node with
/// that chain and the new memory node's chain and update users of the old
/// chain to the token factor. This ensures that the new memory node will have
/// the same relative memory dependency position as the old load. Returns the
/// new merged load chain.
SDValue makeEquivalentMemoryOrdering(LoadSDNode *Old, SDValue New);
/// Topological-sort the AllNodes list and a
/// assign a unique node id for each node in the DAG based on their
/// topological order. Returns the number of nodes.
unsigned AssignTopologicalOrder();
/// Move node N in the AllNodes list to be immediately
/// before the given iterator Position. This may be used to update the
/// topological ordering when the list of nodes is modified.
void RepositionNode(allnodes_iterator Position, SDNode *N) {
AllNodes.insert(Position, AllNodes.remove(N));
}
/// Returns an APFloat semantics tag appropriate for the given type. If VT is
/// a vector type, the element semantics are returned.
static const fltSemantics &EVTToAPFloatSemantics(EVT VT) {
switch (VT.getScalarType().getSimpleVT().SimpleTy) {
default: llvm_unreachable("Unknown FP format");
case MVT::f16: return APFloat::IEEEhalf();
case MVT::f32: return APFloat::IEEEsingle();
case MVT::f64: return APFloat::IEEEdouble();
case MVT::f80: return APFloat::x87DoubleExtended();
case MVT::f128: return APFloat::IEEEquad();
case MVT::ppcf128: return APFloat::PPCDoubleDouble();
}
}
/// Add a dbg_value SDNode. If SD is non-null that means the
/// value is produced by SD.
void AddDbgValue(SDDbgValue *DB, SDNode *SD, bool isParameter);
/// Add a dbg_label SDNode.
void AddDbgLabel(SDDbgLabel *DB);
/// Get the debug values which reference the given SDNode.
ArrayRef<SDDbgValue*> GetDbgValues(const SDNode* SD) const {
return DbgInfo->getSDDbgValues(SD);
}
public:
/// Return true if there are any SDDbgValue nodes associated
/// with this SelectionDAG.
bool hasDebugValues() const { return !DbgInfo->empty(); }
SDDbgInfo::DbgIterator DbgBegin() const { return DbgInfo->DbgBegin(); }
SDDbgInfo::DbgIterator DbgEnd() const { return DbgInfo->DbgEnd(); }
SDDbgInfo::DbgIterator ByvalParmDbgBegin() const {
return DbgInfo->ByvalParmDbgBegin();
}
SDDbgInfo::DbgIterator ByvalParmDbgEnd() const {
return DbgInfo->ByvalParmDbgEnd();
}
SDDbgInfo::DbgLabelIterator DbgLabelBegin() const {
return DbgInfo->DbgLabelBegin();
}
SDDbgInfo::DbgLabelIterator DbgLabelEnd() const {
return DbgInfo->DbgLabelEnd();
}
/// To be invoked on an SDNode that is slated to be erased. This
/// function mirrors \c llvm::salvageDebugInfo.
void salvageDebugInfo(SDNode &N);
void dump() const;
/// Create a stack temporary, suitable for holding the specified value type.
/// If minAlign is specified, the slot size will have at least that alignment.
SDValue CreateStackTemporary(EVT VT, unsigned minAlign = 1);
/// Create a stack temporary suitable for holding either of the specified
/// value types.
SDValue CreateStackTemporary(EVT VT1, EVT VT2);
SDValue FoldSymbolOffset(unsigned Opcode, EVT VT,
const GlobalAddressSDNode *GA,
const SDNode *N2);
SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT,
SDNode *N1, SDNode *N2);
SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT,
const ConstantSDNode *C1,
const ConstantSDNode *C2);
SDValue FoldConstantVectorArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT,
ArrayRef<SDValue> Ops,
const SDNodeFlags Flags = SDNodeFlags());
/// Fold floating-point operations with 2 operands when both operands are
/// constants and/or undefined.
SDValue foldConstantFPMath(unsigned Opcode, const SDLoc &DL, EVT VT,
SDValue N1, SDValue N2);
/// Constant fold a setcc to true or false.
SDValue FoldSetCC(EVT VT, SDValue N1, SDValue N2, ISD::CondCode Cond,
const SDLoc &dl);
/// See if the specified operand can be simplified with the knowledge that
/// only the bits specified by DemandedBits are used. If so, return the
/// simpler operand, otherwise return a null SDValue.
///
/// (This exists alongside SimplifyDemandedBits because GetDemandedBits can
/// simplify nodes with multiple uses more aggressively.)
SDValue GetDemandedBits(SDValue V, const APInt &DemandedBits);
/// See if the specified operand can be simplified with the knowledge that
/// only the bits specified by DemandedBits are used in the elements specified
/// by DemandedElts. If so, return the simpler operand, otherwise return a
/// null SDValue.
///
/// (This exists alongside SimplifyDemandedBits because GetDemandedBits can
/// simplify nodes with multiple uses more aggressively.)
SDValue GetDemandedBits(SDValue V, const APInt &DemandedBits,
const APInt &DemandedElts);
/// Return true if the sign bit of Op is known to be zero.
/// We use this predicate to simplify operations downstream.
bool SignBitIsZero(SDValue Op, unsigned Depth = 0) const;
/// Return true if 'Op & Mask' is known to be zero. We
/// use this predicate to simplify operations downstream. Op and Mask are
/// known to be the same type.
bool MaskedValueIsZero(SDValue Op, const APInt &Mask,
unsigned Depth = 0) const;
/// Return true if 'Op & Mask' is known to be zero in DemandedElts. We
/// use this predicate to simplify operations downstream. Op and Mask are
/// known to be the same type.
bool MaskedValueIsZero(SDValue Op, const APInt &Mask,
const APInt &DemandedElts, unsigned Depth = 0) const;
/// Return true if '(Op & Mask) == Mask'.
/// Op and Mask are known to be the same type.
bool MaskedValueIsAllOnes(SDValue Op, const APInt &Mask,
unsigned Depth = 0) const;
/// Determine which bits of Op are known to be either zero or one and return
/// them in Known. For vectors, the known bits are those that are shared by
/// every vector element.
/// Targets can implement the computeKnownBitsForTargetNode method in the
/// TargetLowering class to allow target nodes to be understood.
KnownBits computeKnownBits(SDValue Op, unsigned Depth = 0) const;
/// Determine which bits of Op are known to be either zero or one and return
/// them in Known. The DemandedElts argument allows us to only collect the
/// known bits that are shared by the requested vector elements.
/// Targets can implement the computeKnownBitsForTargetNode method in the
/// TargetLowering class to allow target nodes to be understood.
KnownBits computeKnownBits(SDValue Op, const APInt &DemandedElts,
unsigned Depth = 0) const;
/// Used to represent the possible overflow behavior of an operation.
/// Never: the operation cannot overflow.
/// Always: the operation will always overflow.
/// Sometime: the operation may or may not overflow.
enum OverflowKind {
OFK_Never,
OFK_Sometime,
OFK_Always,
};
/// Determine if the result of the addition of 2 node can overflow.
OverflowKind computeOverflowKind(SDValue N0, SDValue N1) const;
/// Test if the given value is known to have exactly one bit set. This differs
/// from computeKnownBits in that it doesn't necessarily determine which bit
/// is set.
bool isKnownToBeAPowerOfTwo(SDValue Val) const;
/// Return the number of times the sign bit of the register is replicated into
/// the other bits. We know that at least 1 bit is always equal to the sign
/// bit (itself), but other cases can give us information. For example,
/// immediately after an "SRA X, 2", we know that the top 3 bits are all equal
/// to each other, so we return 3. Targets can implement the
/// ComputeNumSignBitsForTarget method in the TargetLowering class to allow
/// target nodes to be understood.
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth = 0) const;
/// Return the number of times the sign bit of the register is replicated into
/// the other bits. We know that at least 1 bit is always equal to the sign
/// bit (itself), but other cases can give us information. For example,
/// immediately after an "SRA X, 2", we know that the top 3 bits are all equal
/// to each other, so we return 3. The DemandedElts argument allows
/// us to only collect the minimum sign bits of the requested vector elements.
/// Targets can implement the ComputeNumSignBitsForTarget method in the
/// TargetLowering class to allow target nodes to be understood.
unsigned ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
unsigned Depth = 0) const;
/// Return true if the specified operand is an ISD::ADD with a ConstantSDNode
/// on the right-hand side, or if it is an ISD::OR with a ConstantSDNode that
/// is guaranteed to have the same semantics as an ADD. This handles the
/// equivalence:
/// X|Cst == X+Cst iff X&Cst = 0.
bool isBaseWithConstantOffset(SDValue Op) const;
/// Test whether the given SDValue is known to never be NaN. If \p SNaN is
/// true, returns if \p Op is known to never be a signaling NaN (it may still
/// be a qNaN).
bool isKnownNeverNaN(SDValue Op, bool SNaN = false, unsigned Depth = 0) const;
/// \returns true if \p Op is known to never be a signaling NaN.
bool isKnownNeverSNaN(SDValue Op, unsigned Depth = 0) const {
return isKnownNeverNaN(Op, true, Depth);
}
/// Test whether the given floating point SDValue is known to never be
/// positive or negative zero.
bool isKnownNeverZeroFloat(SDValue Op) const;
/// Test whether the given SDValue is known to contain non-zero value(s).
bool isKnownNeverZero(SDValue Op) const;
/// Test whether two SDValues are known to compare equal. This
/// is true if they are the same value, or if one is negative zero and the
/// other positive zero.
bool isEqualTo(SDValue A, SDValue B) const;
/// Return true if A and B have no common bits set. As an example, this can
/// allow an 'add' to be transformed into an 'or'.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const;
/// Test whether \p V has a splatted value for all the demanded elements.
///
/// On success \p UndefElts will indicate the elements that have UNDEF
/// values instead of the splat value, this is only guaranteed to be correct
/// for \p DemandedElts.
///
/// NOTE: The function will return true for a demanded splat of UNDEF values.
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts);
/// Test whether \p V has a splatted value.
bool isSplatValue(SDValue V, bool AllowUndefs = false);
/// If V is a splatted value, return the source vector and its splat index.
SDValue getSplatSourceVector(SDValue V, int &SplatIndex);
/// If V is a splat vector, return its scalar source operand by extracting
/// that element from the source vector.
SDValue getSplatValue(SDValue V);
/// Match a binop + shuffle pyramid that represents a horizontal reduction
/// over the elements of a vector starting from the EXTRACT_VECTOR_ELT node /p
/// Extract. The reduction must use one of the opcodes listed in /p
/// CandidateBinOps and on success /p BinOp will contain the matching opcode.
/// Returns the vector that is being reduced on, or SDValue() if a reduction
/// was not matched.
SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp,
ArrayRef<ISD::NodeType> CandidateBinOps);
/// Utility function used by legalize and lowering to
/// "unroll" a vector operation by splitting out the scalars and operating
/// on each element individually. If the ResNE is 0, fully unroll the vector
/// op. If ResNE is less than the width of the vector op, unroll up to ResNE.
/// If the ResNE is greater than the width of the vector op, unroll the
/// vector op and fill the end of the resulting vector with UNDEFS.
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE = 0);
/// Like UnrollVectorOp(), but for the [US](ADD|SUB|MUL)O family of opcodes.
/// This is a separate function because those opcodes have two results.
std::pair<SDValue, SDValue> UnrollVectorOverflowOp(SDNode *N,
unsigned ResNE = 0);
/// Return true if loads are next to each other and can be
/// merged. Check that both are nonvolatile and if LD is loading
/// 'Bytes' bytes from a location that is 'Dist' units away from the
/// location that the 'Base' load is loading from.
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base,
unsigned Bytes, int Dist) const;
/// Infer alignment of a load / store address. Return 0 if
/// it cannot be inferred.
unsigned InferPtrAlignment(SDValue Ptr) const;
/// Compute the VTs needed for the low/hi parts of a type
/// which is split (or expanded) into two not necessarily identical pieces.
std::pair<EVT, EVT> GetSplitDestVTs(const EVT &VT) const;
/// Split the vector with EXTRACT_SUBVECTOR using the provides
/// VTs and return the low/high part.
std::pair<SDValue, SDValue> SplitVector(const SDValue &N, const SDLoc &DL,
const EVT &LoVT, const EVT &HiVT);
/// Split the vector with EXTRACT_SUBVECTOR and return the low/high part.
std::pair<SDValue, SDValue> SplitVector(const SDValue &N, const SDLoc &DL) {
EVT LoVT, HiVT;
std::tie(LoVT, HiVT) = GetSplitDestVTs(N.getValueType());
return SplitVector(N, DL, LoVT, HiVT);
}
/// Split the node's operand with EXTRACT_SUBVECTOR and
/// return the low/high part.
std::pair<SDValue, SDValue> SplitVectorOperand(const SDNode *N, unsigned OpNo)
{
return SplitVector(N->getOperand(OpNo), SDLoc(N));
}
/// Widen the vector up to the next power of two using INSERT_SUBVECTOR.
SDValue WidenVector(const SDValue &N, const SDLoc &DL);
/// Append the extracted elements from Start to Count out of the vector Op
/// in Args. If Count is 0, all of the elements will be extracted.
void ExtractVectorElements(SDValue Op, SmallVectorImpl<SDValue> &Args,
unsigned Start = 0, unsigned Count = 0);
/// Compute the default alignment value for the given type.
unsigned getEVTAlignment(EVT MemoryVT) const;
/// Test whether the given value is a constant int or similar node.
SDNode *isConstantIntBuildVectorOrConstantInt(SDValue N);
/// Test whether the given value is a constant FP or similar node.
SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N);
/// \returns true if \p N is any kind of constant or build_vector of
/// constants, int or float. If a vector, it may not necessarily be a splat.
inline bool isConstantValueOfAnyType(SDValue N) {
return isConstantIntBuildVectorOrConstantInt(N) ||
isConstantFPBuildVectorOrConstantFP(N);
}
void addCallSiteInfo(const SDNode *CallNode, CallSiteInfoImpl &&CallInfo) {
- SDCallSiteInfo[CallNode] = std::move(CallInfo);
+ SDCallSiteDbgInfo[CallNode].CSInfo = std::move(CallInfo);
}
CallSiteInfo getSDCallSiteInfo(const SDNode *CallNode) {
- auto I = SDCallSiteInfo.find(CallNode);
- if (I != SDCallSiteInfo.end())
- return std::move(I->second);
+ auto I = SDCallSiteDbgInfo.find(CallNode);
+ if (I != SDCallSiteDbgInfo.end())
+ return std::move(I->second).CSInfo;
return CallSiteInfo();
+ }
+
+ void addHeapAllocSite(const SDNode *Node, MDNode *MD) {
+ SDCallSiteDbgInfo[Node].HeapAllocSite = MD;
+ }
+
+ /// Return the HeapAllocSite type associated with the SDNode, if it exists.
+ MDNode *getHeapAllocSite(const SDNode *Node) {
+ auto It = SDCallSiteDbgInfo.find(Node);
+ if (It == SDCallSiteDbgInfo.end())
+ return nullptr;
+ return It->second.HeapAllocSite;
}
private:
void InsertNode(SDNode *N);
bool RemoveNodeFromCSEMaps(SDNode *N);
void AddModifiedNodeToCSEMaps(SDNode *N);
SDNode *FindModifiedNodeSlot(SDNode *N, SDValue Op, void *&InsertPos);
SDNode *FindModifiedNodeSlot(SDNode *N, SDValue Op1, SDValue Op2,
void *&InsertPos);
SDNode *FindModifiedNodeSlot(SDNode *N, ArrayRef<SDValue> Ops,
void *&InsertPos);
SDNode *UpdateSDLocOnMergeSDNode(SDNode *N, const SDLoc &loc);
void DeleteNodeNotInCSEMaps(SDNode *N);
void DeallocateNode(SDNode *N);
void allnodes_clear();
/// Look up the node specified by ID in CSEMap. If it exists, return it. If
/// not, return the insertion token that will make insertion faster. This
/// overload is for nodes other than Constant or ConstantFP, use the other one
/// for those.
SDNode *FindNodeOrInsertPos(const FoldingSetNodeID &ID, void *&InsertPos);
/// Look up the node specified by ID in CSEMap. If it exists, return it. If
/// not, return the insertion token that will make insertion faster. Performs
/// additional processing for constant nodes.
SDNode *FindNodeOrInsertPos(const FoldingSetNodeID &ID, const SDLoc &DL,
void *&InsertPos);
/// List of non-single value types.
FoldingSet<SDVTListNode> VTListMap;
/// Maps to auto-CSE operations.
std::vector<CondCodeSDNode*> CondCodeNodes;
std::vector<SDNode*> ValueTypeNodes;
std::map<EVT, SDNode*, EVT::compareRawBits> ExtendedValueTypeNodes;
StringMap<SDNode*> ExternalSymbols;
std::map<std::pair<std::string, unsigned char>,SDNode*> TargetExternalSymbols;
DenseMap<MCSymbol *, SDNode *> MCSymbols;
};
template <> struct GraphTraits<SelectionDAG*> : public GraphTraits<SDNode*> {
using nodes_iterator = pointer_iterator<SelectionDAG::allnodes_iterator>;
static nodes_iterator nodes_begin(SelectionDAG *G) {
return nodes_iterator(G->allnodes_begin());
}
static nodes_iterator nodes_end(SelectionDAG *G) {
return nodes_iterator(G->allnodes_end());
}
};
template <class TargetMemSDNode>
SDValue SelectionDAG::getTargetMemSDNode(SDVTList VTs,
ArrayRef<SDValue> Ops,
const SDLoc &dl, EVT MemVT,
MachineMemOperand *MMO) {
/// Compose node ID and try to find an existing node.
FoldingSetNodeID ID;
unsigned Opcode =
TargetMemSDNode(dl.getIROrder(), DebugLoc(), VTs, MemVT, MMO).getOpcode();
ID.AddInteger(Opcode);
ID.AddPointer(VTs.VTs);
for (auto& Op : Ops) {
ID.AddPointer(Op.getNode());
ID.AddInteger(Op.getResNo());
}
ID.AddInteger(MemVT.getRawBits());
ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
ID.AddInteger(getSyntheticNodeSubclassData<TargetMemSDNode>(
dl.getIROrder(), VTs, MemVT, MMO));
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
cast<TargetMemSDNode>(E)->refineAlignment(MMO);
return SDValue(E, 0);
}
/// Existing node was not found. Create a new one.
auto *N = newSDNode<TargetMemSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
MemVT, MMO);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
InsertNode(N);
return SDValue(N, 0);
}
} // end namespace llvm
#endif // LLVM_CODEGEN_SELECTIONDAG_H
Index: vendor/llvm/dist-release_90/include/llvm/CodeGen/TargetLowering.h
===================================================================
--- vendor/llvm/dist-release_90/include/llvm/CodeGen/TargetLowering.h (revision 351302)
+++ vendor/llvm/dist-release_90/include/llvm/CodeGen/TargetLowering.h (revision 351303)
@@ -1,4091 +1,4092 @@
//===- llvm/CodeGen/TargetLowering.h - Target Lowering Info -----*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file describes how to lower LLVM code to machine code. This has two
/// main components:
///
/// 1. Which ValueTypes are natively supported by the target.
/// 2. Which operations are supported for supported ValueTypes.
/// 3. Cost thresholds for alternative implementations of certain operations.
///
/// In addition it has a few other components, like information about FP
/// immediates.
///
//===----------------------------------------------------------------------===//
#ifndef LLVM_CODEGEN_TARGETLOWERING_H
#define LLVM_CODEGEN_TARGETLOWERING_H
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/CodeGen/DAGCombine.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetCallingConv.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Type.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MachineValueType.h"
#include "llvm/Target/TargetMachine.h"
#include <algorithm>
#include <cassert>
#include <climits>
#include <cstdint>
#include <iterator>
#include <map>
#include <string>
#include <utility>
#include <vector>
namespace llvm {
class BranchProbability;
class CCState;
class CCValAssign;
class Constant;
class FastISel;
class FunctionLoweringInfo;
class GlobalValue;
class IntrinsicInst;
struct KnownBits;
class LLVMContext;
class MachineBasicBlock;
class MachineFunction;
class MachineInstr;
class MachineJumpTableInfo;
class MachineLoop;
class MachineRegisterInfo;
class MCContext;
class MCExpr;
class Module;
class TargetRegisterClass;
class TargetLibraryInfo;
class TargetRegisterInfo;
class Value;
namespace Sched {
enum Preference {
None, // No preference
Source, // Follow source order.
RegPressure, // Scheduling for lowest register pressure.
Hybrid, // Scheduling for both latency and register pressure.
ILP, // Scheduling for ILP in low register pressure mode.
VLIW // Scheduling for VLIW targets.
};
} // end namespace Sched
/// This base class for TargetLowering contains the SelectionDAG-independent
/// parts that can be used from the rest of CodeGen.
class TargetLoweringBase {
public:
/// This enum indicates whether operations are valid for a target, and if not,
/// what action should be used to make them valid.
enum LegalizeAction : uint8_t {
Legal, // The target natively supports this operation.
Promote, // This operation should be executed in a larger type.
Expand, // Try to expand this to other ops, otherwise use a libcall.
LibCall, // Don't try to expand this to other ops, always use a libcall.
Custom // Use the LowerOperation hook to implement custom lowering.
};
/// This enum indicates whether a types are legal for a target, and if not,
/// what action should be used to make them valid.
enum LegalizeTypeAction : uint8_t {
TypeLegal, // The target natively supports this type.
TypePromoteInteger, // Replace this integer with a larger one.
TypeExpandInteger, // Split this integer into two of half the size.
TypeSoftenFloat, // Convert this float to a same size integer type,
// if an operation is not supported in target HW.
TypeExpandFloat, // Split this float into two of half the size.
TypeScalarizeVector, // Replace this one-element vector with its element.
TypeSplitVector, // Split this vector into two of half the size.
TypeWidenVector, // This vector should be widened into a larger vector.
TypePromoteFloat // Replace this float with a larger one.
};
/// LegalizeKind holds the legalization kind that needs to happen to EVT
/// in order to type-legalize it.
using LegalizeKind = std::pair<LegalizeTypeAction, EVT>;
/// Enum that describes how the target represents true/false values.
enum BooleanContent {
UndefinedBooleanContent, // Only bit 0 counts, the rest can hold garbage.
ZeroOrOneBooleanContent, // All bits zero except for bit 0.
ZeroOrNegativeOneBooleanContent // All bits equal to bit 0.
};
/// Enum that describes what type of support for selects the target has.
enum SelectSupportKind {
ScalarValSelect, // The target supports scalar selects (ex: cmov).
ScalarCondVectorVal, // The target supports selects with a scalar condition
// and vector values (ex: cmov).
VectorMaskSelect // The target supports vector selects with a vector
// mask (ex: x86 blends).
};
/// Enum that specifies what an atomic load/AtomicRMWInst is expanded
/// to, if at all. Exists because different targets have different levels of
/// support for these atomic instructions, and also have different options
/// w.r.t. what they should expand to.
enum class AtomicExpansionKind {
None, // Don't expand the instruction.
LLSC, // Expand the instruction into loadlinked/storeconditional; used
// by ARM/AArch64.
LLOnly, // Expand the (load) instruction into just a load-linked, which has
// greater atomic guarantees than a normal load.
CmpXChg, // Expand the instruction into cmpxchg; used by at least X86.
MaskedIntrinsic, // Use a target-specific intrinsic for the LL/SC loop.
};
/// Enum that specifies when a multiplication should be expanded.
enum class MulExpansionKind {
Always, // Always expand the instruction.
OnlyLegalOrCustom, // Only expand when the resulting instructions are legal
// or custom.
};
class ArgListEntry {
public:
Value *Val = nullptr;
SDValue Node = SDValue();
Type *Ty = nullptr;
bool IsSExt : 1;
bool IsZExt : 1;
bool IsInReg : 1;
bool IsSRet : 1;
bool IsNest : 1;
bool IsByVal : 1;
bool IsInAlloca : 1;
bool IsReturned : 1;
bool IsSwiftSelf : 1;
bool IsSwiftError : 1;
uint16_t Alignment = 0;
Type *ByValType = nullptr;
ArgListEntry()
: IsSExt(false), IsZExt(false), IsInReg(false), IsSRet(false),
IsNest(false), IsByVal(false), IsInAlloca(false), IsReturned(false),
IsSwiftSelf(false), IsSwiftError(false) {}
void setAttributes(const CallBase *Call, unsigned ArgIdx);
void setAttributes(ImmutableCallSite *CS, unsigned ArgIdx) {
return setAttributes(cast<CallBase>(CS->getInstruction()), ArgIdx);
}
};
using ArgListTy = std::vector<ArgListEntry>;
virtual void markLibCallAttributes(MachineFunction *MF, unsigned CC,
ArgListTy &Args) const {};
static ISD::NodeType getExtendForContent(BooleanContent Content) {
switch (Content) {
case UndefinedBooleanContent:
// Extend by adding rubbish bits.
return ISD::ANY_EXTEND;
case ZeroOrOneBooleanContent:
// Extend by adding zero bits.
return ISD::ZERO_EXTEND;
case ZeroOrNegativeOneBooleanContent:
// Extend by copying the sign bit.
return ISD::SIGN_EXTEND;
}
llvm_unreachable("Invalid content kind");
}
/// NOTE: The TargetMachine owns TLOF.
explicit TargetLoweringBase(const TargetMachine &TM);
TargetLoweringBase(const TargetLoweringBase &) = delete;
TargetLoweringBase &operator=(const TargetLoweringBase &) = delete;
virtual ~TargetLoweringBase() = default;
protected:
/// Initialize all of the actions to default values.
void initActions();
public:
const TargetMachine &getTargetMachine() const { return TM; }
virtual bool useSoftFloat() const { return false; }
/// Return the pointer type for the given address space, defaults to
/// the pointer type from the data layout.
/// FIXME: The default needs to be removed once all the code is updated.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS = 0) const {
return MVT::getIntegerVT(DL.getPointerSizeInBits(AS));
}
/// Return the in-memory pointer type for the given address space, defaults to
/// the pointer type from the data layout. FIXME: The default needs to be
/// removed once all the code is updated.
MVT getPointerMemTy(const DataLayout &DL, uint32_t AS = 0) const {
return MVT::getIntegerVT(DL.getPointerSizeInBits(AS));
}
/// Return the type for frame index, which is determined by
/// the alloca address space specified through the data layout.
MVT getFrameIndexTy(const DataLayout &DL) const {
return getPointerTy(DL, DL.getAllocaAddrSpace());
}
/// Return the type for operands of fence.
/// TODO: Let fence operands be of i32 type and remove this.
virtual MVT getFenceOperandTy(const DataLayout &DL) const {
return getPointerTy(DL);
}
/// EVT is not used in-tree, but is used by out-of-tree target.
/// A documentation for this function would be nice...
virtual MVT getScalarShiftAmountTy(const DataLayout &, EVT) const;
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL,
bool LegalTypes = true) const;
/// Returns the type to be used for the index operand of:
/// ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
/// ISD::INSERT_SUBVECTOR, and ISD::EXTRACT_SUBVECTOR
virtual MVT getVectorIdxTy(const DataLayout &DL) const {
return getPointerTy(DL);
}
virtual bool isSelectSupported(SelectSupportKind /*kind*/) const {
return true;
}
/// Return true if it is profitable to convert a select of FP constants into
/// a constant pool load whose address depends on the select condition. The
/// parameter may be used to differentiate a select with FP compare from
/// integer compare.
virtual bool reduceSelectOfFPConstantLoads(bool IsFPSetCC) const {
return true;
}
/// Return true if multiple condition registers are available.
bool hasMultipleConditionRegisters() const {
return HasMultipleConditionRegisters;
}
/// Return true if the target has BitExtract instructions.
bool hasExtractBitsInsn() const { return HasExtractBitsInsn; }
/// Return the preferred vector type legalization action.
virtual TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(MVT VT) const {
// The default action for one element vectors is to scalarize
if (VT.getVectorNumElements() == 1)
return TypeScalarizeVector;
// The default action for an odd-width vector is to widen.
if (!VT.isPow2VectorType())
return TypeWidenVector;
// The default action for other vectors is to promote
return TypePromoteInteger;
}
// There are two general methods for expanding a BUILD_VECTOR node:
// 1. Use SCALAR_TO_VECTOR on the defined scalar values and then shuffle
// them together.
// 2. Build the vector on the stack and then load it.
// If this function returns true, then method (1) will be used, subject to
// the constraint that all of the necessary shuffles are legal (as determined
// by isShuffleMaskLegal). If this function returns false, then method (2) is
// always used. The vector type, and the number of defined values, are
// provided.
virtual bool
shouldExpandBuildVectorWithShuffles(EVT /* VT */,
unsigned DefinedValues) const {
return DefinedValues < 3;
}
/// Return true if integer divide is usually cheaper than a sequence of
/// several shifts, adds, and multiplies for this target.
/// The definition of "cheaper" may depend on whether we're optimizing
/// for speed or for size.
virtual bool isIntDivCheap(EVT VT, AttributeList Attr) const { return false; }
/// Return true if the target can handle a standalone remainder operation.
virtual bool hasStandaloneRem(EVT VT) const {
return true;
}
/// Return true if SQRT(X) shouldn't be replaced with X*RSQRT(X).
virtual bool isFsqrtCheap(SDValue X, SelectionDAG &DAG) const {
// Default behavior is to replace SQRT(X) with X*RSQRT(X).
return false;
}
/// Reciprocal estimate status values used by the functions below.
enum ReciprocalEstimate : int {
Unspecified = -1,
Disabled = 0,
Enabled = 1
};
/// Return a ReciprocalEstimate enum value for a square root of the given type
/// based on the function's attributes. If the operation is not overridden by
/// the function's attributes, "Unspecified" is returned and target defaults
/// are expected to be used for instruction selection.
int getRecipEstimateSqrtEnabled(EVT VT, MachineFunction &MF) const;
/// Return a ReciprocalEstimate enum value for a division of the given type
/// based on the function's attributes. If the operation is not overridden by
/// the function's attributes, "Unspecified" is returned and target defaults
/// are expected to be used for instruction selection.
int getRecipEstimateDivEnabled(EVT VT, MachineFunction &MF) const;
/// Return the refinement step count for a square root of the given type based
/// on the function's attributes. If the operation is not overridden by
/// the function's attributes, "Unspecified" is returned and target defaults
/// are expected to be used for instruction selection.
int getSqrtRefinementSteps(EVT VT, MachineFunction &MF) const;
/// Return the refinement step count for a division of the given type based
/// on the function's attributes. If the operation is not overridden by
/// the function's attributes, "Unspecified" is returned and target defaults
/// are expected to be used for instruction selection.
int getDivRefinementSteps(EVT VT, MachineFunction &MF) const;
/// Returns true if target has indicated at least one type should be bypassed.
bool isSlowDivBypassed() const { return !BypassSlowDivWidths.empty(); }
/// Returns map of slow types for division or remainder with corresponding
/// fast types
const DenseMap<unsigned int, unsigned int> &getBypassSlowDivWidths() const {
return BypassSlowDivWidths;
}
/// Return true if Flow Control is an expensive operation that should be
/// avoided.
bool isJumpExpensive() const { return JumpIsExpensive; }
/// Return true if selects are only cheaper than branches if the branch is
/// unlikely to be predicted right.
bool isPredictableSelectExpensive() const {
return PredictableSelectIsExpensive;
}
/// If a branch or a select condition is skewed in one direction by more than
/// this factor, it is very likely to be predicted correctly.
virtual BranchProbability getPredictableBranchThreshold() const;
/// Return true if the following transform is beneficial:
/// fold (conv (load x)) -> (load (conv*)x)
/// On architectures that don't natively support some vector loads
/// efficiently, casting the load to a smaller vector of larger types and
/// loading is more efficient, however, this can be undone by optimizations in
/// dag combiner.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
const SelectionDAG &DAG,
const MachineMemOperand &MMO) const {
// Don't do if we could do an indexed load on the original type, but not on
// the new one.
if (!LoadVT.isSimple() || !BitcastVT.isSimple())
return true;
MVT LoadMVT = LoadVT.getSimpleVT();
// Don't bother doing this if it's just going to be promoted again later, as
// doing so might interfere with other combines.
if (getOperationAction(ISD::LOAD, LoadMVT) == Promote &&
getTypeToPromoteTo(ISD::LOAD, LoadMVT) == BitcastVT.getSimpleVT())
return false;
bool Fast = false;
return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), BitcastVT,
MMO, &Fast) && Fast;
}
/// Return true if the following transform is beneficial:
/// (store (y (conv x)), y*)) -> (store x, (x*))
virtual bool isStoreBitCastBeneficial(EVT StoreVT, EVT BitcastVT,
const SelectionDAG &DAG,
const MachineMemOperand &MMO) const {
// Default to the same logic as loads.
return isLoadBitCastBeneficial(StoreVT, BitcastVT, DAG, MMO);
}
/// Return true if it is expected to be cheaper to do a store of a non-zero
/// vector constant with the given size and type for the address space than to
/// store the individual scalar element constants.
virtual bool storeOfVectorConstantIsCheap(EVT MemVT,
unsigned NumElem,
unsigned AddrSpace) const {
return false;
}
/// Allow store merging for the specified type after legalization in addition
/// to before legalization. This may transform stores that do not exist
/// earlier (for example, stores created from intrinsics).
virtual bool mergeStoresAfterLegalization(EVT MemVT) const {
return true;
}
/// Returns if it's reasonable to merge stores to MemVT size.
virtual bool canMergeStoresTo(unsigned AS, EVT MemVT,
const SelectionDAG &DAG) const {
return true;
}
/// Return true if it is cheap to speculate a call to intrinsic cttz.
virtual bool isCheapToSpeculateCttz() const {
return false;
}
/// Return true if it is cheap to speculate a call to intrinsic ctlz.
virtual bool isCheapToSpeculateCtlz() const {
return false;
}
/// Return true if ctlz instruction is fast.
virtual bool isCtlzFast() const {
return false;
}
/// Return true if it is safe to transform an integer-domain bitwise operation
/// into the equivalent floating-point operation. This should be set to true
/// if the target has IEEE-754-compliant fabs/fneg operations for the input
/// type.
virtual bool hasBitPreservingFPLogic(EVT VT) const {
return false;
}
/// Return true if it is cheaper to split the store of a merged int val
/// from a pair of smaller values into multiple stores.
virtual bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const {
return false;
}
/// Return if the target supports combining a
/// chain like:
/// \code
/// %andResult = and %val1, #mask
/// %icmpResult = icmp %andResult, 0
/// \endcode
/// into a single machine instruction of a form like:
/// \code
/// cc = test %register, #mask
/// \endcode
virtual bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
return false;
}
/// Use bitwise logic to make pairs of compares more efficient. For example:
/// and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0
/// This should be true when it takes more than one instruction to lower
/// setcc (cmp+set on x86 scalar), when bitwise ops are faster than logic on
/// condition bits (crand on PowerPC), and/or when reducing cmp+br is a win.
virtual bool convertSetCCLogicToBitwiseLogic(EVT VT) const {
return false;
}
/// Return the preferred operand type if the target has a quick way to compare
/// integer values of the given size. Assume that any legal integer type can
/// be compared efficiently. Targets may override this to allow illegal wide
/// types to return a vector type if there is support to compare that type.
virtual MVT hasFastEqualityCompare(unsigned NumBits) const {
MVT VT = MVT::getIntegerVT(NumBits);
return isTypeLegal(VT) ? VT : MVT::INVALID_SIMPLE_VALUE_TYPE;
}
/// Return true if the target should transform:
/// (X & Y) == Y ---> (~X & Y) == 0
/// (X & Y) != Y ---> (~X & Y) != 0
///
/// This may be profitable if the target has a bitwise and-not operation that
/// sets comparison flags. A target may want to limit the transformation based
/// on the type of Y or if Y is a constant.
///
/// Note that the transform will not occur if Y is known to be a power-of-2
/// because a mask and compare of a single bit can be handled by inverting the
/// predicate, for example:
/// (X & 8) == 8 ---> (X & 8) != 0
virtual bool hasAndNotCompare(SDValue Y) const {
return false;
}
/// Return true if the target has a bitwise and-not operation:
/// X = ~A & B
/// This can be used to simplify select or other instructions.
virtual bool hasAndNot(SDValue X) const {
// If the target has the more complex version of this operation, assume that
// it has this operation too.
return hasAndNotCompare(X);
}
/// There are two ways to clear extreme bits (either low or high):
/// Mask: x & (-1 << y) (the instcombine canonical form)
/// Shifts: x >> y << y
/// Return true if the variant with 2 variable shifts is preferred.
/// Return false if there is no preference.
virtual bool shouldFoldMaskToVariableShiftPair(SDValue X) const {
// By default, let's assume that no one prefers shifts.
return false;
}
/// Return true if it is profitable to fold a pair of shifts into a mask.
/// This is usually true on most targets. But some targets, like Thumb1,
/// have immediate shift instructions, but no immediate "and" instruction;
/// this makes the fold unprofitable.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N,
CombineLevel Level) const {
return true;
}
/// Should we tranform the IR-optimal check for whether given truncation
/// down into KeptBits would be truncating or not:
/// (add %x, (1 << (KeptBits-1))) srccond (1 << KeptBits)
/// Into it's more traditional form:
/// ((%x << C) a>> C) dstcond %x
/// Return true if we should transform.
/// Return false if there is no preference.
virtual bool shouldTransformSignedTruncationCheck(EVT XVT,
unsigned KeptBits) const {
// By default, let's assume that no one prefers shifts.
return false;
}
/// These two forms are equivalent:
/// sub %y, (xor %x, -1)
/// add (add %x, 1), %y
/// The variant with two add's is IR-canonical.
/// Some targets may prefer one to the other.
virtual bool preferIncOfAddToSubOfNot(EVT VT) const {
// By default, let's assume that everyone prefers the form with two add's.
return true;
}
/// Return true if the target wants to use the optimization that
/// turns ext(promotableInst1(...(promotableInstN(load)))) into
/// promotedInst1(...(promotedInstN(ext(load)))).
bool enableExtLdPromotion() const { return EnableExtLdPromotion; }
/// Return true if the target can combine store(extractelement VectorTy,
/// Idx).
/// \p Cost[out] gives the cost of that transformation when this is true.
virtual bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
unsigned &Cost) const {
return false;
}
/// Return true if inserting a scalar into a variable element of an undef
/// vector is more efficiently handled by splatting the scalar instead.
virtual bool shouldSplatInsEltVarIndex(EVT) const {
return false;
}
/// Return true if target always beneficiates from combining into FMA for a
/// given value type. This must typically return false on targets where FMA
/// takes more cycles to execute than FADD.
virtual bool enableAggressiveFMAFusion(EVT VT) const {
return false;
}
/// Return the ValueType of the result of SETCC operations.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const;
/// Return the ValueType for comparison libcalls. Comparions libcalls include
/// floating point comparion calls, and Ordered/Unordered check calls on
/// floating point numbers.
virtual
MVT::SimpleValueType getCmpLibcallReturnType() const;
/// For targets without i1 registers, this gives the nature of the high-bits
/// of boolean values held in types wider than i1.
///
/// "Boolean values" are special true/false values produced by nodes like
/// SETCC and consumed (as the condition) by nodes like SELECT and BRCOND.
/// Not to be confused with general values promoted from i1. Some cpus
/// distinguish between vectors of boolean and scalars; the isVec parameter
/// selects between the two kinds. For example on X86 a scalar boolean should
/// be zero extended from i1, while the elements of a vector of booleans
/// should be sign extended from i1.
///
/// Some cpus also treat floating point types the same way as they treat
/// vectors instead of the way they treat scalars.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const {
if (isVec)
return BooleanVectorContents;
return isFloat ? BooleanFloatContents : BooleanContents;
}
BooleanContent getBooleanContents(EVT Type) const {
return getBooleanContents(Type.isVector(), Type.isFloatingPoint());
}
/// Return target scheduling preference.
Sched::Preference getSchedulingPreference() const {
return SchedPreferenceInfo;
}
/// Some scheduler, e.g. hybrid, can switch to different scheduling heuristics
/// for different nodes. This function returns the preference (or none) for
/// the given node.
virtual Sched::Preference getSchedulingPreference(SDNode *) const {
return Sched::None;
}
/// Return the register class that should be used for the specified value
/// type.
virtual const TargetRegisterClass *getRegClassFor(MVT VT, bool isDivergent = false) const {
(void)isDivergent;
const TargetRegisterClass *RC = RegClassForVT[VT.SimpleTy];
assert(RC && "This value type is not natively supported!");
return RC;
}
/// Allows target to decide about the register class of the
/// specific value that is live outside the defining block.
/// Returns true if the value needs uniform register class.
virtual bool requiresUniformRegister(MachineFunction &MF,
const Value *) const {
return false;
}
/// Return the 'representative' register class for the specified value
/// type.
///
/// The 'representative' register class is the largest legal super-reg
/// register class for the register class of the value type. For example, on
/// i386 the rep register class for i8, i16, and i32 are GR32; while the rep
/// register class is GR64 on x86_64.
virtual const TargetRegisterClass *getRepRegClassFor(MVT VT) const {
const TargetRegisterClass *RC = RepRegClassForVT[VT.SimpleTy];
return RC;
}
/// Return the cost of the 'representative' register class for the specified
/// value type.
virtual uint8_t getRepRegClassCostFor(MVT VT) const {
return RepRegClassCostForVT[VT.SimpleTy];
}
/// Return true if SHIFT instructions should be expanded to SHIFT_PARTS
/// instructions, and false if a library call is preferred (e.g for code-size
/// reasons).
virtual bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const {
return true;
}
/// Return true if the target has native support for the specified value type.
/// This means that it has a register that directly holds it without
/// promotions or expansions.
bool isTypeLegal(EVT VT) const {
assert(!VT.isSimple() ||
(unsigned)VT.getSimpleVT().SimpleTy < array_lengthof(RegClassForVT));
return VT.isSimple() && RegClassForVT[VT.getSimpleVT().SimpleTy] != nullptr;
}
class ValueTypeActionImpl {
/// ValueTypeActions - For each value type, keep a LegalizeTypeAction enum
/// that indicates how instruction selection should deal with the type.
LegalizeTypeAction ValueTypeActions[MVT::LAST_VALUETYPE];
public:
ValueTypeActionImpl() {
std::fill(std::begin(ValueTypeActions), std::end(ValueTypeActions),
TypeLegal);
}
LegalizeTypeAction getTypeAction(MVT VT) const {
return ValueTypeActions[VT.SimpleTy];
}
void setTypeAction(MVT VT, LegalizeTypeAction Action) {
ValueTypeActions[VT.SimpleTy] = Action;
}
};
const ValueTypeActionImpl &getValueTypeActions() const {
return ValueTypeActions;
}
/// Return how we should legalize values of this type, either it is already
/// legal (return 'Legal') or we need to promote it to a larger type (return
/// 'Promote'), or we need to expand it into multiple registers of smaller
/// integer type (return 'Expand'). 'Custom' is not an option.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const {
return getTypeConversion(Context, VT).first;
}
LegalizeTypeAction getTypeAction(MVT VT) const {
return ValueTypeActions.getTypeAction(VT);
}
/// For types supported by the target, this is an identity function. For
/// types that must be promoted to larger types, this returns the larger type
/// to promote to. For integer types that are larger than the largest integer
/// register, this contains one step in the expansion to get to the smaller
/// register. For illegal floating point types, this returns the integer type
/// to transform to.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const {
return getTypeConversion(Context, VT).second;
}
/// For types supported by the target, this is an identity function. For
/// types that must be expanded (i.e. integer types that are larger than the
/// largest integer register or illegal floating point types), this returns
/// the largest legal type it will be expanded to.
EVT getTypeToExpandTo(LLVMContext &Context, EVT VT) const {
assert(!VT.isVector());
while (true) {
switch (getTypeAction(Context, VT)) {
case TypeLegal:
return VT;
case TypeExpandInteger:
VT = getTypeToTransformTo(Context, VT);
break;
default:
llvm_unreachable("Type is not legal nor is it to be expanded!");
}
}
}
/// Vector types are broken down into some number of legal first class types.
/// For example, EVT::v8f32 maps to 2 EVT::v4f32 with Altivec or SSE1, or 8
/// promoted EVT::f64 values with the X86 FP stack. Similarly, EVT::v2i64
/// turns into 4 EVT::i32 values with both PPC and X86.
///
/// This method returns the number of registers needed, and the VT for each
/// register. It also returns the VT and quantity of the intermediate values
/// before they are promoted/expanded.
unsigned getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
EVT &IntermediateVT,
unsigned &NumIntermediates,
MVT &RegisterVT) const;
/// Certain targets such as MIPS require that some types such as vectors are
/// always broken down into scalars in some contexts. This occurs even if the
/// vector type is legal.
virtual unsigned getVectorTypeBreakdownForCallingConv(
LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
unsigned &NumIntermediates, MVT &RegisterVT) const {
return getVectorTypeBreakdown(Context, VT, IntermediateVT, NumIntermediates,
RegisterVT);
}
struct IntrinsicInfo {
unsigned opc = 0; // target opcode
EVT memVT; // memory VT
// value representing memory location
PointerUnion<const Value *, const PseudoSourceValue *> ptrVal;
int offset = 0; // offset off of ptrVal
unsigned size = 0; // the size of the memory location
// (taken from memVT if zero)
unsigned align = 1; // alignment
MachineMemOperand::Flags flags = MachineMemOperand::MONone;
IntrinsicInfo() = default;
};
/// Given an intrinsic, checks if on the target the intrinsic will need to map
/// to a MemIntrinsicNode (touches memory). If this is the case, it returns
/// true and store the intrinsic information into the IntrinsicInfo that was
/// passed to the function.
virtual bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &,
MachineFunction &,
unsigned /*Intrinsic*/) const {
return false;
}
/// Returns true if the target can instruction select the specified FP
/// immediate natively. If false, the legalizer will materialize the FP
/// immediate as a load from a constant pool.
virtual bool isFPImmLegal(const APFloat & /*Imm*/, EVT /*VT*/,
bool ForCodeSize = false) const {
return false;
}
/// Targets can use this to indicate that they only support *some*
/// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
/// target supports the VECTOR_SHUFFLE node, all mask values are assumed to be
/// legal.
virtual bool isShuffleMaskLegal(ArrayRef<int> /*Mask*/, EVT /*VT*/) const {
return true;
}
/// Returns true if the operation can trap for the value type.
///
/// VT must be a legal type. By default, we optimistically assume most
/// operations don't trap except for integer divide and remainder.
virtual bool canOpTrap(unsigned Op, EVT VT) const;
/// Similar to isShuffleMaskLegal. Targets can use this to indicate if there
/// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a
/// constant pool entry.
virtual bool isVectorClearMaskLegal(ArrayRef<int> /*Mask*/,
EVT /*VT*/) const {
return false;
}
/// Return how this operation should be treated: either it is legal, needs to
/// be promoted to a larger size, needs to be expanded to some other code
/// sequence, or the target has a custom expander for it.
LegalizeAction getOperationAction(unsigned Op, EVT VT) const {
if (VT.isExtended()) return Expand;
// If a target-specific SDNode requires legalization, require the target
// to provide custom legalization for it.
if (Op >= array_lengthof(OpActions[0])) return Custom;
return OpActions[(unsigned)VT.getSimpleVT().SimpleTy][Op];
}
/// Custom method defined by each target to indicate if an operation which
/// may require a scale is supported natively by the target.
/// If not, the operation is illegal.
virtual bool isSupportedFixedPointOperation(unsigned Op, EVT VT,
unsigned Scale) const {
return false;
}
/// Some fixed point operations may be natively supported by the target but
/// only for specific scales. This method allows for checking
/// if the width is supported by the target for a given operation that may
/// depend on scale.
LegalizeAction getFixedPointOperationAction(unsigned Op, EVT VT,
unsigned Scale) const {
auto Action = getOperationAction(Op, VT);
if (Action != Legal)
return Action;
// This operation is supported in this type but may only work on specific
// scales.
bool Supported;
switch (Op) {
default:
llvm_unreachable("Unexpected fixed point operation.");
case ISD::SMULFIX:
case ISD::SMULFIXSAT:
case ISD::UMULFIX:
Supported = isSupportedFixedPointOperation(Op, VT, Scale);
break;
}
return Supported ? Action : Expand;
}
LegalizeAction getStrictFPOperationAction(unsigned Op, EVT VT) const {
unsigned EqOpc;
switch (Op) {
default: llvm_unreachable("Unexpected FP pseudo-opcode");
case ISD::STRICT_FADD: EqOpc = ISD::FADD; break;
case ISD::STRICT_FSUB: EqOpc = ISD::FSUB; break;
case ISD::STRICT_FMUL: EqOpc = ISD::FMUL; break;
case ISD::STRICT_FDIV: EqOpc = ISD::FDIV; break;
case ISD::STRICT_FREM: EqOpc = ISD::FREM; break;
case ISD::STRICT_FSQRT: EqOpc = ISD::FSQRT; break;
case ISD::STRICT_FPOW: EqOpc = ISD::FPOW; break;
case ISD::STRICT_FPOWI: EqOpc = ISD::FPOWI; break;
case ISD::STRICT_FMA: EqOpc = ISD::FMA; break;
case ISD::STRICT_FSIN: EqOpc = ISD::FSIN; break;
case ISD::STRICT_FCOS: EqOpc = ISD::FCOS; break;
case ISD::STRICT_FEXP: EqOpc = ISD::FEXP; break;
case ISD::STRICT_FEXP2: EqOpc = ISD::FEXP2; break;
case ISD::STRICT_FLOG: EqOpc = ISD::FLOG; break;
case ISD::STRICT_FLOG10: EqOpc = ISD::FLOG10; break;
case ISD::STRICT_FLOG2: EqOpc = ISD::FLOG2; break;
case ISD::STRICT_FRINT: EqOpc = ISD::FRINT; break;
case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break;
case ISD::STRICT_FMAXNUM: EqOpc = ISD::FMAXNUM; break;
case ISD::STRICT_FMINNUM: EqOpc = ISD::FMINNUM; break;
case ISD::STRICT_FCEIL: EqOpc = ISD::FCEIL; break;
case ISD::STRICT_FFLOOR: EqOpc = ISD::FFLOOR; break;
case ISD::STRICT_FROUND: EqOpc = ISD::FROUND; break;
case ISD::STRICT_FTRUNC: EqOpc = ISD::FTRUNC; break;
case ISD::STRICT_FP_ROUND: EqOpc = ISD::FP_ROUND; break;
case ISD::STRICT_FP_EXTEND: EqOpc = ISD::FP_EXTEND; break;
}
auto Action = getOperationAction(EqOpc, VT);
// We don't currently handle Custom or Promote for strict FP pseudo-ops.
// For now, we just expand for those cases.
if (Action != Legal)
Action = Expand;
return Action;
}
/// Return true if the specified operation is legal on this target or can be
/// made legal with custom lowering. This is used to help guide high-level
/// lowering decisions.
bool isOperationLegalOrCustom(unsigned Op, EVT VT) const {
return (VT == MVT::Other || isTypeLegal(VT)) &&
(getOperationAction(Op, VT) == Legal ||
getOperationAction(Op, VT) == Custom);
}
/// Return true if the specified operation is legal on this target or can be
/// made legal using promotion. This is used to help guide high-level lowering
/// decisions.
bool isOperationLegalOrPromote(unsigned Op, EVT VT) const {
return (VT == MVT::Other || isTypeLegal(VT)) &&
(getOperationAction(Op, VT) == Legal ||
getOperationAction(Op, VT) == Promote);
}
/// Return true if the specified operation is legal on this target or can be
/// made legal with custom lowering or using promotion. This is used to help
/// guide high-level lowering decisions.
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT) const {
return (VT == MVT::Other || isTypeLegal(VT)) &&
(getOperationAction(Op, VT) == Legal ||
getOperationAction(Op, VT) == Custom ||
getOperationAction(Op, VT) == Promote);
}
/// Return true if the operation uses custom lowering, regardless of whether
/// the type is legal or not.
bool isOperationCustom(unsigned Op, EVT VT) const {
return getOperationAction(Op, VT) == Custom;
}
/// Return true if lowering to a jump table is allowed.
virtual bool areJTsAllowed(const Function *Fn) const {
if (Fn->getFnAttribute("no-jump-tables").getValueAsString() == "true")
return false;
return isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
}
/// Check whether the range [Low,High] fits in a machine word.
bool rangeFitsInWord(const APInt &Low, const APInt &High,
const DataLayout &DL) const {
// FIXME: Using the pointer type doesn't seem ideal.
uint64_t BW = DL.getIndexSizeInBits(0u);
uint64_t Range = (High - Low).getLimitedValue(UINT64_MAX - 1) + 1;
return Range <= BW;
}
/// Return true if lowering to a jump table is suitable for a set of case
/// clusters which may contain \p NumCases cases, \p Range range of values.
virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases,
uint64_t Range) const {
// FIXME: This function check the maximum table size and density, but the
// minimum size is not checked. It would be nice if the minimum size is
// also combined within this function. Currently, the minimum size check is
// performed in findJumpTable() in SelectionDAGBuiler and
// getEstimatedNumberOfCaseClusters() in BasicTTIImpl.
const bool OptForSize = SI->getParent()->getParent()->hasOptSize();
const unsigned MinDensity = getMinimumJumpTableDensity(OptForSize);
const unsigned MaxJumpTableSize = getMaximumJumpTableSize();
// Check whether the number of cases is small enough and
// the range is dense enough for a jump table.
if ((OptForSize || Range <= MaxJumpTableSize) &&
(NumCases * 100 >= Range * MinDensity)) {
return true;
}
return false;
}
/// Return true if lowering to a bit test is suitable for a set of case
/// clusters which contains \p NumDests unique destinations, \p Low and
/// \p High as its lowest and highest case values, and expects \p NumCmps
/// case value comparisons. Check if the number of destinations, comparison
/// metric, and range are all suitable.
bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps,
const APInt &Low, const APInt &High,
const DataLayout &DL) const {
// FIXME: I don't think NumCmps is the correct metric: a single case and a
// range of cases both require only one branch to lower. Just looking at the
// number of clusters and destinations should be enough to decide whether to
// build bit tests.
// To lower a range with bit tests, the range must fit the bitwidth of a
// machine word.
if (!rangeFitsInWord(Low, High, DL))
return false;
// Decide whether it's profitable to lower this range with bit tests. Each
// destination requires a bit test and branch, and there is an overall range
// check branch. For a small number of clusters, separate comparisons might
// be cheaper, and for many destinations, splitting the range might be
// better.
return (NumDests == 1 && NumCmps >= 3) || (NumDests == 2 && NumCmps >= 5) ||
(NumDests == 3 && NumCmps >= 6);
}
/// Return true if the specified operation is illegal on this target or
/// unlikely to be made legal with custom lowering. This is used to help guide
/// high-level lowering decisions.
bool isOperationExpand(unsigned Op, EVT VT) const {
return (!isTypeLegal(VT) || getOperationAction(Op, VT) == Expand);
}
/// Return true if the specified operation is legal on this target.
bool isOperationLegal(unsigned Op, EVT VT) const {
return (VT == MVT::Other || isTypeLegal(VT)) &&
getOperationAction(Op, VT) == Legal;
}
/// Return how this load with extension should be treated: either it is legal,
/// needs to be promoted to a larger size, needs to be expanded to some other
/// code sequence, or the target has a custom expander for it.
LegalizeAction getLoadExtAction(unsigned ExtType, EVT ValVT,
EVT MemVT) const {
if (ValVT.isExtended() || MemVT.isExtended()) return Expand;
unsigned ValI = (unsigned) ValVT.getSimpleVT().SimpleTy;
unsigned MemI = (unsigned) MemVT.getSimpleVT().SimpleTy;
assert(ExtType < ISD::LAST_LOADEXT_TYPE && ValI < MVT::LAST_VALUETYPE &&
MemI < MVT::LAST_VALUETYPE && "Table isn't big enough!");
unsigned Shift = 4 * ExtType;
return (LegalizeAction)((LoadExtActions[ValI][MemI] >> Shift) & 0xf);
}
/// Return true if the specified load with extension is legal on this target.
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const {
return getLoadExtAction(ExtType, ValVT, MemVT) == Legal;
}
/// Return true if the specified load with extension is legal or custom
/// on this target.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const {
return getLoadExtAction(ExtType, ValVT, MemVT) == Legal ||
getLoadExtAction(ExtType, ValVT, MemVT) == Custom;
}
/// Return how this store with truncation should be treated: either it is
/// legal, needs to be promoted to a larger size, needs to be expanded to some
/// other code sequence, or the target has a custom expander for it.
LegalizeAction getTruncStoreAction(EVT ValVT, EVT MemVT) const {
if (ValVT.isExtended() || MemVT.isExtended()) return Expand;
unsigned ValI = (unsigned) ValVT.getSimpleVT().SimpleTy;
unsigned MemI = (unsigned) MemVT.getSimpleVT().SimpleTy;
assert(ValI < MVT::LAST_VALUETYPE && MemI < MVT::LAST_VALUETYPE &&
"Table isn't big enough!");
return TruncStoreActions[ValI][MemI];
}
/// Return true if the specified store with truncation is legal on this
/// target.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const {
return isTypeLegal(ValVT) && getTruncStoreAction(ValVT, MemVT) == Legal;
}
/// Return true if the specified store with truncation has solution on this
/// target.
bool isTruncStoreLegalOrCustom(EVT ValVT, EVT MemVT) const {
return isTypeLegal(ValVT) &&
(getTruncStoreAction(ValVT, MemVT) == Legal ||
getTruncStoreAction(ValVT, MemVT) == Custom);
}
/// Return how the indexed load should be treated: either it is legal, needs
/// to be promoted to a larger size, needs to be expanded to some other code
/// sequence, or the target has a custom expander for it.
LegalizeAction
getIndexedLoadAction(unsigned IdxMode, MVT VT) const {
assert(IdxMode < ISD::LAST_INDEXED_MODE && VT.isValid() &&
"Table isn't big enough!");
unsigned Ty = (unsigned)VT.SimpleTy;
return (LegalizeAction)((IndexedModeActions[Ty][IdxMode] & 0xf0) >> 4);
}
/// Return true if the specified indexed load is legal on this target.
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const {
return VT.isSimple() &&
(getIndexedLoadAction(IdxMode, VT.getSimpleVT()) == Legal ||
getIndexedLoadAction(IdxMode, VT.getSimpleVT()) == Custom);
}
/// Return how the indexed store should be treated: either it is legal, needs
/// to be promoted to a larger size, needs to be expanded to some other code
/// sequence, or the target has a custom expander for it.
LegalizeAction
getIndexedStoreAction(unsigned IdxMode, MVT VT) const {
assert(IdxMode < ISD::LAST_INDEXED_MODE && VT.isValid() &&
"Table isn't big enough!");
unsigned Ty = (unsigned)VT.SimpleTy;
return (LegalizeAction)(IndexedModeActions[Ty][IdxMode] & 0x0f);
}
/// Return true if the specified indexed load is legal on this target.
bool isIndexedStoreLegal(unsigned IdxMode, EVT VT) const {
return VT.isSimple() &&
(getIndexedStoreAction(IdxMode, VT.getSimpleVT()) == Legal ||
getIndexedStoreAction(IdxMode, VT.getSimpleVT()) == Custom);
}
/// Return how the condition code should be treated: either it is legal, needs
/// to be expanded to some other code sequence, or the target has a custom
/// expander for it.
LegalizeAction
getCondCodeAction(ISD::CondCode CC, MVT VT) const {
assert((unsigned)CC < array_lengthof(CondCodeActions) &&
((unsigned)VT.SimpleTy >> 3) < array_lengthof(CondCodeActions[0]) &&
"Table isn't big enough!");
// See setCondCodeAction for how this is encoded.
uint32_t Shift = 4 * (VT.SimpleTy & 0x7);
uint32_t Value = CondCodeActions[CC][VT.SimpleTy >> 3];
LegalizeAction Action = (LegalizeAction) ((Value >> Shift) & 0xF);
assert(Action != Promote && "Can't promote condition code!");
return Action;
}
/// Return true if the specified condition code is legal on this target.
bool isCondCodeLegal(ISD::CondCode CC, MVT VT) const {
return getCondCodeAction(CC, VT) == Legal;
}
/// Return true if the specified condition code is legal or custom on this
/// target.
bool isCondCodeLegalOrCustom(ISD::CondCode CC, MVT VT) const {
return getCondCodeAction(CC, VT) == Legal ||
getCondCodeAction(CC, VT) == Custom;
}
/// If the action for this operation is to promote, this method returns the
/// ValueType to promote to.
MVT getTypeToPromoteTo(unsigned Op, MVT VT) const {
assert(getOperationAction(Op, VT) == Promote &&
"This operation isn't promoted!");
// See if this has an explicit type specified.
std::map<std::pair<unsigned, MVT::SimpleValueType>,
MVT::SimpleValueType>::const_iterator PTTI =
PromoteToType.find(std::make_pair(Op, VT.SimpleTy));
if (PTTI != PromoteToType.end()) return PTTI->second;
assert((VT.isInteger() || VT.isFloatingPoint()) &&
"Cannot autopromote this type, add it with AddPromotedToType.");
MVT NVT = VT;
do {
NVT = (MVT::SimpleValueType)(NVT.SimpleTy+1);
assert(NVT.isInteger() == VT.isInteger() && NVT != MVT::isVoid &&
"Didn't find type to promote to!");
} while (!isTypeLegal(NVT) ||
getOperationAction(Op, NVT) == Promote);
return NVT;
}
/// Return the EVT corresponding to this LLVM type. This is fixed by the LLVM
/// operations except for the pointer size. If AllowUnknown is true, this
/// will return MVT::Other for types with no EVT counterpart (e.g. structs),
/// otherwise it will assert.
EVT getValueType(const DataLayout &DL, Type *Ty,
bool AllowUnknown = false) const {
// Lower scalar pointers to native pointer types.
if (auto *PTy = dyn_cast<PointerType>(Ty))
return getPointerTy(DL, PTy->getAddressSpace());
if (auto *VTy = dyn_cast<VectorType>(Ty)) {
Type *EltTy = VTy->getElementType();
// Lower vectors of pointers to native pointer types.
if (auto *PTy = dyn_cast<PointerType>(EltTy)) {
EVT PointerTy(getPointerTy(DL, PTy->getAddressSpace()));
EltTy = PointerTy.getTypeForEVT(Ty->getContext());
}
return EVT::getVectorVT(Ty->getContext(), EVT::getEVT(EltTy, false),
VTy->getNumElements());
}
return EVT::getEVT(Ty, AllowUnknown);
}
EVT getMemValueType(const DataLayout &DL, Type *Ty,
bool AllowUnknown = false) const {
// Lower scalar pointers to native pointer types.
if (PointerType *PTy = dyn_cast<PointerType>(Ty))
return getPointerMemTy(DL, PTy->getAddressSpace());
else if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
Type *Elm = VTy->getElementType();
if (PointerType *PT = dyn_cast<PointerType>(Elm)) {
EVT PointerTy(getPointerMemTy(DL, PT->getAddressSpace()));
Elm = PointerTy.getTypeForEVT(Ty->getContext());
}
return EVT::getVectorVT(Ty->getContext(), EVT::getEVT(Elm, false),
VTy->getNumElements());
}
return getValueType(DL, Ty, AllowUnknown);
}
/// Return the MVT corresponding to this LLVM type. See getValueType.
MVT getSimpleValueType(const DataLayout &DL, Type *Ty,
bool AllowUnknown = false) const {
return getValueType(DL, Ty, AllowUnknown).getSimpleVT();
}
/// Return the desired alignment for ByVal or InAlloca aggregate function
/// arguments in the caller parameter area. This is the actual alignment, not
/// its logarithm.
virtual unsigned getByValTypeAlignment(Type *Ty, const DataLayout &DL) const;
/// Return the type of registers that this ValueType will eventually require.
MVT getRegisterType(MVT VT) const {
assert((unsigned)VT.SimpleTy < array_lengthof(RegisterTypeForVT));
return RegisterTypeForVT[VT.SimpleTy];
}
/// Return the type of registers that this ValueType will eventually require.
MVT getRegisterType(LLVMContext &Context, EVT VT) const {
if (VT.isSimple()) {
assert((unsigned)VT.getSimpleVT().SimpleTy <
array_lengthof(RegisterTypeForVT));
return RegisterTypeForVT[VT.getSimpleVT().SimpleTy];
}
if (VT.isVector()) {
EVT VT1;
MVT RegisterVT;
unsigned NumIntermediates;
(void)getVectorTypeBreakdown(Context, VT, VT1,
NumIntermediates, RegisterVT);
return RegisterVT;
}
if (VT.isInteger()) {
return getRegisterType(Context, getTypeToTransformTo(Context, VT));
}
llvm_unreachable("Unsupported extended type!");
}
/// Return the number of registers that this ValueType will eventually
/// require.
///
/// This is one for any types promoted to live in larger registers, but may be
/// more than one for types (like i64) that are split into pieces. For types
/// like i140, which are first promoted then expanded, it is the number of
/// registers needed to hold all the bits of the original type. For an i140
/// on a 32 bit machine this means 5 registers.
unsigned getNumRegisters(LLVMContext &Context, EVT VT) const {
if (VT.isSimple()) {
assert((unsigned)VT.getSimpleVT().SimpleTy <
array_lengthof(NumRegistersForVT));
return NumRegistersForVT[VT.getSimpleVT().SimpleTy];
}
if (VT.isVector()) {
EVT VT1;
MVT VT2;
unsigned NumIntermediates;
return getVectorTypeBreakdown(Context, VT, VT1, NumIntermediates, VT2);
}
if (VT.isInteger()) {
unsigned BitWidth = VT.getSizeInBits();
unsigned RegWidth = getRegisterType(Context, VT).getSizeInBits();
return (BitWidth + RegWidth - 1) / RegWidth;
}
llvm_unreachable("Unsupported extended type!");
}
/// Certain combinations of ABIs, Targets and features require that types
/// are legal for some operations and not for other operations.
/// For MIPS all vector types must be passed through the integer register set.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC, EVT VT) const {
return getRegisterType(Context, VT);
}
/// Certain targets require unusual breakdowns of certain types. For MIPS,
/// this occurs when a vector type is used, as vector are passed through the
/// integer register set.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
return getNumRegisters(Context, VT);
}
/// Certain targets have context senstive alignment requirements, where one
/// type has the alignment requirement of another type.
virtual unsigned getABIAlignmentForCallingConv(Type *ArgTy,
DataLayout DL) const {
return DL.getABITypeAlignment(ArgTy);
}
/// If true, then instruction selection should seek to shrink the FP constant
/// of the specified type to a smaller type in order to save space and / or
/// reduce runtime.
virtual bool ShouldShrinkFPConstant(EVT) const { return true; }
/// Return true if it is profitable to reduce a load to a smaller type.
/// Example: (i16 (trunc (i32 (load x))) -> i16 load x
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
EVT NewVT) const {
// By default, assume that it is cheaper to extract a subvector from a wide
// vector load rather than creating multiple narrow vector loads.
if (NewVT.isVector() && !Load->hasOneUse())
return false;
return true;
}
/// When splitting a value of the specified type into parts, does the Lo
/// or Hi part come first? This usually follows the endianness, except
/// for ppcf128, where the Hi part always comes first.
bool hasBigEndianPartOrdering(EVT VT, const DataLayout &DL) const {
return DL.isBigEndian() || VT == MVT::ppcf128;
}
/// If true, the target has custom DAG combine transformations that it can
/// perform for the specified node.
bool hasTargetDAGCombine(ISD::NodeType NT) const {
assert(unsigned(NT >> 3) < array_lengthof(TargetDAGCombineArray));
return TargetDAGCombineArray[NT >> 3] & (1 << (NT&7));
}
unsigned getGatherAllAliasesMaxDepth() const {
return GatherAllAliasesMaxDepth;
}
/// Returns the size of the platform's va_list object.
virtual unsigned getVaListSizeInBits(const DataLayout &DL) const {
return getPointerTy(DL).getSizeInBits();
}
/// Get maximum # of store operations permitted for llvm.memset
///
/// This function returns the maximum number of store operations permitted
/// to replace a call to llvm.memset. The value is set by the target at the
/// performance threshold for such a replacement. If OptSize is true,
/// return the limit for functions that have OptSize attribute.
unsigned getMaxStoresPerMemset(bool OptSize) const {
return OptSize ? MaxStoresPerMemsetOptSize : MaxStoresPerMemset;
}
/// Get maximum # of store operations permitted for llvm.memcpy
///
/// This function returns the maximum number of store operations permitted
/// to replace a call to llvm.memcpy. The value is set by the target at the
/// performance threshold for such a replacement. If OptSize is true,
/// return the limit for functions that have OptSize attribute.
unsigned getMaxStoresPerMemcpy(bool OptSize) const {
return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy;
}
/// \brief Get maximum # of store operations to be glued together
///
/// This function returns the maximum number of store operations permitted
/// to glue together during lowering of llvm.memcpy. The value is set by
// the target at the performance threshold for such a replacement.
virtual unsigned getMaxGluedStoresPerMemcpy() const {
return MaxGluedStoresPerMemcpy;
}
/// Get maximum # of load operations permitted for memcmp
///
/// This function returns the maximum number of load operations permitted
/// to replace a call to memcmp. The value is set by the target at the
/// performance threshold for such a replacement. If OptSize is true,
/// return the limit for functions that have OptSize attribute.
unsigned getMaxExpandSizeMemcmp(bool OptSize) const {
return OptSize ? MaxLoadsPerMemcmpOptSize : MaxLoadsPerMemcmp;
}
/// Get maximum # of store operations permitted for llvm.memmove
///
/// This function returns the maximum number of store operations permitted
/// to replace a call to llvm.memmove. The value is set by the target at the
/// performance threshold for such a replacement. If OptSize is true,
/// return the limit for functions that have OptSize attribute.
unsigned getMaxStoresPerMemmove(bool OptSize) const {
return OptSize ? MaxStoresPerMemmoveOptSize : MaxStoresPerMemmove;
}
/// Determine if the target supports unaligned memory accesses.
///
/// This function returns true if the target allows unaligned memory accesses
/// of the specified type in the given address space. If true, it also returns
/// whether the unaligned memory access is "fast" in the last argument by
/// reference. This is used, for example, in situations where an array
/// copy/move/set is converted to a sequence of store operations. Its use
/// helps to ensure that such replacements don't generate code that causes an
/// alignment error (trap) on the target machine.
virtual bool allowsMisalignedMemoryAccesses(
EVT, unsigned AddrSpace = 0, unsigned Align = 1,
MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
bool * /*Fast*/ = nullptr) const {
return false;
}
/// Return true if the target supports a memory access of this type for the
/// given address space and alignment. If the access is allowed, the optional
/// final parameter returns if the access is also fast (as defined by the
/// target).
bool
allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT,
unsigned AddrSpace = 0, unsigned Alignment = 1,
MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
bool *Fast = nullptr) const;
/// Return true if the target supports a memory access of this type for the
/// given MachineMemOperand. If the access is allowed, the optional
/// final parameter returns if the access is also fast (as defined by the
/// target).
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT,
const MachineMemOperand &MMO,
bool *Fast = nullptr) const;
/// Returns the target specific optimal type for load and store operations as
/// a result of memset, memcpy, and memmove lowering.
///
/// If DstAlign is zero that means it's safe to destination alignment can
/// satisfy any constraint. Similarly if SrcAlign is zero it means there isn't
/// a need to check it against alignment requirement, probably because the
/// source does not need to be loaded. If 'IsMemset' is true, that means it's
/// expanding a memset. If 'ZeroMemset' is true, that means it's a memset of
/// zero. 'MemcpyStrSrc' indicates whether the memcpy source is constant so it
/// does not need to be loaded. It returns EVT::Other if the type should be
/// determined using generic target-independent logic.
virtual EVT
getOptimalMemOpType(uint64_t /*Size*/, unsigned /*DstAlign*/,
unsigned /*SrcAlign*/, bool /*IsMemset*/,
bool /*ZeroMemset*/, bool /*MemcpyStrSrc*/,
const AttributeList & /*FuncAttributes*/) const {
return MVT::Other;
}
/// Returns true if it's safe to use load / store of the specified type to
/// expand memcpy / memset inline.
///
/// This is mostly true for all types except for some special cases. For
/// example, on X86 targets without SSE2 f64 load / store are done with fldl /
/// fstpl which also does type conversion. Note the specified type doesn't
/// have to be legal as the hook is used before type legalization.
virtual bool isSafeMemOpType(MVT /*VT*/) const { return true; }
/// Determine if we should use _setjmp or setjmp to implement llvm.setjmp.
bool usesUnderscoreSetJmp() const {
return UseUnderscoreSetJmp;
}
/// Determine if we should use _longjmp or longjmp to implement llvm.longjmp.
bool usesUnderscoreLongJmp() const {
return UseUnderscoreLongJmp;
}
/// Return lower limit for number of blocks in a jump table.
virtual unsigned getMinimumJumpTableEntries() const;
/// Return lower limit of the density in a jump table.
unsigned getMinimumJumpTableDensity(bool OptForSize) const;
/// Return upper limit for number of entries in a jump table.
/// Zero if no limit.
unsigned getMaximumJumpTableSize() const;
virtual bool isJumpTableRelative() const {
return TM.isPositionIndependent();
}
/// If a physical register, this specifies the register that
/// llvm.savestack/llvm.restorestack should save and restore.
unsigned getStackPointerRegisterToSaveRestore() const {
return StackPointerRegisterToSaveRestore;
}
/// If a physical register, this returns the register that receives the
/// exception address on entry to an EH pad.
virtual unsigned
getExceptionPointerRegister(const Constant *PersonalityFn) const {
// 0 is guaranteed to be the NoRegister value on all targets
return 0;
}
/// If a physical register, this returns the register that receives the
/// exception typeid on entry to a landing pad.
virtual unsigned
getExceptionSelectorRegister(const Constant *PersonalityFn) const {
// 0 is guaranteed to be the NoRegister value on all targets
return 0;
}
virtual bool needsFixedCatchObjects() const {
report_fatal_error("Funclet EH is not implemented for this target");
}
/// Returns the target's jmp_buf size in bytes (if never set, the default is
/// 200)
unsigned getJumpBufSize() const {
return JumpBufSize;
}
/// Returns the target's jmp_buf alignment in bytes (if never set, the default
/// is 0)
unsigned getJumpBufAlignment() const {
return JumpBufAlignment;
}
/// Return the minimum stack alignment of an argument.
unsigned getMinStackArgumentAlignment() const {
return MinStackArgumentAlignment;
}
/// Return the minimum function alignment.
unsigned getMinFunctionAlignment() const {
return MinFunctionAlignment;
}
/// Return the preferred function alignment.
unsigned getPrefFunctionAlignment() const {
return PrefFunctionAlignment;
}
/// Return the preferred loop alignment.
virtual unsigned getPrefLoopAlignment(MachineLoop *ML = nullptr) const {
return PrefLoopAlignment;
}
/// Should loops be aligned even when the function is marked OptSize (but not
/// MinSize).
virtual bool alignLoopsWithOptSize() const {
return false;
}
/// If the target has a standard location for the stack protector guard,
/// returns the address of that location. Otherwise, returns nullptr.
/// DEPRECATED: please override useLoadStackGuardNode and customize
/// LOAD_STACK_GUARD, or customize \@llvm.stackguard().
virtual Value *getIRStackGuard(IRBuilder<> &IRB) const;
/// Inserts necessary declarations for SSP (stack protection) purpose.
/// Should be used only when getIRStackGuard returns nullptr.
virtual void insertSSPDeclarations(Module &M) const;
/// Return the variable that's previously inserted by insertSSPDeclarations,
/// if any, otherwise return nullptr. Should be used only when
/// getIRStackGuard returns nullptr.
virtual Value *getSDagStackGuard(const Module &M) const;
/// If this function returns true, stack protection checks should XOR the
/// frame pointer (or whichever pointer is used to address locals) into the
/// stack guard value before checking it. getIRStackGuard must return nullptr
/// if this returns true.
virtual bool useStackGuardXorFP() const { return false; }
/// If the target has a standard stack protection check function that
/// performs validation and error handling, returns the function. Otherwise,
/// returns nullptr. Must be previously inserted by insertSSPDeclarations.
/// Should be used only when getIRStackGuard returns nullptr.
virtual Function *getSSPStackGuardCheck(const Module &M) const;
protected:
Value *getDefaultSafeStackPointerLocation(IRBuilder<> &IRB,
bool UseTLS) const;
public:
/// Returns the target-specific address of the unsafe stack pointer.
virtual Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const;
/// Returns the name of the symbol used to emit stack probes or the empty
/// string if not applicable.
virtual StringRef getStackProbeSymbolName(MachineFunction &MF) const {
return "";
}
/// Returns true if a cast between SrcAS and DestAS is a noop.
virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const {
return false;
}
/// Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g. we
/// are happy to sink it into basic blocks. A cast may be free, but not
/// necessarily a no-op. e.g. a free truncate from a 64-bit to 32-bit pointer.
virtual bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const {
return isNoopAddrSpaceCast(SrcAS, DestAS);
}
/// Return true if the pointer arguments to CI should be aligned by aligning
/// the object whose address is being passed. If so then MinSize is set to the
/// minimum size the object must be to be aligned and PrefAlign is set to the
/// preferred alignment.
virtual bool shouldAlignPointerArgs(CallInst * /*CI*/, unsigned & /*MinSize*/,
unsigned & /*PrefAlign*/) const {
return false;
}
//===--------------------------------------------------------------------===//
/// \name Helpers for TargetTransformInfo implementations
/// @{
/// Get the ISD node that corresponds to the Instruction class opcode.
int InstructionOpcodeToISD(unsigned Opcode) const;
/// Estimate the cost of type-legalization and the legalized type.
std::pair<int, MVT> getTypeLegalizationCost(const DataLayout &DL,
Type *Ty) const;
/// @}
//===--------------------------------------------------------------------===//
/// \name Helpers for atomic expansion.
/// @{
/// Returns the maximum atomic operation size (in bits) supported by
/// the backend. Atomic operations greater than this size (as well
/// as ones that are not naturally aligned), will be expanded by
/// AtomicExpandPass into an __atomic_* library call.
unsigned getMaxAtomicSizeInBitsSupported() const {
return MaxAtomicSizeInBitsSupported;
}
/// Returns the size of the smallest cmpxchg or ll/sc instruction
/// the backend supports. Any smaller operations are widened in
/// AtomicExpandPass.
///
/// Note that *unlike* operations above the maximum size, atomic ops
/// are still natively supported below the minimum; they just
/// require a more complex expansion.
unsigned getMinCmpXchgSizeInBits() const { return MinCmpXchgSizeInBits; }
/// Whether the target supports unaligned atomic operations.
bool supportsUnalignedAtomics() const { return SupportsUnalignedAtomics; }
/// Whether AtomicExpandPass should automatically insert fences and reduce
/// ordering for this atomic. This should be true for most architectures with
/// weak memory ordering. Defaults to false.
virtual bool shouldInsertFencesForAtomic(const Instruction *I) const {
return false;
}
/// Perform a load-linked operation on Addr, returning a "Value *" with the
/// corresponding pointee type. This may entail some non-trivial operations to
/// truncate or reconstruct types that will be illegal in the backend. See
/// ARMISelLowering for an example implementation.
virtual Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
AtomicOrdering Ord) const {
llvm_unreachable("Load linked unimplemented on this target");
}
/// Perform a store-conditional operation to Addr. Return the status of the
/// store. This should be 0 if the store succeeded, non-zero otherwise.
virtual Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
Value *Addr, AtomicOrdering Ord) const {
llvm_unreachable("Store conditional unimplemented on this target");
}
/// Perform a masked atomicrmw using a target-specific intrinsic. This
/// represents the core LL/SC loop which will be lowered at a late stage by
/// the backend.
virtual Value *emitMaskedAtomicRMWIntrinsic(IRBuilder<> &Builder,
AtomicRMWInst *AI,
Value *AlignedAddr, Value *Incr,
Value *Mask, Value *ShiftAmt,
AtomicOrdering Ord) const {
llvm_unreachable("Masked atomicrmw expansion unimplemented on this target");
}
/// Perform a masked cmpxchg using a target-specific intrinsic. This
/// represents the core LL/SC loop which will be lowered at a late stage by
/// the backend.
virtual Value *emitMaskedAtomicCmpXchgIntrinsic(
IRBuilder<> &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr,
Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const {
llvm_unreachable("Masked cmpxchg expansion unimplemented on this target");
}
/// Inserts in the IR a target-specific intrinsic specifying a fence.
/// It is called by AtomicExpandPass before expanding an
/// AtomicRMW/AtomicCmpXchg/AtomicStore/AtomicLoad
/// if shouldInsertFencesForAtomic returns true.
///
/// Inst is the original atomic instruction, prior to other expansions that
/// may be performed.
///
/// This function should either return a nullptr, or a pointer to an IR-level
/// Instruction*. Even complex fence sequences can be represented by a
/// single Instruction* through an intrinsic to be lowered later.
/// Backends should override this method to produce target-specific intrinsic
/// for their fences.
/// FIXME: Please note that the default implementation here in terms of
/// IR-level fences exists for historical/compatibility reasons and is
/// *unsound* ! Fences cannot, in general, be used to restore sequential
/// consistency. For example, consider the following example:
/// atomic<int> x = y = 0;
/// int r1, r2, r3, r4;
/// Thread 0:
/// x.store(1);
/// Thread 1:
/// y.store(1);
/// Thread 2:
/// r1 = x.load();
/// r2 = y.load();
/// Thread 3:
/// r3 = y.load();
/// r4 = x.load();
/// r1 = r3 = 1 and r2 = r4 = 0 is impossible as long as the accesses are all
/// seq_cst. But if they are lowered to monotonic accesses, no amount of
/// IR-level fences can prevent it.
/// @{
virtual Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
AtomicOrdering Ord) const {
if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore())
return Builder.CreateFence(Ord);
else
return nullptr;
}
virtual Instruction *emitTrailingFence(IRBuilder<> &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
if (isAcquireOrStronger(Ord))
return Builder.CreateFence(Ord);
else
return nullptr;
}
/// @}
// Emits code that executes when the comparison result in the ll/sc
// expansion of a cmpxchg instruction is such that the store-conditional will
// not execute. This makes it possible to balance out the load-linked with
// a dedicated instruction, if desired.
// E.g., on ARM, if ldrex isn't followed by strex, the exclusive monitor would
// be unnecessarily held, except if clrex, inserted by this hook, is executed.
virtual void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const {}
/// Returns true if the given (atomic) store should be expanded by the
/// IR-level AtomicExpand pass into an "atomic xchg" which ignores its input.
virtual bool shouldExpandAtomicStoreInIR(StoreInst *SI) const {
return false;
}
/// Returns true if arguments should be sign-extended in lib calls.
virtual bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const {
return IsSigned;
}
/// Returns how the given (atomic) load should be expanded by the
/// IR-level AtomicExpand pass.
virtual AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const {
return AtomicExpansionKind::None;
}
/// Returns how the given atomic cmpxchg should be expanded by the IR-level
/// AtomicExpand pass.
virtual AtomicExpansionKind
shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
return AtomicExpansionKind::None;
}
/// Returns how the IR-level AtomicExpand pass should expand the given
/// AtomicRMW, if at all. Default is to never expand.
virtual AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
return RMW->isFloatingPointOperation() ?
AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None;
}
/// On some platforms, an AtomicRMW that never actually modifies the value
/// (such as fetch_add of 0) can be turned into a fence followed by an
/// atomic load. This may sound useless, but it makes it possible for the
/// processor to keep the cacheline shared, dramatically improving
/// performance. And such idempotent RMWs are useful for implementing some
/// kinds of locks, see for example (justification + benchmarks):
/// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf
/// This method tries doing that transformation, returning the atomic load if
/// it succeeds, and nullptr otherwise.
/// If shouldExpandAtomicLoadInIR returns true on that load, it will undergo
/// another round of expansion.
virtual LoadInst *
lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *RMWI) const {
return nullptr;
}
/// Returns how the platform's atomic operations are extended (ZERO_EXTEND,
/// SIGN_EXTEND, or ANY_EXTEND).
virtual ISD::NodeType getExtendForAtomicOps() const {
return ISD::ZERO_EXTEND;
}
/// @}
/// Returns true if we should normalize
/// select(N0&N1, X, Y) => select(N0, select(N1, X, Y), Y) and
/// select(N0|N1, X, Y) => select(N0, select(N1, X, Y, Y)) if it is likely
/// that it saves us from materializing N0 and N1 in an integer register.
/// Targets that are able to perform and/or on flags should return false here.
virtual bool shouldNormalizeToSelectSequence(LLVMContext &Context,
EVT VT) const {
// If a target has multiple condition registers, then it likely has logical
// operations on those registers.
if (hasMultipleConditionRegisters())
return false;
// Only do the transform if the value won't be split into multiple
// registers.
LegalizeTypeAction Action = getTypeAction(Context, VT);
return Action != TypeExpandInteger && Action != TypeExpandFloat &&
Action != TypeSplitVector;
}
virtual bool isProfitableToCombineMinNumMaxNum(EVT VT) const { return true; }
/// Return true if a select of constants (select Cond, C1, C2) should be
/// transformed into simple math ops with the condition value. For example:
/// select Cond, C1, C1-1 --> add (zext Cond), C1-1
virtual bool convertSelectOfConstantsToMath(EVT VT) const {
return false;
}
/// Return true if it is profitable to transform an integer
/// multiplication-by-constant into simpler operations like shifts and adds.
/// This may be true if the target does not directly support the
/// multiplication operation for the specified type or the sequence of simpler
/// ops is faster than the multiply.
virtual bool decomposeMulByConstant(EVT VT, SDValue C) const {
return false;
}
/// Return true if it is more correct/profitable to use strict FP_TO_INT
/// conversion operations - canonicalizing the FP source value instead of
/// converting all cases and then selecting based on value.
/// This may be true if the target throws exceptions for out of bounds
/// conversions or has fast FP CMOV.
virtual bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
bool IsSigned) const {
return false;
}
//===--------------------------------------------------------------------===//
// TargetLowering Configuration Methods - These methods should be invoked by
// the derived class constructor to configure this object for the target.
//
protected:
/// Specify how the target extends the result of integer and floating point
/// boolean values from i1 to a wider type. See getBooleanContents.
void setBooleanContents(BooleanContent Ty) {
BooleanContents = Ty;
BooleanFloatContents = Ty;
}
/// Specify how the target extends the result of integer and floating point
/// boolean values from i1 to a wider type. See getBooleanContents.
void setBooleanContents(BooleanContent IntTy, BooleanContent FloatTy) {
BooleanContents = IntTy;
BooleanFloatContents = FloatTy;
}
/// Specify how the target extends the result of a vector boolean value from a
/// vector of i1 to a wider type. See getBooleanContents.
void setBooleanVectorContents(BooleanContent Ty) {
BooleanVectorContents = Ty;
}
/// Specify the target scheduling preference.
void setSchedulingPreference(Sched::Preference Pref) {
SchedPreferenceInfo = Pref;
}
/// Indicate whether this target prefers to use _setjmp to implement
/// llvm.setjmp or the version without _. Defaults to false.
void setUseUnderscoreSetJmp(bool Val) {
UseUnderscoreSetJmp = Val;
}
/// Indicate whether this target prefers to use _longjmp to implement
/// llvm.longjmp or the version without _. Defaults to false.
void setUseUnderscoreLongJmp(bool Val) {
UseUnderscoreLongJmp = Val;
}
/// Indicate the minimum number of blocks to generate jump tables.
void setMinimumJumpTableEntries(unsigned Val);
/// Indicate the maximum number of entries in jump tables.
/// Set to zero to generate unlimited jump tables.
void setMaximumJumpTableSize(unsigned);
/// If set to a physical register, this specifies the register that
/// llvm.savestack/llvm.restorestack should save and restore.
void setStackPointerRegisterToSaveRestore(unsigned R) {
StackPointerRegisterToSaveRestore = R;
}
/// Tells the code generator that the target has multiple (allocatable)
/// condition registers that can be used to store the results of comparisons
/// for use by selects and conditional branches. With multiple condition
/// registers, the code generator will not aggressively sink comparisons into
/// the blocks of their users.
void setHasMultipleConditionRegisters(bool hasManyRegs = true) {
HasMultipleConditionRegisters = hasManyRegs;
}
/// Tells the code generator that the target has BitExtract instructions.
/// The code generator will aggressively sink "shift"s into the blocks of
/// their users if the users will generate "and" instructions which can be
/// combined with "shift" to BitExtract instructions.
void setHasExtractBitsInsn(bool hasExtractInsn = true) {
HasExtractBitsInsn = hasExtractInsn;
}
/// Tells the code generator not to expand logic operations on comparison
/// predicates into separate sequences that increase the amount of flow
/// control.
void setJumpIsExpensive(bool isExpensive = true);
/// Tells the code generator which bitwidths to bypass.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth) {
BypassSlowDivWidths[SlowBitWidth] = FastBitWidth;
}
/// Add the specified register class as an available regclass for the
/// specified value type. This indicates the selector can handle values of
/// that class natively.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC) {
assert((unsigned)VT.SimpleTy < array_lengthof(RegClassForVT));
RegClassForVT[VT.SimpleTy] = RC;
}
/// Return the largest legal super-reg register class of the register class
/// for the specified type and its associated "cost".
virtual std::pair<const TargetRegisterClass *, uint8_t>
findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const;
/// Once all of the register classes are added, this allows us to compute
/// derived properties we expose.
void computeRegisterProperties(const TargetRegisterInfo *TRI);
/// Indicate that the specified operation does not work with the specified
/// type and indicate what to do about it. Note that VT may refer to either
/// the type of a result or that of an operand of Op.
void setOperationAction(unsigned Op, MVT VT,
LegalizeAction Action) {
assert(Op < array_lengthof(OpActions[0]) && "Table isn't big enough!");
OpActions[(unsigned)VT.SimpleTy][Op] = Action;
}
/// Indicate that the specified load with extension does not work with the
/// specified type and indicate what to do about it.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT,
LegalizeAction Action) {
assert(ExtType < ISD::LAST_LOADEXT_TYPE && ValVT.isValid() &&
MemVT.isValid() && "Table isn't big enough!");
assert((unsigned)Action < 0x10 && "too many bits for bitfield array");
unsigned Shift = 4 * ExtType;
LoadExtActions[ValVT.SimpleTy][MemVT.SimpleTy] &= ~((uint16_t)0xF << Shift);
LoadExtActions[ValVT.SimpleTy][MemVT.SimpleTy] |= (uint16_t)Action << Shift;
}
/// Indicate that the specified truncating store does not work with the
/// specified type and indicate what to do about it.
void setTruncStoreAction(MVT ValVT, MVT MemVT,
LegalizeAction Action) {
assert(ValVT.isValid() && MemVT.isValid() && "Table isn't big enough!");
TruncStoreActions[(unsigned)ValVT.SimpleTy][MemVT.SimpleTy] = Action;
}
/// Indicate that the specified indexed load does or does not work with the
/// specified type and indicate what to do abort it.
///
/// NOTE: All indexed mode loads are initialized to Expand in
/// TargetLowering.cpp
void setIndexedLoadAction(unsigned IdxMode, MVT VT,
LegalizeAction Action) {
assert(VT.isValid() && IdxMode < ISD::LAST_INDEXED_MODE &&
(unsigned)Action < 0xf && "Table isn't big enough!");
// Load action are kept in the upper half.
IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] &= ~0xf0;
IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] |= ((uint8_t)Action) <<4;
}
/// Indicate that the specified indexed store does or does not work with the
/// specified type and indicate what to do about it.
///
/// NOTE: All indexed mode stores are initialized to Expand in
/// TargetLowering.cpp
void setIndexedStoreAction(unsigned IdxMode, MVT VT,
LegalizeAction Action) {
assert(VT.isValid() && IdxMode < ISD::LAST_INDEXED_MODE &&
(unsigned)Action < 0xf && "Table isn't big enough!");
// Store action are kept in the lower half.
IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] &= ~0x0f;
IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] |= ((uint8_t)Action);
}
/// Indicate that the specified condition code is or isn't supported on the
/// target and indicate what to do about it.
void setCondCodeAction(ISD::CondCode CC, MVT VT,
LegalizeAction Action) {
assert(VT.isValid() && (unsigned)CC < array_lengthof(CondCodeActions) &&
"Table isn't big enough!");
assert((unsigned)Action < 0x10 && "too many bits for bitfield array");
/// The lower 3 bits of the SimpleTy index into Nth 4bit set from the 32-bit
/// value and the upper 29 bits index into the second dimension of the array
/// to select what 32-bit value to use.
uint32_t Shift = 4 * (VT.SimpleTy & 0x7);
CondCodeActions[CC][VT.SimpleTy >> 3] &= ~((uint32_t)0xF << Shift);
CondCodeActions[CC][VT.SimpleTy >> 3] |= (uint32_t)Action << Shift;
}
/// If Opc/OrigVT is specified as being promoted, the promotion code defaults
/// to trying a larger integer/fp until it can find one that works. If that
/// default is insufficient, this method can be used by the target to override
/// the default.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT) {
PromoteToType[std::make_pair(Opc, OrigVT.SimpleTy)] = DestVT.SimpleTy;
}
/// Convenience method to set an operation to Promote and specify the type
/// in a single call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT) {
setOperationAction(Opc, OrigVT, Promote);
AddPromotedToType(Opc, OrigVT, DestVT);
}
/// Targets should invoke this method for each target independent node that
/// they want to provide a custom DAG combiner for by implementing the
/// PerformDAGCombine virtual method.
void setTargetDAGCombine(ISD::NodeType NT) {
assert(unsigned(NT >> 3) < array_lengthof(TargetDAGCombineArray));
TargetDAGCombineArray[NT >> 3] |= 1 << (NT&7);
}
/// Set the target's required jmp_buf buffer size (in bytes); default is 200
void setJumpBufSize(unsigned Size) {
JumpBufSize = Size;
}
/// Set the target's required jmp_buf buffer alignment (in bytes); default is
/// 0
void setJumpBufAlignment(unsigned Align) {
JumpBufAlignment = Align;
}
/// Set the target's minimum function alignment (in log2(bytes))
void setMinFunctionAlignment(unsigned Align) {
MinFunctionAlignment = Align;
}
/// Set the target's preferred function alignment. This should be set if
/// there is a performance benefit to higher-than-minimum alignment (in
/// log2(bytes))
void setPrefFunctionAlignment(unsigned Align) {
PrefFunctionAlignment = Align;
}
/// Set the target's preferred loop alignment. Default alignment is zero, it
/// means the target does not care about loop alignment. The alignment is
/// specified in log2(bytes). The target may also override
/// getPrefLoopAlignment to provide per-loop values.
void setPrefLoopAlignment(unsigned Align) {
PrefLoopAlignment = Align;
}
/// Set the minimum stack alignment of an argument (in log2(bytes)).
void setMinStackArgumentAlignment(unsigned Align) {
MinStackArgumentAlignment = Align;
}
/// Set the maximum atomic operation size supported by the
/// backend. Atomic operations greater than this size (as well as
/// ones that are not naturally aligned), will be expanded by
/// AtomicExpandPass into an __atomic_* library call.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits) {
MaxAtomicSizeInBitsSupported = SizeInBits;
}
/// Sets the minimum cmpxchg or ll/sc size supported by the backend.
void setMinCmpXchgSizeInBits(unsigned SizeInBits) {
MinCmpXchgSizeInBits = SizeInBits;
}
/// Sets whether unaligned atomic operations are supported.
void setSupportsUnalignedAtomics(bool UnalignedSupported) {
SupportsUnalignedAtomics = UnalignedSupported;
}
public:
//===--------------------------------------------------------------------===//
// Addressing mode description hooks (used by LSR etc).
//
/// CodeGenPrepare sinks address calculations into the same BB as Load/Store
/// instructions reading the address. This allows as much computation as
/// possible to be done in the address mode for that operand. This hook lets
/// targets also pass back when this should be done on intrinsics which
/// load/store.
virtual bool getAddrModeArguments(IntrinsicInst * /*I*/,
SmallVectorImpl<Value*> &/*Ops*/,
Type *&/*AccessTy*/) const {
return false;
}
/// This represents an addressing mode of:
/// BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
/// If BaseGV is null, there is no BaseGV.
/// If BaseOffs is zero, there is no base offset.
/// If HasBaseReg is false, there is no base register.
/// If Scale is zero, there is no ScaleReg. Scale of 1 indicates a reg with
/// no scale.
struct AddrMode {
GlobalValue *BaseGV = nullptr;
int64_t BaseOffs = 0;
bool HasBaseReg = false;
int64_t Scale = 0;
AddrMode() = default;
};
/// Return true if the addressing mode represented by AM is legal for this
/// target, for a load/store of the specified type.
///
/// The type may be VoidTy, in which case only return true if the addressing
/// mode is legal for a load/store of any legal type. TODO: Handle
/// pre/postinc as well.
///
/// If the address space cannot be determined, it will be -1.
///
/// TODO: Remove default argument
virtual bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
Type *Ty, unsigned AddrSpace,
Instruction *I = nullptr) const;
/// Return the cost of the scaling factor used in the addressing mode
/// represented by AM for this target, for a load/store of the specified type.
///
/// If the AM is supported, the return value must be >= 0.
/// If the AM is not supported, it returns a negative value.
/// TODO: Handle pre/postinc as well.
/// TODO: Remove default argument
virtual int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM,
Type *Ty, unsigned AS = 0) const {
// Default: assume that any scaling factor used in a legal AM is free.
if (isLegalAddressingMode(DL, AM, Ty, AS))
return 0;
return -1;
}
/// Return true if the specified immediate is legal icmp immediate, that is
/// the target has icmp instructions which can compare a register against the
/// immediate without having to materialize the immediate into a register.
virtual bool isLegalICmpImmediate(int64_t) const {
return true;
}
/// Return true if the specified immediate is legal add immediate, that is the
/// target has add instructions which can add a register with the immediate
/// without having to materialize the immediate into a register.
virtual bool isLegalAddImmediate(int64_t) const {
return true;
}
/// Return true if the specified immediate is legal for the value input of a
/// store instruction.
virtual bool isLegalStoreImmediate(int64_t Value) const {
// Default implementation assumes that at least 0 works since it is likely
// that a zero register exists or a zero immediate is allowed.
return Value == 0;
}
/// Return true if it's significantly cheaper to shift a vector by a uniform
/// scalar than by an amount which will vary across each lane. On x86, for
/// example, there is a "psllw" instruction for the former case, but no simple
/// instruction for a general "a << b" operation on vectors.
virtual bool isVectorShiftByScalarCheap(Type *Ty) const {
return false;
}
/// Returns true if the opcode is a commutative binary operation.
virtual bool isCommutativeBinOp(unsigned Opcode) const {
// FIXME: This should get its info from the td file.
switch (Opcode) {
case ISD::ADD:
case ISD::SMIN:
case ISD::SMAX:
case ISD::UMIN:
case ISD::UMAX:
case ISD::MUL:
case ISD::MULHU:
case ISD::MULHS:
case ISD::SMUL_LOHI:
case ISD::UMUL_LOHI:
case ISD::FADD:
case ISD::FMUL:
case ISD::AND:
case ISD::OR:
case ISD::XOR:
case ISD::SADDO:
case ISD::UADDO:
case ISD::ADDC:
case ISD::ADDE:
case ISD::SADDSAT:
case ISD::UADDSAT:
case ISD::FMINNUM:
case ISD::FMAXNUM:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
return true;
default: return false;
}
}
/// Return true if the node is a math/logic binary operator.
virtual bool isBinOp(unsigned Opcode) const {
// A commutative binop must be a binop.
if (isCommutativeBinOp(Opcode))
return true;
// These are non-commutative binops.
switch (Opcode) {
case ISD::SUB:
case ISD::SHL:
case ISD::SRL:
case ISD::SRA:
case ISD::SDIV:
case ISD::UDIV:
case ISD::SREM:
case ISD::UREM:
case ISD::FSUB:
case ISD::FDIV:
case ISD::FREM:
return true;
default:
return false;
}
}
/// Return true if it's free to truncate a value of type FromTy to type
/// ToTy. e.g. On x86 it's free to truncate a i32 value in register EAX to i16
/// by referencing its sub-register AX.
/// Targets must return false when FromTy <= ToTy.
virtual bool isTruncateFree(Type *FromTy, Type *ToTy) const {
return false;
}
/// Return true if a truncation from FromTy to ToTy is permitted when deciding
/// whether a call is in tail position. Typically this means that both results
/// would be assigned to the same register or stack slot, but it could mean
/// the target performs adequate checks of its own before proceeding with the
/// tail call. Targets must return false when FromTy <= ToTy.
virtual bool allowTruncateForTailCall(Type *FromTy, Type *ToTy) const {
return false;
}
virtual bool isTruncateFree(EVT FromVT, EVT ToVT) const {
return false;
}
virtual bool isProfitableToHoist(Instruction *I) const { return true; }
/// Return true if the extension represented by \p I is free.
/// Unlikely the is[Z|FP]ExtFree family which is based on types,
/// this method can use the context provided by \p I to decide
/// whether or not \p I is free.
/// This method extends the behavior of the is[Z|FP]ExtFree family.
/// In other words, if is[Z|FP]Free returns true, then this method
/// returns true as well. The converse is not true.
/// The target can perform the adequate checks by overriding isExtFreeImpl.
/// \pre \p I must be a sign, zero, or fp extension.
bool isExtFree(const Instruction *I) const {
switch (I->getOpcode()) {
case Instruction::FPExt:
if (isFPExtFree(EVT::getEVT(I->getType()),
EVT::getEVT(I->getOperand(0)->getType())))
return true;
break;
case Instruction::ZExt:
if (isZExtFree(I->getOperand(0)->getType(), I->getType()))
return true;
break;
case Instruction::SExt:
break;
default:
llvm_unreachable("Instruction is not an extension");
}
return isExtFreeImpl(I);
}
/// Return true if \p Load and \p Ext can form an ExtLoad.
/// For example, in AArch64
/// %L = load i8, i8* %ptr
/// %E = zext i8 %L to i32
/// can be lowered into one load instruction
/// ldrb w0, [x0]
bool isExtLoad(const LoadInst *Load, const Instruction *Ext,
const DataLayout &DL) const {
EVT VT = getValueType(DL, Ext->getType());
EVT LoadVT = getValueType(DL, Load->getType());
// If the load has other users and the truncate is not free, the ext
// probably isn't free.
if (!Load->hasOneUse() && (isTypeLegal(LoadVT) || !isTypeLegal(VT)) &&
!isTruncateFree(Ext->getType(), Load->getType()))
return false;
// Check whether the target supports casts folded into loads.
unsigned LType;
if (isa<ZExtInst>(Ext))
LType = ISD::ZEXTLOAD;
else {
assert(isa<SExtInst>(Ext) && "Unexpected ext type!");
LType = ISD::SEXTLOAD;
}
return isLoadExtLegal(LType, VT, LoadVT);
}
/// Return true if any actual instruction that defines a value of type FromTy
/// implicitly zero-extends the value to ToTy in the result register.
///
/// The function should return true when it is likely that the truncate can
/// be freely folded with an instruction defining a value of FromTy. If
/// the defining instruction is unknown (because you're looking at a
/// function argument, PHI, etc.) then the target may require an
/// explicit truncate, which is not necessarily free, but this function
/// does not deal with those cases.
/// Targets must return false when FromTy >= ToTy.
virtual bool isZExtFree(Type *FromTy, Type *ToTy) const {
return false;
}
virtual bool isZExtFree(EVT FromTy, EVT ToTy) const {
return false;
}
/// Return true if sign-extension from FromTy to ToTy is cheaper than
/// zero-extension.
virtual bool isSExtCheaperThanZExt(EVT FromTy, EVT ToTy) const {
return false;
}
/// Return true if sinking I's operands to the same basic block as I is
/// profitable, e.g. because the operands can be folded into a target
/// instruction during instruction selection. After calling the function
/// \p Ops contains the Uses to sink ordered by dominance (dominating users
/// come first).
virtual bool shouldSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const {
return false;
}
/// Return true if the target supplies and combines to a paired load
/// two loaded values of type LoadedType next to each other in memory.
/// RequiredAlignment gives the minimal alignment constraints that must be met
/// to be able to select this paired load.
///
/// This information is *not* used to generate actual paired loads, but it is
/// used to generate a sequence of loads that is easier to combine into a
/// paired load.
/// For instance, something like this:
/// a = load i64* addr
/// b = trunc i64 a to i32
/// c = lshr i64 a, 32
/// d = trunc i64 c to i32
/// will be optimized into:
/// b = load i32* addr1
/// d = load i32* addr2
/// Where addr1 = addr2 +/- sizeof(i32).
///
/// In other words, unless the target performs a post-isel load combining,
/// this information should not be provided because it will generate more
/// loads.
virtual bool hasPairedLoad(EVT /*LoadedType*/,
unsigned & /*RequiredAlignment*/) const {
return false;
}
/// Return true if the target has a vector blend instruction.
virtual bool hasVectorBlend() const { return false; }
/// Get the maximum supported factor for interleaved memory accesses.
/// Default to be the minimum interleave factor: 2.
virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; }
/// Lower an interleaved load to target specific intrinsics. Return
/// true on success.
///
/// \p LI is the vector load instruction.
/// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
/// \p Indices is the corresponding indices for each shufflevector.
/// \p Factor is the interleave factor.
virtual bool lowerInterleavedLoad(LoadInst *LI,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const {
return false;
}
/// Lower an interleaved store to target specific intrinsics. Return
/// true on success.
///
/// \p SI is the vector store instruction.
/// \p SVI is the shufflevector to RE-interleave the stored vector.
/// \p Factor is the interleave factor.
virtual bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const {
return false;
}
/// Return true if zero-extending the specific node Val to type VT2 is free
/// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or
/// because it's folded such as X86 zero-extending loads).
virtual bool isZExtFree(SDValue Val, EVT VT2) const {
return isZExtFree(Val.getValueType(), VT2);
}
/// Return true if an fpext operation is free (for instance, because
/// single-precision floating-point numbers are implicitly extended to
/// double-precision).
virtual bool isFPExtFree(EVT DestVT, EVT SrcVT) const {
assert(SrcVT.isFloatingPoint() && DestVT.isFloatingPoint() &&
"invalid fpext types");
return false;
}
/// Return true if an fpext operation input to an \p Opcode operation is free
/// (for instance, because half-precision floating-point numbers are
/// implicitly extended to float-precision) for an FMA instruction.
virtual bool isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const {
assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
"invalid fpext types");
return isFPExtFree(DestVT, SrcVT);
}
/// Return true if folding a vector load into ExtVal (a sign, zero, or any
/// extend node) is profitable.
virtual bool isVectorLoadExtDesirable(SDValue ExtVal) const { return false; }
/// Return true if an fneg operation is free to the point where it is never
/// worthwhile to replace it with a bitwise operation.
virtual bool isFNegFree(EVT VT) const {
assert(VT.isFloatingPoint());
return false;
}
/// Return true if an fabs operation is free to the point where it is never
/// worthwhile to replace it with a bitwise operation.
virtual bool isFAbsFree(EVT VT) const {
assert(VT.isFloatingPoint());
return false;
}
/// Return true if an FMA operation is faster than a pair of fmul and fadd
/// instructions. fmuladd intrinsics will be expanded to FMAs when this method
/// returns true, otherwise fmuladd is expanded to fmul + fadd.
///
/// NOTE: This may be called before legalization on types for which FMAs are
/// not legal, but should return true if those types will eventually legalize
/// to types that support FMAs. After legalization, it will only be called on
/// types that support FMAs (via Legal or Custom actions)
virtual bool isFMAFasterThanFMulAndFAdd(EVT) const {
return false;
}
/// Return true if it's profitable to narrow operations of type VT1 to
/// VT2. e.g. on x86, it's profitable to narrow from i32 to i8 but not from
/// i32 to i16.
virtual bool isNarrowingProfitable(EVT /*VT1*/, EVT /*VT2*/) const {
return false;
}
/// Return true if it is beneficial to convert a load of a constant to
/// just the constant itself.
/// On some targets it might be more efficient to use a combination of
/// arithmetic instructions to materialize the constant instead of loading it
/// from a constant pool.
virtual bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
return false;
}
/// Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type
/// from this source type with this index. This is needed because
/// EXTRACT_SUBVECTOR usually has custom lowering that depends on the index of
/// the first element, and only the target knows which lowering is cheap.
virtual bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
unsigned Index) const {
return false;
}
/// Try to convert an extract element of a vector binary operation into an
/// extract element followed by a scalar operation.
virtual bool shouldScalarizeBinop(SDValue VecOp) const {
return false;
}
/// Return true if extraction of a scalar element from the given vector type
/// at the given index is cheap. For example, if scalar operations occur on
/// the same register file as vector operations, then an extract element may
/// be a sub-register rename rather than an actual instruction.
virtual bool isExtractVecEltCheap(EVT VT, unsigned Index) const {
return false;
}
/// Try to convert math with an overflow comparison into the corresponding DAG
/// node operation. Targets may want to override this independently of whether
/// the operation is legal/custom for the given type because it may obscure
/// matching of other patterns.
virtual bool shouldFormOverflowOp(unsigned Opcode, EVT VT) const {
// TODO: The default logic is inherited from code in CodeGenPrepare.
// The opcode should not make a difference by default?
if (Opcode != ISD::UADDO)
return false;
// Allow the transform as long as we have an integer type that is not
// obviously illegal and unsupported.
if (VT.isVector())
return false;
return VT.isSimple() || !isOperationExpand(Opcode, VT);
}
// Return true if it is profitable to use a scalar input to a BUILD_VECTOR
// even if the vector itself has multiple uses.
virtual bool aggressivelyPreferBuildVectorSources(EVT VecVT) const {
return false;
}
// Return true if CodeGenPrepare should consider splitting large offset of a
// GEP to make the GEP fit into the addressing mode and can be sunk into the
// same blocks of its users.
virtual bool shouldConsiderGEPOffsetSplit() const { return false; }
//===--------------------------------------------------------------------===//
// Runtime Library hooks
//
/// Rename the default libcall routine name for the specified libcall.
void setLibcallName(RTLIB::Libcall Call, const char *Name) {
LibcallRoutineNames[Call] = Name;
}
/// Get the libcall routine name for the specified libcall.
const char *getLibcallName(RTLIB::Libcall Call) const {
return LibcallRoutineNames[Call];
}
/// Override the default CondCode to be used to test the result of the
/// comparison libcall against zero.
void setCmpLibcallCC(RTLIB::Libcall Call, ISD::CondCode CC) {
CmpLibcallCCs[Call] = CC;
}
/// Get the CondCode that's to be used to test the result of the comparison
/// libcall against zero.
ISD::CondCode getCmpLibcallCC(RTLIB::Libcall Call) const {
return CmpLibcallCCs[Call];
}
/// Set the CallingConv that should be used for the specified libcall.
void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) {
LibcallCallingConvs[Call] = CC;
}
/// Get the CallingConv that should be used for the specified libcall.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const {
return LibcallCallingConvs[Call];
}
/// Execute target specific actions to finalize target lowering.
/// This is used to set extra flags in MachineFrameInformation and freezing
/// the set of reserved registers.
/// The default implementation just freezes the set of reserved registers.
virtual void finalizeLowering(MachineFunction &MF) const;
private:
const TargetMachine &TM;
/// Tells the code generator that the target has multiple (allocatable)
/// condition registers that can be used to store the results of comparisons
/// for use by selects and conditional branches. With multiple condition
/// registers, the code generator will not aggressively sink comparisons into
/// the blocks of their users.
bool HasMultipleConditionRegisters;
/// Tells the code generator that the target has BitExtract instructions.
/// The code generator will aggressively sink "shift"s into the blocks of
/// their users if the users will generate "and" instructions which can be
/// combined with "shift" to BitExtract instructions.
bool HasExtractBitsInsn;
/// Tells the code generator to bypass slow divide or remainder
/// instructions. For example, BypassSlowDivWidths[32,8] tells the code
/// generator to bypass 32-bit integer div/rem with an 8-bit unsigned integer
/// div/rem when the operands are positive and less than 256.
DenseMap <unsigned int, unsigned int> BypassSlowDivWidths;
/// Tells the code generator that it shouldn't generate extra flow control
/// instructions and should attempt to combine flow control instructions via
/// predication.
bool JumpIsExpensive;
/// This target prefers to use _setjmp to implement llvm.setjmp.
///
/// Defaults to false.
bool UseUnderscoreSetJmp;
/// This target prefers to use _longjmp to implement llvm.longjmp.
///
/// Defaults to false.
bool UseUnderscoreLongJmp;
/// Information about the contents of the high-bits in boolean values held in
/// a type wider than i1. See getBooleanContents.
BooleanContent BooleanContents;
/// Information about the contents of the high-bits in boolean values held in
/// a type wider than i1. See getBooleanContents.
BooleanContent BooleanFloatContents;
/// Information about the contents of the high-bits in boolean vector values
/// when the element type is wider than i1. See getBooleanContents.
BooleanContent BooleanVectorContents;
/// The target scheduling preference: shortest possible total cycles or lowest
/// register usage.
Sched::Preference SchedPreferenceInfo;
/// The size, in bytes, of the target's jmp_buf buffers
unsigned JumpBufSize;
/// The alignment, in bytes, of the target's jmp_buf buffers
unsigned JumpBufAlignment;
/// The minimum alignment that any argument on the stack needs to have.
unsigned MinStackArgumentAlignment;
/// The minimum function alignment (used when optimizing for size, and to
/// prevent explicitly provided alignment from leading to incorrect code).
unsigned MinFunctionAlignment;
/// The preferred function alignment (used when alignment unspecified and
/// optimizing for speed).
unsigned PrefFunctionAlignment;
/// The preferred loop alignment.
unsigned PrefLoopAlignment;
/// Size in bits of the maximum atomics size the backend supports.
/// Accesses larger than this will be expanded by AtomicExpandPass.
unsigned MaxAtomicSizeInBitsSupported;
/// Size in bits of the minimum cmpxchg or ll/sc operation the
/// backend supports.
unsigned MinCmpXchgSizeInBits;
/// This indicates if the target supports unaligned atomic operations.
bool SupportsUnalignedAtomics;
/// If set to a physical register, this specifies the register that
/// llvm.savestack/llvm.restorestack should save and restore.
unsigned StackPointerRegisterToSaveRestore;
/// This indicates the default register class to use for each ValueType the
/// target supports natively.
const TargetRegisterClass *RegClassForVT[MVT::LAST_VALUETYPE];
unsigned char NumRegistersForVT[MVT::LAST_VALUETYPE];
MVT RegisterTypeForVT[MVT::LAST_VALUETYPE];
/// This indicates the "representative" register class to use for each
/// ValueType the target supports natively. This information is used by the
/// scheduler to track register pressure. By default, the representative
/// register class is the largest legal super-reg register class of the
/// register class of the specified type. e.g. On x86, i8, i16, and i32's
/// representative class would be GR32.
const TargetRegisterClass *RepRegClassForVT[MVT::LAST_VALUETYPE];
/// This indicates the "cost" of the "representative" register class for each
/// ValueType. The cost is used by the scheduler to approximate register
/// pressure.
uint8_t RepRegClassCostForVT[MVT::LAST_VALUETYPE];
/// For any value types we are promoting or expanding, this contains the value
/// type that we are changing to. For Expanded types, this contains one step
/// of the expand (e.g. i64 -> i32), even if there are multiple steps required
/// (e.g. i64 -> i16). For types natively supported by the system, this holds
/// the same type (e.g. i32 -> i32).
MVT TransformToType[MVT::LAST_VALUETYPE];
/// For each operation and each value type, keep a LegalizeAction that
/// indicates how instruction selection should deal with the operation. Most
/// operations are Legal (aka, supported natively by the target), but
/// operations that are not should be described. Note that operations on
/// non-legal value types are not described here.
LegalizeAction OpActions[MVT::LAST_VALUETYPE][ISD::BUILTIN_OP_END];
/// For each load extension type and each value type, keep a LegalizeAction
/// that indicates how instruction selection should deal with a load of a
/// specific value type and extension type. Uses 4-bits to store the action
/// for each of the 4 load ext types.
uint16_t LoadExtActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE];
/// For each value type pair keep a LegalizeAction that indicates whether a
/// truncating store of a specific value type and truncating type is legal.
LegalizeAction TruncStoreActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE];
/// For each indexed mode and each value type, keep a pair of LegalizeAction
/// that indicates how instruction selection should deal with the load /
/// store.
///
/// The first dimension is the value_type for the reference. The second
/// dimension represents the various modes for load store.
uint8_t IndexedModeActions[MVT::LAST_VALUETYPE][ISD::LAST_INDEXED_MODE];
/// For each condition code (ISD::CondCode) keep a LegalizeAction that
/// indicates how instruction selection should deal with the condition code.
///
/// Because each CC action takes up 4 bits, we need to have the array size be
/// large enough to fit all of the value types. This can be done by rounding
/// up the MVT::LAST_VALUETYPE value to the next multiple of 8.
uint32_t CondCodeActions[ISD::SETCC_INVALID][(MVT::LAST_VALUETYPE + 7) / 8];
protected:
ValueTypeActionImpl ValueTypeActions;
private:
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const;
/// Targets can specify ISD nodes that they would like PerformDAGCombine
/// callbacks for by calling setTargetDAGCombine(), which sets a bit in this
/// array.
unsigned char
TargetDAGCombineArray[(ISD::BUILTIN_OP_END+CHAR_BIT-1)/CHAR_BIT];
/// For operations that must be promoted to a specific type, this holds the
/// destination type. This map should be sparse, so don't hold it as an
/// array.
///
/// Targets add entries to this map with AddPromotedToType(..), clients access
/// this with getTypeToPromoteTo(..).
std::map<std::pair<unsigned, MVT::SimpleValueType>, MVT::SimpleValueType>
PromoteToType;
/// Stores the name each libcall.
const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1];
/// The ISD::CondCode that should be used to test the result of each of the
/// comparison libcall against zero.
ISD::CondCode CmpLibcallCCs[RTLIB::UNKNOWN_LIBCALL];
/// Stores the CallingConv that should be used for each libcall.
CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL];
/// Set default libcall names and calling conventions.
void InitLibcalls(const Triple &TT);
protected:
/// Return true if the extension represented by \p I is free.
/// \pre \p I is a sign, zero, or fp extension and
/// is[Z|FP]ExtFree of the related types is not true.
virtual bool isExtFreeImpl(const Instruction *I) const { return false; }
/// Depth that GatherAllAliases should should continue looking for chain
/// dependencies when trying to find a more preferable chain. As an
/// approximation, this should be more than the number of consecutive stores
/// expected to be merged.
unsigned GatherAllAliasesMaxDepth;
/// Specify maximum number of store instructions per memset call.
///
/// When lowering \@llvm.memset this field specifies the maximum number of
/// store operations that may be substituted for the call to memset. Targets
/// must set this value based on the cost threshold for that target. Targets
/// should assume that the memset will be done using as many of the largest
/// store operations first, followed by smaller ones, if necessary, per
/// alignment restrictions. For example, storing 9 bytes on a 32-bit machine
/// with 16-bit alignment would result in four 2-byte stores and one 1-byte
/// store. This only applies to setting a constant array of a constant size.
unsigned MaxStoresPerMemset;
/// Maximum number of stores operations that may be substituted for the call
/// to memset, used for functions with OptSize attribute.
unsigned MaxStoresPerMemsetOptSize;
/// Specify maximum bytes of store instructions per memcpy call.
///
/// When lowering \@llvm.memcpy this field specifies the maximum number of
/// store operations that may be substituted for a call to memcpy. Targets
/// must set this value based on the cost threshold for that target. Targets
/// should assume that the memcpy will be done using as many of the largest
/// store operations first, followed by smaller ones, if necessary, per
/// alignment restrictions. For example, storing 7 bytes on a 32-bit machine
/// with 32-bit alignment would result in one 4-byte store, a one 2-byte store
/// and one 1-byte store. This only applies to copying a constant array of
/// constant size.
unsigned MaxStoresPerMemcpy;
/// \brief Specify max number of store instructions to glue in inlined memcpy.
///
/// When memcpy is inlined based on MaxStoresPerMemcpy, specify maximum number
/// of store instructions to keep together. This helps in pairing and
// vectorization later on.
unsigned MaxGluedStoresPerMemcpy = 0;
/// Maximum number of store operations that may be substituted for a call to
/// memcpy, used for functions with OptSize attribute.
unsigned MaxStoresPerMemcpyOptSize;
unsigned MaxLoadsPerMemcmp;
unsigned MaxLoadsPerMemcmpOptSize;
/// Specify maximum bytes of store instructions per memmove call.
///
/// When lowering \@llvm.memmove this field specifies the maximum number of
/// store instructions that may be substituted for a call to memmove. Targets
/// must set this value based on the cost threshold for that target. Targets
/// should assume that the memmove will be done using as many of the largest
/// store operations first, followed by smaller ones, if necessary, per
/// alignment restrictions. For example, moving 9 bytes on a 32-bit machine
/// with 8-bit alignment would result in nine 1-byte stores. This only
/// applies to copying a constant array of constant size.
unsigned MaxStoresPerMemmove;
/// Maximum number of store instructions that may be substituted for a call to
/// memmove, used for functions with OptSize attribute.
unsigned MaxStoresPerMemmoveOptSize;
/// Tells the code generator that select is more expensive than a branch if
/// the branch is usually predicted right.
bool PredictableSelectIsExpensive;
/// \see enableExtLdPromotion.
bool EnableExtLdPromotion;
/// Return true if the value types that can be represented by the specified
/// register class are all legal.
bool isLegalRC(const TargetRegisterInfo &TRI,
const TargetRegisterClass &RC) const;
/// Replace/modify any TargetFrameIndex operands with a targte-dependent
/// sequence of memory operands that is recognized by PrologEpilogInserter.
MachineBasicBlock *emitPatchPoint(MachineInstr &MI,
MachineBasicBlock *MBB) const;
/// Replace/modify the XRay custom event operands with target-dependent
/// details.
MachineBasicBlock *emitXRayCustomEvent(MachineInstr &MI,
MachineBasicBlock *MBB) const;
/// Replace/modify the XRay typed event operands with target-dependent
/// details.
MachineBasicBlock *emitXRayTypedEvent(MachineInstr &MI,
MachineBasicBlock *MBB) const;
};
/// This class defines information used to lower LLVM code to legal SelectionDAG
/// operators that the target instruction selector can accept natively.
///
/// This class also defines callbacks that targets must implement to lower
/// target-specific constructs to SelectionDAG operators.
class TargetLowering : public TargetLoweringBase {
public:
struct DAGCombinerInfo;
TargetLowering(const TargetLowering &) = delete;
TargetLowering &operator=(const TargetLowering &) = delete;
/// NOTE: The TargetMachine owns TLOF.
explicit TargetLowering(const TargetMachine &TM);
bool isPositionIndependent() const;
virtual bool isSDNodeSourceOfDivergence(const SDNode *N,
FunctionLoweringInfo *FLI,
LegacyDivergenceAnalysis *DA) const {
return false;
}
virtual bool isSDNodeAlwaysUniform(const SDNode * N) const {
return false;
}
/// Returns true by value, base pointer and offset pointer and addressing mode
/// by reference if the node's address can be legally represented as
/// pre-indexed load / store address.
virtual bool getPreIndexedAddressParts(SDNode * /*N*/, SDValue &/*Base*/,
SDValue &/*Offset*/,
ISD::MemIndexedMode &/*AM*/,
SelectionDAG &/*DAG*/) const {
return false;
}
/// Returns true by value, base pointer and offset pointer and addressing mode
/// by reference if this node can be combined with a load / store to form a
/// post-indexed load / store.
virtual bool getPostIndexedAddressParts(SDNode * /*N*/, SDNode * /*Op*/,
SDValue &/*Base*/,
SDValue &/*Offset*/,
ISD::MemIndexedMode &/*AM*/,
SelectionDAG &/*DAG*/) const {
return false;
}
/// Return the entry encoding for a jump table in the current function. The
/// returned value is a member of the MachineJumpTableInfo::JTEntryKind enum.
virtual unsigned getJumpTableEncoding() const;
virtual const MCExpr *
LowerCustomJumpTableEntry(const MachineJumpTableInfo * /*MJTI*/,
const MachineBasicBlock * /*MBB*/, unsigned /*uid*/,
MCContext &/*Ctx*/) const {
llvm_unreachable("Need to implement this hook if target has custom JTIs");
}
/// Returns relocation base for the given PIC jumptable.
virtual SDValue getPICJumpTableRelocBase(SDValue Table,
SelectionDAG &DAG) const;
/// This returns the relocation base for the given PIC jumptable, the same as
/// getPICJumpTableRelocBase, but as an MCExpr.
virtual const MCExpr *
getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
unsigned JTI, MCContext &Ctx) const;
/// Return true if folding a constant offset with the given GlobalAddress is
/// legal. It is frequently not legal in PIC relocation models.
virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
bool isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
SDValue &Chain) const;
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS,
SDValue &NewRHS, ISD::CondCode &CCCode,
const SDLoc &DL) const;
/// Returns a pair of (return value, chain).
/// It is an error to pass RTLIB::UNKNOWN_LIBCALL as \p LC.
std::pair<SDValue, SDValue> makeLibCall(
SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef<SDValue> Ops,
bool isSigned, const SDLoc &dl, bool doesNotReturn = false,
bool isReturnValueUsed = true, bool isPostTypeLegalization = false) const;
/// Check whether parameters to a call that are passed in callee saved
/// registers are the same as from the calling function. This needs to be
/// checked for tail call eligibility.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI,
const uint32_t *CallerPreservedMask,
const SmallVectorImpl<CCValAssign> &ArgLocs,
const SmallVectorImpl<SDValue> &OutVals) const;
//===--------------------------------------------------------------------===//
// TargetLowering Optimization Methods
//
/// A convenience struct that encapsulates a DAG, and two SDValues for
/// returning information from TargetLowering to its clients that want to
/// combine.
struct TargetLoweringOpt {
SelectionDAG &DAG;
bool LegalTys;
bool LegalOps;
SDValue Old;
SDValue New;
explicit TargetLoweringOpt(SelectionDAG &InDAG,
bool LT, bool LO) :
DAG(InDAG), LegalTys(LT), LegalOps(LO) {}
bool LegalTypes() const { return LegalTys; }
bool LegalOperations() const { return LegalOps; }
bool CombineTo(SDValue O, SDValue N) {
Old = O;
New = N;
return true;
}
};
/// Determines the optimal series of memory ops to replace the memset / memcpy.
/// Return true if the number of memory ops is below the threshold (Limit).
/// It returns the types of the sequence of memory ops to perform
/// memset / memcpy by reference.
bool findOptimalMemOpLowering(std::vector<EVT> &MemOps,
unsigned Limit, uint64_t Size,
unsigned DstAlign, unsigned SrcAlign,
bool IsMemset,
bool ZeroMemset,
bool MemcpyStrSrc,
bool AllowOverlap,
unsigned DstAS, unsigned SrcAS,
const AttributeList &FuncAttributes) const;
/// Check to see if the specified operand of the specified instruction is a
/// constant integer. If so, check to see if there are any bits set in the
/// constant that are not demanded. If so, shrink the constant and return
/// true.
bool ShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
TargetLoweringOpt &TLO) const;
// Target hook to do target-specific const optimization, which is called by
// ShrinkDemandedConstant. This function should return true if the target
// doesn't want ShrinkDemandedConstant to further optimize the constant.
virtual bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
TargetLoweringOpt &TLO) const {
return false;
}
/// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free. This
/// uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be
/// generalized for targets with other types of implicit widening casts.
bool ShrinkDemandedOp(SDValue Op, unsigned BitWidth, const APInt &Demanded,
TargetLoweringOpt &TLO) const;
/// Look at Op. At this point, we know that only the DemandedBits bits of the
/// result of Op are ever used downstream. If we can use this information to
/// simplify Op, create a new simplified DAG node and return true, returning
/// the original and new nodes in Old and New. Otherwise, analyze the
/// expression and return a mask of KnownOne and KnownZero bits for the
/// expression (used to simplify the caller). The KnownZero/One bits may only
/// be accurate for those bits in the Demanded masks.
/// \p AssumeSingleUse When this parameter is true, this function will
/// attempt to simplify \p Op even if there are multiple uses.
/// Callers are responsible for correctly updating the DAG based on the
/// results of this function, because simply replacing replacing TLO.Old
/// with TLO.New will be incorrect when this parameter is true and TLO.Old
/// has multiple uses.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
const APInt &DemandedElts, KnownBits &Known,
TargetLoweringOpt &TLO, unsigned Depth = 0,
bool AssumeSingleUse = false) const;
/// Helper wrapper around SimplifyDemandedBits, demanding all elements.
/// Adds Op back to the worklist upon success.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
KnownBits &Known, TargetLoweringOpt &TLO,
unsigned Depth = 0,
bool AssumeSingleUse = false) const;
/// Helper wrapper around SimplifyDemandedBits.
/// Adds Op back to the worklist upon success.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask,
DAGCombinerInfo &DCI) const;
/// Look at Vector Op. At this point, we know that only the DemandedElts
/// elements of the result of Op are ever used downstream. If we can use
/// this information to simplify Op, create a new simplified DAG node and
/// return true, storing the original and new nodes in TLO.
/// Otherwise, analyze the expression and return a mask of KnownUndef and
/// KnownZero elements for the expression (used to simplify the caller).
/// The KnownUndef/Zero elements may only be accurate for those bits
/// in the DemandedMask.
/// \p AssumeSingleUse When this parameter is true, this function will
/// attempt to simplify \p Op even if there are multiple uses.
/// Callers are responsible for correctly updating the DAG based on the
/// results of this function, because simply replacing replacing TLO.Old
/// with TLO.New will be incorrect when this parameter is true and TLO.Old
/// has multiple uses.
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask,
APInt &KnownUndef, APInt &KnownZero,
TargetLoweringOpt &TLO, unsigned Depth = 0,
bool AssumeSingleUse = false) const;
/// Helper wrapper around SimplifyDemandedVectorElts.
/// Adds Op back to the worklist upon success.
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts,
APInt &KnownUndef, APInt &KnownZero,
DAGCombinerInfo &DCI) const;
/// Determine which of the bits specified in Mask are known to be either zero
/// or one and return them in the KnownZero/KnownOne bitsets. The DemandedElts
/// argument allows us to only collect the known bits that are shared by the
/// requested vector elements.
virtual void computeKnownBitsForTargetNode(const SDValue Op,
KnownBits &Known,
const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth = 0) const;
/// Determine which of the bits of FrameIndex \p FIOp are known to be 0.
/// Default implementation computes low bits based on alignment
/// information. This should preserve known bits passed into it.
virtual void computeKnownBitsForFrameIndex(const SDValue FIOp,
KnownBits &Known,
const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth = 0) const;
/// This method can be implemented by targets that want to expose additional
/// information about sign bits to the DAG Combiner. The DemandedElts
/// argument allows us to only collect the minimum sign bits that are shared
/// by the requested vector elements.
virtual unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth = 0) const;
/// Attempt to simplify any target nodes based on the demanded vector
/// elements, returning true on success. Otherwise, analyze the expression and
/// return a mask of KnownUndef and KnownZero elements for the expression
/// (used to simplify the caller). The KnownUndef/Zero elements may only be
/// accurate for those bits in the DemandedMask.
virtual bool SimplifyDemandedVectorEltsForTargetNode(
SDValue Op, const APInt &DemandedElts, APInt &KnownUndef,
APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth = 0) const;
/// Attempt to simplify any target nodes based on the demanded bits/elts,
/// returning true on success. Otherwise, analyze the
/// expression and return a mask of KnownOne and KnownZero bits for the
/// expression (used to simplify the caller). The KnownZero/One bits may only
/// be accurate for those bits in the Demanded masks.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op,
const APInt &DemandedBits,
const APInt &DemandedElts,
KnownBits &Known,
TargetLoweringOpt &TLO,
unsigned Depth = 0) const;
/// This method returns the constant pool value that will be loaded by LD.
/// NOTE: You must check for implicit extensions of the constant by LD.
virtual const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const;
/// If \p SNaN is false, \returns true if \p Op is known to never be any
/// NaN. If \p sNaN is true, returns if \p Op is known to never be a signaling
/// NaN.
virtual bool isKnownNeverNaNForTargetNode(SDValue Op,
const SelectionDAG &DAG,
bool SNaN = false,
unsigned Depth = 0) const;
struct DAGCombinerInfo {
void *DC; // The DAG Combiner object.
CombineLevel Level;
bool CalledByLegalizer;
public:
SelectionDAG &DAG;
DAGCombinerInfo(SelectionDAG &dag, CombineLevel level, bool cl, void *dc)
: DC(dc), Level(level), CalledByLegalizer(cl), DAG(dag) {}
bool isBeforeLegalize() const { return Level == BeforeLegalizeTypes; }
bool isBeforeLegalizeOps() const { return Level < AfterLegalizeVectorOps; }
bool isAfterLegalizeDAG() const {
return Level == AfterLegalizeDAG;
}
CombineLevel getDAGCombineLevel() { return Level; }
bool isCalledByLegalizer() const { return CalledByLegalizer; }
void AddToWorklist(SDNode *N);
SDValue CombineTo(SDNode *N, ArrayRef<SDValue> To, bool AddTo = true);
SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true);
SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo = true);
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO);
};
/// Return if the N is a constant or constant vector equal to the true value
/// from getBooleanContents().
bool isConstTrueVal(const SDNode *N) const;
/// Return if the N is a constant or constant vector equal to the false value
/// from getBooleanContents().
bool isConstFalseVal(const SDNode *N) const;
/// Return if \p N is a True value when extended to \p VT.
bool isExtendedTrueVal(const ConstantSDNode *N, EVT VT, bool SExt) const;
/// Try to simplify a setcc built with the specified operands and cc. If it is
/// unable to simplify it, return a null SDValue.
SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
bool foldBooleans, DAGCombinerInfo &DCI,
const SDLoc &dl) const;
// For targets which wrap address, unwrap for analysis.
virtual SDValue unwrapAddress(SDValue N) const { return N; }
/// Returns true (and the GlobalValue and the offset) if the node is a
/// GlobalAddress + offset.
virtual bool
isGAPlusOffset(SDNode *N, const GlobalValue* &GA, int64_t &Offset) const;
/// This method will be invoked for all target nodes and for any
/// target-independent nodes that the target has registered with invoke it
/// for.
///
/// The semantics are as follows:
/// Return Value:
/// SDValue.Val == 0 - No change was made
/// SDValue.Val == N - N was replaced, is dead, and is already handled.
/// otherwise - N should be replaced by the returned Operand.
///
/// In addition, methods provided by DAGCombinerInfo may be used to perform
/// more complex transformations.
///
virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
/// Return true if it is profitable to move this shift by a constant amount
/// though its operand, adjusting any immediate operands as necessary to
/// preserve semantics. This transformation may not be desirable if it
/// disrupts a particularly auspicious target-specific tree (e.g. bitfield
/// extraction in AArch64). By default, it returns true.
///
/// @param N the shift node
/// @param Level the current DAGCombine legalization level.
virtual bool isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const {
return true;
}
// Return true if it is profitable to combine a BUILD_VECTOR with a stride-pattern
// to a shuffle and a truncate.
// Example of such a combine:
// v4i32 build_vector((extract_elt V, 1),
// (extract_elt V, 3),
// (extract_elt V, 5),
// (extract_elt V, 7))
// -->
// v4i32 truncate (bitcast (shuffle<1,u,3,u,5,u,7,u> V, u) to v4i64)
virtual bool isDesirableToCombineBuildVectorToShuffleTruncate(
ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
return false;
}
/// Return true if the target has native support for the specified value type
/// and it is 'desirable' to use the type for the given node type. e.g. On x86
/// i16 is legal, but undesirable since i16 instruction encodings are longer
/// and some i16 instructions are slow.
virtual bool isTypeDesirableForOp(unsigned /*Opc*/, EVT VT) const {
// By default, assume all legal types are desirable.
return isTypeLegal(VT);
}
/// Return true if it is profitable for dag combiner to transform a floating
/// point op of specified opcode to a equivalent op of an integer
/// type. e.g. f32 load -> i32 load can be profitable on ARM.
virtual bool isDesirableToTransformToIntegerOp(unsigned /*Opc*/,
EVT /*VT*/) const {
return false;
}
/// This method query the target whether it is beneficial for dag combiner to
/// promote the specified node. If true, it should return the desired
/// promotion type by reference.
virtual bool IsDesirableToPromoteOp(SDValue /*Op*/, EVT &/*PVT*/) const {
return false;
}
/// Return true if the target supports swifterror attribute. It optimizes
/// loads and stores to reading and writing a specific register.
virtual bool supportSwiftError() const {
return false;
}
/// Return true if the target supports that a subset of CSRs for the given
/// machine function is handled explicitly via copies.
virtual bool supportSplitCSR(MachineFunction *MF) const {
return false;
}
/// Perform necessary initialization to handle a subset of CSRs explicitly
/// via copies. This function is called at the beginning of instruction
/// selection.
virtual void initializeSplitCSR(MachineBasicBlock *Entry) const {
llvm_unreachable("Not Implemented");
}
/// Insert explicit copies in entry and exit blocks. We copy a subset of
/// CSRs to virtual registers in the entry block, and copy them back to
/// physical registers in the exit blocks. This function is called at the end
/// of instruction selection.
virtual void insertCopiesSplitCSR(
MachineBasicBlock *Entry,
const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
llvm_unreachable("Not Implemented");
}
//===--------------------------------------------------------------------===//
// Lowering methods - These methods must be implemented by targets so that
// the SelectionDAGBuilder code knows how to lower these.
//
/// This hook must be implemented to lower the incoming (formal) arguments,
/// described by the Ins array, into the specified DAG. The implementation
/// should fill in the InVals array with legal-type argument values, and
/// return the resulting token chain value.
virtual SDValue LowerFormalArguments(
SDValue /*Chain*/, CallingConv::ID /*CallConv*/, bool /*isVarArg*/,
const SmallVectorImpl<ISD::InputArg> & /*Ins*/, const SDLoc & /*dl*/,
SelectionDAG & /*DAG*/, SmallVectorImpl<SDValue> & /*InVals*/) const {
llvm_unreachable("Not Implemented");
}
/// This structure contains all information that is necessary for lowering
/// calls. It is passed to TLI::LowerCallTo when the SelectionDAG builder
/// needs to lower a call, and targets will see this struct in their LowerCall
/// implementation.
struct CallLoweringInfo {
SDValue Chain;
Type *RetTy = nullptr;
bool RetSExt : 1;
bool RetZExt : 1;
bool IsVarArg : 1;
bool IsInReg : 1;
bool DoesNotReturn : 1;
bool IsReturnValueUsed : 1;
bool IsConvergent : 1;
bool IsPatchPoint : 1;
// IsTailCall should be modified by implementations of
// TargetLowering::LowerCall that perform tail call conversions.
bool IsTailCall = false;
// Is Call lowering done post SelectionDAG type legalization.
bool IsPostTypeLegalization = false;
unsigned NumFixedArgs = -1;
CallingConv::ID CallConv = CallingConv::C;
SDValue Callee;
ArgListTy Args;
SelectionDAG &DAG;
SDLoc DL;
ImmutableCallSite CS;
SmallVector<ISD::OutputArg, 32> Outs;
SmallVector<SDValue, 32> OutVals;
SmallVector<ISD::InputArg, 32> Ins;
SmallVector<SDValue, 4> InVals;
CallLoweringInfo(SelectionDAG &DAG)
: RetSExt(false), RetZExt(false), IsVarArg(false), IsInReg(false),
DoesNotReturn(false), IsReturnValueUsed(true), IsConvergent(false),
IsPatchPoint(false), DAG(DAG) {}
CallLoweringInfo &setDebugLoc(const SDLoc &dl) {
DL = dl;
return *this;
}
CallLoweringInfo &setChain(SDValue InChain) {
Chain = InChain;
return *this;
}
// setCallee with target/module-specific attributes
CallLoweringInfo &setLibCallee(CallingConv::ID CC, Type *ResultType,
SDValue Target, ArgListTy &&ArgsList) {
RetTy = ResultType;
Callee = Target;
CallConv = CC;
NumFixedArgs = ArgsList.size();
Args = std::move(ArgsList);
DAG.getTargetLoweringInfo().markLibCallAttributes(
&(DAG.getMachineFunction()), CC, Args);
return *this;
}
CallLoweringInfo &setCallee(CallingConv::ID CC, Type *ResultType,
SDValue Target, ArgListTy &&ArgsList) {
RetTy = ResultType;
Callee = Target;
CallConv = CC;
NumFixedArgs = ArgsList.size();
Args = std::move(ArgsList);
return *this;
}
CallLoweringInfo &setCallee(Type *ResultType, FunctionType *FTy,
SDValue Target, ArgListTy &&ArgsList,
ImmutableCallSite Call) {
RetTy = ResultType;
IsInReg = Call.hasRetAttr(Attribute::InReg);
DoesNotReturn =
Call.doesNotReturn() ||
(!Call.isInvoke() &&
isa<UnreachableInst>(Call.getInstruction()->getNextNode()));
IsVarArg = FTy->isVarArg();
IsReturnValueUsed = !Call.getInstruction()->use_empty();
RetSExt = Call.hasRetAttr(Attribute::SExt);
RetZExt = Call.hasRetAttr(Attribute::ZExt);
Callee = Target;
CallConv = Call.getCallingConv();
NumFixedArgs = FTy->getNumParams();
Args = std::move(ArgsList);
CS = Call;
return *this;
}
CallLoweringInfo &setInRegister(bool Value = true) {
IsInReg = Value;
return *this;
}
CallLoweringInfo &setNoReturn(bool Value = true) {
DoesNotReturn = Value;
return *this;
}
CallLoweringInfo &setVarArg(bool Value = true) {
IsVarArg = Value;
return *this;
}
CallLoweringInfo &setTailCall(bool Value = true) {
IsTailCall = Value;
return *this;
}
CallLoweringInfo &setDiscardResult(bool Value = true) {
IsReturnValueUsed = !Value;
return *this;
}
CallLoweringInfo &setConvergent(bool Value = true) {
IsConvergent = Value;
return *this;
}
CallLoweringInfo &setSExtResult(bool Value = true) {
RetSExt = Value;
return *this;
}
CallLoweringInfo &setZExtResult(bool Value = true) {
RetZExt = Value;
return *this;
}
CallLoweringInfo &setIsPatchPoint(bool Value = true) {
IsPatchPoint = Value;
return *this;
}
CallLoweringInfo &setIsPostTypeLegalization(bool Value=true) {
IsPostTypeLegalization = Value;
return *this;
}
ArgListTy &getArgs() {
return Args;
}
};
/// This function lowers an abstract call to a function into an actual call.
/// This returns a pair of operands. The first element is the return value
/// for the function (if RetTy is not VoidTy). The second element is the
/// outgoing token chain. It calls LowerCall to do the actual lowering.
std::pair<SDValue, SDValue> LowerCallTo(CallLoweringInfo &CLI) const;
/// This hook must be implemented to lower calls into the specified
/// DAG. The outgoing arguments to the call are described by the Outs array,
/// and the values to be returned by the call are described by the Ins
/// array. The implementation should fill in the InVals array with legal-type
/// return values from the call, and return the resulting token chain value.
virtual SDValue
LowerCall(CallLoweringInfo &/*CLI*/,
SmallVectorImpl<SDValue> &/*InVals*/) const {
llvm_unreachable("Not Implemented");
}
/// Target-specific cleanup for formal ByVal parameters.
virtual void HandleByVal(CCState *, unsigned &, unsigned) const {}
/// This hook should be implemented to check whether the return values
/// described by the Outs array can fit into the return registers. If false
/// is returned, an sret-demotion is performed.
virtual bool CanLowerReturn(CallingConv::ID /*CallConv*/,
MachineFunction &/*MF*/, bool /*isVarArg*/,
const SmallVectorImpl<ISD::OutputArg> &/*Outs*/,
LLVMContext &/*Context*/) const
{
// Return true by default to get preexisting behavior.
return true;
}
/// This hook must be implemented to lower outgoing return values, described
/// by the Outs array, into the specified DAG. The implementation should
/// return the resulting token chain value.
virtual SDValue LowerReturn(SDValue /*Chain*/, CallingConv::ID /*CallConv*/,
bool /*isVarArg*/,
const SmallVectorImpl<ISD::OutputArg> & /*Outs*/,
const SmallVectorImpl<SDValue> & /*OutVals*/,
const SDLoc & /*dl*/,
SelectionDAG & /*DAG*/) const {
llvm_unreachable("Not Implemented");
}
/// Return true if result of the specified node is used by a return node
/// only. It also compute and return the input chain for the tail call.
///
/// This is used to determine whether it is possible to codegen a libcall as
/// tail call at legalization time.
virtual bool isUsedByReturnOnly(SDNode *, SDValue &/*Chain*/) const {
return false;
}
/// Return true if the target may be able emit the call instruction as a tail
/// call. This is used by optimization passes to determine if it's profitable
/// to duplicate return instructions to enable tailcall optimization.
virtual bool mayBeEmittedAsTailCall(const CallInst *) const {
return false;
}
/// Return the builtin name for the __builtin___clear_cache intrinsic
/// Default is to invoke the clear cache library call
virtual const char * getClearCacheBuiltinName() const {
return "__clear_cache";
}
/// Return the register ID of the name passed in. Used by named register
/// global variables extension. There is no target-independent behaviour
/// so the default action is to bail.
virtual unsigned getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const {
report_fatal_error("Named registers not implemented for this target");
}
/// Return the type that should be used to zero or sign extend a
/// zeroext/signext integer return value. FIXME: Some C calling conventions
/// require the return type to be promoted, but this is not true all the time,
/// e.g. i1/i8/i16 on x86/x86_64. It is also not necessary for non-C calling
/// conventions. The frontend should handle this and include all of the
/// necessary information.
virtual EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
ISD::NodeType /*ExtendKind*/) const {
EVT MinVT = getRegisterType(Context, MVT::i32);
return VT.bitsLT(MinVT) ? MinVT : VT;
}
/// For some targets, an LLVM struct type must be broken down into multiple
/// simple types, but the calling convention specifies that the entire struct
/// must be passed in a block of consecutive registers.
virtual bool
functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv,
bool isVarArg) const {
return false;
}
/// For most targets, an LLVM type must be broken down into multiple
/// smaller types. Usually the halves are ordered according to the endianness
/// but for some platform that would break. So this method will default to
/// matching the endianness but can be overridden.
virtual bool
shouldSplitFunctionArgumentsAsLittleEndian(const DataLayout &DL) const {
return DL.isLittleEndian();
}
/// Returns a 0 terminated array of registers that can be safely used as
/// scratch registers.
virtual const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const {
return nullptr;
}
/// This callback is used to prepare for a volatile or atomic load.
/// It takes a chain node as input and returns the chain for the load itself.
///
/// Having a callback like this is necessary for targets like SystemZ,
/// which allows a CPU to reuse the result of a previous load indefinitely,
/// even if a cache-coherent store is performed by another CPU. The default
/// implementation does nothing.
virtual SDValue prepareVolatileOrAtomicLoad(SDValue Chain, const SDLoc &DL,
SelectionDAG &DAG) const {
return Chain;
}
/// This callback is used to inspect load/store instructions and add
/// target-specific MachineMemOperand flags to them. The default
/// implementation does nothing.
virtual MachineMemOperand::Flags getMMOFlags(const Instruction &I) const {
return MachineMemOperand::MONone;
}
/// This callback is invoked by the type legalizer to legalize nodes with an
/// illegal operand type but legal result types. It replaces the
/// LowerOperation callback in the type Legalizer. The reason we can not do
/// away with LowerOperation entirely is that LegalizeDAG isn't yet ready to
/// use this callback.
///
/// TODO: Consider merging with ReplaceNodeResults.
///
/// The target places new result values for the node in Results (their number
/// and types must exactly match those of the original return values of
/// the node), or leaves Results empty, which indicates that the node is not
/// to be custom lowered after all.
/// The default implementation calls LowerOperation.
virtual void LowerOperationWrapper(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const;
/// This callback is invoked for operations that are unsupported by the
/// target, which are registered to use 'custom' lowering, and whose defined
/// values are all legal. If the target has no operations that require custom
/// lowering, it need not implement this. The default implementation of this
/// aborts.
virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
/// This callback is invoked when a node result type is illegal for the
/// target, and the operation was registered to use 'custom' lowering for that
/// result type. The target places new result values for the node in Results
/// (their number and types must exactly match those of the original return
/// values of the node), or leaves Results empty, which indicates that the
/// node is not to be custom lowered after all.
///
/// If the target has no operations that require custom lowering, it need not
/// implement this. The default implementation aborts.
virtual void ReplaceNodeResults(SDNode * /*N*/,
SmallVectorImpl<SDValue> &/*Results*/,
SelectionDAG &/*DAG*/) const {
llvm_unreachable("ReplaceNodeResults not implemented for this target!");
}
/// This method returns the name of a target specific DAG node.
virtual const char *getTargetNodeName(unsigned Opcode) const;
/// This method returns a target specific FastISel object, or null if the
/// target does not support "fast" ISel.
virtual FastISel *createFastISel(FunctionLoweringInfo &,
const TargetLibraryInfo *) const {
return nullptr;
}
bool verifyReturnAddressArgumentIsConstant(SDValue Op,
SelectionDAG &DAG) const;
//===--------------------------------------------------------------------===//
// Inline Asm Support hooks
//
/// This hook allows the target to expand an inline asm call to be explicit
/// llvm code if it wants to. This is useful for turning simple inline asms
/// into LLVM intrinsics, which gives the compiler more information about the
/// behavior of the code.
virtual bool ExpandInlineAsm(CallInst *) const {
return false;
}
enum ConstraintType {
C_Register, // Constraint represents specific register(s).
C_RegisterClass, // Constraint represents any of register(s) in class.
C_Memory, // Memory constraint.
+ C_Immediate, // Requires an immediate.
C_Other, // Something else.
C_Unknown // Unsupported constraint.
};
enum ConstraintWeight {
// Generic weights.
CW_Invalid = -1, // No match.
CW_Okay = 0, // Acceptable.
CW_Good = 1, // Good weight.
CW_Better = 2, // Better weight.
CW_Best = 3, // Best weight.
// Well-known weights.
CW_SpecificReg = CW_Okay, // Specific register operands.
CW_Register = CW_Good, // Register operands.
CW_Memory = CW_Better, // Memory operands.
CW_Constant = CW_Best, // Constant operand.
CW_Default = CW_Okay // Default or don't know type.
};
/// This contains information for each constraint that we are lowering.
struct AsmOperandInfo : public InlineAsm::ConstraintInfo {
/// This contains the actual string for the code, like "m". TargetLowering
/// picks the 'best' code from ConstraintInfo::Codes that most closely
/// matches the operand.
std::string ConstraintCode;
/// Information about the constraint code, e.g. Register, RegisterClass,
/// Memory, Other, Unknown.
TargetLowering::ConstraintType ConstraintType = TargetLowering::C_Unknown;
/// If this is the result output operand or a clobber, this is null,
/// otherwise it is the incoming operand to the CallInst. This gets
/// modified as the asm is processed.
Value *CallOperandVal = nullptr;
/// The ValueType for the operand value.
MVT ConstraintVT = MVT::Other;
/// Copy constructor for copying from a ConstraintInfo.
AsmOperandInfo(InlineAsm::ConstraintInfo Info)
: InlineAsm::ConstraintInfo(std::move(Info)) {}
/// Return true of this is an input operand that is a matching constraint
/// like "4".
bool isMatchingInputConstraint() const;
/// If this is an input matching constraint, this method returns the output
/// operand it matches.
unsigned getMatchedOperand() const;
};
using AsmOperandInfoVector = std::vector<AsmOperandInfo>;
/// Split up the constraint string from the inline assembly value into the
/// specific constraints and their prefixes, and also tie in the associated
/// operand values. If this returns an empty vector, and if the constraint
/// string itself isn't empty, there was an error parsing.
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL,
const TargetRegisterInfo *TRI,
ImmutableCallSite CS) const;
/// Examine constraint type and operand type and determine a weight value.
/// The operand object must already have been set up with the operand type.
virtual ConstraintWeight getMultipleConstraintMatchWeight(
AsmOperandInfo &info, int maIndex) const;
/// Examine constraint string and operand type and determine a weight value.
/// The operand object must already have been set up with the operand type.
virtual ConstraintWeight getSingleConstraintMatchWeight(
AsmOperandInfo &info, const char *constraint) const;
/// Determines the constraint code and constraint type to use for the specific
/// AsmOperandInfo, setting OpInfo.ConstraintCode and OpInfo.ConstraintType.
/// If the actual operand being passed in is available, it can be passed in as
/// Op, otherwise an empty SDValue can be passed.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo,
SDValue Op,
SelectionDAG *DAG = nullptr) const;
/// Given a constraint, return the type of constraint it is for this target.
virtual ConstraintType getConstraintType(StringRef Constraint) const;
/// Given a physical register constraint (e.g. {edx}), return the register
/// number and the register class for the register.
///
/// Given a register class constraint, like 'r', if this corresponds directly
/// to an LLVM register class, return a register of 0 and the register class
/// pointer.
///
/// This should only be used for C_Register constraints. On error, this
/// returns a register number of 0 and a null register class pointer.
virtual std::pair<unsigned, const TargetRegisterClass *>
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint, MVT VT) const;
virtual unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const {
if (ConstraintCode == "i")
return InlineAsm::Constraint_i;
else if (ConstraintCode == "m")
return InlineAsm::Constraint_m;
return InlineAsm::Constraint_Unknown;
}
/// Try to replace an X constraint, which matches anything, with another that
/// has more specific requirements based on the type of the corresponding
/// operand. This returns null if there is no replacement to make.
virtual const char *LowerXConstraint(EVT ConstraintVT) const;
/// Lower the specified operand into the Ops vector. If it is invalid, don't
/// add anything to Ops.
virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
std::vector<SDValue> &Ops,
SelectionDAG &DAG) const;
// Lower custom output constraints. If invalid, return SDValue().
virtual SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag,
SDLoc DL,
const AsmOperandInfo &OpInfo,
SelectionDAG &DAG) const;
//===--------------------------------------------------------------------===//
// Div utility functions
//
SDValue BuildSDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
SmallVectorImpl<SDNode *> &Created) const;
SDValue BuildUDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
SmallVectorImpl<SDNode *> &Created) const;
/// Targets may override this function to provide custom SDIV lowering for
/// power-of-2 denominators. If the target returns an empty SDValue, LLVM
/// assumes SDIV is expensive and replaces it with a series of other integer
/// operations.
virtual SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor,
SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const;
/// Indicate whether this target prefers to combine FDIVs with the same
/// divisor. If the transform should never be done, return zero. If the
/// transform should be done, return the minimum number of divisor uses
/// that must exist.
virtual unsigned combineRepeatedFPDivisors() const {
return 0;
}
/// Hooks for building estimates in place of slower divisions and square
/// roots.
/// Return either a square root or its reciprocal estimate value for the input
/// operand.
/// \p Enabled is a ReciprocalEstimate enum with value either 'Unspecified' or
/// 'Enabled' as set by a potential default override attribute.
/// If \p RefinementSteps is 'Unspecified', the number of Newton-Raphson
/// refinement iterations required to generate a sufficient (though not
/// necessarily IEEE-754 compliant) estimate is returned in that parameter.
/// The boolean UseOneConstNR output is used to select a Newton-Raphson
/// algorithm implementation that uses either one or two constants.
/// The boolean Reciprocal is used to select whether the estimate is for the
/// square root of the input operand or the reciprocal of its square root.
/// A target may choose to implement its own refinement within this function.
/// If that's true, then return '0' as the number of RefinementSteps to avoid
/// any further refinement of the estimate.
/// An empty SDValue return means no estimate sequence can be created.
virtual SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
int Enabled, int &RefinementSteps,
bool &UseOneConstNR, bool Reciprocal) const {
return SDValue();
}
/// Return a reciprocal estimate value for the input operand.
/// \p Enabled is a ReciprocalEstimate enum with value either 'Unspecified' or
/// 'Enabled' as set by a potential default override attribute.
/// If \p RefinementSteps is 'Unspecified', the number of Newton-Raphson
/// refinement iterations required to generate a sufficient (though not
/// necessarily IEEE-754 compliant) estimate is returned in that parameter.
/// A target may choose to implement its own refinement within this function.
/// If that's true, then return '0' as the number of RefinementSteps to avoid
/// any further refinement of the estimate.
/// An empty SDValue return means no estimate sequence can be created.
virtual SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
int Enabled, int &RefinementSteps) const {
return SDValue();
}
//===--------------------------------------------------------------------===//
// Legalization utility functions
//
/// Expand a MUL or [US]MUL_LOHI of n-bit values into two or four nodes,
/// respectively, each computing an n/2-bit part of the result.
/// \param Result A vector that will be filled with the parts of the result
/// in little-endian order.
/// \param LL Low bits of the LHS of the MUL. You can use this parameter
/// if you want to control how low bits are extracted from the LHS.
/// \param LH High bits of the LHS of the MUL. See LL for meaning.
/// \param RL Low bits of the RHS of the MUL. See LL for meaning
/// \param RH High bits of the RHS of the MUL. See LL for meaning.
/// \returns true if the node has been expanded, false if it has not
bool expandMUL_LOHI(unsigned Opcode, EVT VT, SDLoc dl, SDValue LHS,
SDValue RHS, SmallVectorImpl<SDValue> &Result, EVT HiLoVT,
SelectionDAG &DAG, MulExpansionKind Kind,
SDValue LL = SDValue(), SDValue LH = SDValue(),
SDValue RL = SDValue(), SDValue RH = SDValue()) const;
/// Expand a MUL into two nodes. One that computes the high bits of
/// the result and one that computes the low bits.
/// \param HiLoVT The value type to use for the Lo and Hi nodes.
/// \param LL Low bits of the LHS of the MUL. You can use this parameter
/// if you want to control how low bits are extracted from the LHS.
/// \param LH High bits of the LHS of the MUL. See LL for meaning.
/// \param RL Low bits of the RHS of the MUL. See LL for meaning
/// \param RH High bits of the RHS of the MUL. See LL for meaning.
/// \returns true if the node has been expanded. false if it has not
bool expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
SelectionDAG &DAG, MulExpansionKind Kind,
SDValue LL = SDValue(), SDValue LH = SDValue(),
SDValue RL = SDValue(), SDValue RH = SDValue()) const;
/// Expand funnel shift.
/// \param N Node to expand
/// \param Result output after conversion
/// \returns True, if the expansion was successful, false otherwise
bool expandFunnelShift(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
/// Expand rotations.
/// \param N Node to expand
/// \param Result output after conversion
/// \returns True, if the expansion was successful, false otherwise
bool expandROT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
/// Expand float(f32) to SINT(i64) conversion
/// \param N Node to expand
/// \param Result output after conversion
/// \returns True, if the expansion was successful, false otherwise
bool expandFP_TO_SINT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
/// Expand float to UINT conversion
/// \param N Node to expand
/// \param Result output after conversion
/// \returns True, if the expansion was successful, false otherwise
bool expandFP_TO_UINT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
/// Expand UINT(i64) to double(f64) conversion
/// \param N Node to expand
/// \param Result output after conversion
/// \returns True, if the expansion was successful, false otherwise
bool expandUINT_TO_FP(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
/// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const;
/// Expand CTPOP nodes. Expands vector/scalar CTPOP nodes,
/// vector nodes can only succeed if all operations are legal/custom.
/// \param N Node to expand
/// \param Result output after conversion
/// \returns True, if the expansion was successful, false otherwise
bool expandCTPOP(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
/// Expand CTLZ/CTLZ_ZERO_UNDEF nodes. Expands vector/scalar CTLZ nodes,
/// vector nodes can only succeed if all operations are legal/custom.
/// \param N Node to expand
/// \param Result output after conversion
/// \returns True, if the expansion was successful, false otherwise
bool expandCTLZ(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
/// Expand CTTZ/CTTZ_ZERO_UNDEF nodes. Expands vector/scalar CTTZ nodes,
/// vector nodes can only succeed if all operations are legal/custom.
/// \param N Node to expand
/// \param Result output after conversion
/// \returns True, if the expansion was successful, false otherwise
bool expandCTTZ(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
/// Expand ABS nodes. Expands vector/scalar ABS nodes,
/// vector nodes can only succeed if all operations are legal/custom.
/// (ABS x) -> (XOR (ADD x, (SRA x, type_size)), (SRA x, type_size))
/// \param N Node to expand
/// \param Result output after conversion
/// \returns True, if the expansion was successful, false otherwise
bool expandABS(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
/// Turn load of vector type into a load of the individual elements.
/// \param LD load to expand
/// \returns MERGE_VALUEs of the scalar loads with their chains.
SDValue scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const;
// Turn a store of a vector type into stores of the individual elements.
/// \param ST Store with a vector value type
/// \returns MERGE_VALUs of the individual store chains.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const;
/// Expands an unaligned load to 2 half-size loads for an integer, and
/// possibly more for vectors.
std::pair<SDValue, SDValue> expandUnalignedLoad(LoadSDNode *LD,
SelectionDAG &DAG) const;
/// Expands an unaligned store to 2 half-size stores for integer values, and
/// possibly more for vectors.
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const;
/// Increments memory address \p Addr according to the type of the value
/// \p DataVT that should be stored. If the data is stored in compressed
/// form, the memory address should be incremented according to the number of
/// the stored elements. This number is equal to the number of '1's bits
/// in the \p Mask.
/// \p DataVT is a vector type. \p Mask is a vector value.
/// \p DataVT and \p Mask have the same number of vector elements.
SDValue IncrementMemoryAddress(SDValue Addr, SDValue Mask, const SDLoc &DL,
EVT DataVT, SelectionDAG &DAG,
bool IsCompressedMemory) const;
/// Get a pointer to vector element \p Idx located in memory for a vector of
/// type \p VecVT starting at a base address of \p VecPtr. If \p Idx is out of
/// bounds the returned pointer is unspecified, but will be within the vector
/// bounds.
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
SDValue Index) const;
/// Method for building the DAG expansion of ISD::[US][ADD|SUB]SAT. This
/// method accepts integers as its arguments.
SDValue expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const;
/// Method for building the DAG expansion of ISD::SMULFIX. This method accepts
/// integers as its arguments.
SDValue expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const;
/// Method for building the DAG expansion of ISD::U(ADD|SUB)O. Expansion
/// always suceeds and populates the Result and Overflow arguments.
void expandUADDSUBO(SDNode *Node, SDValue &Result, SDValue &Overflow,
SelectionDAG &DAG) const;
/// Method for building the DAG expansion of ISD::S(ADD|SUB)O. Expansion
/// always suceeds and populates the Result and Overflow arguments.
void expandSADDSUBO(SDNode *Node, SDValue &Result, SDValue &Overflow,
SelectionDAG &DAG) const;
/// Method for building the DAG expansion of ISD::[US]MULO. Returns whether
/// expansion was successful and populates the Result and Overflow arguments.
bool expandMULO(SDNode *Node, SDValue &Result, SDValue &Overflow,
SelectionDAG &DAG) const;
/// Expand a VECREDUCE_* into an explicit calculation. If Count is specified,
/// only the first Count elements of the vector are used.
SDValue expandVecReduce(SDNode *Node, SelectionDAG &DAG) const;
//===--------------------------------------------------------------------===//
// Instruction Emitting Hooks
//
/// This method should be implemented by targets that mark instructions with
/// the 'usesCustomInserter' flag. These instructions are special in various
/// ways, which require special support to insert. The specified MachineInstr
/// is created but not inserted into any basic blocks, and this method is
/// called to expand it into a sequence of instructions, potentially also
/// creating new basic blocks and control flow.
/// As long as the returned basic block is different (i.e., we created a new
/// one), the custom inserter is free to modify the rest of \p MBB.
virtual MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const;
/// This method should be implemented by targets that mark instructions with
/// the 'hasPostISelHook' flag. These instructions must be adjusted after
/// instruction selection by target hooks. e.g. To fill in optional defs for
/// ARM 's' setting instructions.
virtual void AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const;
/// If this function returns true, SelectionDAGBuilder emits a
/// LOAD_STACK_GUARD node when it is lowering Intrinsic::stackprotector.
virtual bool useLoadStackGuardNode() const {
return false;
}
virtual SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
const SDLoc &DL) const {
llvm_unreachable("not implemented for this target");
}
/// Lower TLS global address SDNode for target independent emulated TLS model.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA,
SelectionDAG &DAG) const;
/// Expands target specific indirect branch for the case of JumpTable
/// expanasion.
virtual SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value, SDValue Addr,
SelectionDAG &DAG) const {
return DAG.getNode(ISD::BRIND, dl, MVT::Other, Value, Addr);
}
// seteq(x, 0) -> truncate(srl(ctlz(zext(x)), log2(#bits)))
// If we're comparing for equality to zero and isCtlzFast is true, expose the
// fact that this can be implemented as a ctlz/srl pair, so that the dag
// combiner can fold the new nodes.
SDValue lowerCmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) const;
private:
SDValue foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
const SDLoc &DL, DAGCombinerInfo &DCI) const;
SDValue foldSetCCWithBinOp(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
const SDLoc &DL, DAGCombinerInfo &DCI) const;
SDValue optimizeSetCCOfSignedTruncationCheck(EVT SCCVT, SDValue N0,
SDValue N1, ISD::CondCode Cond,
DAGCombinerInfo &DCI,
const SDLoc &DL) const;
SDValue prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
SDValue CompTargetNode, ISD::CondCode Cond,
DAGCombinerInfo &DCI, const SDLoc &DL,
SmallVectorImpl<SDNode *> &Created) const;
SDValue buildUREMEqFold(EVT SETCCVT, SDValue REMNode, SDValue CompTargetNode,
ISD::CondCode Cond, DAGCombinerInfo &DCI,
const SDLoc &DL) const;
};
/// Given an LLVM IR type and return type attributes, compute the return value
/// EVTs and flags, and optionally also the offsets, if the return value is
/// being lowered to memory.
void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr,
SmallVectorImpl<ISD::OutputArg> &Outs,
const TargetLowering &TLI, const DataLayout &DL);
} // end namespace llvm
#endif // LLVM_CODEGEN_TARGETLOWERING_H
Index: vendor/llvm/dist-release_90/include/llvm/ExecutionEngine/Orc/LambdaResolver.h
===================================================================
--- vendor/llvm/dist-release_90/include/llvm/ExecutionEngine/Orc/LambdaResolver.h (revision 351302)
+++ vendor/llvm/dist-release_90/include/llvm/ExecutionEngine/Orc/LambdaResolver.h (revision 351303)
@@ -1,83 +1,84 @@
//===- LambdaResolverMM - Redirect symbol lookup via a functor --*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Defines a RuntimeDyld::SymbolResolver subclass that uses a user-supplied
// functor for symbol resolution.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_EXECUTIONENGINE_ORC_LAMBDARESOLVER_H
#define LLVM_EXECUTIONENGINE_ORC_LAMBDARESOLVER_H
#include "llvm/ADT/STLExtras.h"
#include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/OrcV1Deprecation.h"
#include <memory>
namespace llvm {
namespace orc {
template <typename DylibLookupFtorT, typename ExternalLookupFtorT>
class LambdaResolver : public LegacyJITSymbolResolver {
public:
LLVM_ATTRIBUTE_DEPRECATED(
LambdaResolver(DylibLookupFtorT DylibLookupFtor,
ExternalLookupFtorT ExternalLookupFtor),
"ORCv1 utilities (including resolvers) are deprecated and will be "
"removed "
"in the next release. Please use ORCv2 (see docs/ORCv2.rst)");
LambdaResolver(ORCv1DeprecationAcknowledgement,
DylibLookupFtorT DylibLookupFtor,
ExternalLookupFtorT ExternalLookupFtor)
: DylibLookupFtor(DylibLookupFtor),
ExternalLookupFtor(ExternalLookupFtor) {}
JITSymbol findSymbolInLogicalDylib(const std::string &Name) final {
return DylibLookupFtor(Name);
}
JITSymbol findSymbol(const std::string &Name) final {
return ExternalLookupFtor(Name);
}
private:
DylibLookupFtorT DylibLookupFtor;
ExternalLookupFtorT ExternalLookupFtor;
};
template <typename DylibLookupFtorT, typename ExternalLookupFtorT>
LambdaResolver<DylibLookupFtorT, ExternalLookupFtorT>::LambdaResolver(
DylibLookupFtorT DylibLookupFtor, ExternalLookupFtorT ExternalLookupFtor)
: DylibLookupFtor(DylibLookupFtor), ExternalLookupFtor(ExternalLookupFtor) {
}
template <typename DylibLookupFtorT,
typename ExternalLookupFtorT>
std::shared_ptr<LambdaResolver<DylibLookupFtorT, ExternalLookupFtorT>>
createLambdaResolver(DylibLookupFtorT DylibLookupFtor,
ExternalLookupFtorT ExternalLookupFtor) {
using LR = LambdaResolver<DylibLookupFtorT, ExternalLookupFtorT>;
return make_unique<LR>(std::move(DylibLookupFtor),
std::move(ExternalLookupFtor));
}
template <typename DylibLookupFtorT, typename ExternalLookupFtorT>
std::shared_ptr<LambdaResolver<DylibLookupFtorT, ExternalLookupFtorT>>
createLambdaResolver(ORCv1DeprecationAcknowledgement,
DylibLookupFtorT DylibLookupFtor,
ExternalLookupFtorT ExternalLookupFtor) {
using LR = LambdaResolver<DylibLookupFtorT, ExternalLookupFtorT>;
return make_unique<LR>(AcknowledgeORCv1Deprecation,
std::move(DylibLookupFtor),
std::move(ExternalLookupFtor));
}
} // end namespace orc
} // end namespace llvm
#endif // LLVM_EXECUTIONENGINE_ORC_LAMBDARESOLVER_H
Index: vendor/llvm/dist-release_90/include/llvm/MC/MCContext.h
===================================================================
--- vendor/llvm/dist-release_90/include/llvm/MC/MCContext.h (revision 351302)
+++ vendor/llvm/dist-release_90/include/llvm/MC/MCContext.h (revision 351303)
@@ -1,754 +1,767 @@
//===- MCContext.h - Machine Code Context -----------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_MC_MCCONTEXT_H
#define LLVM_MC_MCCONTEXT_H
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/BinaryFormat/Dwarf.h"
#include "llvm/BinaryFormat/XCOFF.h"
#include "llvm/MC/MCAsmMacro.h"
#include "llvm/MC/MCDwarf.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/SectionKind.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/MD5.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <map>
#include <memory>
#include <string>
#include <utility>
#include <vector>
namespace llvm {
class CodeViewContext;
class MCAsmInfo;
class MCLabel;
class MCObjectFileInfo;
class MCRegisterInfo;
class MCSection;
class MCSectionCOFF;
class MCSectionELF;
class MCSectionMachO;
class MCSectionWasm;
class MCSectionXCOFF;
class MCStreamer;
class MCSymbol;
class MCSymbolELF;
class MCSymbolWasm;
class SMLoc;
class SourceMgr;
/// Context object for machine code objects. This class owns all of the
/// sections that it creates.
///
class MCContext {
public:
using SymbolTable = StringMap<MCSymbol *, BumpPtrAllocator &>;
private:
/// The SourceMgr for this object, if any.
const SourceMgr *SrcMgr;
/// The SourceMgr for inline assembly, if any.
SourceMgr *InlineSrcMgr;
/// The MCAsmInfo for this target.
const MCAsmInfo *MAI;
/// The MCRegisterInfo for this target.
const MCRegisterInfo *MRI;
/// The MCObjectFileInfo for this target.
const MCObjectFileInfo *MOFI;
std::unique_ptr<CodeViewContext> CVContext;
/// Allocator object used for creating machine code objects.
///
/// We use a bump pointer allocator to avoid the need to track all allocated
/// objects.
BumpPtrAllocator Allocator;
SpecificBumpPtrAllocator<MCSectionCOFF> COFFAllocator;
SpecificBumpPtrAllocator<MCSectionELF> ELFAllocator;
SpecificBumpPtrAllocator<MCSectionMachO> MachOAllocator;
SpecificBumpPtrAllocator<MCSectionWasm> WasmAllocator;
SpecificBumpPtrAllocator<MCSectionXCOFF> XCOFFAllocator;
/// Bindings of names to symbols.
SymbolTable Symbols;
/// A mapping from a local label number and an instance count to a symbol.
/// For example, in the assembly
/// 1:
/// 2:
/// 1:
/// We have three labels represented by the pairs (1, 0), (2, 0) and (1, 1)
DenseMap<std::pair<unsigned, unsigned>, MCSymbol *> LocalSymbols;
/// Keeps tracks of names that were used both for used declared and
/// artificial symbols. The value is "true" if the name has been used for a
/// non-section symbol (there can be at most one of those, plus an unlimited
/// number of section symbols with the same name).
StringMap<bool, BumpPtrAllocator &> UsedNames;
+ /// Keeps track of labels that are used in inline assembly.
+ SymbolTable InlineAsmUsedLabelNames;
+
/// The next ID to dole out to an unnamed assembler temporary symbol with
/// a given prefix.
StringMap<unsigned> NextID;
/// Instances of directional local labels.
DenseMap<unsigned, MCLabel *> Instances;
/// NextInstance() creates the next instance of the directional local label
/// for the LocalLabelVal and adds it to the map if needed.
unsigned NextInstance(unsigned LocalLabelVal);
/// GetInstance() gets the current instance of the directional local label
/// for the LocalLabelVal and adds it to the map if needed.
unsigned GetInstance(unsigned LocalLabelVal);
/// The file name of the log file from the environment variable
/// AS_SECURE_LOG_FILE. Which must be set before the .secure_log_unique
/// directive is used or it is an error.
char *SecureLogFile;
/// The stream that gets written to for the .secure_log_unique directive.
std::unique_ptr<raw_fd_ostream> SecureLog;
/// Boolean toggled when .secure_log_unique / .secure_log_reset is seen to
/// catch errors if .secure_log_unique appears twice without
/// .secure_log_reset appearing between them.
bool SecureLogUsed = false;
/// The compilation directory to use for DW_AT_comp_dir.
SmallString<128> CompilationDir;
/// Prefix replacement map for source file information.
std::map<const std::string, const std::string> DebugPrefixMap;
/// The main file name if passed in explicitly.
std::string MainFileName;
/// The dwarf file and directory tables from the dwarf .file directive.
/// We now emit a line table for each compile unit. To reduce the prologue
/// size of each line table, the files and directories used by each compile
/// unit are separated.
std::map<unsigned, MCDwarfLineTable> MCDwarfLineTablesCUMap;
/// The current dwarf line information from the last dwarf .loc directive.
MCDwarfLoc CurrentDwarfLoc;
bool DwarfLocSeen = false;
/// Generate dwarf debugging info for assembly source files.
bool GenDwarfForAssembly = false;
/// The current dwarf file number when generate dwarf debugging info for
/// assembly source files.
unsigned GenDwarfFileNumber = 0;
/// Sections for generating the .debug_ranges and .debug_aranges sections.
SetVector<MCSection *> SectionsForRanges;
/// The information gathered from labels that will have dwarf label
/// entries when generating dwarf assembly source files.
std::vector<MCGenDwarfLabelEntry> MCGenDwarfLabelEntries;
/// The string to embed in the debug information for the compile unit, if
/// non-empty.
StringRef DwarfDebugFlags;
/// The string to embed in as the dwarf AT_producer for the compile unit, if
/// non-empty.
StringRef DwarfDebugProducer;
/// The maximum version of dwarf that we should emit.
uint16_t DwarfVersion = 4;
/// Honor temporary labels, this is useful for debugging semantic
/// differences between temporary and non-temporary labels (primarily on
/// Darwin).
bool AllowTemporaryLabels = true;
bool UseNamesOnTempLabels = true;
/// The Compile Unit ID that we are currently processing.
unsigned DwarfCompileUnitID = 0;
struct ELFSectionKey {
std::string SectionName;
StringRef GroupName;
unsigned UniqueID;
ELFSectionKey(StringRef SectionName, StringRef GroupName,
unsigned UniqueID)
: SectionName(SectionName), GroupName(GroupName), UniqueID(UniqueID) {
}
bool operator<(const ELFSectionKey &Other) const {
if (SectionName != Other.SectionName)
return SectionName < Other.SectionName;
if (GroupName != Other.GroupName)
return GroupName < Other.GroupName;
return UniqueID < Other.UniqueID;
}
};
struct COFFSectionKey {
std::string SectionName;
StringRef GroupName;
int SelectionKey;
unsigned UniqueID;
COFFSectionKey(StringRef SectionName, StringRef GroupName,
int SelectionKey, unsigned UniqueID)
: SectionName(SectionName), GroupName(GroupName),
SelectionKey(SelectionKey), UniqueID(UniqueID) {}
bool operator<(const COFFSectionKey &Other) const {
if (SectionName != Other.SectionName)
return SectionName < Other.SectionName;
if (GroupName != Other.GroupName)
return GroupName < Other.GroupName;
if (SelectionKey != Other.SelectionKey)
return SelectionKey < Other.SelectionKey;
return UniqueID < Other.UniqueID;
}
};
struct WasmSectionKey {
std::string SectionName;
StringRef GroupName;
unsigned UniqueID;
WasmSectionKey(StringRef SectionName, StringRef GroupName,
unsigned UniqueID)
: SectionName(SectionName), GroupName(GroupName), UniqueID(UniqueID) {
}
bool operator<(const WasmSectionKey &Other) const {
if (SectionName != Other.SectionName)
return SectionName < Other.SectionName;
if (GroupName != Other.GroupName)
return GroupName < Other.GroupName;
return UniqueID < Other.UniqueID;
}
};
struct XCOFFSectionKey {
std::string SectionName;
XCOFF::StorageMappingClass MappingClass;
XCOFFSectionKey(StringRef SectionName,
XCOFF::StorageMappingClass MappingClass)
: SectionName(SectionName), MappingClass(MappingClass) {}
bool operator<(const XCOFFSectionKey &Other) const {
return std::tie(SectionName, MappingClass) <
std::tie(Other.SectionName, Other.MappingClass);
}
};
StringMap<MCSectionMachO *> MachOUniquingMap;
std::map<ELFSectionKey, MCSectionELF *> ELFUniquingMap;
std::map<COFFSectionKey, MCSectionCOFF *> COFFUniquingMap;
std::map<WasmSectionKey, MCSectionWasm *> WasmUniquingMap;
std::map<XCOFFSectionKey, MCSectionXCOFF *> XCOFFUniquingMap;
StringMap<bool> RelSecNames;
SpecificBumpPtrAllocator<MCSubtargetInfo> MCSubtargetAllocator;
/// Do automatic reset in destructor
bool AutoReset;
bool HadError = false;
MCSymbol *createSymbolImpl(const StringMapEntry<bool> *Name,
bool CanBeUnnamed);
MCSymbol *createSymbol(StringRef Name, bool AlwaysAddSuffix,
bool IsTemporary);
MCSymbol *getOrCreateDirectionalLocalSymbol(unsigned LocalLabelVal,
unsigned Instance);
MCSectionELF *createELFSectionImpl(StringRef Section, unsigned Type,
unsigned Flags, SectionKind K,
unsigned EntrySize,
const MCSymbolELF *Group,
unsigned UniqueID,
const MCSymbolELF *Associated);
/// Map of currently defined macros.
StringMap<MCAsmMacro> MacroMap;
public:
explicit MCContext(const MCAsmInfo *MAI, const MCRegisterInfo *MRI,
const MCObjectFileInfo *MOFI,
const SourceMgr *Mgr = nullptr, bool DoAutoReset = true);
MCContext(const MCContext &) = delete;
MCContext &operator=(const MCContext &) = delete;
~MCContext();
const SourceMgr *getSourceManager() const { return SrcMgr; }
void setInlineSourceManager(SourceMgr *SM) { InlineSrcMgr = SM; }
const MCAsmInfo *getAsmInfo() const { return MAI; }
const MCRegisterInfo *getRegisterInfo() const { return MRI; }
const MCObjectFileInfo *getObjectFileInfo() const { return MOFI; }
CodeViewContext &getCVContext();
void setAllowTemporaryLabels(bool Value) { AllowTemporaryLabels = Value; }
void setUseNamesOnTempLabels(bool Value) { UseNamesOnTempLabels = Value; }
/// \name Module Lifetime Management
/// @{
/// reset - return object to right after construction state to prepare
/// to process a new module
void reset();
/// @}
/// \name Symbol Management
/// @{
/// Create and return a new linker temporary symbol with a unique but
/// unspecified name.
MCSymbol *createLinkerPrivateTempSymbol();
/// Create and return a new assembler temporary symbol with a unique but
/// unspecified name.
MCSymbol *createTempSymbol(bool CanBeUnnamed = true);
MCSymbol *createTempSymbol(const Twine &Name, bool AlwaysAddSuffix,
bool CanBeUnnamed = true);
/// Create the definition of a directional local symbol for numbered label
/// (used for "1:" definitions).
MCSymbol *createDirectionalLocalSymbol(unsigned LocalLabelVal);
/// Create and return a directional local symbol for numbered label (used
/// for "1b" or 1f" references).
MCSymbol *getDirectionalLocalSymbol(unsigned LocalLabelVal, bool Before);
/// Lookup the symbol inside with the specified \p Name. If it exists,
/// return it. If not, create a forward reference and return it.
///
/// \param Name - The symbol name, which must be unique across all symbols.
MCSymbol *getOrCreateSymbol(const Twine &Name);
/// Gets a symbol that will be defined to the final stack offset of a local
/// variable after codegen.
///
/// \param Idx - The index of a local variable passed to \@llvm.localescape.
MCSymbol *getOrCreateFrameAllocSymbol(StringRef FuncName, unsigned Idx);
MCSymbol *getOrCreateParentFrameOffsetSymbol(StringRef FuncName);
MCSymbol *getOrCreateLSDASymbol(StringRef FuncName);
/// Get the symbol for \p Name, or null.
MCSymbol *lookupSymbol(const Twine &Name) const;
/// Set value for a symbol.
void setSymbolValue(MCStreamer &Streamer, StringRef Sym, uint64_t Val);
/// getSymbols - Get a reference for the symbol table for clients that
/// want to, for example, iterate over all symbols. 'const' because we
/// still want any modifications to the table itself to use the MCContext
/// APIs.
const SymbolTable &getSymbols() const { return Symbols; }
+
+ /// isInlineAsmLabel - Return true if the name is a label referenced in
+ /// inline assembly.
+ MCSymbol *getInlineAsmLabel(StringRef Name) const {
+ return InlineAsmUsedLabelNames.lookup(Name);
+ }
+
+ /// registerInlineAsmLabel - Records that the name is a label referenced in
+ /// inline assembly.
+ void registerInlineAsmLabel(MCSymbol *Sym);
/// @}
/// \name Section Management
/// @{
enum : unsigned {
/// Pass this value as the UniqueID during section creation to get the
/// generic section with the given name and characteristics. The usual
/// sections such as .text use this ID.
GenericSectionID = ~0U
};
/// Return the MCSection for the specified mach-o section. This requires
/// the operands to be valid.
MCSectionMachO *getMachOSection(StringRef Segment, StringRef Section,
unsigned TypeAndAttributes,
unsigned Reserved2, SectionKind K,
const char *BeginSymName = nullptr);
MCSectionMachO *getMachOSection(StringRef Segment, StringRef Section,
unsigned TypeAndAttributes, SectionKind K,
const char *BeginSymName = nullptr) {
return getMachOSection(Segment, Section, TypeAndAttributes, 0, K,
BeginSymName);
}
MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
unsigned Flags) {
return getELFSection(Section, Type, Flags, 0, "");
}
MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
unsigned Flags, unsigned EntrySize,
const Twine &Group) {
return getELFSection(Section, Type, Flags, EntrySize, Group, ~0);
}
MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
unsigned Flags, unsigned EntrySize,
const Twine &Group, unsigned UniqueID) {
return getELFSection(Section, Type, Flags, EntrySize, Group, UniqueID,
nullptr);
}
MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
unsigned Flags, unsigned EntrySize,
const Twine &Group, unsigned UniqueID,
const MCSymbolELF *Associated);
MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
unsigned Flags, unsigned EntrySize,
const MCSymbolELF *Group, unsigned UniqueID,
const MCSymbolELF *Associated);
/// Get a section with the provided group identifier. This section is
/// named by concatenating \p Prefix with '.' then \p Suffix. The \p Type
/// describes the type of the section and \p Flags are used to further
/// configure this named section.
MCSectionELF *getELFNamedSection(const Twine &Prefix, const Twine &Suffix,
unsigned Type, unsigned Flags,
unsigned EntrySize = 0);
MCSectionELF *createELFRelSection(const Twine &Name, unsigned Type,
unsigned Flags, unsigned EntrySize,
const MCSymbolELF *Group,
const MCSectionELF *RelInfoSection);
void renameELFSection(MCSectionELF *Section, StringRef Name);
MCSectionELF *createELFGroupSection(const MCSymbolELF *Group);
MCSectionCOFF *getCOFFSection(StringRef Section, unsigned Characteristics,
SectionKind Kind, StringRef COMDATSymName,
int Selection,
unsigned UniqueID = GenericSectionID,
const char *BeginSymName = nullptr);
MCSectionCOFF *getCOFFSection(StringRef Section, unsigned Characteristics,
SectionKind Kind,
const char *BeginSymName = nullptr);
/// Gets or creates a section equivalent to Sec that is associated with the
/// section containing KeySym. For example, to create a debug info section
/// associated with an inline function, pass the normal debug info section
/// as Sec and the function symbol as KeySym.
MCSectionCOFF *
getAssociativeCOFFSection(MCSectionCOFF *Sec, const MCSymbol *KeySym,
unsigned UniqueID = GenericSectionID);
MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K) {
return getWasmSection(Section, K, nullptr);
}
MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
const char *BeginSymName) {
return getWasmSection(Section, K, "", ~0, BeginSymName);
}
MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
const Twine &Group, unsigned UniqueID) {
return getWasmSection(Section, K, Group, UniqueID, nullptr);
}
MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
const Twine &Group, unsigned UniqueID,
const char *BeginSymName);
MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
const MCSymbolWasm *Group, unsigned UniqueID,
const char *BeginSymName);
MCSectionXCOFF *getXCOFFSection(StringRef Section,
XCOFF::StorageMappingClass MappingClass,
SectionKind K,
const char *BeginSymName = nullptr);
// Create and save a copy of STI and return a reference to the copy.
MCSubtargetInfo &getSubtargetCopy(const MCSubtargetInfo &STI);
/// @}
/// \name Dwarf Management
/// @{
/// Get the compilation directory for DW_AT_comp_dir
/// The compilation directory should be set with \c setCompilationDir before
/// calling this function. If it is unset, an empty string will be returned.
StringRef getCompilationDir() const { return CompilationDir; }
/// Set the compilation directory for DW_AT_comp_dir
void setCompilationDir(StringRef S) { CompilationDir = S.str(); }
/// Add an entry to the debug prefix map.
void addDebugPrefixMapEntry(const std::string &From, const std::string &To);
// Remaps all debug directory paths in-place as per the debug prefix map.
void RemapDebugPaths();
/// Get the main file name for use in error messages and debug
/// info. This can be set to ensure we've got the correct file name
/// after preprocessing or for -save-temps.
const std::string &getMainFileName() const { return MainFileName; }
/// Set the main file name and override the default.
void setMainFileName(StringRef S) { MainFileName = S; }
/// Creates an entry in the dwarf file and directory tables.
Expected<unsigned> getDwarfFile(StringRef Directory, StringRef FileName,
unsigned FileNumber,
Optional<MD5::MD5Result> Checksum,
Optional<StringRef> Source, unsigned CUID);
bool isValidDwarfFileNumber(unsigned FileNumber, unsigned CUID = 0);
const std::map<unsigned, MCDwarfLineTable> &getMCDwarfLineTables() const {
return MCDwarfLineTablesCUMap;
}
MCDwarfLineTable &getMCDwarfLineTable(unsigned CUID) {
return MCDwarfLineTablesCUMap[CUID];
}
const MCDwarfLineTable &getMCDwarfLineTable(unsigned CUID) const {
auto I = MCDwarfLineTablesCUMap.find(CUID);
assert(I != MCDwarfLineTablesCUMap.end());
return I->second;
}
const SmallVectorImpl<MCDwarfFile> &getMCDwarfFiles(unsigned CUID = 0) {
return getMCDwarfLineTable(CUID).getMCDwarfFiles();
}
const SmallVectorImpl<std::string> &getMCDwarfDirs(unsigned CUID = 0) {
return getMCDwarfLineTable(CUID).getMCDwarfDirs();
}
unsigned getDwarfCompileUnitID() { return DwarfCompileUnitID; }
void setDwarfCompileUnitID(unsigned CUIndex) {
DwarfCompileUnitID = CUIndex;
}
/// Specifies the "root" file and directory of the compilation unit.
/// These are "file 0" and "directory 0" in DWARF v5.
void setMCLineTableRootFile(unsigned CUID, StringRef CompilationDir,
StringRef Filename,
Optional<MD5::MD5Result> Checksum,
Optional<StringRef> Source) {
getMCDwarfLineTable(CUID).setRootFile(CompilationDir, Filename, Checksum,
Source);
}
/// Reports whether MD5 checksum usage is consistent (all-or-none).
bool isDwarfMD5UsageConsistent(unsigned CUID) const {
return getMCDwarfLineTable(CUID).isMD5UsageConsistent();
}
/// Saves the information from the currently parsed dwarf .loc directive
/// and sets DwarfLocSeen. When the next instruction is assembled an entry
/// in the line number table with this information and the address of the
/// instruction will be created.
void setCurrentDwarfLoc(unsigned FileNum, unsigned Line, unsigned Column,
unsigned Flags, unsigned Isa,
unsigned Discriminator) {
CurrentDwarfLoc.setFileNum(FileNum);
CurrentDwarfLoc.setLine(Line);
CurrentDwarfLoc.setColumn(Column);
CurrentDwarfLoc.setFlags(Flags);
CurrentDwarfLoc.setIsa(Isa);
CurrentDwarfLoc.setDiscriminator(Discriminator);
DwarfLocSeen = true;
}
void clearDwarfLocSeen() { DwarfLocSeen = false; }
bool getDwarfLocSeen() { return DwarfLocSeen; }
const MCDwarfLoc &getCurrentDwarfLoc() { return CurrentDwarfLoc; }
bool getGenDwarfForAssembly() { return GenDwarfForAssembly; }
void setGenDwarfForAssembly(bool Value) { GenDwarfForAssembly = Value; }
unsigned getGenDwarfFileNumber() { return GenDwarfFileNumber; }
void setGenDwarfFileNumber(unsigned FileNumber) {
GenDwarfFileNumber = FileNumber;
}
/// Specifies information about the "root file" for assembler clients
/// (e.g., llvm-mc). Assumes compilation dir etc. have been set up.
void setGenDwarfRootFile(StringRef FileName, StringRef Buffer);
const SetVector<MCSection *> &getGenDwarfSectionSyms() {
return SectionsForRanges;
}
bool addGenDwarfSection(MCSection *Sec) {
return SectionsForRanges.insert(Sec);
}
void finalizeDwarfSections(MCStreamer &MCOS);
const std::vector<MCGenDwarfLabelEntry> &getMCGenDwarfLabelEntries() const {
return MCGenDwarfLabelEntries;
}
void addMCGenDwarfLabelEntry(const MCGenDwarfLabelEntry &E) {
MCGenDwarfLabelEntries.push_back(E);
}
void setDwarfDebugFlags(StringRef S) { DwarfDebugFlags = S; }
StringRef getDwarfDebugFlags() { return DwarfDebugFlags; }
void setDwarfDebugProducer(StringRef S) { DwarfDebugProducer = S; }
StringRef getDwarfDebugProducer() { return DwarfDebugProducer; }
dwarf::DwarfFormat getDwarfFormat() const {
// TODO: Support DWARF64
return dwarf::DWARF32;
}
void setDwarfVersion(uint16_t v) { DwarfVersion = v; }
uint16_t getDwarfVersion() const { return DwarfVersion; }
/// @}
char *getSecureLogFile() { return SecureLogFile; }
raw_fd_ostream *getSecureLog() { return SecureLog.get(); }
void setSecureLog(std::unique_ptr<raw_fd_ostream> Value) {
SecureLog = std::move(Value);
}
bool getSecureLogUsed() { return SecureLogUsed; }
void setSecureLogUsed(bool Value) { SecureLogUsed = Value; }
void *allocate(unsigned Size, unsigned Align = 8) {
return Allocator.Allocate(Size, Align);
}
void deallocate(void *Ptr) {}
bool hadError() { return HadError; }
void reportError(SMLoc L, const Twine &Msg);
// Unrecoverable error has occurred. Display the best diagnostic we can
// and bail via exit(1). For now, most MC backend errors are unrecoverable.
// FIXME: We should really do something about that.
LLVM_ATTRIBUTE_NORETURN void reportFatalError(SMLoc L,
const Twine &Msg);
const MCAsmMacro *lookupMacro(StringRef Name) {
StringMap<MCAsmMacro>::iterator I = MacroMap.find(Name);
return (I == MacroMap.end()) ? nullptr : &I->getValue();
}
void defineMacro(StringRef Name, MCAsmMacro Macro) {
MacroMap.insert(std::make_pair(Name, std::move(Macro)));
}
void undefineMacro(StringRef Name) { MacroMap.erase(Name); }
};
} // end namespace llvm
// operator new and delete aren't allowed inside namespaces.
// The throw specifications are mandated by the standard.
/// Placement new for using the MCContext's allocator.
///
/// This placement form of operator new uses the MCContext's allocator for
/// obtaining memory. It is a non-throwing new, which means that it returns
/// null on error. (If that is what the allocator does. The current does, so if
/// this ever changes, this operator will have to be changed, too.)
/// Usage looks like this (assuming there's an MCContext 'Context' in scope):
/// \code
/// // Default alignment (8)
/// IntegerLiteral *Ex = new (Context) IntegerLiteral(arguments);
/// // Specific alignment
/// IntegerLiteral *Ex2 = new (Context, 4) IntegerLiteral(arguments);
/// \endcode
/// Please note that you cannot use delete on the pointer; it must be
/// deallocated using an explicit destructor call followed by
/// \c Context.Deallocate(Ptr).
///
/// \param Bytes The number of bytes to allocate. Calculated by the compiler.
/// \param C The MCContext that provides the allocator.
/// \param Alignment The alignment of the allocated memory (if the underlying
/// allocator supports it).
/// \return The allocated memory. Could be NULL.
inline void *operator new(size_t Bytes, llvm::MCContext &C,
size_t Alignment = 8) noexcept {
return C.allocate(Bytes, Alignment);
}
/// Placement delete companion to the new above.
///
/// This operator is just a companion to the new above. There is no way of
/// invoking it directly; see the new operator for more details. This operator
/// is called implicitly by the compiler if a placement new expression using
/// the MCContext throws in the object constructor.
inline void operator delete(void *Ptr, llvm::MCContext &C, size_t) noexcept {
C.deallocate(Ptr);
}
/// This placement form of operator new[] uses the MCContext's allocator for
/// obtaining memory. It is a non-throwing new[], which means that it returns
/// null on error.
/// Usage looks like this (assuming there's an MCContext 'Context' in scope):
/// \code
/// // Default alignment (8)
/// char *data = new (Context) char[10];
/// // Specific alignment
/// char *data = new (Context, 4) char[10];
/// \endcode
/// Please note that you cannot use delete on the pointer; it must be
/// deallocated using an explicit destructor call followed by
/// \c Context.Deallocate(Ptr).
///
/// \param Bytes The number of bytes to allocate. Calculated by the compiler.
/// \param C The MCContext that provides the allocator.
/// \param Alignment The alignment of the allocated memory (if the underlying
/// allocator supports it).
/// \return The allocated memory. Could be NULL.
inline void *operator new[](size_t Bytes, llvm::MCContext &C,
size_t Alignment = 8) noexcept {
return C.allocate(Bytes, Alignment);
}
/// Placement delete[] companion to the new[] above.
///
/// This operator is just a companion to the new[] above. There is no way of
/// invoking it directly; see the new[] operator for more details. This operator
/// is called implicitly by the compiler if a placement new[] expression using
/// the MCContext throws in the object constructor.
inline void operator delete[](void *Ptr, llvm::MCContext &C) noexcept {
C.deallocate(Ptr);
}
#endif // LLVM_MC_MCCONTEXT_H
Index: vendor/llvm/dist-release_90/include/llvm/Support/AArch64TargetParser.def
===================================================================
--- vendor/llvm/dist-release_90/include/llvm/Support/AArch64TargetParser.def (revision 351302)
+++ vendor/llvm/dist-release_90/include/llvm/Support/AArch64TargetParser.def (revision 351303)
@@ -1,141 +1,141 @@
//===- AARCH64TargetParser.def - AARCH64 target parsing defines ---------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file provides defines to build up the AARCH64 target parser's logic.
//
//===----------------------------------------------------------------------===//
// NOTE: NO INCLUDE GUARD DESIRED!
#ifndef AARCH64_ARCH
#define AARCH64_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT)
#endif
AARCH64_ARCH("invalid", INVALID, "", "",
ARMBuildAttrs::CPUArch::v8_A, FK_NONE, AArch64::AEK_NONE)
AARCH64_ARCH("armv8-a", ARMV8A, "8-A", "v8", ARMBuildAttrs::CPUArch::v8_A,
FK_CRYPTO_NEON_FP_ARMV8,
(AArch64::AEK_CRYPTO | AArch64::AEK_FP | AArch64::AEK_SIMD))
AARCH64_ARCH("armv8.1-a", ARMV8_1A, "8.1-A", "v8.1a",
ARMBuildAttrs::CPUArch::v8_A, FK_CRYPTO_NEON_FP_ARMV8,
(AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_FP |
AArch64::AEK_SIMD | AArch64::AEK_LSE | AArch64::AEK_RDM))
AARCH64_ARCH("armv8.2-a", ARMV8_2A, "8.2-A", "v8.2a",
ARMBuildAttrs::CPUArch::v8_A, FK_CRYPTO_NEON_FP_ARMV8,
(AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_FP |
AArch64::AEK_SIMD | AArch64::AEK_RAS | AArch64::AEK_LSE |
AArch64::AEK_RDM))
AARCH64_ARCH("armv8.3-a", ARMV8_3A, "8.3-A", "v8.3a",
ARMBuildAttrs::CPUArch::v8_A, FK_CRYPTO_NEON_FP_ARMV8,
(AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_FP |
AArch64::AEK_SIMD | AArch64::AEK_RAS | AArch64::AEK_LSE |
AArch64::AEK_RDM | AArch64::AEK_RCPC))
AARCH64_ARCH("armv8.4-a", ARMV8_4A, "8.4-A", "v8.4a",
ARMBuildAttrs::CPUArch::v8_A, FK_CRYPTO_NEON_FP_ARMV8,
(AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_FP |
AArch64::AEK_SIMD | AArch64::AEK_RAS | AArch64::AEK_LSE |
AArch64::AEK_RDM | AArch64::AEK_RCPC | AArch64::AEK_DOTPROD))
AARCH64_ARCH("armv8.5-a", ARMV8_5A, "8.5-A", "v8.5a",
ARMBuildAttrs::CPUArch::v8_A, FK_CRYPTO_NEON_FP_ARMV8,
(AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_FP |
AArch64::AEK_SIMD | AArch64::AEK_RAS | AArch64::AEK_LSE |
AArch64::AEK_RDM | AArch64::AEK_RCPC | AArch64::AEK_DOTPROD))
#undef AARCH64_ARCH
#ifndef AARCH64_ARCH_EXT_NAME
#define AARCH64_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE)
#endif
// FIXME: This would be nicer were it tablegen
-AARCH64_ARCH_EXT_NAME("invalid", AArch64::AEK_INVALID, nullptr, nullptr)
-AARCH64_ARCH_EXT_NAME("none", AArch64::AEK_NONE, nullptr, nullptr)
-AARCH64_ARCH_EXT_NAME("crc", AArch64::AEK_CRC, "+crc", "-crc")
-AARCH64_ARCH_EXT_NAME("lse", AArch64::AEK_LSE, "+lse", "-lse")
-AARCH64_ARCH_EXT_NAME("rdm", AArch64::AEK_RDM, "+rdm", "-rdm")
-AARCH64_ARCH_EXT_NAME("crypto", AArch64::AEK_CRYPTO, "+crypto","-crypto")
-AARCH64_ARCH_EXT_NAME("sm4", AArch64::AEK_SM4, "+sm4", "-sm4")
-AARCH64_ARCH_EXT_NAME("sha3", AArch64::AEK_SHA3, "+sha3", "-sha3")
-AARCH64_ARCH_EXT_NAME("sha2", AArch64::AEK_SHA2, "+sha2", "-sha2")
-AARCH64_ARCH_EXT_NAME("aes", AArch64::AEK_AES, "+aes", "-aes")
-AARCH64_ARCH_EXT_NAME("dotprod", AArch64::AEK_DOTPROD, "+dotprod","-dotprod")
-AARCH64_ARCH_EXT_NAME("fp", AArch64::AEK_FP, "+fp-armv8", "-fp-armv8")
-AARCH64_ARCH_EXT_NAME("simd", AArch64::AEK_SIMD, "+neon", "-neon")
-AARCH64_ARCH_EXT_NAME("fp16", AArch64::AEK_FP16, "+fullfp16", "-fullfp16")
-AARCH64_ARCH_EXT_NAME("fp16fml", AArch64::AEK_FP16FML, "+fp16fml", "-fp16fml")
-AARCH64_ARCH_EXT_NAME("profile", AArch64::AEK_PROFILE, "+spe", "-spe")
-AARCH64_ARCH_EXT_NAME("ras", AArch64::AEK_RAS, "+ras", "-ras")
-AARCH64_ARCH_EXT_NAME("sve", AArch64::AEK_SVE, "+sve", "-sve")
-AARCH64_ARCH_EXT_NAME("sve2", AArch64::AEK_SVE2, "+sve2", "-sve2")
-AARCH64_ARCH_EXT_NAME("sve2-aes", AArch64::AEK_SVE2AES, "+sve2-aes", "-sve2-aes")
-AARCH64_ARCH_EXT_NAME("sve2-sm4", AArch64::AEK_SVE2SM4, "+sve2-sm4", "-sve2-sm4")
-AARCH64_ARCH_EXT_NAME("sve2-sha3", AArch64::AEK_SVE2SHA3, "+sve2-sha3", "-sve2-sha3")
-AARCH64_ARCH_EXT_NAME("bitperm", AArch64::AEK_BITPERM, "+bitperm", "-bitperm")
-AARCH64_ARCH_EXT_NAME("rcpc", AArch64::AEK_RCPC, "+rcpc", "-rcpc")
-AARCH64_ARCH_EXT_NAME("rng", AArch64::AEK_RAND, "+rand", "-rand")
-AARCH64_ARCH_EXT_NAME("memtag", AArch64::AEK_MTE, "+mte", "-mte")
-AARCH64_ARCH_EXT_NAME("ssbs", AArch64::AEK_SSBS, "+ssbs", "-ssbs")
-AARCH64_ARCH_EXT_NAME("sb", AArch64::AEK_SB, "+sb", "-sb")
-AARCH64_ARCH_EXT_NAME("predres", AArch64::AEK_PREDRES, "+predres", "-predres")
+AARCH64_ARCH_EXT_NAME("invalid", AArch64::AEK_INVALID, nullptr, nullptr)
+AARCH64_ARCH_EXT_NAME("none", AArch64::AEK_NONE, nullptr, nullptr)
+AARCH64_ARCH_EXT_NAME("crc", AArch64::AEK_CRC, "+crc", "-crc")
+AARCH64_ARCH_EXT_NAME("lse", AArch64::AEK_LSE, "+lse", "-lse")
+AARCH64_ARCH_EXT_NAME("rdm", AArch64::AEK_RDM, "+rdm", "-rdm")
+AARCH64_ARCH_EXT_NAME("crypto", AArch64::AEK_CRYPTO, "+crypto","-crypto")
+AARCH64_ARCH_EXT_NAME("sm4", AArch64::AEK_SM4, "+sm4", "-sm4")
+AARCH64_ARCH_EXT_NAME("sha3", AArch64::AEK_SHA3, "+sha3", "-sha3")
+AARCH64_ARCH_EXT_NAME("sha2", AArch64::AEK_SHA2, "+sha2", "-sha2")
+AARCH64_ARCH_EXT_NAME("aes", AArch64::AEK_AES, "+aes", "-aes")
+AARCH64_ARCH_EXT_NAME("dotprod", AArch64::AEK_DOTPROD, "+dotprod","-dotprod")
+AARCH64_ARCH_EXT_NAME("fp", AArch64::AEK_FP, "+fp-armv8", "-fp-armv8")
+AARCH64_ARCH_EXT_NAME("simd", AArch64::AEK_SIMD, "+neon", "-neon")
+AARCH64_ARCH_EXT_NAME("fp16", AArch64::AEK_FP16, "+fullfp16", "-fullfp16")
+AARCH64_ARCH_EXT_NAME("fp16fml", AArch64::AEK_FP16FML, "+fp16fml", "-fp16fml")
+AARCH64_ARCH_EXT_NAME("profile", AArch64::AEK_PROFILE, "+spe", "-spe")
+AARCH64_ARCH_EXT_NAME("ras", AArch64::AEK_RAS, "+ras", "-ras")
+AARCH64_ARCH_EXT_NAME("sve", AArch64::AEK_SVE, "+sve", "-sve")
+AARCH64_ARCH_EXT_NAME("sve2", AArch64::AEK_SVE2, "+sve2", "-sve2")
+AARCH64_ARCH_EXT_NAME("sve2-aes", AArch64::AEK_SVE2AES, "+sve2-aes", "-sve2-aes")
+AARCH64_ARCH_EXT_NAME("sve2-sm4", AArch64::AEK_SVE2SM4, "+sve2-sm4", "-sve2-sm4")
+AARCH64_ARCH_EXT_NAME("sve2-sha3", AArch64::AEK_SVE2SHA3, "+sve2-sha3", "-sve2-sha3")
+AARCH64_ARCH_EXT_NAME("sve2-bitperm", AArch64::AEK_SVE2BITPERM, "+sve2-bitperm", "-sve2-bitperm")
+AARCH64_ARCH_EXT_NAME("rcpc", AArch64::AEK_RCPC, "+rcpc", "-rcpc")
+AARCH64_ARCH_EXT_NAME("rng", AArch64::AEK_RAND, "+rand", "-rand")
+AARCH64_ARCH_EXT_NAME("memtag", AArch64::AEK_MTE, "+mte", "-mte")
+AARCH64_ARCH_EXT_NAME("ssbs", AArch64::AEK_SSBS, "+ssbs", "-ssbs")
+AARCH64_ARCH_EXT_NAME("sb", AArch64::AEK_SB, "+sb", "-sb")
+AARCH64_ARCH_EXT_NAME("predres", AArch64::AEK_PREDRES, "+predres", "-predres")
#undef AARCH64_ARCH_EXT_NAME
#ifndef AARCH64_CPU_NAME
#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT)
#endif
AARCH64_CPU_NAME("cortex-a35", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_CRC))
AARCH64_CPU_NAME("cortex-a53", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, true,
(AArch64::AEK_CRC))
AARCH64_CPU_NAME("cortex-a55", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_FP16 | AArch64::AEK_DOTPROD | AArch64::AEK_RCPC))
AARCH64_CPU_NAME("cortex-a57", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_CRC))
AARCH64_CPU_NAME("cortex-a72", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_CRC))
AARCH64_CPU_NAME("cortex-a73", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_CRC))
AARCH64_CPU_NAME("cortex-a75", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_FP16 | AArch64::AEK_DOTPROD | AArch64::AEK_RCPC))
AARCH64_CPU_NAME("cortex-a76", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_FP16 | AArch64::AEK_DOTPROD | AArch64::AEK_RCPC |
AArch64::AEK_SSBS))
AARCH64_CPU_NAME("cortex-a76ae", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_FP16 | AArch64::AEK_DOTPROD | AArch64::AEK_RCPC |
AArch64::AEK_SSBS))
AARCH64_CPU_NAME("cyclone", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_NONE))
AARCH64_CPU_NAME("exynos-m1", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_CRC))
AARCH64_CPU_NAME("exynos-m2", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_CRC))
AARCH64_CPU_NAME("exynos-m3", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_CRC))
AARCH64_CPU_NAME("exynos-m4", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_DOTPROD | AArch64::AEK_FP16))
AARCH64_CPU_NAME("exynos-m5", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_DOTPROD | AArch64::AEK_FP16))
AARCH64_CPU_NAME("falkor", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_CRC | AArch64::AEK_RDM))
AARCH64_CPU_NAME("saphira", ARMV8_3A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_PROFILE))
AARCH64_CPU_NAME("kryo", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_CRC))
AARCH64_CPU_NAME("thunderx2t99", ARMV8_1A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_NONE))
AARCH64_CPU_NAME("thunderx", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_CRC | AArch64::AEK_PROFILE))
AARCH64_CPU_NAME("thunderxt88", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_CRC | AArch64::AEK_PROFILE))
AARCH64_CPU_NAME("thunderxt81", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_CRC | AArch64::AEK_PROFILE))
AARCH64_CPU_NAME("thunderxt83", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_CRC | AArch64::AEK_PROFILE))
AARCH64_CPU_NAME("tsv110", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
(AArch64::AEK_DOTPROD |
AArch64::AEK_FP16 | AArch64::AEK_FP16FML |
AArch64::AEK_PROFILE))
// Invalid CPU
AARCH64_CPU_NAME("invalid", INVALID, FK_INVALID, true, AArch64::AEK_INVALID)
#undef AARCH64_CPU_NAME
Index: vendor/llvm/dist-release_90/include/llvm/Support/AArch64TargetParser.h
===================================================================
--- vendor/llvm/dist-release_90/include/llvm/Support/AArch64TargetParser.h (revision 351302)
+++ vendor/llvm/dist-release_90/include/llvm/Support/AArch64TargetParser.h (revision 351303)
@@ -1,128 +1,128 @@
//===-- AArch64TargetParser - Parser for AArch64 features -------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements a target parser to recognise AArch64 hardware features
// such as FPU/CPU/ARCH and extension names.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_SUPPORT_AARCH64TARGETPARSERCOMMON_H
#define LLVM_SUPPORT_AARCH64TARGETPARSERCOMMON_H
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Support/ARMTargetParser.h"
#include <vector>
// FIXME:This should be made into class design,to avoid dupplication.
namespace llvm {
namespace AArch64 {
// Arch extension modifiers for CPUs.
enum ArchExtKind : unsigned {
AEK_INVALID = 0,
AEK_NONE = 1,
AEK_CRC = 1 << 1,
AEK_CRYPTO = 1 << 2,
AEK_FP = 1 << 3,
AEK_SIMD = 1 << 4,
AEK_FP16 = 1 << 5,
AEK_PROFILE = 1 << 6,
AEK_RAS = 1 << 7,
AEK_LSE = 1 << 8,
AEK_SVE = 1 << 9,
AEK_DOTPROD = 1 << 10,
AEK_RCPC = 1 << 11,
AEK_RDM = 1 << 12,
AEK_SM4 = 1 << 13,
AEK_SHA3 = 1 << 14,
AEK_SHA2 = 1 << 15,
AEK_AES = 1 << 16,
AEK_FP16FML = 1 << 17,
AEK_RAND = 1 << 18,
AEK_MTE = 1 << 19,
AEK_SSBS = 1 << 20,
AEK_SB = 1 << 21,
AEK_PREDRES = 1 << 22,
AEK_SVE2 = 1 << 23,
AEK_SVE2AES = 1 << 24,
AEK_SVE2SM4 = 1 << 25,
AEK_SVE2SHA3 = 1 << 26,
- AEK_BITPERM = 1 << 27,
+ AEK_SVE2BITPERM = 1 << 27,
};
enum class ArchKind {
#define AARCH64_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT) ID,
#include "AArch64TargetParser.def"
};
const ARM::ArchNames<ArchKind> AArch64ARCHNames[] = {
#define AARCH64_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, \
ARCH_BASE_EXT) \
{NAME, \
sizeof(NAME) - 1, \
CPU_ATTR, \
sizeof(CPU_ATTR) - 1, \
SUB_ARCH, \
sizeof(SUB_ARCH) - 1, \
ARM::FPUKind::ARCH_FPU, \
ARCH_BASE_EXT, \
AArch64::ArchKind::ID, \
ARCH_ATTR},
#include "AArch64TargetParser.def"
};
const ARM::ExtName AArch64ARCHExtNames[] = {
#define AARCH64_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE) \
{NAME, sizeof(NAME) - 1, ID, FEATURE, NEGFEATURE},
#include "AArch64TargetParser.def"
};
const ARM::CpuNames<ArchKind> AArch64CPUNames[] = {
#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
{NAME, sizeof(NAME) - 1, AArch64::ArchKind::ID, IS_DEFAULT, DEFAULT_EXT},
#include "AArch64TargetParser.def"
};
const ArchKind ArchKinds[] = {
#define AARCH64_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT) \
ArchKind::ID,
#include "AArch64TargetParser.def"
};
// FIXME: These should be moved to TargetTuple once it exists
bool getExtensionFeatures(unsigned Extensions,
std::vector<StringRef> &Features);
bool getArchFeatures(ArchKind AK, std::vector<StringRef> &Features);
StringRef getArchName(ArchKind AK);
unsigned getArchAttr(ArchKind AK);
StringRef getCPUAttr(ArchKind AK);
StringRef getSubArch(ArchKind AK);
StringRef getArchExtName(unsigned ArchExtKind);
StringRef getArchExtFeature(StringRef ArchExt);
// Information by Name
unsigned getDefaultFPU(StringRef CPU, ArchKind AK);
unsigned getDefaultExtensions(StringRef CPU, ArchKind AK);
StringRef getDefaultCPU(StringRef Arch);
ArchKind getCPUArchKind(StringRef CPU);
// Parser
ArchKind parseArch(StringRef Arch);
ArchExtKind parseArchExt(StringRef ArchExt);
ArchKind parseCPUArch(StringRef CPU);
// Used by target parser tests
void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
bool isX18ReservedByDefault(const Triple &TT);
} // namespace AArch64
} // namespace llvm
#endif
Index: vendor/llvm/dist-release_90/include/llvm/Support/ARMTargetParser.h
===================================================================
--- vendor/llvm/dist-release_90/include/llvm/Support/ARMTargetParser.h (revision 351302)
+++ vendor/llvm/dist-release_90/include/llvm/Support/ARMTargetParser.h (revision 351303)
@@ -1,273 +1,267 @@
//===-- ARMTargetParser - Parser for ARM target features --------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements a target parser to recognise ARM hardware features
// such as FPU/CPU/ARCH/extensions and specific support such as HWDIV.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_SUPPORT_ARMTARGETPARSER_H
#define LLVM_SUPPORT_ARMTARGETPARSER_H
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Support/ARMBuildAttributes.h"
#include <vector>
namespace llvm {
namespace ARM {
// Arch extension modifiers for CPUs.
// Note that this is not the same as the AArch64 list
enum ArchExtKind : unsigned {
AEK_INVALID = 0,
AEK_NONE = 1,
AEK_CRC = 1 << 1,
AEK_CRYPTO = 1 << 2,
AEK_FP = 1 << 3,
AEK_HWDIVTHUMB = 1 << 4,
AEK_HWDIVARM = 1 << 5,
AEK_MP = 1 << 6,
AEK_SIMD = 1 << 7,
AEK_SEC = 1 << 8,
AEK_VIRT = 1 << 9,
AEK_DSP = 1 << 10,
AEK_FP16 = 1 << 11,
AEK_RAS = 1 << 12,
- AEK_SVE = 1 << 13,
- AEK_DOTPROD = 1 << 14,
- AEK_SHA2 = 1 << 15,
- AEK_AES = 1 << 16,
- AEK_FP16FML = 1 << 17,
- AEK_SB = 1 << 18,
- AEK_SVE2 = 1 << 19,
- AEK_SVE2AES = 1 << 20,
- AEK_SVE2SM4 = 1 << 21,
- AEK_SVE2SHA3 = 1 << 22,
- AEK_BITPERM = 1 << 23,
- AEK_FP_DP = 1 << 24,
- AEK_LOB = 1 << 25,
+ AEK_DOTPROD = 1 << 13,
+ AEK_SHA2 = 1 << 14,
+ AEK_AES = 1 << 15,
+ AEK_FP16FML = 1 << 16,
+ AEK_SB = 1 << 17,
+ AEK_FP_DP = 1 << 18,
+ AEK_LOB = 1 << 19,
// Unsupported extensions.
AEK_OS = 0x8000000,
AEK_IWMMXT = 0x10000000,
AEK_IWMMXT2 = 0x20000000,
AEK_MAVERICK = 0x40000000,
AEK_XSCALE = 0x80000000,
};
// List of Arch Extension names.
// FIXME: TableGen this.
struct ExtName {
const char *NameCStr;
size_t NameLength;
unsigned ID;
const char *Feature;
const char *NegFeature;
StringRef getName() const { return StringRef(NameCStr, NameLength); }
};
const ExtName ARCHExtNames[] = {
#define ARM_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE) \
{NAME, sizeof(NAME) - 1, ID, FEATURE, NEGFEATURE},
#include "ARMTargetParser.def"
};
// List of HWDiv names (use getHWDivSynonym) and which architectural
// features they correspond to (use getHWDivFeatures).
// FIXME: TableGen this.
const struct {
const char *NameCStr;
size_t NameLength;
unsigned ID;
StringRef getName() const { return StringRef(NameCStr, NameLength); }
} HWDivNames[] = {
#define ARM_HW_DIV_NAME(NAME, ID) {NAME, sizeof(NAME) - 1, ID},
#include "ARMTargetParser.def"
};
// Arch names.
enum class ArchKind {
#define ARM_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT) ID,
#include "ARMTargetParser.def"
};
// List of CPU names and their arches.
// The same CPU can have multiple arches and can be default on multiple arches.
// When finding the Arch for a CPU, first-found prevails. Sort them accordingly.
// When this becomes table-generated, we'd probably need two tables.
// FIXME: TableGen this.
template <typename T> struct CpuNames {
const char *NameCStr;
size_t NameLength;
T ArchID;
bool Default; // is $Name the default CPU for $ArchID ?
unsigned DefaultExtensions;
StringRef getName() const { return StringRef(NameCStr, NameLength); }
};
const CpuNames<ArchKind> CPUNames[] = {
#define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
{NAME, sizeof(NAME) - 1, ARM::ArchKind::ID, IS_DEFAULT, DEFAULT_EXT},
#include "ARMTargetParser.def"
};
// FPU names.
enum FPUKind {
#define ARM_FPU(NAME, KIND, VERSION, NEON_SUPPORT, RESTRICTION) KIND,
#include "ARMTargetParser.def"
FK_LAST
};
// FPU Version
enum class FPUVersion {
NONE,
VFPV2,
VFPV3,
VFPV3_FP16,
VFPV4,
VFPV5,
VFPV5_FULLFP16,
};
// An FPU name restricts the FPU in one of three ways:
enum class FPURestriction {
None = 0, ///< No restriction
D16, ///< Only 16 D registers
SP_D16 ///< Only single-precision instructions, with 16 D registers
};
// An FPU name implies one of three levels of Neon support:
enum class NeonSupportLevel {
None = 0, ///< No Neon
Neon, ///< Neon
Crypto ///< Neon with Crypto
};
// ISA kinds.
enum class ISAKind { INVALID = 0, ARM, THUMB, AARCH64 };
// Endianness
// FIXME: BE8 vs. BE32?
enum class EndianKind { INVALID = 0, LITTLE, BIG };
// v6/v7/v8 Profile
enum class ProfileKind { INVALID = 0, A, R, M };
// List of canonical FPU names (use getFPUSynonym) and which architectural
// features they correspond to (use getFPUFeatures).
// FIXME: TableGen this.
// The entries must appear in the order listed in ARM::FPUKind for correct
// indexing
struct FPUName {
const char *NameCStr;
size_t NameLength;
FPUKind ID;
FPUVersion FPUVer;
NeonSupportLevel NeonSupport;
FPURestriction Restriction;
StringRef getName() const { return StringRef(NameCStr, NameLength); }
};
static const FPUName FPUNames[] = {
#define ARM_FPU(NAME, KIND, VERSION, NEON_SUPPORT, RESTRICTION) \
{NAME, sizeof(NAME) - 1, KIND, VERSION, NEON_SUPPORT, RESTRICTION},
#include "llvm/Support/ARMTargetParser.def"
};
// List of canonical arch names (use getArchSynonym).
// This table also provides the build attribute fields for CPU arch
// and Arch ID, according to the Addenda to the ARM ABI, chapters
// 2.4 and 2.3.5.2 respectively.
// FIXME: SubArch values were simplified to fit into the expectations
// of the triples and are not conforming with their official names.
// Check to see if the expectation should be changed.
// FIXME: TableGen this.
template <typename T> struct ArchNames {
const char *NameCStr;
size_t NameLength;
const char *CPUAttrCStr;
size_t CPUAttrLength;
const char *SubArchCStr;
size_t SubArchLength;
unsigned DefaultFPU;
unsigned ArchBaseExtensions;
T ID;
ARMBuildAttrs::CPUArch ArchAttr; // Arch ID in build attributes.
StringRef getName() const { return StringRef(NameCStr, NameLength); }
// CPU class in build attributes.
StringRef getCPUAttr() const { return StringRef(CPUAttrCStr, CPUAttrLength); }
// Sub-Arch name.
StringRef getSubArch() const { return StringRef(SubArchCStr, SubArchLength); }
};
static const ArchNames<ArchKind> ARCHNames[] = {
#define ARM_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, \
ARCH_BASE_EXT) \
{NAME, sizeof(NAME) - 1, \
CPU_ATTR, sizeof(CPU_ATTR) - 1, \
SUB_ARCH, sizeof(SUB_ARCH) - 1, \
ARCH_FPU, ARCH_BASE_EXT, \
ArchKind::ID, ARCH_ATTR},
#include "llvm/Support/ARMTargetParser.def"
};
// Information by ID
StringRef getFPUName(unsigned FPUKind);
FPUVersion getFPUVersion(unsigned FPUKind);
NeonSupportLevel getFPUNeonSupportLevel(unsigned FPUKind);
FPURestriction getFPURestriction(unsigned FPUKind);
// FIXME: These should be moved to TargetTuple once it exists
bool getFPUFeatures(unsigned FPUKind, std::vector<StringRef> &Features);
bool getHWDivFeatures(unsigned HWDivKind, std::vector<StringRef> &Features);
bool getExtensionFeatures(unsigned Extensions,
std::vector<StringRef> &Features);
StringRef getArchName(ArchKind AK);
unsigned getArchAttr(ArchKind AK);
StringRef getCPUAttr(ArchKind AK);
StringRef getSubArch(ArchKind AK);
StringRef getArchExtName(unsigned ArchExtKind);
StringRef getArchExtFeature(StringRef ArchExt);
bool appendArchExtFeatures(StringRef CPU, ARM::ArchKind AK, StringRef ArchExt,
std::vector<StringRef> &Features);
StringRef getHWDivName(unsigned HWDivKind);
// Information by Name
unsigned getDefaultFPU(StringRef CPU, ArchKind AK);
unsigned getDefaultExtensions(StringRef CPU, ArchKind AK);
StringRef getDefaultCPU(StringRef Arch);
StringRef getCanonicalArchName(StringRef Arch);
StringRef getFPUSynonym(StringRef FPU);
StringRef getArchSynonym(StringRef Arch);
// Parser
unsigned parseHWDiv(StringRef HWDiv);
unsigned parseFPU(StringRef FPU);
ArchKind parseArch(StringRef Arch);
unsigned parseArchExt(StringRef ArchExt);
ArchKind parseCPUArch(StringRef CPU);
ISAKind parseArchISA(StringRef Arch);
EndianKind parseArchEndian(StringRef Arch);
ProfileKind parseArchProfile(StringRef Arch);
unsigned parseArchVersion(StringRef Arch);
void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
StringRef computeDefaultTargetABI(const Triple &TT, StringRef CPU);
} // namespace ARM
} // namespace llvm
#endif
Index: vendor/llvm/dist-release_90/include/llvm/Transforms/Utils/BypassSlowDivision.h
===================================================================
--- vendor/llvm/dist-release_90/include/llvm/Transforms/Utils/BypassSlowDivision.h (revision 351302)
+++ vendor/llvm/dist-release_90/include/llvm/Transforms/Utils/BypassSlowDivision.h (revision 351303)
@@ -1,69 +1,72 @@
//===- llvm/Transforms/Utils/BypassSlowDivision.h ---------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains an optimization for div and rem on architectures that
// execute short instructions significantly faster than longer instructions.
// For example, on Intel Atom 32-bit divides are slow enough that during
// runtime it is profitable to check the value of the operands, and if they are
// positive and less than 256 use an unsigned 8-bit divide.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TRANSFORMS_UTILS_BYPASSSLOWDIVISION_H
#define LLVM_TRANSFORMS_UTILS_BYPASSSLOWDIVISION_H
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/IR/ValueHandle.h"
#include <cstdint>
namespace llvm {
class BasicBlock;
class Value;
struct DivRemMapKey {
bool SignedOp;
- Value *Dividend;
- Value *Divisor;
+ AssertingVH<Value> Dividend;
+ AssertingVH<Value> Divisor;
DivRemMapKey(bool InSignedOp, Value *InDividend, Value *InDivisor)
: SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {}
};
template <> struct DenseMapInfo<DivRemMapKey> {
static bool isEqual(const DivRemMapKey &Val1, const DivRemMapKey &Val2) {
return Val1.SignedOp == Val2.SignedOp && Val1.Dividend == Val2.Dividend &&
Val1.Divisor == Val2.Divisor;
}
static DivRemMapKey getEmptyKey() {
return DivRemMapKey(false, nullptr, nullptr);
}
static DivRemMapKey getTombstoneKey() {
return DivRemMapKey(true, nullptr, nullptr);
}
static unsigned getHashValue(const DivRemMapKey &Val) {
- return (unsigned)(reinterpret_cast<uintptr_t>(Val.Dividend) ^
- reinterpret_cast<uintptr_t>(Val.Divisor)) ^
+ return (unsigned)(reinterpret_cast<uintptr_t>(
+ static_cast<Value *>(Val.Dividend)) ^
+ reinterpret_cast<uintptr_t>(
+ static_cast<Value *>(Val.Divisor))) ^
(unsigned)Val.SignedOp;
}
};
/// This optimization identifies DIV instructions in a BB that can be
/// profitably bypassed and carried out with a shorter, faster divide.
///
/// This optimization may add basic blocks immediately after BB; for obvious
/// reasons, you shouldn't pass those blocks to bypassSlowDivision.
bool bypassSlowDivision(
BasicBlock *BB, const DenseMap<unsigned int, unsigned int> &BypassWidth);
} // end namespace llvm
#endif // LLVM_TRANSFORMS_UTILS_BYPASSSLOWDIVISION_H
Index: vendor/llvm/dist-release_90/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp (revision 351303)
@@ -1,663 +1,664 @@
//===-- AsmPrinterInlineAsm.cpp - AsmPrinter Inline Asm Handling ----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the inline assembler pieces of the AsmPrinter class.
//
//===----------------------------------------------------------------------===//
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/Twine.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCParser/MCTargetAsmParser.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
#define DEBUG_TYPE "asm-printer"
/// srcMgrDiagHandler - This callback is invoked when the SourceMgr for an
/// inline asm has an error in it. diagInfo is a pointer to the SrcMgrDiagInfo
/// struct above.
static void srcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) {
AsmPrinter::SrcMgrDiagInfo *DiagInfo =
static_cast<AsmPrinter::SrcMgrDiagInfo *>(diagInfo);
assert(DiagInfo && "Diagnostic context not passed down?");
// Look up a LocInfo for the buffer this diagnostic is coming from.
unsigned BufNum = DiagInfo->SrcMgr.FindBufferContainingLoc(Diag.getLoc());
const MDNode *LocInfo = nullptr;
if (BufNum > 0 && BufNum <= DiagInfo->LocInfos.size())
LocInfo = DiagInfo->LocInfos[BufNum-1];
// If the inline asm had metadata associated with it, pull out a location
// cookie corresponding to which line the error occurred on.
unsigned LocCookie = 0;
if (LocInfo) {
unsigned ErrorLine = Diag.getLineNo()-1;
if (ErrorLine >= LocInfo->getNumOperands())
ErrorLine = 0;
if (LocInfo->getNumOperands() != 0)
if (const ConstantInt *CI =
mdconst::dyn_extract<ConstantInt>(LocInfo->getOperand(ErrorLine)))
LocCookie = CI->getZExtValue();
}
DiagInfo->DiagHandler(Diag, DiagInfo->DiagContext, LocCookie);
}
unsigned AsmPrinter::addInlineAsmDiagBuffer(StringRef AsmStr,
const MDNode *LocMDNode) const {
if (!DiagInfo) {
DiagInfo = make_unique<SrcMgrDiagInfo>();
MCContext &Context = MMI->getContext();
Context.setInlineSourceManager(&DiagInfo->SrcMgr);
LLVMContext &LLVMCtx = MMI->getModule()->getContext();
if (LLVMCtx.getInlineAsmDiagnosticHandler()) {
DiagInfo->DiagHandler = LLVMCtx.getInlineAsmDiagnosticHandler();
DiagInfo->DiagContext = LLVMCtx.getInlineAsmDiagnosticContext();
DiagInfo->SrcMgr.setDiagHandler(srcMgrDiagHandler, DiagInfo.get());
}
}
SourceMgr &SrcMgr = DiagInfo->SrcMgr;
std::unique_ptr<MemoryBuffer> Buffer;
// The inline asm source manager will outlive AsmStr, so make a copy of the
// string for SourceMgr to own.
Buffer = MemoryBuffer::getMemBufferCopy(AsmStr, "<inline asm>");
// Tell SrcMgr about this buffer, it takes ownership of the buffer.
unsigned BufNum = SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc());
// Store LocMDNode in DiagInfo, using BufNum as an identifier.
if (LocMDNode) {
DiagInfo->LocInfos.resize(BufNum);
DiagInfo->LocInfos[BufNum - 1] = LocMDNode;
}
return BufNum;
}
/// EmitInlineAsm - Emit a blob of inline asm to the output streamer.
void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
const MCTargetOptions &MCOptions,
const MDNode *LocMDNode,
InlineAsm::AsmDialect Dialect) const {
assert(!Str.empty() && "Can't emit empty inline asm block");
// Remember if the buffer is nul terminated or not so we can avoid a copy.
bool isNullTerminated = Str.back() == 0;
if (isNullTerminated)
Str = Str.substr(0, Str.size()-1);
// If the output streamer does not have mature MC support or the integrated
// assembler has been disabled, just emit the blob textually.
// Otherwise parse the asm and emit it via MC support.
// This is useful in case the asm parser doesn't handle something but the
// system assembler does.
const MCAsmInfo *MCAI = TM.getMCAsmInfo();
assert(MCAI && "No MCAsmInfo");
if (!MCAI->useIntegratedAssembler() &&
!OutStreamer->isIntegratedAssemblerRequired()) {
emitInlineAsmStart();
OutStreamer->EmitRawText(Str);
emitInlineAsmEnd(STI, nullptr);
return;
}
unsigned BufNum = addInlineAsmDiagBuffer(Str, LocMDNode);
DiagInfo->SrcMgr.setIncludeDirs(MCOptions.IASSearchPaths);
std::unique_ptr<MCAsmParser> Parser(createMCAsmParser(
DiagInfo->SrcMgr, OutContext, *OutStreamer, *MAI, BufNum));
// Do not use assembler-level information for parsing inline assembly.
OutStreamer->setUseAssemblerInfoForParsing(false);
// We create a new MCInstrInfo here since we might be at the module level
// and not have a MachineFunction to initialize the TargetInstrInfo from and
// we only need MCInstrInfo for asm parsing. We create one unconditionally
// because it's not subtarget dependent.
std::unique_ptr<MCInstrInfo> MII(TM.getTarget().createMCInstrInfo());
std::unique_ptr<MCTargetAsmParser> TAP(TM.getTarget().createMCAsmParser(
STI, *Parser, *MII, MCOptions));
if (!TAP)
report_fatal_error("Inline asm not supported by this streamer because"
" we don't have an asm parser for this target\n");
Parser->setAssemblerDialect(Dialect);
Parser->setTargetParser(*TAP.get());
// Enable lexing Masm binary and hex integer literals in intel inline
// assembly.
if (Dialect == InlineAsm::AD_Intel)
Parser->getLexer().setLexMasmIntegers(true);
emitInlineAsmStart();
// Don't implicitly switch to the text section before the asm.
int Res = Parser->Run(/*NoInitialTextSection*/ true,
/*NoFinalize*/ true);
emitInlineAsmEnd(STI, &TAP->getSTI());
if (Res && !DiagInfo->DiagHandler)
report_fatal_error("Error parsing inline asm\n");
}
static void EmitMSInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
MachineModuleInfo *MMI, AsmPrinter *AP,
unsigned LocCookie, raw_ostream &OS) {
// Switch to the inline assembly variant.
OS << "\t.intel_syntax\n\t";
const char *LastEmitted = AsmStr; // One past the last character emitted.
unsigned NumOperands = MI->getNumOperands();
while (*LastEmitted) {
switch (*LastEmitted) {
default: {
// Not a special case, emit the string section literally.
const char *LiteralEnd = LastEmitted+1;
while (*LiteralEnd && *LiteralEnd != '{' && *LiteralEnd != '|' &&
*LiteralEnd != '}' && *LiteralEnd != '$' && *LiteralEnd != '\n')
++LiteralEnd;
OS.write(LastEmitted, LiteralEnd-LastEmitted);
LastEmitted = LiteralEnd;
break;
}
case '\n':
++LastEmitted; // Consume newline character.
OS << '\n'; // Indent code with newline.
break;
case '$': {
++LastEmitted; // Consume '$' character.
bool Done = true;
// Handle escapes.
switch (*LastEmitted) {
default: Done = false; break;
case '$':
++LastEmitted; // Consume second '$' character.
break;
}
if (Done) break;
// If we have ${:foo}, then this is not a real operand reference, it is a
// "magic" string reference, just like in .td files. Arrange to call
// PrintSpecial.
if (LastEmitted[0] == '{' && LastEmitted[1] == ':') {
LastEmitted += 2;
const char *StrStart = LastEmitted;
const char *StrEnd = strchr(StrStart, '}');
if (!StrEnd)
report_fatal_error("Unterminated ${:foo} operand in inline asm"
" string: '" + Twine(AsmStr) + "'");
std::string Val(StrStart, StrEnd);
AP->PrintSpecial(MI, OS, Val.c_str());
LastEmitted = StrEnd+1;
break;
}
const char *IDStart = LastEmitted;
const char *IDEnd = IDStart;
while (*IDEnd >= '0' && *IDEnd <= '9') ++IDEnd;
unsigned Val;
if (StringRef(IDStart, IDEnd-IDStart).getAsInteger(10, Val))
report_fatal_error("Bad $ operand number in inline asm string: '" +
Twine(AsmStr) + "'");
LastEmitted = IDEnd;
if (Val >= NumOperands-1)
report_fatal_error("Invalid $ operand number in inline asm string: '" +
Twine(AsmStr) + "'");
// Okay, we finally have a value number. Ask the target to print this
// operand!
unsigned OpNo = InlineAsm::MIOp_FirstOperand;
bool Error = false;
// Scan to find the machine operand number for the operand.
for (; Val; --Val) {
if (OpNo >= MI->getNumOperands()) break;
unsigned OpFlags = MI->getOperand(OpNo).getImm();
OpNo += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
}
// We may have a location metadata attached to the end of the
// instruction, and at no point should see metadata at any
// other point while processing. It's an error if so.
if (OpNo >= MI->getNumOperands() ||
MI->getOperand(OpNo).isMetadata()) {
Error = true;
} else {
unsigned OpFlags = MI->getOperand(OpNo).getImm();
++OpNo; // Skip over the ID number.
if (InlineAsm::isMemKind(OpFlags)) {
Error = AP->PrintAsmMemoryOperand(MI, OpNo, /*Modifier*/ nullptr, OS);
} else {
Error = AP->PrintAsmOperand(MI, OpNo, /*Modifier*/ nullptr, OS);
}
}
if (Error) {
std::string msg;
raw_string_ostream Msg(msg);
Msg << "invalid operand in inline asm: '" << AsmStr << "'";
MMI->getModule()->getContext().emitError(LocCookie, Msg.str());
}
break;
}
}
}
OS << "\n\t.att_syntax\n" << (char)0; // null terminate string.
}
static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
MachineModuleInfo *MMI, int AsmPrinterVariant,
AsmPrinter *AP, unsigned LocCookie,
raw_ostream &OS) {
int CurVariant = -1; // The number of the {.|.|.} region we are in.
const char *LastEmitted = AsmStr; // One past the last character emitted.
unsigned NumOperands = MI->getNumOperands();
OS << '\t';
while (*LastEmitted) {
switch (*LastEmitted) {
default: {
// Not a special case, emit the string section literally.
const char *LiteralEnd = LastEmitted+1;
while (*LiteralEnd && *LiteralEnd != '{' && *LiteralEnd != '|' &&
*LiteralEnd != '}' && *LiteralEnd != '$' && *LiteralEnd != '\n')
++LiteralEnd;
if (CurVariant == -1 || CurVariant == AsmPrinterVariant)
OS.write(LastEmitted, LiteralEnd-LastEmitted);
LastEmitted = LiteralEnd;
break;
}
case '\n':
++LastEmitted; // Consume newline character.
OS << '\n'; // Indent code with newline.
break;
case '$': {
++LastEmitted; // Consume '$' character.
bool Done = true;
// Handle escapes.
switch (*LastEmitted) {
default: Done = false; break;
case '$': // $$ -> $
if (CurVariant == -1 || CurVariant == AsmPrinterVariant)
OS << '$';
++LastEmitted; // Consume second '$' character.
break;
case '(': // $( -> same as GCC's { character.
++LastEmitted; // Consume '(' character.
if (CurVariant != -1)
report_fatal_error("Nested variants found in inline asm string: '" +
Twine(AsmStr) + "'");
CurVariant = 0; // We're in the first variant now.
break;
case '|':
++LastEmitted; // consume '|' character.
if (CurVariant == -1)
OS << '|'; // this is gcc's behavior for | outside a variant
else
++CurVariant; // We're in the next variant.
break;
case ')': // $) -> same as GCC's } char.
++LastEmitted; // consume ')' character.
if (CurVariant == -1)
OS << '}'; // this is gcc's behavior for } outside a variant
else
CurVariant = -1;
break;
}
if (Done) break;
bool HasCurlyBraces = false;
if (*LastEmitted == '{') { // ${variable}
++LastEmitted; // Consume '{' character.
HasCurlyBraces = true;
}
// If we have ${:foo}, then this is not a real operand reference, it is a
// "magic" string reference, just like in .td files. Arrange to call
// PrintSpecial.
if (HasCurlyBraces && *LastEmitted == ':') {
++LastEmitted;
const char *StrStart = LastEmitted;
const char *StrEnd = strchr(StrStart, '}');
if (!StrEnd)
report_fatal_error("Unterminated ${:foo} operand in inline asm"
" string: '" + Twine(AsmStr) + "'");
std::string Val(StrStart, StrEnd);
AP->PrintSpecial(MI, OS, Val.c_str());
LastEmitted = StrEnd+1;
break;
}
const char *IDStart = LastEmitted;
const char *IDEnd = IDStart;
while (*IDEnd >= '0' && *IDEnd <= '9') ++IDEnd;
unsigned Val;
if (StringRef(IDStart, IDEnd-IDStart).getAsInteger(10, Val))
report_fatal_error("Bad $ operand number in inline asm string: '" +
Twine(AsmStr) + "'");
LastEmitted = IDEnd;
char Modifier[2] = { 0, 0 };
if (HasCurlyBraces) {
// If we have curly braces, check for a modifier character. This
// supports syntax like ${0:u}, which correspond to "%u0" in GCC asm.
if (*LastEmitted == ':') {
++LastEmitted; // Consume ':' character.
if (*LastEmitted == 0)
report_fatal_error("Bad ${:} expression in inline asm string: '" +
Twine(AsmStr) + "'");
Modifier[0] = *LastEmitted;
++LastEmitted; // Consume modifier character.
}
if (*LastEmitted != '}')
report_fatal_error("Bad ${} expression in inline asm string: '" +
Twine(AsmStr) + "'");
++LastEmitted; // Consume '}' character.
}
if (Val >= NumOperands-1)
report_fatal_error("Invalid $ operand number in inline asm string: '" +
Twine(AsmStr) + "'");
// Okay, we finally have a value number. Ask the target to print this
// operand!
if (CurVariant == -1 || CurVariant == AsmPrinterVariant) {
unsigned OpNo = InlineAsm::MIOp_FirstOperand;
bool Error = false;
// Scan to find the machine operand number for the operand.
for (; Val; --Val) {
if (OpNo >= MI->getNumOperands()) break;
unsigned OpFlags = MI->getOperand(OpNo).getImm();
OpNo += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
}
// We may have a location metadata attached to the end of the
// instruction, and at no point should see metadata at any
// other point while processing. It's an error if so.
if (OpNo >= MI->getNumOperands() ||
MI->getOperand(OpNo).isMetadata()) {
Error = true;
} else {
unsigned OpFlags = MI->getOperand(OpNo).getImm();
++OpNo; // Skip over the ID number.
// FIXME: Shouldn't arch-independent output template handling go into
// PrintAsmOperand?
if (Modifier[0] == 'l') { // Labels are target independent.
if (MI->getOperand(OpNo).isBlockAddress()) {
const BlockAddress *BA = MI->getOperand(OpNo).getBlockAddress();
MCSymbol *Sym = AP->GetBlockAddressSymbol(BA);
Sym->print(OS, AP->MAI);
+ MMI->getContext().registerInlineAsmLabel(Sym);
} else if (MI->getOperand(OpNo).isMBB()) {
const MCSymbol *Sym = MI->getOperand(OpNo).getMBB()->getSymbol();
Sym->print(OS, AP->MAI);
} else {
Error = true;
}
} else {
if (InlineAsm::isMemKind(OpFlags)) {
Error = AP->PrintAsmMemoryOperand(
MI, OpNo, Modifier[0] ? Modifier : nullptr, OS);
} else {
Error = AP->PrintAsmOperand(MI, OpNo,
Modifier[0] ? Modifier : nullptr, OS);
}
}
}
if (Error) {
std::string msg;
raw_string_ostream Msg(msg);
Msg << "invalid operand in inline asm: '" << AsmStr << "'";
MMI->getModule()->getContext().emitError(LocCookie, Msg.str());
}
}
break;
}
}
}
OS << '\n' << (char)0; // null terminate string.
}
/// EmitInlineAsm - This method formats and emits the specified machine
/// instruction that is an inline asm.
void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
assert(MI->isInlineAsm() && "printInlineAsm only works on inline asms");
// Count the number of register definitions to find the asm string.
unsigned NumDefs = 0;
for (; MI->getOperand(NumDefs).isReg() && MI->getOperand(NumDefs).isDef();
++NumDefs)
assert(NumDefs != MI->getNumOperands()-2 && "No asm string?");
assert(MI->getOperand(NumDefs).isSymbol() && "No asm string?");
// Disassemble the AsmStr, printing out the literal pieces, the operands, etc.
const char *AsmStr = MI->getOperand(NumDefs).getSymbolName();
// If this asmstr is empty, just print the #APP/#NOAPP markers.
// These are useful to see where empty asm's wound up.
if (AsmStr[0] == 0) {
OutStreamer->emitRawComment(MAI->getInlineAsmStart());
OutStreamer->emitRawComment(MAI->getInlineAsmEnd());
return;
}
// Emit the #APP start marker. This has to happen even if verbose-asm isn't
// enabled, so we use emitRawComment.
OutStreamer->emitRawComment(MAI->getInlineAsmStart());
// Get the !srcloc metadata node if we have it, and decode the loc cookie from
// it.
unsigned LocCookie = 0;
const MDNode *LocMD = nullptr;
for (unsigned i = MI->getNumOperands(); i != 0; --i) {
if (MI->getOperand(i-1).isMetadata() &&
(LocMD = MI->getOperand(i-1).getMetadata()) &&
LocMD->getNumOperands() != 0) {
if (const ConstantInt *CI =
mdconst::dyn_extract<ConstantInt>(LocMD->getOperand(0))) {
LocCookie = CI->getZExtValue();
break;
}
}
}
// Emit the inline asm to a temporary string so we can emit it through
// EmitInlineAsm.
SmallString<256> StringData;
raw_svector_ostream OS(StringData);
// The variant of the current asmprinter.
int AsmPrinterVariant = MAI->getAssemblerDialect();
AsmPrinter *AP = const_cast<AsmPrinter*>(this);
if (MI->getInlineAsmDialect() == InlineAsm::AD_ATT)
EmitGCCInlineAsmStr(AsmStr, MI, MMI, AsmPrinterVariant, AP, LocCookie, OS);
else
EmitMSInlineAsmStr(AsmStr, MI, MMI, AP, LocCookie, OS);
// Emit warnings if we use reserved registers on the clobber list, as
// that might give surprising results.
std::vector<std::string> RestrRegs;
// Start with the first operand descriptor, and iterate over them.
for (unsigned I = InlineAsm::MIOp_FirstOperand, NumOps = MI->getNumOperands();
I < NumOps; ++I) {
const MachineOperand &MO = MI->getOperand(I);
if (MO.isImm()) {
unsigned Flags = MO.getImm();
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
if (InlineAsm::getKind(Flags) == InlineAsm::Kind_Clobber &&
!TRI->isAsmClobberable(*MF, MI->getOperand(I + 1).getReg())) {
RestrRegs.push_back(TRI->getName(MI->getOperand(I + 1).getReg()));
}
// Skip to one before the next operand descriptor, if it exists.
I += InlineAsm::getNumOperandRegisters(Flags);
}
}
if (!RestrRegs.empty()) {
unsigned BufNum = addInlineAsmDiagBuffer(OS.str(), LocMD);
auto &SrcMgr = DiagInfo->SrcMgr;
SMLoc Loc = SMLoc::getFromPointer(
SrcMgr.getMemoryBuffer(BufNum)->getBuffer().begin());
std::string Msg = "inline asm clobber list contains reserved registers: ";
for (auto I = RestrRegs.begin(), E = RestrRegs.end(); I != E; I++) {
if(I != RestrRegs.begin())
Msg += ", ";
Msg += *I;
}
std::string Note = "Reserved registers on the clobber list may not be "
"preserved across the asm statement, and clobbering them may "
"lead to undefined behaviour.";
SrcMgr.PrintMessage(Loc, SourceMgr::DK_Warning, Msg);
SrcMgr.PrintMessage(Loc, SourceMgr::DK_Note, Note);
}
EmitInlineAsm(OS.str(), getSubtargetInfo(), TM.Options.MCOptions, LocMD,
MI->getInlineAsmDialect());
// Emit the #NOAPP end marker. This has to happen even if verbose-asm isn't
// enabled, so we use emitRawComment.
OutStreamer->emitRawComment(MAI->getInlineAsmEnd());
}
/// PrintSpecial - Print information related to the specified machine instr
/// that is independent of the operand, and may be independent of the instr
/// itself. This can be useful for portably encoding the comment character
/// or other bits of target-specific knowledge into the asmstrings. The
/// syntax used is ${:comment}. Targets can override this to add support
/// for their own strange codes.
void AsmPrinter::PrintSpecial(const MachineInstr *MI, raw_ostream &OS,
const char *Code) const {
if (!strcmp(Code, "private")) {
const DataLayout &DL = MF->getDataLayout();
OS << DL.getPrivateGlobalPrefix();
} else if (!strcmp(Code, "comment")) {
OS << MAI->getCommentString();
} else if (!strcmp(Code, "uid")) {
// Comparing the address of MI isn't sufficient, because machineinstrs may
// be allocated to the same address across functions.
// If this is a new LastFn instruction, bump the counter.
if (LastMI != MI || LastFn != getFunctionNumber()) {
++Counter;
LastMI = MI;
LastFn = getFunctionNumber();
}
OS << Counter;
} else {
std::string msg;
raw_string_ostream Msg(msg);
Msg << "Unknown special formatter '" << Code
<< "' for machine instr: " << *MI;
report_fatal_error(Msg.str());
}
}
void AsmPrinter::PrintSymbolOperand(const MachineOperand &MO, raw_ostream &OS) {
assert(MO.isGlobal() && "caller should check MO.isGlobal");
getSymbol(MO.getGlobal())->print(OS, MAI);
printOffset(MO.getOffset(), OS);
}
/// PrintAsmOperand - Print the specified operand of MI, an INLINEASM
/// instruction, using the specified assembler variant. Targets should
/// override this to format as appropriate for machine specific ExtraCodes
/// or when the arch-independent handling would be too complex otherwise.
bool AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
const char *ExtraCode, raw_ostream &O) {
// Does this asm operand have a single letter operand modifier?
if (ExtraCode && ExtraCode[0]) {
if (ExtraCode[1] != 0) return true; // Unknown modifier.
// https://gcc.gnu.org/onlinedocs/gccint/Output-Template.html
const MachineOperand &MO = MI->getOperand(OpNo);
switch (ExtraCode[0]) {
default:
return true; // Unknown modifier.
case 'a': // Print as memory address.
if (MO.isReg()) {
PrintAsmMemoryOperand(MI, OpNo, nullptr, O);
return false;
}
LLVM_FALLTHROUGH; // GCC allows '%a' to behave like '%c' with immediates.
case 'c': // Substitute immediate value without immediate syntax
if (MO.isImm()) {
O << MO.getImm();
return false;
}
if (MO.isGlobal()) {
PrintSymbolOperand(MO, O);
return false;
}
return true;
case 'n': // Negate the immediate constant.
if (!MO.isImm())
return true;
O << -MO.getImm();
return false;
case 's': // The GCC deprecated s modifier
if (!MO.isImm())
return true;
O << ((32 - MO.getImm()) & 31);
return false;
}
}
return true;
}
bool AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
const char *ExtraCode, raw_ostream &O) {
// Target doesn't support this yet!
return true;
}
void AsmPrinter::emitInlineAsmStart() const {}
void AsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
const MCSubtargetInfo *EndInfo) const {}
Index: vendor/llvm/dist-release_90/lib/CodeGen/CodeGenPrepare.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/CodeGen/CodeGenPrepare.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/CodeGen/CodeGenPrepare.cpp (revision 351303)
@@ -1,7350 +1,7351 @@
//===- CodeGenPrepare.cpp - Prepare a function for code generation --------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass munges the code in the input function to better prepare it for
// SelectionDAG-based code generation. This works around limitations in it's
// basic-block-at-a-time approach. It should eventually be removed.
//
//===----------------------------------------------------------------------===//
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PointerIntPair.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Statepoint.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/ValueMap.h"
#include "llvm/Pass.h"
#include "llvm/Support/BlockFrequency.h"
#include "llvm/Support/BranchProbability.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/BypassSlowDivision.h"
#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <iterator>
#include <limits>
#include <memory>
#include <utility>
#include <vector>
using namespace llvm;
using namespace llvm::PatternMatch;
#define DEBUG_TYPE "codegenprepare"
STATISTIC(NumBlocksElim, "Number of blocks eliminated");
STATISTIC(NumPHIsElim, "Number of trivial PHIs eliminated");
STATISTIC(NumGEPsElim, "Number of GEPs converted to casts");
STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of "
"sunken Cmps");
STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses "
"of sunken Casts");
STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "
"computations were sunk");
STATISTIC(NumMemoryInstsPhiCreated,
"Number of phis created when address "
"computations were sunk to memory instructions");
STATISTIC(NumMemoryInstsSelectCreated,
"Number of select created when address "
"computations were sunk to memory instructions");
STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads");
STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized");
STATISTIC(NumAndsAdded,
"Number of and mask instructions added to form ext loads");
STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");
STATISTIC(NumRetsDup, "Number of return instructions duplicated");
STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
static cl::opt<bool> DisableBranchOpts(
"disable-cgp-branch-opts", cl::Hidden, cl::init(false),
cl::desc("Disable branch optimizations in CodeGenPrepare"));
static cl::opt<bool>
DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false),
cl::desc("Disable GC optimizations in CodeGenPrepare"));
static cl::opt<bool> DisableSelectToBranch(
"disable-cgp-select2branch", cl::Hidden, cl::init(false),
cl::desc("Disable select to branch conversion."));
static cl::opt<bool> AddrSinkUsingGEPs(
"addr-sink-using-gep", cl::Hidden, cl::init(true),
cl::desc("Address sinking in CGP using GEPs."));
static cl::opt<bool> EnableAndCmpSinking(
"enable-andcmp-sinking", cl::Hidden, cl::init(true),
cl::desc("Enable sinkinig and/cmp into branches."));
static cl::opt<bool> DisableStoreExtract(
"disable-cgp-store-extract", cl::Hidden, cl::init(false),
cl::desc("Disable store(extract) optimizations in CodeGenPrepare"));
static cl::opt<bool> StressStoreExtract(
"stress-cgp-store-extract", cl::Hidden, cl::init(false),
cl::desc("Stress test store(extract) optimizations in CodeGenPrepare"));
static cl::opt<bool> DisableExtLdPromotion(
"disable-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
cl::desc("Disable ext(promotable(ld)) -> promoted(ext(ld)) optimization in "
"CodeGenPrepare"));
static cl::opt<bool> StressExtLdPromotion(
"stress-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) "
"optimization in CodeGenPrepare"));
static cl::opt<bool> DisablePreheaderProtect(
"disable-preheader-prot", cl::Hidden, cl::init(false),
cl::desc("Disable protection against removing loop preheaders"));
static cl::opt<bool> ProfileGuidedSectionPrefix(
"profile-guided-section-prefix", cl::Hidden, cl::init(true), cl::ZeroOrMore,
cl::desc("Use profile info to add section prefix for hot/cold functions"));
static cl::opt<unsigned> FreqRatioToSkipMerge(
"cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(2),
cl::desc("Skip merging empty blocks if (frequency of empty block) / "
"(frequency of destination block) is greater than this ratio"));
static cl::opt<bool> ForceSplitStore(
"force-split-store", cl::Hidden, cl::init(false),
cl::desc("Force store splitting no matter what the target query says."));
static cl::opt<bool>
EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden,
cl::desc("Enable merging of redundant sexts when one is dominating"
" the other."), cl::init(true));
static cl::opt<bool> DisableComplexAddrModes(
"disable-complex-addr-modes", cl::Hidden, cl::init(false),
cl::desc("Disables combining addressing modes with different parts "
"in optimizeMemoryInst."));
static cl::opt<bool>
AddrSinkNewPhis("addr-sink-new-phis", cl::Hidden, cl::init(false),
cl::desc("Allow creation of Phis in Address sinking."));
static cl::opt<bool>
AddrSinkNewSelects("addr-sink-new-select", cl::Hidden, cl::init(true),
cl::desc("Allow creation of selects in Address sinking."));
static cl::opt<bool> AddrSinkCombineBaseReg(
"addr-sink-combine-base-reg", cl::Hidden, cl::init(true),
cl::desc("Allow combining of BaseReg field in Address sinking."));
static cl::opt<bool> AddrSinkCombineBaseGV(
"addr-sink-combine-base-gv", cl::Hidden, cl::init(true),
cl::desc("Allow combining of BaseGV field in Address sinking."));
static cl::opt<bool> AddrSinkCombineBaseOffs(
"addr-sink-combine-base-offs", cl::Hidden, cl::init(true),
cl::desc("Allow combining of BaseOffs field in Address sinking."));
static cl::opt<bool> AddrSinkCombineScaledReg(
"addr-sink-combine-scaled-reg", cl::Hidden, cl::init(true),
cl::desc("Allow combining of ScaledReg field in Address sinking."));
static cl::opt<bool>
EnableGEPOffsetSplit("cgp-split-large-offset-gep", cl::Hidden,
cl::init(true),
cl::desc("Enable splitting large offset of GEP."));
namespace {
enum ExtType {
ZeroExtension, // Zero extension has been seen.
SignExtension, // Sign extension has been seen.
BothExtension // This extension type is used if we saw sext after
// ZeroExtension had been set, or if we saw zext after
// SignExtension had been set. It makes the type
// information of a promoted instruction invalid.
};
using SetOfInstrs = SmallPtrSet<Instruction *, 16>;
using TypeIsSExt = PointerIntPair<Type *, 2, ExtType>;
using InstrToOrigTy = DenseMap<Instruction *, TypeIsSExt>;
using SExts = SmallVector<Instruction *, 16>;
using ValueToSExts = DenseMap<Value *, SExts>;
class TypePromotionTransaction;
class CodeGenPrepare : public FunctionPass {
const TargetMachine *TM = nullptr;
const TargetSubtargetInfo *SubtargetInfo;
const TargetLowering *TLI = nullptr;
const TargetRegisterInfo *TRI;
const TargetTransformInfo *TTI = nullptr;
const TargetLibraryInfo *TLInfo;
const LoopInfo *LI;
std::unique_ptr<BlockFrequencyInfo> BFI;
std::unique_ptr<BranchProbabilityInfo> BPI;
/// As we scan instructions optimizing them, this is the next instruction
/// to optimize. Transforms that can invalidate this should update it.
BasicBlock::iterator CurInstIterator;
/// Keeps track of non-local addresses that have been sunk into a block.
/// This allows us to avoid inserting duplicate code for blocks with
/// multiple load/stores of the same address. The usage of WeakTrackingVH
/// enables SunkAddrs to be treated as a cache whose entries can be
/// invalidated if a sunken address computation has been erased.
ValueMap<Value*, WeakTrackingVH> SunkAddrs;
/// Keeps track of all instructions inserted for the current function.
SetOfInstrs InsertedInsts;
/// Keeps track of the type of the related instruction before their
/// promotion for the current function.
InstrToOrigTy PromotedInsts;
/// Keep track of instructions removed during promotion.
SetOfInstrs RemovedInsts;
/// Keep track of sext chains based on their initial value.
DenseMap<Value *, Instruction *> SeenChainsForSExt;
/// Keep track of GEPs accessing the same data structures such as structs or
/// arrays that are candidates to be split later because of their large
/// size.
MapVector<
AssertingVH<Value>,
SmallVector<std::pair<AssertingVH<GetElementPtrInst>, int64_t>, 32>>
LargeOffsetGEPMap;
/// Keep track of new GEP base after splitting the GEPs having large offset.
SmallSet<AssertingVH<Value>, 2> NewGEPBases;
/// Map serial numbers to Large offset GEPs.
DenseMap<AssertingVH<GetElementPtrInst>, int> LargeOffsetGEPID;
/// Keep track of SExt promoted.
ValueToSExts ValToSExtendedUses;
/// True if optimizing for size.
bool OptSize;
/// DataLayout for the Function being processed.
const DataLayout *DL = nullptr;
/// Building the dominator tree can be expensive, so we only build it
/// lazily and update it when required.
std::unique_ptr<DominatorTree> DT;
public:
static char ID; // Pass identification, replacement for typeid
CodeGenPrepare() : FunctionPass(ID) {
initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override;
StringRef getPassName() const override { return "CodeGen Prepare"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
// FIXME: When we can selectively preserve passes, preserve the domtree.
AU.addRequired<ProfileSummaryInfoWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
}
private:
template <typename F>
void resetIteratorIfInvalidatedWhileCalling(BasicBlock *BB, F f) {
// Substituting can cause recursive simplifications, which can invalidate
// our iterator. Use a WeakTrackingVH to hold onto it in case this
// happens.
Value *CurValue = &*CurInstIterator;
WeakTrackingVH IterHandle(CurValue);
f();
// If the iterator instruction was recursively deleted, start over at the
// start of the block.
if (IterHandle != CurValue) {
CurInstIterator = BB->begin();
SunkAddrs.clear();
}
}
// Get the DominatorTree, building if necessary.
DominatorTree &getDT(Function &F) {
if (!DT)
DT = llvm::make_unique<DominatorTree>(F);
return *DT;
}
bool eliminateFallThrough(Function &F);
bool eliminateMostlyEmptyBlocks(Function &F);
BasicBlock *findDestBlockOfMergeableEmptyBlock(BasicBlock *BB);
bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
void eliminateMostlyEmptyBlock(BasicBlock *BB);
bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB,
bool isPreheader);
bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT);
bool optimizeInst(Instruction *I, bool &ModifiedDT);
bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
Type *AccessTy, unsigned AddrSpace);
bool optimizeInlineAsmInst(CallInst *CS);
bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
bool optimizeExt(Instruction *&I);
bool optimizeExtUses(Instruction *I);
bool optimizeLoadExt(LoadInst *Load);
bool optimizeShiftInst(BinaryOperator *BO);
bool optimizeSelectInst(SelectInst *SI);
bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI);
bool optimizeSwitchInst(SwitchInst *SI);
bool optimizeExtractElementInst(Instruction *Inst);
bool dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT);
bool placeDbgValues(Function &F);
bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts,
LoadInst *&LI, Instruction *&Inst, bool HasPromoted);
bool tryToPromoteExts(TypePromotionTransaction &TPT,
const SmallVectorImpl<Instruction *> &Exts,
SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
unsigned CreatedInstsCost = 0);
bool mergeSExts(Function &F);
bool splitLargeGEPOffsets();
bool performAddressTypePromotion(
Instruction *&Inst,
bool AllowPromotionWithoutCommonHeader,
bool HasPromoted, TypePromotionTransaction &TPT,
SmallVectorImpl<Instruction *> &SpeculativelyMovedExts);
bool splitBranchCondition(Function &F, bool &ModifiedDT);
bool simplifyOffsetableRelocate(Instruction &I);
bool tryToSinkFreeOperands(Instruction *I);
bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, CmpInst *Cmp,
Intrinsic::ID IID);
bool optimizeCmp(CmpInst *Cmp, bool &ModifiedDT);
bool combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
bool combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
};
} // end anonymous namespace
char CodeGenPrepare::ID = 0;
INITIALIZE_PASS_BEGIN(CodeGenPrepare, DEBUG_TYPE,
"Optimize for code generation", false, false)
INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
INITIALIZE_PASS_END(CodeGenPrepare, DEBUG_TYPE,
"Optimize for code generation", false, false)
FunctionPass *llvm::createCodeGenPreparePass() { return new CodeGenPrepare(); }
bool CodeGenPrepare::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
DL = &F.getParent()->getDataLayout();
bool EverMadeChange = false;
// Clear per function information.
InsertedInsts.clear();
PromotedInsts.clear();
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
TM = &TPC->getTM<TargetMachine>();
SubtargetInfo = TM->getSubtargetImpl(F);
TLI = SubtargetInfo->getTargetLowering();
TRI = SubtargetInfo->getRegisterInfo();
}
TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
BPI.reset(new BranchProbabilityInfo(F, *LI));
BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
OptSize = F.hasOptSize();
ProfileSummaryInfo *PSI =
&getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
if (ProfileGuidedSectionPrefix) {
if (PSI->isFunctionHotInCallGraph(&F, *BFI))
F.setSectionPrefix(".hot");
else if (PSI->isFunctionColdInCallGraph(&F, *BFI))
F.setSectionPrefix(".unlikely");
}
/// This optimization identifies DIV instructions that can be
/// profitably bypassed and carried out with a shorter, faster divide.
if (!OptSize && !PSI->hasHugeWorkingSetSize() && TLI &&
TLI->isSlowDivBypassed()) {
const DenseMap<unsigned int, unsigned int> &BypassWidths =
TLI->getBypassSlowDivWidths();
BasicBlock* BB = &*F.begin();
while (BB != nullptr) {
// bypassSlowDivision may create new BBs, but we don't want to reapply the
// optimization to those blocks.
BasicBlock* Next = BB->getNextNode();
EverMadeChange |= bypassSlowDivision(BB, BypassWidths);
BB = Next;
}
}
// Eliminate blocks that contain only PHI nodes and an
// unconditional branch.
EverMadeChange |= eliminateMostlyEmptyBlocks(F);
bool ModifiedDT = false;
if (!DisableBranchOpts)
EverMadeChange |= splitBranchCondition(F, ModifiedDT);
// Split some critical edges where one of the sources is an indirect branch,
// to help generate sane code for PHIs involving such edges.
EverMadeChange |= SplitIndirectBrCriticalEdges(F);
bool MadeChange = true;
while (MadeChange) {
MadeChange = false;
DT.reset();
for (Function::iterator I = F.begin(); I != F.end(); ) {
BasicBlock *BB = &*I++;
bool ModifiedDTOnIteration = false;
MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration);
// Restart BB iteration if the dominator tree of the Function was changed
if (ModifiedDTOnIteration)
break;
}
if (EnableTypePromotionMerge && !ValToSExtendedUses.empty())
MadeChange |= mergeSExts(F);
if (!LargeOffsetGEPMap.empty())
MadeChange |= splitLargeGEPOffsets();
// Really free removed instructions during promotion.
for (Instruction *I : RemovedInsts)
I->deleteValue();
EverMadeChange |= MadeChange;
SeenChainsForSExt.clear();
ValToSExtendedUses.clear();
RemovedInsts.clear();
LargeOffsetGEPMap.clear();
LargeOffsetGEPID.clear();
}
SunkAddrs.clear();
if (!DisableBranchOpts) {
MadeChange = false;
// Use a set vector to get deterministic iteration order. The order the
// blocks are removed may affect whether or not PHI nodes in successors
// are removed.
SmallSetVector<BasicBlock*, 8> WorkList;
for (BasicBlock &BB : F) {
SmallVector<BasicBlock *, 2> Successors(succ_begin(&BB), succ_end(&BB));
MadeChange |= ConstantFoldTerminator(&BB, true);
if (!MadeChange) continue;
for (SmallVectorImpl<BasicBlock*>::iterator
II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
if (pred_begin(*II) == pred_end(*II))
WorkList.insert(*II);
}
// Delete the dead blocks and any of their dead successors.
MadeChange |= !WorkList.empty();
while (!WorkList.empty()) {
BasicBlock *BB = WorkList.pop_back_val();
SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));
DeleteDeadBlock(BB);
for (SmallVectorImpl<BasicBlock*>::iterator
II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
if (pred_begin(*II) == pred_end(*II))
WorkList.insert(*II);
}
// Merge pairs of basic blocks with unconditional branches, connected by
// a single edge.
if (EverMadeChange || MadeChange)
MadeChange |= eliminateFallThrough(F);
EverMadeChange |= MadeChange;
}
if (!DisableGCOpts) {
SmallVector<Instruction *, 2> Statepoints;
for (BasicBlock &BB : F)
for (Instruction &I : BB)
if (isStatepoint(I))
Statepoints.push_back(&I);
for (auto &I : Statepoints)
EverMadeChange |= simplifyOffsetableRelocate(*I);
}
// Do this last to clean up use-before-def scenarios introduced by other
// preparatory transforms.
EverMadeChange |= placeDbgValues(F);
return EverMadeChange;
}
/// Merge basic blocks which are connected by a single edge, where one of the
/// basic blocks has a single successor pointing to the other basic block,
/// which has a single predecessor.
bool CodeGenPrepare::eliminateFallThrough(Function &F) {
bool Changed = false;
// Scan all of the blocks in the function, except for the entry block.
// Use a temporary array to avoid iterator being invalidated when
// deleting blocks.
SmallVector<WeakTrackingVH, 16> Blocks;
for (auto &Block : llvm::make_range(std::next(F.begin()), F.end()))
Blocks.push_back(&Block);
for (auto &Block : Blocks) {
auto *BB = cast_or_null<BasicBlock>(Block);
if (!BB)
continue;
// If the destination block has a single pred, then this is a trivial
// edge, just collapse it.
BasicBlock *SinglePred = BB->getSinglePredecessor();
// Don't merge if BB's address is taken.
if (!SinglePred || SinglePred == BB || BB->hasAddressTaken()) continue;
BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator());
if (Term && !Term->isConditional()) {
Changed = true;
LLVM_DEBUG(dbgs() << "To merge:\n" << *BB << "\n\n\n");
// Merge BB into SinglePred and delete it.
MergeBlockIntoPredecessor(BB);
}
}
return Changed;
}
/// Find a destination block from BB if BB is mergeable empty block.
BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) {
// If this block doesn't end with an uncond branch, ignore it.
BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
if (!BI || !BI->isUnconditional())
return nullptr;
// If the instruction before the branch (skipping debug info) isn't a phi
// node, then other stuff is happening here.
BasicBlock::iterator BBI = BI->getIterator();
if (BBI != BB->begin()) {
--BBI;
while (isa<DbgInfoIntrinsic>(BBI)) {
if (BBI == BB->begin())
break;
--BBI;
}
if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI))
return nullptr;
}
// Do not break infinite loops.
BasicBlock *DestBB = BI->getSuccessor(0);
if (DestBB == BB)
return nullptr;
if (!canMergeBlocks(BB, DestBB))
DestBB = nullptr;
return DestBB;
}
/// Eliminate blocks that contain only PHI nodes, debug info directives, and an
/// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split
/// edges in ways that are non-optimal for isel. Start by eliminating these
/// blocks so we can split them the way we want them.
bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) {
SmallPtrSet<BasicBlock *, 16> Preheaders;
SmallVector<Loop *, 16> LoopList(LI->begin(), LI->end());
while (!LoopList.empty()) {
Loop *L = LoopList.pop_back_val();
LoopList.insert(LoopList.end(), L->begin(), L->end());
if (BasicBlock *Preheader = L->getLoopPreheader())
Preheaders.insert(Preheader);
}
bool MadeChange = false;
// Copy blocks into a temporary array to avoid iterator invalidation issues
// as we remove them.
// Note that this intentionally skips the entry block.
SmallVector<WeakTrackingVH, 16> Blocks;
for (auto &Block : llvm::make_range(std::next(F.begin()), F.end()))
Blocks.push_back(&Block);
for (auto &Block : Blocks) {
BasicBlock *BB = cast_or_null<BasicBlock>(Block);
if (!BB)
continue;
BasicBlock *DestBB = findDestBlockOfMergeableEmptyBlock(BB);
if (!DestBB ||
!isMergingEmptyBlockProfitable(BB, DestBB, Preheaders.count(BB)))
continue;
eliminateMostlyEmptyBlock(BB);
MadeChange = true;
}
return MadeChange;
}
bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB,
BasicBlock *DestBB,
bool isPreheader) {
// Do not delete loop preheaders if doing so would create a critical edge.
// Loop preheaders can be good locations to spill registers. If the
// preheader is deleted and we create a critical edge, registers may be
// spilled in the loop body instead.
if (!DisablePreheaderProtect && isPreheader &&
!(BB->getSinglePredecessor() &&
BB->getSinglePredecessor()->getSingleSuccessor()))
return false;
// Skip merging if the block's successor is also a successor to any callbr
// that leads to this block.
// FIXME: Is this really needed? Is this a correctness issue?
for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
if (auto *CBI = dyn_cast<CallBrInst>((*PI)->getTerminator()))
for (unsigned i = 0, e = CBI->getNumSuccessors(); i != e; ++i)
if (DestBB == CBI->getSuccessor(i))
return false;
}
// Try to skip merging if the unique predecessor of BB is terminated by a
// switch or indirect branch instruction, and BB is used as an incoming block
// of PHIs in DestBB. In such case, merging BB and DestBB would cause ISel to
// add COPY instructions in the predecessor of BB instead of BB (if it is not
// merged). Note that the critical edge created by merging such blocks wont be
// split in MachineSink because the jump table is not analyzable. By keeping
// such empty block (BB), ISel will place COPY instructions in BB, not in the
// predecessor of BB.
BasicBlock *Pred = BB->getUniquePredecessor();
if (!Pred ||
!(isa<SwitchInst>(Pred->getTerminator()) ||
isa<IndirectBrInst>(Pred->getTerminator())))
return true;
if (BB->getTerminator() != BB->getFirstNonPHIOrDbg())
return true;
// We use a simple cost heuristic which determine skipping merging is
// profitable if the cost of skipping merging is less than the cost of
// merging : Cost(skipping merging) < Cost(merging BB), where the
// Cost(skipping merging) is Freq(BB) * (Cost(Copy) + Cost(Branch)), and
// the Cost(merging BB) is Freq(Pred) * Cost(Copy).
// Assuming Cost(Copy) == Cost(Branch), we could simplify it to :
// Freq(Pred) / Freq(BB) > 2.
// Note that if there are multiple empty blocks sharing the same incoming
// value for the PHIs in the DestBB, we consider them together. In such
// case, Cost(merging BB) will be the sum of their frequencies.
if (!isa<PHINode>(DestBB->begin()))
return true;
SmallPtrSet<BasicBlock *, 16> SameIncomingValueBBs;
// Find all other incoming blocks from which incoming values of all PHIs in
// DestBB are the same as the ones from BB.
for (pred_iterator PI = pred_begin(DestBB), E = pred_end(DestBB); PI != E;
++PI) {
BasicBlock *DestBBPred = *PI;
if (DestBBPred == BB)
continue;
if (llvm::all_of(DestBB->phis(), [&](const PHINode &DestPN) {
return DestPN.getIncomingValueForBlock(BB) ==
DestPN.getIncomingValueForBlock(DestBBPred);
}))
SameIncomingValueBBs.insert(DestBBPred);
}
// See if all BB's incoming values are same as the value from Pred. In this
// case, no reason to skip merging because COPYs are expected to be place in
// Pred already.
if (SameIncomingValueBBs.count(Pred))
return true;
BlockFrequency PredFreq = BFI->getBlockFreq(Pred);
BlockFrequency BBFreq = BFI->getBlockFreq(BB);
for (auto SameValueBB : SameIncomingValueBBs)
if (SameValueBB->getUniquePredecessor() == Pred &&
DestBB == findDestBlockOfMergeableEmptyBlock(SameValueBB))
BBFreq += BFI->getBlockFreq(SameValueBB);
return PredFreq.getFrequency() <=
BBFreq.getFrequency() * FreqRatioToSkipMerge;
}
/// Return true if we can merge BB into DestBB if there is a single
/// unconditional branch between them, and BB contains no other non-phi
/// instructions.
bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB,
const BasicBlock *DestBB) const {
// We only want to eliminate blocks whose phi nodes are used by phi nodes in
// the successor. If there are more complex condition (e.g. preheaders),
// don't mess around with them.
for (const PHINode &PN : BB->phis()) {
for (const User *U : PN.users()) {
const Instruction *UI = cast<Instruction>(U);
if (UI->getParent() != DestBB || !isa<PHINode>(UI))
return false;
// If User is inside DestBB block and it is a PHINode then check
// incoming value. If incoming value is not from BB then this is
// a complex condition (e.g. preheaders) we want to avoid here.
if (UI->getParent() == DestBB) {
if (const PHINode *UPN = dyn_cast<PHINode>(UI))
for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) {
Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I));
if (Insn && Insn->getParent() == BB &&
Insn->getParent() != UPN->getIncomingBlock(I))
return false;
}
}
}
}
// If BB and DestBB contain any common predecessors, then the phi nodes in BB
// and DestBB may have conflicting incoming values for the block. If so, we
// can't merge the block.
const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin());
if (!DestBBPN) return true; // no conflict.
// Collect the preds of BB.
SmallPtrSet<const BasicBlock*, 16> BBPreds;
if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
// It is faster to get preds from a PHI than with pred_iterator.
for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
BBPreds.insert(BBPN->getIncomingBlock(i));
} else {
BBPreds.insert(pred_begin(BB), pred_end(BB));
}
// Walk the preds of DestBB.
for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) {
BasicBlock *Pred = DestBBPN->getIncomingBlock(i);
if (BBPreds.count(Pred)) { // Common predecessor?
for (const PHINode &PN : DestBB->phis()) {
const Value *V1 = PN.getIncomingValueForBlock(Pred);
const Value *V2 = PN.getIncomingValueForBlock(BB);
// If V2 is a phi node in BB, look up what the mapped value will be.
if (const PHINode *V2PN = dyn_cast<PHINode>(V2))
if (V2PN->getParent() == BB)
V2 = V2PN->getIncomingValueForBlock(Pred);
// If there is a conflict, bail out.
if (V1 != V2) return false;
}
}
}
return true;
}
/// Eliminate a basic block that has only phi's and an unconditional branch in
/// it.
void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) {
BranchInst *BI = cast<BranchInst>(BB->getTerminator());
BasicBlock *DestBB = BI->getSuccessor(0);
LLVM_DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n"
<< *BB << *DestBB);
// If the destination block has a single pred, then this is a trivial edge,
// just collapse it.
if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) {
if (SinglePred != DestBB) {
assert(SinglePred == BB &&
"Single predecessor not the same as predecessor");
// Merge DestBB into SinglePred/BB and delete it.
MergeBlockIntoPredecessor(DestBB);
// Note: BB(=SinglePred) will not be deleted on this path.
// DestBB(=its single successor) is the one that was deleted.
LLVM_DEBUG(dbgs() << "AFTER:\n" << *SinglePred << "\n\n\n");
return;
}
}
// Otherwise, we have multiple predecessors of BB. Update the PHIs in DestBB
// to handle the new incoming edges it is about to have.
for (PHINode &PN : DestBB->phis()) {
// Remove the incoming value for BB, and remember it.
Value *InVal = PN.removeIncomingValue(BB, false);
// Two options: either the InVal is a phi node defined in BB or it is some
// value that dominates BB.
PHINode *InValPhi = dyn_cast<PHINode>(InVal);
if (InValPhi && InValPhi->getParent() == BB) {
// Add all of the input values of the input PHI as inputs of this phi.
for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i)
PN.addIncoming(InValPhi->getIncomingValue(i),
InValPhi->getIncomingBlock(i));
} else {
// Otherwise, add one instance of the dominating value for each edge that
// we will be adding.
if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
PN.addIncoming(InVal, BBPN->getIncomingBlock(i));
} else {
for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
PN.addIncoming(InVal, *PI);
}
}
}
// The PHIs are now updated, change everything that refers to BB to use
// DestBB and remove BB.
BB->replaceAllUsesWith(DestBB);
BB->eraseFromParent();
++NumBlocksElim;
LLVM_DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
}
// Computes a map of base pointer relocation instructions to corresponding
// derived pointer relocation instructions given a vector of all relocate calls
static void computeBaseDerivedRelocateMap(
const SmallVectorImpl<GCRelocateInst *> &AllRelocateCalls,
DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>>
&RelocateInstMap) {
// Collect information in two maps: one primarily for locating the base object
// while filling the second map; the second map is the final structure holding
// a mapping between Base and corresponding Derived relocate calls
DenseMap<std::pair<unsigned, unsigned>, GCRelocateInst *> RelocateIdxMap;
for (auto *ThisRelocate : AllRelocateCalls) {
auto K = std::make_pair(ThisRelocate->getBasePtrIndex(),
ThisRelocate->getDerivedPtrIndex());
RelocateIdxMap.insert(std::make_pair(K, ThisRelocate));
}
for (auto &Item : RelocateIdxMap) {
std::pair<unsigned, unsigned> Key = Item.first;
if (Key.first == Key.second)
// Base relocation: nothing to insert
continue;
GCRelocateInst *I = Item.second;
auto BaseKey = std::make_pair(Key.first, Key.first);
// We're iterating over RelocateIdxMap so we cannot modify it.
auto MaybeBase = RelocateIdxMap.find(BaseKey);
if (MaybeBase == RelocateIdxMap.end())
// TODO: We might want to insert a new base object relocate and gep off
// that, if there are enough derived object relocates.
continue;
RelocateInstMap[MaybeBase->second].push_back(I);
}
}
// Accepts a GEP and extracts the operands into a vector provided they're all
// small integer constants
static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP,
SmallVectorImpl<Value *> &OffsetV) {
for (unsigned i = 1; i < GEP->getNumOperands(); i++) {
// Only accept small constant integer operands
auto Op = dyn_cast<ConstantInt>(GEP->getOperand(i));
if (!Op || Op->getZExtValue() > 20)
return false;
}
for (unsigned i = 1; i < GEP->getNumOperands(); i++)
OffsetV.push_back(GEP->getOperand(i));
return true;
}
// Takes a RelocatedBase (base pointer relocation instruction) and Targets to
// replace, computes a replacement, and affects it.
static bool
simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,
const SmallVectorImpl<GCRelocateInst *> &Targets) {
bool MadeChange = false;
// We must ensure the relocation of derived pointer is defined after
// relocation of base pointer. If we find a relocation corresponding to base
// defined earlier than relocation of base then we move relocation of base
// right before found relocation. We consider only relocation in the same
// basic block as relocation of base. Relocations from other basic block will
// be skipped by optimization and we do not care about them.
for (auto R = RelocatedBase->getParent()->getFirstInsertionPt();
&*R != RelocatedBase; ++R)
if (auto RI = dyn_cast<GCRelocateInst>(R))
if (RI->getStatepoint() == RelocatedBase->getStatepoint())
if (RI->getBasePtrIndex() == RelocatedBase->getBasePtrIndex()) {
RelocatedBase->moveBefore(RI);
break;
}
for (GCRelocateInst *ToReplace : Targets) {
assert(ToReplace->getBasePtrIndex() == RelocatedBase->getBasePtrIndex() &&
"Not relocating a derived object of the original base object");
if (ToReplace->getBasePtrIndex() == ToReplace->getDerivedPtrIndex()) {
// A duplicate relocate call. TODO: coalesce duplicates.
continue;
}
if (RelocatedBase->getParent() != ToReplace->getParent()) {
// Base and derived relocates are in different basic blocks.
// In this case transform is only valid when base dominates derived
// relocate. However it would be too expensive to check dominance
// for each such relocate, so we skip the whole transformation.
continue;
}
Value *Base = ToReplace->getBasePtr();
auto Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr());
if (!Derived || Derived->getPointerOperand() != Base)
continue;
SmallVector<Value *, 2> OffsetV;
if (!getGEPSmallConstantIntOffsetV(Derived, OffsetV))
continue;
// Create a Builder and replace the target callsite with a gep
assert(RelocatedBase->getNextNode() &&
"Should always have one since it's not a terminator");
// Insert after RelocatedBase
IRBuilder<> Builder(RelocatedBase->getNextNode());
Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc());
// If gc_relocate does not match the actual type, cast it to the right type.
// In theory, there must be a bitcast after gc_relocate if the type does not
// match, and we should reuse it to get the derived pointer. But it could be
// cases like this:
// bb1:
// ...
// %g1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...)
// br label %merge
//
// bb2:
// ...
// %g2 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...)
// br label %merge
//
// merge:
// %p1 = phi i8 addrspace(1)* [ %g1, %bb1 ], [ %g2, %bb2 ]
// %cast = bitcast i8 addrspace(1)* %p1 in to i32 addrspace(1)*
//
// In this case, we can not find the bitcast any more. So we insert a new bitcast
// no matter there is already one or not. In this way, we can handle all cases, and
// the extra bitcast should be optimized away in later passes.
Value *ActualRelocatedBase = RelocatedBase;
if (RelocatedBase->getType() != Base->getType()) {
ActualRelocatedBase =
Builder.CreateBitCast(RelocatedBase, Base->getType());
}
Value *Replacement = Builder.CreateGEP(
Derived->getSourceElementType(), ActualRelocatedBase, makeArrayRef(OffsetV));
Replacement->takeName(ToReplace);
// If the newly generated derived pointer's type does not match the original derived
// pointer's type, cast the new derived pointer to match it. Same reasoning as above.
Value *ActualReplacement = Replacement;
if (Replacement->getType() != ToReplace->getType()) {
ActualReplacement =
Builder.CreateBitCast(Replacement, ToReplace->getType());
}
ToReplace->replaceAllUsesWith(ActualReplacement);
ToReplace->eraseFromParent();
MadeChange = true;
}
return MadeChange;
}
// Turns this:
//
// %base = ...
// %ptr = gep %base + 15
// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
// %base' = relocate(%tok, i32 4, i32 4)
// %ptr' = relocate(%tok, i32 4, i32 5)
// %val = load %ptr'
//
// into this:
//
// %base = ...
// %ptr = gep %base + 15
// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
// %base' = gc.relocate(%tok, i32 4, i32 4)
// %ptr' = gep %base' + 15
// %val = load %ptr'
bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) {
bool MadeChange = false;
SmallVector<GCRelocateInst *, 2> AllRelocateCalls;
for (auto *U : I.users())
if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U))
// Collect all the relocate calls associated with a statepoint
AllRelocateCalls.push_back(Relocate);
// We need atleast one base pointer relocation + one derived pointer
// relocation to mangle
if (AllRelocateCalls.size() < 2)
return false;
// RelocateInstMap is a mapping from the base relocate instruction to the
// corresponding derived relocate instructions
DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>> RelocateInstMap;
computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap);
if (RelocateInstMap.empty())
return false;
for (auto &Item : RelocateInstMap)
// Item.first is the RelocatedBase to offset against
// Item.second is the vector of Targets to replace
MadeChange = simplifyRelocatesOffABase(Item.first, Item.second);
return MadeChange;
}
/// Sink the specified cast instruction into its user blocks.
static bool SinkCast(CastInst *CI) {
BasicBlock *DefBB = CI->getParent();
/// InsertedCasts - Only insert a cast in each block once.
DenseMap<BasicBlock*, CastInst*> InsertedCasts;
bool MadeChange = false;
for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end();
UI != E; ) {
Use &TheUse = UI.getUse();
Instruction *User = cast<Instruction>(*UI);
// Figure out which BB this cast is used in. For PHI's this is the
// appropriate predecessor block.
BasicBlock *UserBB = User->getParent();
if (PHINode *PN = dyn_cast<PHINode>(User)) {
UserBB = PN->getIncomingBlock(TheUse);
}
// Preincrement use iterator so we don't invalidate it.
++UI;
// The first insertion point of a block containing an EH pad is after the
// pad. If the pad is the user, we cannot sink the cast past the pad.
if (User->isEHPad())
continue;
// If the block selected to receive the cast is an EH pad that does not
// allow non-PHI instructions before the terminator, we can't sink the
// cast.
if (UserBB->getTerminator()->isEHPad())
continue;
// If this user is in the same block as the cast, don't change the cast.
if (UserBB == DefBB) continue;
// If we have already inserted a cast into this block, use it.
CastInst *&InsertedCast = InsertedCasts[UserBB];
if (!InsertedCast) {
BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
assert(InsertPt != UserBB->end());
InsertedCast = CastInst::Create(CI->getOpcode(), CI->getOperand(0),
CI->getType(), "", &*InsertPt);
InsertedCast->setDebugLoc(CI->getDebugLoc());
}
// Replace a use of the cast with a use of the new cast.
TheUse = InsertedCast;
MadeChange = true;
++NumCastUses;
}
// If we removed all uses, nuke the cast.
if (CI->use_empty()) {
salvageDebugInfo(*CI);
CI->eraseFromParent();
MadeChange = true;
}
return MadeChange;
}
/// If the specified cast instruction is a noop copy (e.g. it's casting from
/// one pointer type to another, i32->i8 on PPC), sink it into user blocks to
/// reduce the number of virtual registers that must be created and coalesced.
///
/// Return true if any changes are made.
static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,
const DataLayout &DL) {
// Sink only "cheap" (or nop) address-space casts. This is a weaker condition
// than sinking only nop casts, but is helpful on some platforms.
if (auto *ASC = dyn_cast<AddrSpaceCastInst>(CI)) {
if (!TLI.isFreeAddrSpaceCast(ASC->getSrcAddressSpace(),
ASC->getDestAddressSpace()))
return false;
}
// If this is a noop copy,
EVT SrcVT = TLI.getValueType(DL, CI->getOperand(0)->getType());
EVT DstVT = TLI.getValueType(DL, CI->getType());
// This is an fp<->int conversion?
if (SrcVT.isInteger() != DstVT.isInteger())
return false;
// If this is an extension, it will be a zero or sign extension, which
// isn't a noop.
if (SrcVT.bitsLT(DstVT)) return false;
// If these values will be promoted, find out what they will be promoted
// to. This helps us consider truncates on PPC as noop copies when they
// are.
if (TLI.getTypeAction(CI->getContext(), SrcVT) ==
TargetLowering::TypePromoteInteger)
SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT);
if (TLI.getTypeAction(CI->getContext(), DstVT) ==
TargetLowering::TypePromoteInteger)
DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT);
// If, after promotion, these are the same types, this is a noop copy.
if (SrcVT != DstVT)
return false;
return SinkCast(CI);
}
bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
CmpInst *Cmp,
Intrinsic::ID IID) {
if (BO->getParent() != Cmp->getParent()) {
// We used to use a dominator tree here to allow multi-block optimization.
// But that was problematic because:
// 1. It could cause a perf regression by hoisting the math op into the
// critical path.
// 2. It could cause a perf regression by creating a value that was live
// across multiple blocks and increasing register pressure.
// 3. Use of a dominator tree could cause large compile-time regression.
// This is because we recompute the DT on every change in the main CGP
// run-loop. The recomputing is probably unnecessary in many cases, so if
// that was fixed, using a DT here would be ok.
return false;
}
// We allow matching the canonical IR (add X, C) back to (usubo X, -C).
Value *Arg0 = BO->getOperand(0);
Value *Arg1 = BO->getOperand(1);
if (BO->getOpcode() == Instruction::Add &&
IID == Intrinsic::usub_with_overflow) {
assert(isa<Constant>(Arg1) && "Unexpected input for usubo");
Arg1 = ConstantExpr::getNeg(cast<Constant>(Arg1));
}
// Insert at the first instruction of the pair.
Instruction *InsertPt = nullptr;
for (Instruction &Iter : *Cmp->getParent()) {
if (&Iter == BO || &Iter == Cmp) {
InsertPt = &Iter;
break;
}
}
assert(InsertPt != nullptr && "Parent block did not contain cmp or binop");
IRBuilder<> Builder(InsertPt);
Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1);
Value *Math = Builder.CreateExtractValue(MathOV, 0, "math");
Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov");
BO->replaceAllUsesWith(Math);
Cmp->replaceAllUsesWith(OV);
BO->eraseFromParent();
Cmp->eraseFromParent();
return true;
}
/// Match special-case patterns that check for unsigned add overflow.
static bool matchUAddWithOverflowConstantEdgeCases(CmpInst *Cmp,
BinaryOperator *&Add) {
// Add = add A, 1; Cmp = icmp eq A,-1 (overflow if A is max val)
// Add = add A,-1; Cmp = icmp ne A, 0 (overflow if A is non-zero)
Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1);
// We are not expecting non-canonical/degenerate code. Just bail out.
if (isa<Constant>(A))
return false;
ICmpInst::Predicate Pred = Cmp->getPredicate();
if (Pred == ICmpInst::ICMP_EQ && match(B, m_AllOnes()))
B = ConstantInt::get(B->getType(), 1);
else if (Pred == ICmpInst::ICMP_NE && match(B, m_ZeroInt()))
B = ConstantInt::get(B->getType(), -1);
else
return false;
// Check the users of the variable operand of the compare looking for an add
// with the adjusted constant.
for (User *U : A->users()) {
if (match(U, m_Add(m_Specific(A), m_Specific(B)))) {
Add = cast<BinaryOperator>(U);
return true;
}
}
return false;
}
/// Try to combine the compare into a call to the llvm.uadd.with.overflow
/// intrinsic. Return true if any changes were made.
bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp,
bool &ModifiedDT) {
Value *A, *B;
BinaryOperator *Add;
if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add))))
if (!matchUAddWithOverflowConstantEdgeCases(Cmp, Add))
return false;
if (!TLI->shouldFormOverflowOp(ISD::UADDO,
TLI->getValueType(*DL, Add->getType())))
return false;
// We don't want to move around uses of condition values this late, so we
// check if it is legal to create the call to the intrinsic in the basic
// block containing the icmp.
if (Add->getParent() != Cmp->getParent() && !Add->hasOneUse())
return false;
if (!replaceMathCmpWithIntrinsic(Add, Cmp, Intrinsic::uadd_with_overflow))
return false;
// Reset callers - do not crash by iterating over a dead instruction.
ModifiedDT = true;
return true;
}
bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
bool &ModifiedDT) {
// We are not expecting non-canonical/degenerate code. Just bail out.
Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1);
if (isa<Constant>(A) && isa<Constant>(B))
return false;
// Convert (A u> B) to (A u< B) to simplify pattern matching.
ICmpInst::Predicate Pred = Cmp->getPredicate();
if (Pred == ICmpInst::ICMP_UGT) {
std::swap(A, B);
Pred = ICmpInst::ICMP_ULT;
}
// Convert special-case: (A == 0) is the same as (A u< 1).
if (Pred == ICmpInst::ICMP_EQ && match(B, m_ZeroInt())) {
B = ConstantInt::get(B->getType(), 1);
Pred = ICmpInst::ICMP_ULT;
}
// Convert special-case: (A != 0) is the same as (0 u< A).
if (Pred == ICmpInst::ICMP_NE && match(B, m_ZeroInt())) {
std::swap(A, B);
Pred = ICmpInst::ICMP_ULT;
}
if (Pred != ICmpInst::ICMP_ULT)
return false;
// Walk the users of a variable operand of a compare looking for a subtract or
// add with that same operand. Also match the 2nd operand of the compare to
// the add/sub, but that may be a negated constant operand of an add.
Value *CmpVariableOperand = isa<Constant>(A) ? B : A;
BinaryOperator *Sub = nullptr;
for (User *U : CmpVariableOperand->users()) {
// A - B, A u< B --> usubo(A, B)
if (match(U, m_Sub(m_Specific(A), m_Specific(B)))) {
Sub = cast<BinaryOperator>(U);
break;
}
// A + (-C), A u< C (canonicalized form of (sub A, C))
const APInt *CmpC, *AddC;
if (match(U, m_Add(m_Specific(A), m_APInt(AddC))) &&
match(B, m_APInt(CmpC)) && *AddC == -(*CmpC)) {
Sub = cast<BinaryOperator>(U);
break;
}
}
if (!Sub)
return false;
if (!TLI->shouldFormOverflowOp(ISD::USUBO,
TLI->getValueType(*DL, Sub->getType())))
return false;
if (!replaceMathCmpWithIntrinsic(Sub, Cmp, Intrinsic::usub_with_overflow))
return false;
// Reset callers - do not crash by iterating over a dead instruction.
ModifiedDT = true;
return true;
}
/// Sink the given CmpInst into user blocks to reduce the number of virtual
/// registers that must be created and coalesced. This is a clear win except on
/// targets with multiple condition code registers (PowerPC), where it might
/// lose; some adjustment may be wanted there.
///
/// Return true if any changes are made.
static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
if (TLI.hasMultipleConditionRegisters())
return false;
// Avoid sinking soft-FP comparisons, since this can move them into a loop.
if (TLI.useSoftFloat() && isa<FCmpInst>(Cmp))
return false;
// Only insert a cmp in each block once.
DenseMap<BasicBlock*, CmpInst*> InsertedCmps;
bool MadeChange = false;
for (Value::user_iterator UI = Cmp->user_begin(), E = Cmp->user_end();
UI != E; ) {
Use &TheUse = UI.getUse();
Instruction *User = cast<Instruction>(*UI);
// Preincrement use iterator so we don't invalidate it.
++UI;
// Don't bother for PHI nodes.
if (isa<PHINode>(User))
continue;
// Figure out which BB this cmp is used in.
BasicBlock *UserBB = User->getParent();
BasicBlock *DefBB = Cmp->getParent();
// If this user is in the same block as the cmp, don't change the cmp.
if (UserBB == DefBB) continue;
// If we have already inserted a cmp into this block, use it.
CmpInst *&InsertedCmp = InsertedCmps[UserBB];
if (!InsertedCmp) {
BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
assert(InsertPt != UserBB->end());
InsertedCmp =
CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(),
Cmp->getOperand(0), Cmp->getOperand(1), "",
&*InsertPt);
// Propagate the debug info.
InsertedCmp->setDebugLoc(Cmp->getDebugLoc());
}
// Replace a use of the cmp with a use of the new cmp.
TheUse = InsertedCmp;
MadeChange = true;
++NumCmpUses;
}
// If we removed all uses, nuke the cmp.
if (Cmp->use_empty()) {
Cmp->eraseFromParent();
MadeChange = true;
}
return MadeChange;
}
bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, bool &ModifiedDT) {
if (sinkCmpExpression(Cmp, *TLI))
return true;
if (combineToUAddWithOverflow(Cmp, ModifiedDT))
return true;
if (combineToUSubWithOverflow(Cmp, ModifiedDT))
return true;
return false;
}
/// Duplicate and sink the given 'and' instruction into user blocks where it is
/// used in a compare to allow isel to generate better code for targets where
/// this operation can be combined.
///
/// Return true if any changes are made.
static bool sinkAndCmp0Expression(Instruction *AndI,
const TargetLowering &TLI,
SetOfInstrs &InsertedInsts) {
// Double-check that we're not trying to optimize an instruction that was
// already optimized by some other part of this pass.
assert(!InsertedInsts.count(AndI) &&
"Attempting to optimize already optimized and instruction");
(void) InsertedInsts;
// Nothing to do for single use in same basic block.
if (AndI->hasOneUse() &&
AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent())
return false;
// Try to avoid cases where sinking/duplicating is likely to increase register
// pressure.
if (!isa<ConstantInt>(AndI->getOperand(0)) &&
!isa<ConstantInt>(AndI->getOperand(1)) &&
AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse())
return false;
for (auto *U : AndI->users()) {
Instruction *User = cast<Instruction>(U);
// Only sink 'and' feeding icmp with 0.
if (!isa<ICmpInst>(User))
return false;
auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1));
if (!CmpC || !CmpC->isZero())
return false;
}
if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI))
return false;
LLVM_DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n");
LLVM_DEBUG(AndI->getParent()->dump());
// Push the 'and' into the same block as the icmp 0. There should only be
// one (icmp (and, 0)) in each block, since CSE/GVN should have removed any
// others, so we don't need to keep track of which BBs we insert into.
for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end();
UI != E; ) {
Use &TheUse = UI.getUse();
Instruction *User = cast<Instruction>(*UI);
// Preincrement use iterator so we don't invalidate it.
++UI;
LLVM_DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n");
// Keep the 'and' in the same place if the use is already in the same block.
Instruction *InsertPt =
User->getParent() == AndI->getParent() ? AndI : User;
Instruction *InsertedAnd =
BinaryOperator::Create(Instruction::And, AndI->getOperand(0),
AndI->getOperand(1), "", InsertPt);
// Propagate the debug info.
InsertedAnd->setDebugLoc(AndI->getDebugLoc());
// Replace a use of the 'and' with a use of the new 'and'.
TheUse = InsertedAnd;
++NumAndUses;
LLVM_DEBUG(User->getParent()->dump());
}
// We removed all uses, nuke the and.
AndI->eraseFromParent();
return true;
}
/// Check if the candidates could be combined with a shift instruction, which
/// includes:
/// 1. Truncate instruction
/// 2. And instruction and the imm is a mask of the low bits:
/// imm & (imm+1) == 0
static bool isExtractBitsCandidateUse(Instruction *User) {
if (!isa<TruncInst>(User)) {
if (User->getOpcode() != Instruction::And ||
!isa<ConstantInt>(User->getOperand(1)))
return false;
const APInt &Cimm = cast<ConstantInt>(User->getOperand(1))->getValue();
if ((Cimm & (Cimm + 1)).getBoolValue())
return false;
}
return true;
}
/// Sink both shift and truncate instruction to the use of truncate's BB.
static bool
SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
DenseMap<BasicBlock *, BinaryOperator *> &InsertedShifts,
const TargetLowering &TLI, const DataLayout &DL) {
BasicBlock *UserBB = User->getParent();
DenseMap<BasicBlock *, CastInst *> InsertedTruncs;
TruncInst *TruncI = dyn_cast<TruncInst>(User);
bool MadeChange = false;
for (Value::user_iterator TruncUI = TruncI->user_begin(),
TruncE = TruncI->user_end();
TruncUI != TruncE;) {
Use &TruncTheUse = TruncUI.getUse();
Instruction *TruncUser = cast<Instruction>(*TruncUI);
// Preincrement use iterator so we don't invalidate it.
++TruncUI;
int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode());
if (!ISDOpcode)
continue;
// If the use is actually a legal node, there will not be an
// implicit truncate.
// FIXME: always querying the result type is just an
// approximation; some nodes' legality is determined by the
// operand or other means. There's no good way to find out though.
if (TLI.isOperationLegalOrCustom(
ISDOpcode, TLI.getValueType(DL, TruncUser->getType(), true)))
continue;
// Don't bother for PHI nodes.
if (isa<PHINode>(TruncUser))
continue;
BasicBlock *TruncUserBB = TruncUser->getParent();
if (UserBB == TruncUserBB)
continue;
BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB];
CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB];
if (!InsertedShift && !InsertedTrunc) {
BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt();
assert(InsertPt != TruncUserBB->end());
// Sink the shift
if (ShiftI->getOpcode() == Instruction::AShr)
InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
"", &*InsertPt);
else
InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
"", &*InsertPt);
InsertedShift->setDebugLoc(ShiftI->getDebugLoc());
// Sink the trunc
BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt();
TruncInsertPt++;
assert(TruncInsertPt != TruncUserBB->end());
InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift,
TruncI->getType(), "", &*TruncInsertPt);
InsertedTrunc->setDebugLoc(TruncI->getDebugLoc());
MadeChange = true;
TruncTheUse = InsertedTrunc;
}
}
return MadeChange;
}
/// Sink the shift *right* instruction into user blocks if the uses could
/// potentially be combined with this shift instruction and generate BitExtract
/// instruction. It will only be applied if the architecture supports BitExtract
/// instruction. Here is an example:
/// BB1:
/// %x.extract.shift = lshr i64 %arg1, 32
/// BB2:
/// %x.extract.trunc = trunc i64 %x.extract.shift to i16
/// ==>
///
/// BB2:
/// %x.extract.shift.1 = lshr i64 %arg1, 32
/// %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16
///
/// CodeGen will recognize the pattern in BB2 and generate BitExtract
/// instruction.
/// Return true if any changes are made.
static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
const TargetLowering &TLI,
const DataLayout &DL) {
BasicBlock *DefBB = ShiftI->getParent();
/// Only insert instructions in each block once.
DenseMap<BasicBlock *, BinaryOperator *> InsertedShifts;
bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(DL, ShiftI->getType()));
bool MadeChange = false;
for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end();
UI != E;) {
Use &TheUse = UI.getUse();
Instruction *User = cast<Instruction>(*UI);
// Preincrement use iterator so we don't invalidate it.
++UI;
// Don't bother for PHI nodes.
if (isa<PHINode>(User))
continue;
if (!isExtractBitsCandidateUse(User))
continue;
BasicBlock *UserBB = User->getParent();
if (UserBB == DefBB) {
// If the shift and truncate instruction are in the same BB. The use of
// the truncate(TruncUse) may still introduce another truncate if not
// legal. In this case, we would like to sink both shift and truncate
// instruction to the BB of TruncUse.
// for example:
// BB1:
// i64 shift.result = lshr i64 opnd, imm
// trunc.result = trunc shift.result to i16
//
// BB2:
// ----> We will have an implicit truncate here if the architecture does
// not have i16 compare.
// cmp i16 trunc.result, opnd2
//
if (isa<TruncInst>(User) && shiftIsLegal
// If the type of the truncate is legal, no truncate will be
// introduced in other basic blocks.
&&
(!TLI.isTypeLegal(TLI.getValueType(DL, User->getType()))))
MadeChange =
SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI, DL);
continue;
}
// If we have already inserted a shift into this block, use it.
BinaryOperator *&InsertedShift = InsertedShifts[UserBB];
if (!InsertedShift) {
BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
assert(InsertPt != UserBB->end());
if (ShiftI->getOpcode() == Instruction::AShr)
InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
"", &*InsertPt);
else
InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
"", &*InsertPt);
InsertedShift->setDebugLoc(ShiftI->getDebugLoc());
MadeChange = true;
}
// Replace a use of the shift with a use of the new shift.
TheUse = InsertedShift;
}
- // If we removed all uses, nuke the shift.
+ // If we removed all uses, or there are none, nuke the shift.
if (ShiftI->use_empty()) {
salvageDebugInfo(*ShiftI);
ShiftI->eraseFromParent();
+ MadeChange = true;
}
return MadeChange;
}
/// If counting leading or trailing zeros is an expensive operation and a zero
/// input is defined, add a check for zero to avoid calling the intrinsic.
///
/// We want to transform:
/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 false)
///
/// into:
/// entry:
/// %cmpz = icmp eq i64 %A, 0
/// br i1 %cmpz, label %cond.end, label %cond.false
/// cond.false:
/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 true)
/// br label %cond.end
/// cond.end:
/// %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ]
///
/// If the transform is performed, return true and set ModifiedDT to true.
static bool despeculateCountZeros(IntrinsicInst *CountZeros,
const TargetLowering *TLI,
const DataLayout *DL,
bool &ModifiedDT) {
if (!TLI || !DL)
return false;
// If a zero input is undefined, it doesn't make sense to despeculate that.
if (match(CountZeros->getOperand(1), m_One()))
return false;
// If it's cheap to speculate, there's nothing to do.
auto IntrinsicID = CountZeros->getIntrinsicID();
if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()) ||
(IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz()))
return false;
// Only handle legal scalar cases. Anything else requires too much work.
Type *Ty = CountZeros->getType();
unsigned SizeInBits = Ty->getPrimitiveSizeInBits();
if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits())
return false;
// The intrinsic will be sunk behind a compare against zero and branch.
BasicBlock *StartBlock = CountZeros->getParent();
BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false");
// Create another block after the count zero intrinsic. A PHI will be added
// in this block to select the result of the intrinsic or the bit-width
// constant if the input to the intrinsic is zero.
BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros));
BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end");
// Set up a builder to create a compare, conditional branch, and PHI.
IRBuilder<> Builder(CountZeros->getContext());
Builder.SetInsertPoint(StartBlock->getTerminator());
Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc());
// Replace the unconditional branch that was created by the first split with
// a compare against zero and a conditional branch.
Value *Zero = Constant::getNullValue(Ty);
Value *Cmp = Builder.CreateICmpEQ(CountZeros->getOperand(0), Zero, "cmpz");
Builder.CreateCondBr(Cmp, EndBlock, CallBlock);
StartBlock->getTerminator()->eraseFromParent();
// Create a PHI in the end block to select either the output of the intrinsic
// or the bit width of the operand.
Builder.SetInsertPoint(&EndBlock->front());
PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz");
CountZeros->replaceAllUsesWith(PN);
Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits));
PN->addIncoming(BitWidth, StartBlock);
PN->addIncoming(CountZeros, CallBlock);
// We are explicitly handling the zero case, so we can set the intrinsic's
// undefined zero argument to 'true'. This will also prevent reprocessing the
// intrinsic; we only despeculate when a zero input is defined.
CountZeros->setArgOperand(1, Builder.getTrue());
ModifiedDT = true;
return true;
}
bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
BasicBlock *BB = CI->getParent();
// Lower inline assembly if we can.
// If we found an inline asm expession, and if the target knows how to
// lower it to normal LLVM code, do so now.
if (TLI && isa<InlineAsm>(CI->getCalledValue())) {
if (TLI->ExpandInlineAsm(CI)) {
// Avoid invalidating the iterator.
CurInstIterator = BB->begin();
// Avoid processing instructions out of order, which could cause
// reuse before a value is defined.
SunkAddrs.clear();
return true;
}
// Sink address computing for memory operands into the block.
if (optimizeInlineAsmInst(CI))
return true;
}
// Align the pointer arguments to this call if the target thinks it's a good
// idea
unsigned MinSize, PrefAlign;
if (TLI && TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
for (auto &Arg : CI->arg_operands()) {
// We want to align both objects whose address is used directly and
// objects whose address is used in casts and GEPs, though it only makes
// sense for GEPs if the offset is a multiple of the desired alignment and
// if size - offset meets the size threshold.
if (!Arg->getType()->isPointerTy())
continue;
APInt Offset(DL->getIndexSizeInBits(
cast<PointerType>(Arg->getType())->getAddressSpace()),
0);
Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset);
uint64_t Offset2 = Offset.getLimitedValue();
if ((Offset2 & (PrefAlign-1)) != 0)
continue;
AllocaInst *AI;
if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign &&
DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)
AI->setAlignment(PrefAlign);
// Global variables can only be aligned if they are defined in this
// object (i.e. they are uniquely initialized in this object), and
// over-aligning global variables that have an explicit section is
// forbidden.
GlobalVariable *GV;
if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->canIncreaseAlignment() &&
GV->getPointerAlignment(*DL) < PrefAlign &&
DL->getTypeAllocSize(GV->getValueType()) >=
MinSize + Offset2)
GV->setAlignment(PrefAlign);
}
// If this is a memcpy (or similar) then we may be able to improve the
// alignment
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
unsigned DestAlign = getKnownAlignment(MI->getDest(), *DL);
if (DestAlign > MI->getDestAlignment())
MI->setDestAlignment(DestAlign);
if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
unsigned SrcAlign = getKnownAlignment(MTI->getSource(), *DL);
if (SrcAlign > MTI->getSourceAlignment())
MTI->setSourceAlignment(SrcAlign);
}
}
}
// If we have a cold call site, try to sink addressing computation into the
// cold block. This interacts with our handling for loads and stores to
// ensure that we can fold all uses of a potential addressing computation
// into their uses. TODO: generalize this to work over profiling data
if (!OptSize && CI->hasFnAttr(Attribute::Cold))
for (auto &Arg : CI->arg_operands()) {
if (!Arg->getType()->isPointerTy())
continue;
unsigned AS = Arg->getType()->getPointerAddressSpace();
return optimizeMemoryInst(CI, Arg, Arg->getType(), AS);
}
IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
if (II) {
switch (II->getIntrinsicID()) {
default: break;
case Intrinsic::experimental_widenable_condition: {
// Give up on future widening oppurtunties so that we can fold away dead
// paths and merge blocks before going into block-local instruction
// selection.
if (II->use_empty()) {
II->eraseFromParent();
return true;
}
Constant *RetVal = ConstantInt::getTrue(II->getContext());
resetIteratorIfInvalidatedWhileCalling(BB, [&]() {
replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
});
return true;
}
case Intrinsic::objectsize: {
// Lower all uses of llvm.objectsize.*
Value *RetVal =
lowerObjectSizeCall(II, *DL, TLInfo, /*MustSucceed=*/true);
resetIteratorIfInvalidatedWhileCalling(BB, [&]() {
replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
});
return true;
}
case Intrinsic::is_constant: {
// If is_constant hasn't folded away yet, lower it to false now.
Constant *RetVal = ConstantInt::get(II->getType(), 0);
resetIteratorIfInvalidatedWhileCalling(BB, [&]() {
replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
});
return true;
}
case Intrinsic::aarch64_stlxr:
case Intrinsic::aarch64_stxr: {
ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));
if (!ExtVal || !ExtVal->hasOneUse() ||
ExtVal->getParent() == CI->getParent())
return false;
// Sink a zext feeding stlxr/stxr before it, so it can be folded into it.
ExtVal->moveBefore(CI);
// Mark this instruction as "inserted by CGP", so that other
// optimizations don't touch it.
InsertedInsts.insert(ExtVal);
return true;
}
case Intrinsic::launder_invariant_group:
case Intrinsic::strip_invariant_group: {
Value *ArgVal = II->getArgOperand(0);
auto it = LargeOffsetGEPMap.find(II);
if (it != LargeOffsetGEPMap.end()) {
// Merge entries in LargeOffsetGEPMap to reflect the RAUW.
// Make sure not to have to deal with iterator invalidation
// after possibly adding ArgVal to LargeOffsetGEPMap.
auto GEPs = std::move(it->second);
LargeOffsetGEPMap[ArgVal].append(GEPs.begin(), GEPs.end());
LargeOffsetGEPMap.erase(II);
}
II->replaceAllUsesWith(ArgVal);
II->eraseFromParent();
return true;
}
case Intrinsic::cttz:
case Intrinsic::ctlz:
// If counting zeros is expensive, try to avoid it.
return despeculateCountZeros(II, TLI, DL, ModifiedDT);
}
if (TLI) {
SmallVector<Value*, 2> PtrOps;
Type *AccessTy;
if (TLI->getAddrModeArguments(II, PtrOps, AccessTy))
while (!PtrOps.empty()) {
Value *PtrVal = PtrOps.pop_back_val();
unsigned AS = PtrVal->getType()->getPointerAddressSpace();
if (optimizeMemoryInst(II, PtrVal, AccessTy, AS))
return true;
}
}
}
// From here on out we're working with named functions.
if (!CI->getCalledFunction()) return false;
// Lower all default uses of _chk calls. This is very similar
// to what InstCombineCalls does, but here we are only lowering calls
// to fortified library functions (e.g. __memcpy_chk) that have the default
// "don't know" as the objectsize. Anything else should be left alone.
FortifiedLibCallSimplifier Simplifier(TLInfo, true);
if (Value *V = Simplifier.optimizeCall(CI)) {
CI->replaceAllUsesWith(V);
CI->eraseFromParent();
return true;
}
return false;
}
/// Look for opportunities to duplicate return instructions to the predecessor
/// to enable tail call optimizations. The case it is currently looking for is:
/// @code
/// bb0:
/// %tmp0 = tail call i32 @f0()
/// br label %return
/// bb1:
/// %tmp1 = tail call i32 @f1()
/// br label %return
/// bb2:
/// %tmp2 = tail call i32 @f2()
/// br label %return
/// return:
/// %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]
/// ret i32 %retval
/// @endcode
///
/// =>
///
/// @code
/// bb0:
/// %tmp0 = tail call i32 @f0()
/// ret i32 %tmp0
/// bb1:
/// %tmp1 = tail call i32 @f1()
/// ret i32 %tmp1
/// bb2:
/// %tmp2 = tail call i32 @f2()
/// ret i32 %tmp2
/// @endcode
bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT) {
if (!TLI)
return false;
ReturnInst *RetI = dyn_cast<ReturnInst>(BB->getTerminator());
if (!RetI)
return false;
PHINode *PN = nullptr;
BitCastInst *BCI = nullptr;
Value *V = RetI->getReturnValue();
if (V) {
BCI = dyn_cast<BitCastInst>(V);
if (BCI)
V = BCI->getOperand(0);
PN = dyn_cast<PHINode>(V);
if (!PN)
return false;
}
if (PN && PN->getParent() != BB)
return false;
// Make sure there are no instructions between the PHI and return, or that the
// return is the first instruction in the block.
if (PN) {
BasicBlock::iterator BI = BB->begin();
// Skip over debug and the bitcast.
do { ++BI; } while (isa<DbgInfoIntrinsic>(BI) || &*BI == BCI);
if (&*BI != RetI)
return false;
} else {
BasicBlock::iterator BI = BB->begin();
while (isa<DbgInfoIntrinsic>(BI)) ++BI;
if (&*BI != RetI)
return false;
}
/// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail
/// call.
const Function *F = BB->getParent();
SmallVector<CallInst*, 4> TailCalls;
if (PN) {
for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {
// Look through bitcasts.
Value *IncomingVal = PN->getIncomingValue(I)->stripPointerCasts();
CallInst *CI = dyn_cast<CallInst>(IncomingVal);
// Make sure the phi value is indeed produced by the tail call.
if (CI && CI->hasOneUse() && CI->getParent() == PN->getIncomingBlock(I) &&
TLI->mayBeEmittedAsTailCall(CI) &&
attributesPermitTailCall(F, CI, RetI, *TLI))
TailCalls.push_back(CI);
}
} else {
SmallPtrSet<BasicBlock*, 4> VisitedBBs;
for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
if (!VisitedBBs.insert(*PI).second)
continue;
BasicBlock::InstListType &InstList = (*PI)->getInstList();
BasicBlock::InstListType::reverse_iterator RI = InstList.rbegin();
BasicBlock::InstListType::reverse_iterator RE = InstList.rend();
do { ++RI; } while (RI != RE && isa<DbgInfoIntrinsic>(&*RI));
if (RI == RE)
continue;
CallInst *CI = dyn_cast<CallInst>(&*RI);
if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI) &&
attributesPermitTailCall(F, CI, RetI, *TLI))
TailCalls.push_back(CI);
}
}
bool Changed = false;
for (unsigned i = 0, e = TailCalls.size(); i != e; ++i) {
CallInst *CI = TailCalls[i];
CallSite CS(CI);
// Make sure the call instruction is followed by an unconditional branch to
// the return block.
BasicBlock *CallBB = CI->getParent();
BranchInst *BI = dyn_cast<BranchInst>(CallBB->getTerminator());
if (!BI || !BI->isUnconditional() || BI->getSuccessor(0) != BB)
continue;
// Duplicate the return into CallBB.
(void)FoldReturnIntoUncondBranch(RetI, BB, CallBB);
ModifiedDT = Changed = true;
++NumRetsDup;
}
// If we eliminated all predecessors of the block, delete the block now.
if (Changed && !BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
BB->eraseFromParent();
return Changed;
}
//===----------------------------------------------------------------------===//
// Memory Optimization
//===----------------------------------------------------------------------===//
namespace {
/// This is an extended version of TargetLowering::AddrMode
/// which holds actual Value*'s for register values.
struct ExtAddrMode : public TargetLowering::AddrMode {
Value *BaseReg = nullptr;
Value *ScaledReg = nullptr;
Value *OriginalValue = nullptr;
bool InBounds = true;
enum FieldName {
NoField = 0x00,
BaseRegField = 0x01,
BaseGVField = 0x02,
BaseOffsField = 0x04,
ScaledRegField = 0x08,
ScaleField = 0x10,
MultipleFields = 0xff
};
ExtAddrMode() = default;
void print(raw_ostream &OS) const;
void dump() const;
FieldName compare(const ExtAddrMode &other) {
// First check that the types are the same on each field, as differing types
// is something we can't cope with later on.
if (BaseReg && other.BaseReg &&
BaseReg->getType() != other.BaseReg->getType())
return MultipleFields;
if (BaseGV && other.BaseGV &&
BaseGV->getType() != other.BaseGV->getType())
return MultipleFields;
if (ScaledReg && other.ScaledReg &&
ScaledReg->getType() != other.ScaledReg->getType())
return MultipleFields;
// Conservatively reject 'inbounds' mismatches.
if (InBounds != other.InBounds)
return MultipleFields;
// Check each field to see if it differs.
unsigned Result = NoField;
if (BaseReg != other.BaseReg)
Result |= BaseRegField;
if (BaseGV != other.BaseGV)
Result |= BaseGVField;
if (BaseOffs != other.BaseOffs)
Result |= BaseOffsField;
if (ScaledReg != other.ScaledReg)
Result |= ScaledRegField;
// Don't count 0 as being a different scale, because that actually means
// unscaled (which will already be counted by having no ScaledReg).
if (Scale && other.Scale && Scale != other.Scale)
Result |= ScaleField;
if (countPopulation(Result) > 1)
return MultipleFields;
else
return static_cast<FieldName>(Result);
}
// An AddrMode is trivial if it involves no calculation i.e. it is just a base
// with no offset.
bool isTrivial() {
// An AddrMode is (BaseGV + BaseReg + BaseOffs + ScaleReg * Scale) so it is
// trivial if at most one of these terms is nonzero, except that BaseGV and
// BaseReg both being zero actually means a null pointer value, which we
// consider to be 'non-zero' here.
return !BaseOffs && !Scale && !(BaseGV && BaseReg);
}
Value *GetFieldAsValue(FieldName Field, Type *IntPtrTy) {
switch (Field) {
default:
return nullptr;
case BaseRegField:
return BaseReg;
case BaseGVField:
return BaseGV;
case ScaledRegField:
return ScaledReg;
case BaseOffsField:
return ConstantInt::get(IntPtrTy, BaseOffs);
}
}
void SetCombinedField(FieldName Field, Value *V,
const SmallVectorImpl<ExtAddrMode> &AddrModes) {
switch (Field) {
default:
llvm_unreachable("Unhandled fields are expected to be rejected earlier");
break;
case ExtAddrMode::BaseRegField:
BaseReg = V;
break;
case ExtAddrMode::BaseGVField:
// A combined BaseGV is an Instruction, not a GlobalValue, so it goes
// in the BaseReg field.
assert(BaseReg == nullptr);
BaseReg = V;
BaseGV = nullptr;
break;
case ExtAddrMode::ScaledRegField:
ScaledReg = V;
// If we have a mix of scaled and unscaled addrmodes then we want scale
// to be the scale and not zero.
if (!Scale)
for (const ExtAddrMode &AM : AddrModes)
if (AM.Scale) {
Scale = AM.Scale;
break;
}
break;
case ExtAddrMode::BaseOffsField:
// The offset is no longer a constant, so it goes in ScaledReg with a
// scale of 1.
assert(ScaledReg == nullptr);
ScaledReg = V;
Scale = 1;
BaseOffs = 0;
break;
}
}
};
} // end anonymous namespace
#ifndef NDEBUG
static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) {
AM.print(OS);
return OS;
}
#endif
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void ExtAddrMode::print(raw_ostream &OS) const {
bool NeedPlus = false;
OS << "[";
if (InBounds)
OS << "inbounds ";
if (BaseGV) {
OS << (NeedPlus ? " + " : "")
<< "GV:";
BaseGV->printAsOperand(OS, /*PrintType=*/false);
NeedPlus = true;
}
if (BaseOffs) {
OS << (NeedPlus ? " + " : "")
<< BaseOffs;
NeedPlus = true;
}
if (BaseReg) {
OS << (NeedPlus ? " + " : "")
<< "Base:";
BaseReg->printAsOperand(OS, /*PrintType=*/false);
NeedPlus = true;
}
if (Scale) {
OS << (NeedPlus ? " + " : "")
<< Scale << "*";
ScaledReg->printAsOperand(OS, /*PrintType=*/false);
}
OS << ']';
}
LLVM_DUMP_METHOD void ExtAddrMode::dump() const {
print(dbgs());
dbgs() << '\n';
}
#endif
namespace {
/// This class provides transaction based operation on the IR.
/// Every change made through this class is recorded in the internal state and
/// can be undone (rollback) until commit is called.
class TypePromotionTransaction {
/// This represents the common interface of the individual transaction.
/// Each class implements the logic for doing one specific modification on
/// the IR via the TypePromotionTransaction.
class TypePromotionAction {
protected:
/// The Instruction modified.
Instruction *Inst;
public:
/// Constructor of the action.
/// The constructor performs the related action on the IR.
TypePromotionAction(Instruction *Inst) : Inst(Inst) {}
virtual ~TypePromotionAction() = default;
/// Undo the modification done by this action.
/// When this method is called, the IR must be in the same state as it was
/// before this action was applied.
/// \pre Undoing the action works if and only if the IR is in the exact same
/// state as it was directly after this action was applied.
virtual void undo() = 0;
/// Advocate every change made by this action.
/// When the results on the IR of the action are to be kept, it is important
/// to call this function, otherwise hidden information may be kept forever.
virtual void commit() {
// Nothing to be done, this action is not doing anything.
}
};
/// Utility to remember the position of an instruction.
class InsertionHandler {
/// Position of an instruction.
/// Either an instruction:
/// - Is the first in a basic block: BB is used.
/// - Has a previous instruction: PrevInst is used.
union {
Instruction *PrevInst;
BasicBlock *BB;
} Point;
/// Remember whether or not the instruction had a previous instruction.
bool HasPrevInstruction;
public:
/// Record the position of \p Inst.
InsertionHandler(Instruction *Inst) {
BasicBlock::iterator It = Inst->getIterator();
HasPrevInstruction = (It != (Inst->getParent()->begin()));
if (HasPrevInstruction)
Point.PrevInst = &*--It;
else
Point.BB = Inst->getParent();
}
/// Insert \p Inst at the recorded position.
void insert(Instruction *Inst) {
if (HasPrevInstruction) {
if (Inst->getParent())
Inst->removeFromParent();
Inst->insertAfter(Point.PrevInst);
} else {
Instruction *Position = &*Point.BB->getFirstInsertionPt();
if (Inst->getParent())
Inst->moveBefore(Position);
else
Inst->insertBefore(Position);
}
}
};
/// Move an instruction before another.
class InstructionMoveBefore : public TypePromotionAction {
/// Original position of the instruction.
InsertionHandler Position;
public:
/// Move \p Inst before \p Before.
InstructionMoveBefore(Instruction *Inst, Instruction *Before)
: TypePromotionAction(Inst), Position(Inst) {
LLVM_DEBUG(dbgs() << "Do: move: " << *Inst << "\nbefore: " << *Before
<< "\n");
Inst->moveBefore(Before);
}
/// Move the instruction back to its original position.
void undo() override {
LLVM_DEBUG(dbgs() << "Undo: moveBefore: " << *Inst << "\n");
Position.insert(Inst);
}
};
/// Set the operand of an instruction with a new value.
class OperandSetter : public TypePromotionAction {
/// Original operand of the instruction.
Value *Origin;
/// Index of the modified instruction.
unsigned Idx;
public:
/// Set \p Idx operand of \p Inst with \p NewVal.
OperandSetter(Instruction *Inst, unsigned Idx, Value *NewVal)
: TypePromotionAction(Inst), Idx(Idx) {
LLVM_DEBUG(dbgs() << "Do: setOperand: " << Idx << "\n"
<< "for:" << *Inst << "\n"
<< "with:" << *NewVal << "\n");
Origin = Inst->getOperand(Idx);
Inst->setOperand(Idx, NewVal);
}
/// Restore the original value of the instruction.
void undo() override {
LLVM_DEBUG(dbgs() << "Undo: setOperand:" << Idx << "\n"
<< "for: " << *Inst << "\n"
<< "with: " << *Origin << "\n");
Inst->setOperand(Idx, Origin);
}
};
/// Hide the operands of an instruction.
/// Do as if this instruction was not using any of its operands.
class OperandsHider : public TypePromotionAction {
/// The list of original operands.
SmallVector<Value *, 4> OriginalValues;
public:
/// Remove \p Inst from the uses of the operands of \p Inst.
OperandsHider(Instruction *Inst) : TypePromotionAction(Inst) {
LLVM_DEBUG(dbgs() << "Do: OperandsHider: " << *Inst << "\n");
unsigned NumOpnds = Inst->getNumOperands();
OriginalValues.reserve(NumOpnds);
for (unsigned It = 0; It < NumOpnds; ++It) {
// Save the current operand.
Value *Val = Inst->getOperand(It);
OriginalValues.push_back(Val);
// Set a dummy one.
// We could use OperandSetter here, but that would imply an overhead
// that we are not willing to pay.
Inst->setOperand(It, UndefValue::get(Val->getType()));
}
}
/// Restore the original list of uses.
void undo() override {
LLVM_DEBUG(dbgs() << "Undo: OperandsHider: " << *Inst << "\n");
for (unsigned It = 0, EndIt = OriginalValues.size(); It != EndIt; ++It)
Inst->setOperand(It, OriginalValues[It]);
}
};
/// Build a truncate instruction.
class TruncBuilder : public TypePromotionAction {
Value *Val;
public:
/// Build a truncate instruction of \p Opnd producing a \p Ty
/// result.
/// trunc Opnd to Ty.
TruncBuilder(Instruction *Opnd, Type *Ty) : TypePromotionAction(Opnd) {
IRBuilder<> Builder(Opnd);
Val = Builder.CreateTrunc(Opnd, Ty, "promoted");
LLVM_DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n");
}
/// Get the built value.
Value *getBuiltValue() { return Val; }
/// Remove the built instruction.
void undo() override {
LLVM_DEBUG(dbgs() << "Undo: TruncBuilder: " << *Val << "\n");
if (Instruction *IVal = dyn_cast<Instruction>(Val))
IVal->eraseFromParent();
}
};
/// Build a sign extension instruction.
class SExtBuilder : public TypePromotionAction {
Value *Val;
public:
/// Build a sign extension instruction of \p Opnd producing a \p Ty
/// result.
/// sext Opnd to Ty.
SExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
: TypePromotionAction(InsertPt) {
IRBuilder<> Builder(InsertPt);
Val = Builder.CreateSExt(Opnd, Ty, "promoted");
LLVM_DEBUG(dbgs() << "Do: SExtBuilder: " << *Val << "\n");
}
/// Get the built value.
Value *getBuiltValue() { return Val; }
/// Remove the built instruction.
void undo() override {
LLVM_DEBUG(dbgs() << "Undo: SExtBuilder: " << *Val << "\n");
if (Instruction *IVal = dyn_cast<Instruction>(Val))
IVal->eraseFromParent();
}
};
/// Build a zero extension instruction.
class ZExtBuilder : public TypePromotionAction {
Value *Val;
public:
/// Build a zero extension instruction of \p Opnd producing a \p Ty
/// result.
/// zext Opnd to Ty.
ZExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
: TypePromotionAction(InsertPt) {
IRBuilder<> Builder(InsertPt);
Val = Builder.CreateZExt(Opnd, Ty, "promoted");
LLVM_DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n");
}
/// Get the built value.
Value *getBuiltValue() { return Val; }
/// Remove the built instruction.
void undo() override {
LLVM_DEBUG(dbgs() << "Undo: ZExtBuilder: " << *Val << "\n");
if (Instruction *IVal = dyn_cast<Instruction>(Val))
IVal->eraseFromParent();
}
};
/// Mutate an instruction to another type.
class TypeMutator : public TypePromotionAction {
/// Record the original type.
Type *OrigTy;
public:
/// Mutate the type of \p Inst into \p NewTy.
TypeMutator(Instruction *Inst, Type *NewTy)
: TypePromotionAction(Inst), OrigTy(Inst->getType()) {
LLVM_DEBUG(dbgs() << "Do: MutateType: " << *Inst << " with " << *NewTy
<< "\n");
Inst->mutateType(NewTy);
}
/// Mutate the instruction back to its original type.
void undo() override {
LLVM_DEBUG(dbgs() << "Undo: MutateType: " << *Inst << " with " << *OrigTy
<< "\n");
Inst->mutateType(OrigTy);
}
};
/// Replace the uses of an instruction by another instruction.
class UsesReplacer : public TypePromotionAction {
/// Helper structure to keep track of the replaced uses.
struct InstructionAndIdx {
/// The instruction using the instruction.
Instruction *Inst;
/// The index where this instruction is used for Inst.
unsigned Idx;
InstructionAndIdx(Instruction *Inst, unsigned Idx)
: Inst(Inst), Idx(Idx) {}
};
/// Keep track of the original uses (pair Instruction, Index).
SmallVector<InstructionAndIdx, 4> OriginalUses;
/// Keep track of the debug users.
SmallVector<DbgValueInst *, 1> DbgValues;
using use_iterator = SmallVectorImpl<InstructionAndIdx>::iterator;
public:
/// Replace all the use of \p Inst by \p New.
UsesReplacer(Instruction *Inst, Value *New) : TypePromotionAction(Inst) {
LLVM_DEBUG(dbgs() << "Do: UsersReplacer: " << *Inst << " with " << *New
<< "\n");
// Record the original uses.
for (Use &U : Inst->uses()) {
Instruction *UserI = cast<Instruction>(U.getUser());
OriginalUses.push_back(InstructionAndIdx(UserI, U.getOperandNo()));
}
// Record the debug uses separately. They are not in the instruction's
// use list, but they are replaced by RAUW.
findDbgValues(DbgValues, Inst);
// Now, we can replace the uses.
Inst->replaceAllUsesWith(New);
}
/// Reassign the original uses of Inst to Inst.
void undo() override {
LLVM_DEBUG(dbgs() << "Undo: UsersReplacer: " << *Inst << "\n");
for (use_iterator UseIt = OriginalUses.begin(),
EndIt = OriginalUses.end();
UseIt != EndIt; ++UseIt) {
UseIt->Inst->setOperand(UseIt->Idx, Inst);
}
// RAUW has replaced all original uses with references to the new value,
// including the debug uses. Since we are undoing the replacements,
// the original debug uses must also be reinstated to maintain the
// correctness and utility of debug value instructions.
for (auto *DVI: DbgValues) {
LLVMContext &Ctx = Inst->getType()->getContext();
auto *MV = MetadataAsValue::get(Ctx, ValueAsMetadata::get(Inst));
DVI->setOperand(0, MV);
}
}
};
/// Remove an instruction from the IR.
class InstructionRemover : public TypePromotionAction {
/// Original position of the instruction.
InsertionHandler Inserter;
/// Helper structure to hide all the link to the instruction. In other
/// words, this helps to do as if the instruction was removed.
OperandsHider Hider;
/// Keep track of the uses replaced, if any.
UsesReplacer *Replacer = nullptr;
/// Keep track of instructions removed.
SetOfInstrs &RemovedInsts;
public:
/// Remove all reference of \p Inst and optionally replace all its
/// uses with New.
/// \p RemovedInsts Keep track of the instructions removed by this Action.
/// \pre If !Inst->use_empty(), then New != nullptr
InstructionRemover(Instruction *Inst, SetOfInstrs &RemovedInsts,
Value *New = nullptr)
: TypePromotionAction(Inst), Inserter(Inst), Hider(Inst),
RemovedInsts(RemovedInsts) {
if (New)
Replacer = new UsesReplacer(Inst, New);
LLVM_DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n");
RemovedInsts.insert(Inst);
/// The instructions removed here will be freed after completing
/// optimizeBlock() for all blocks as we need to keep track of the
/// removed instructions during promotion.
Inst->removeFromParent();
}
~InstructionRemover() override { delete Replacer; }
/// Resurrect the instruction and reassign it to the proper uses if
/// new value was provided when build this action.
void undo() override {
LLVM_DEBUG(dbgs() << "Undo: InstructionRemover: " << *Inst << "\n");
Inserter.insert(Inst);
if (Replacer)
Replacer->undo();
Hider.undo();
RemovedInsts.erase(Inst);
}
};
public:
/// Restoration point.
/// The restoration point is a pointer to an action instead of an iterator
/// because the iterator may be invalidated but not the pointer.
using ConstRestorationPt = const TypePromotionAction *;
TypePromotionTransaction(SetOfInstrs &RemovedInsts)
: RemovedInsts(RemovedInsts) {}
/// Advocate every changes made in that transaction.
void commit();
/// Undo all the changes made after the given point.
void rollback(ConstRestorationPt Point);
/// Get the current restoration point.
ConstRestorationPt getRestorationPoint() const;
/// \name API for IR modification with state keeping to support rollback.
/// @{
/// Same as Instruction::setOperand.
void setOperand(Instruction *Inst, unsigned Idx, Value *NewVal);
/// Same as Instruction::eraseFromParent.
void eraseInstruction(Instruction *Inst, Value *NewVal = nullptr);
/// Same as Value::replaceAllUsesWith.
void replaceAllUsesWith(Instruction *Inst, Value *New);
/// Same as Value::mutateType.
void mutateType(Instruction *Inst, Type *NewTy);
/// Same as IRBuilder::createTrunc.
Value *createTrunc(Instruction *Opnd, Type *Ty);
/// Same as IRBuilder::createSExt.
Value *createSExt(Instruction *Inst, Value *Opnd, Type *Ty);
/// Same as IRBuilder::createZExt.
Value *createZExt(Instruction *Inst, Value *Opnd, Type *Ty);
/// Same as Instruction::moveBefore.
void moveBefore(Instruction *Inst, Instruction *Before);
/// @}
private:
/// The ordered list of actions made so far.
SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions;
using CommitPt = SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator;
SetOfInstrs &RemovedInsts;
};
} // end anonymous namespace
void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,
Value *NewVal) {
Actions.push_back(llvm::make_unique<TypePromotionTransaction::OperandSetter>(
Inst, Idx, NewVal));
}
void TypePromotionTransaction::eraseInstruction(Instruction *Inst,
Value *NewVal) {
Actions.push_back(
llvm::make_unique<TypePromotionTransaction::InstructionRemover>(
Inst, RemovedInsts, NewVal));
}
void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst,
Value *New) {
Actions.push_back(
llvm::make_unique<TypePromotionTransaction::UsesReplacer>(Inst, New));
}
void TypePromotionTransaction::mutateType(Instruction *Inst, Type *NewTy) {
Actions.push_back(
llvm::make_unique<TypePromotionTransaction::TypeMutator>(Inst, NewTy));
}
Value *TypePromotionTransaction::createTrunc(Instruction *Opnd,
Type *Ty) {
std::unique_ptr<TruncBuilder> Ptr(new TruncBuilder(Opnd, Ty));
Value *Val = Ptr->getBuiltValue();
Actions.push_back(std::move(Ptr));
return Val;
}
Value *TypePromotionTransaction::createSExt(Instruction *Inst,
Value *Opnd, Type *Ty) {
std::unique_ptr<SExtBuilder> Ptr(new SExtBuilder(Inst, Opnd, Ty));
Value *Val = Ptr->getBuiltValue();
Actions.push_back(std::move(Ptr));
return Val;
}
Value *TypePromotionTransaction::createZExt(Instruction *Inst,
Value *Opnd, Type *Ty) {
std::unique_ptr<ZExtBuilder> Ptr(new ZExtBuilder(Inst, Opnd, Ty));
Value *Val = Ptr->getBuiltValue();
Actions.push_back(std::move(Ptr));
return Val;
}
void TypePromotionTransaction::moveBefore(Instruction *Inst,
Instruction *Before) {
Actions.push_back(
llvm::make_unique<TypePromotionTransaction::InstructionMoveBefore>(
Inst, Before));
}
TypePromotionTransaction::ConstRestorationPt
TypePromotionTransaction::getRestorationPoint() const {
return !Actions.empty() ? Actions.back().get() : nullptr;
}
void TypePromotionTransaction::commit() {
for (CommitPt It = Actions.begin(), EndIt = Actions.end(); It != EndIt;
++It)
(*It)->commit();
Actions.clear();
}
void TypePromotionTransaction::rollback(
TypePromotionTransaction::ConstRestorationPt Point) {
while (!Actions.empty() && Point != Actions.back().get()) {
std::unique_ptr<TypePromotionAction> Curr = Actions.pop_back_val();
Curr->undo();
}
}
namespace {
/// A helper class for matching addressing modes.
///
/// This encapsulates the logic for matching the target-legal addressing modes.
class AddressingModeMatcher {
SmallVectorImpl<Instruction*> &AddrModeInsts;
const TargetLowering &TLI;
const TargetRegisterInfo &TRI;
const DataLayout &DL;
/// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
/// the memory instruction that we're computing this address for.
Type *AccessTy;
unsigned AddrSpace;
Instruction *MemoryInst;
/// This is the addressing mode that we're building up. This is
/// part of the return value of this addressing mode matching stuff.
ExtAddrMode &AddrMode;
/// The instructions inserted by other CodeGenPrepare optimizations.
const SetOfInstrs &InsertedInsts;
/// A map from the instructions to their type before promotion.
InstrToOrigTy &PromotedInsts;
/// The ongoing transaction where every action should be registered.
TypePromotionTransaction &TPT;
// A GEP which has too large offset to be folded into the addressing mode.
std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP;
/// This is set to true when we should not do profitability checks.
/// When true, IsProfitableToFoldIntoAddressingMode always returns true.
bool IgnoreProfitability;
AddressingModeMatcher(
SmallVectorImpl<Instruction *> &AMI, const TargetLowering &TLI,
const TargetRegisterInfo &TRI, Type *AT, unsigned AS, Instruction *MI,
ExtAddrMode &AM, const SetOfInstrs &InsertedInsts,
InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT,
std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP)
: AddrModeInsts(AMI), TLI(TLI), TRI(TRI),
DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),
MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),
PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP) {
IgnoreProfitability = false;
}
public:
/// Find the maximal addressing mode that a load/store of V can fold,
/// give an access type of AccessTy. This returns a list of involved
/// instructions in AddrModeInsts.
/// \p InsertedInsts The instructions inserted by other CodeGenPrepare
/// optimizations.
/// \p PromotedInsts maps the instructions to their type before promotion.
/// \p The ongoing transaction where every action should be registered.
static ExtAddrMode
Match(Value *V, Type *AccessTy, unsigned AS, Instruction *MemoryInst,
SmallVectorImpl<Instruction *> &AddrModeInsts,
const TargetLowering &TLI, const TargetRegisterInfo &TRI,
const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts,
TypePromotionTransaction &TPT,
std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP) {
ExtAddrMode Result;
bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, AccessTy, AS,
MemoryInst, Result, InsertedInsts,
PromotedInsts, TPT, LargeOffsetGEP)
.matchAddr(V, 0);
(void)Success; assert(Success && "Couldn't select *anything*?");
return Result;
}
private:
bool matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth);
bool matchAddr(Value *Addr, unsigned Depth);
bool matchOperationAddr(User *AddrInst, unsigned Opcode, unsigned Depth,
bool *MovedAway = nullptr);
bool isProfitableToFoldIntoAddressingMode(Instruction *I,
ExtAddrMode &AMBefore,
ExtAddrMode &AMAfter);
bool valueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2);
bool isPromotionProfitable(unsigned NewCost, unsigned OldCost,
Value *PromotedOperand) const;
};
class PhiNodeSet;
/// An iterator for PhiNodeSet.
class PhiNodeSetIterator {
PhiNodeSet * const Set;
size_t CurrentIndex = 0;
public:
/// The constructor. Start should point to either a valid element, or be equal
/// to the size of the underlying SmallVector of the PhiNodeSet.
PhiNodeSetIterator(PhiNodeSet * const Set, size_t Start);
PHINode * operator*() const;
PhiNodeSetIterator& operator++();
bool operator==(const PhiNodeSetIterator &RHS) const;
bool operator!=(const PhiNodeSetIterator &RHS) const;
};
/// Keeps a set of PHINodes.
///
/// This is a minimal set implementation for a specific use case:
/// It is very fast when there are very few elements, but also provides good
/// performance when there are many. It is similar to SmallPtrSet, but also
/// provides iteration by insertion order, which is deterministic and stable
/// across runs. It is also similar to SmallSetVector, but provides removing
/// elements in O(1) time. This is achieved by not actually removing the element
/// from the underlying vector, so comes at the cost of using more memory, but
/// that is fine, since PhiNodeSets are used as short lived objects.
class PhiNodeSet {
friend class PhiNodeSetIterator;
using MapType = SmallDenseMap<PHINode *, size_t, 32>;
using iterator = PhiNodeSetIterator;
/// Keeps the elements in the order of their insertion in the underlying
/// vector. To achieve constant time removal, it never deletes any element.
SmallVector<PHINode *, 32> NodeList;
/// Keeps the elements in the underlying set implementation. This (and not the
/// NodeList defined above) is the source of truth on whether an element
/// is actually in the collection.
MapType NodeMap;
/// Points to the first valid (not deleted) element when the set is not empty
/// and the value is not zero. Equals to the size of the underlying vector
/// when the set is empty. When the value is 0, as in the beginning, the
/// first element may or may not be valid.
size_t FirstValidElement = 0;
public:
/// Inserts a new element to the collection.
/// \returns true if the element is actually added, i.e. was not in the
/// collection before the operation.
bool insert(PHINode *Ptr) {
if (NodeMap.insert(std::make_pair(Ptr, NodeList.size())).second) {
NodeList.push_back(Ptr);
return true;
}
return false;
}
/// Removes the element from the collection.
/// \returns whether the element is actually removed, i.e. was in the
/// collection before the operation.
bool erase(PHINode *Ptr) {
auto it = NodeMap.find(Ptr);
if (it != NodeMap.end()) {
NodeMap.erase(Ptr);
SkipRemovedElements(FirstValidElement);
return true;
}
return false;
}
/// Removes all elements and clears the collection.
void clear() {
NodeMap.clear();
NodeList.clear();
FirstValidElement = 0;
}
/// \returns an iterator that will iterate the elements in the order of
/// insertion.
iterator begin() {
if (FirstValidElement == 0)
SkipRemovedElements(FirstValidElement);
return PhiNodeSetIterator(this, FirstValidElement);
}
/// \returns an iterator that points to the end of the collection.
iterator end() { return PhiNodeSetIterator(this, NodeList.size()); }
/// Returns the number of elements in the collection.
size_t size() const {
return NodeMap.size();
}
/// \returns 1 if the given element is in the collection, and 0 if otherwise.
size_t count(PHINode *Ptr) const {
return NodeMap.count(Ptr);
}
private:
/// Updates the CurrentIndex so that it will point to a valid element.
///
/// If the element of NodeList at CurrentIndex is valid, it does not
/// change it. If there are no more valid elements, it updates CurrentIndex
/// to point to the end of the NodeList.
void SkipRemovedElements(size_t &CurrentIndex) {
while (CurrentIndex < NodeList.size()) {
auto it = NodeMap.find(NodeList[CurrentIndex]);
// If the element has been deleted and added again later, NodeMap will
// point to a different index, so CurrentIndex will still be invalid.
if (it != NodeMap.end() && it->second == CurrentIndex)
break;
++CurrentIndex;
}
}
};
PhiNodeSetIterator::PhiNodeSetIterator(PhiNodeSet *const Set, size_t Start)
: Set(Set), CurrentIndex(Start) {}
PHINode * PhiNodeSetIterator::operator*() const {
assert(CurrentIndex < Set->NodeList.size() &&
"PhiNodeSet access out of range");
return Set->NodeList[CurrentIndex];
}
PhiNodeSetIterator& PhiNodeSetIterator::operator++() {
assert(CurrentIndex < Set->NodeList.size() &&
"PhiNodeSet access out of range");
++CurrentIndex;
Set->SkipRemovedElements(CurrentIndex);
return *this;
}
bool PhiNodeSetIterator::operator==(const PhiNodeSetIterator &RHS) const {
return CurrentIndex == RHS.CurrentIndex;
}
bool PhiNodeSetIterator::operator!=(const PhiNodeSetIterator &RHS) const {
return !((*this) == RHS);
}
/// Keep track of simplification of Phi nodes.
/// Accept the set of all phi nodes and erase phi node from this set
/// if it is simplified.
class SimplificationTracker {
DenseMap<Value *, Value *> Storage;
const SimplifyQuery &SQ;
// Tracks newly created Phi nodes. The elements are iterated by insertion
// order.
PhiNodeSet AllPhiNodes;
// Tracks newly created Select nodes.
SmallPtrSet<SelectInst *, 32> AllSelectNodes;
public:
SimplificationTracker(const SimplifyQuery &sq)
: SQ(sq) {}
Value *Get(Value *V) {
do {
auto SV = Storage.find(V);
if (SV == Storage.end())
return V;
V = SV->second;
} while (true);
}
Value *Simplify(Value *Val) {
SmallVector<Value *, 32> WorkList;
SmallPtrSet<Value *, 32> Visited;
WorkList.push_back(Val);
while (!WorkList.empty()) {
auto P = WorkList.pop_back_val();
if (!Visited.insert(P).second)
continue;
if (auto *PI = dyn_cast<Instruction>(P))
if (Value *V = SimplifyInstruction(cast<Instruction>(PI), SQ)) {
for (auto *U : PI->users())
WorkList.push_back(cast<Value>(U));
Put(PI, V);
PI->replaceAllUsesWith(V);
if (auto *PHI = dyn_cast<PHINode>(PI))
AllPhiNodes.erase(PHI);
if (auto *Select = dyn_cast<SelectInst>(PI))
AllSelectNodes.erase(Select);
PI->eraseFromParent();
}
}
return Get(Val);
}
void Put(Value *From, Value *To) {
Storage.insert({ From, To });
}
void ReplacePhi(PHINode *From, PHINode *To) {
Value* OldReplacement = Get(From);
while (OldReplacement != From) {
From = To;
To = dyn_cast<PHINode>(OldReplacement);
OldReplacement = Get(From);
}
assert(Get(To) == To && "Replacement PHI node is already replaced.");
Put(From, To);
From->replaceAllUsesWith(To);
AllPhiNodes.erase(From);
From->eraseFromParent();
}
PhiNodeSet& newPhiNodes() { return AllPhiNodes; }
void insertNewPhi(PHINode *PN) { AllPhiNodes.insert(PN); }
void insertNewSelect(SelectInst *SI) { AllSelectNodes.insert(SI); }
unsigned countNewPhiNodes() const { return AllPhiNodes.size(); }
unsigned countNewSelectNodes() const { return AllSelectNodes.size(); }
void destroyNewNodes(Type *CommonType) {
// For safe erasing, replace the uses with dummy value first.
auto Dummy = UndefValue::get(CommonType);
for (auto I : AllPhiNodes) {
I->replaceAllUsesWith(Dummy);
I->eraseFromParent();
}
AllPhiNodes.clear();
for (auto I : AllSelectNodes) {
I->replaceAllUsesWith(Dummy);
I->eraseFromParent();
}
AllSelectNodes.clear();
}
};
/// A helper class for combining addressing modes.
class AddressingModeCombiner {
typedef DenseMap<Value *, Value *> FoldAddrToValueMapping;
typedef std::pair<PHINode *, PHINode *> PHIPair;
private:
/// The addressing modes we've collected.
SmallVector<ExtAddrMode, 16> AddrModes;
/// The field in which the AddrModes differ, when we have more than one.
ExtAddrMode::FieldName DifferentField = ExtAddrMode::NoField;
/// Are the AddrModes that we have all just equal to their original values?
bool AllAddrModesTrivial = true;
/// Common Type for all different fields in addressing modes.
Type *CommonType;
/// SimplifyQuery for simplifyInstruction utility.
const SimplifyQuery &SQ;
/// Original Address.
Value *Original;
public:
AddressingModeCombiner(const SimplifyQuery &_SQ, Value *OriginalValue)
: CommonType(nullptr), SQ(_SQ), Original(OriginalValue) {}
/// Get the combined AddrMode
const ExtAddrMode &getAddrMode() const {
return AddrModes[0];
}
/// Add a new AddrMode if it's compatible with the AddrModes we already
/// have.
/// \return True iff we succeeded in doing so.
bool addNewAddrMode(ExtAddrMode &NewAddrMode) {
// Take note of if we have any non-trivial AddrModes, as we need to detect
// when all AddrModes are trivial as then we would introduce a phi or select
// which just duplicates what's already there.
AllAddrModesTrivial = AllAddrModesTrivial && NewAddrMode.isTrivial();
// If this is the first addrmode then everything is fine.
if (AddrModes.empty()) {
AddrModes.emplace_back(NewAddrMode);
return true;
}
// Figure out how different this is from the other address modes, which we
// can do just by comparing against the first one given that we only care
// about the cumulative difference.
ExtAddrMode::FieldName ThisDifferentField =
AddrModes[0].compare(NewAddrMode);
if (DifferentField == ExtAddrMode::NoField)
DifferentField = ThisDifferentField;
else if (DifferentField != ThisDifferentField)
DifferentField = ExtAddrMode::MultipleFields;
// If NewAddrMode differs in more than one dimension we cannot handle it.
bool CanHandle = DifferentField != ExtAddrMode::MultipleFields;
// If Scale Field is different then we reject.
CanHandle = CanHandle && DifferentField != ExtAddrMode::ScaleField;
// We also must reject the case when base offset is different and
// scale reg is not null, we cannot handle this case due to merge of
// different offsets will be used as ScaleReg.
CanHandle = CanHandle && (DifferentField != ExtAddrMode::BaseOffsField ||
!NewAddrMode.ScaledReg);
// We also must reject the case when GV is different and BaseReg installed
// due to we want to use base reg as a merge of GV values.
CanHandle = CanHandle && (DifferentField != ExtAddrMode::BaseGVField ||
!NewAddrMode.HasBaseReg);
// Even if NewAddMode is the same we still need to collect it due to
// original value is different. And later we will need all original values
// as anchors during finding the common Phi node.
if (CanHandle)
AddrModes.emplace_back(NewAddrMode);
else
AddrModes.clear();
return CanHandle;
}
/// Combine the addressing modes we've collected into a single
/// addressing mode.
/// \return True iff we successfully combined them or we only had one so
/// didn't need to combine them anyway.
bool combineAddrModes() {
// If we have no AddrModes then they can't be combined.
if (AddrModes.size() == 0)
return false;
// A single AddrMode can trivially be combined.
if (AddrModes.size() == 1 || DifferentField == ExtAddrMode::NoField)
return true;
// If the AddrModes we collected are all just equal to the value they are
// derived from then combining them wouldn't do anything useful.
if (AllAddrModesTrivial)
return false;
if (!addrModeCombiningAllowed())
return false;
// Build a map between <original value, basic block where we saw it> to
// value of base register.
// Bail out if there is no common type.
FoldAddrToValueMapping Map;
if (!initializeMap(Map))
return false;
Value *CommonValue = findCommon(Map);
if (CommonValue)
AddrModes[0].SetCombinedField(DifferentField, CommonValue, AddrModes);
return CommonValue != nullptr;
}
private:
/// Initialize Map with anchor values. For address seen
/// we set the value of different field saw in this address.
/// At the same time we find a common type for different field we will
/// use to create new Phi/Select nodes. Keep it in CommonType field.
/// Return false if there is no common type found.
bool initializeMap(FoldAddrToValueMapping &Map) {
// Keep track of keys where the value is null. We will need to replace it
// with constant null when we know the common type.
SmallVector<Value *, 2> NullValue;
Type *IntPtrTy = SQ.DL.getIntPtrType(AddrModes[0].OriginalValue->getType());
for (auto &AM : AddrModes) {
Value *DV = AM.GetFieldAsValue(DifferentField, IntPtrTy);
if (DV) {
auto *Type = DV->getType();
if (CommonType && CommonType != Type)
return false;
CommonType = Type;
Map[AM.OriginalValue] = DV;
} else {
NullValue.push_back(AM.OriginalValue);
}
}
assert(CommonType && "At least one non-null value must be!");
for (auto *V : NullValue)
Map[V] = Constant::getNullValue(CommonType);
return true;
}
/// We have mapping between value A and other value B where B was a field in
/// addressing mode represented by A. Also we have an original value C
/// representing an address we start with. Traversing from C through phi and
/// selects we ended up with A's in a map. This utility function tries to find
/// a value V which is a field in addressing mode C and traversing through phi
/// nodes and selects we will end up in corresponded values B in a map.
/// The utility will create a new Phi/Selects if needed.
// The simple example looks as follows:
// BB1:
// p1 = b1 + 40
// br cond BB2, BB3
// BB2:
// p2 = b2 + 40
// br BB3
// BB3:
// p = phi [p1, BB1], [p2, BB2]
// v = load p
// Map is
// p1 -> b1
// p2 -> b2
// Request is
// p -> ?
// The function tries to find or build phi [b1, BB1], [b2, BB2] in BB3.
Value *findCommon(FoldAddrToValueMapping &Map) {
// Tracks the simplification of newly created phi nodes. The reason we use
// this mapping is because we will add new created Phi nodes in AddrToBase.
// Simplification of Phi nodes is recursive, so some Phi node may
// be simplified after we added it to AddrToBase. In reality this
// simplification is possible only if original phi/selects were not
// simplified yet.
// Using this mapping we can find the current value in AddrToBase.
SimplificationTracker ST(SQ);
// First step, DFS to create PHI nodes for all intermediate blocks.
// Also fill traverse order for the second step.
SmallVector<Value *, 32> TraverseOrder;
InsertPlaceholders(Map, TraverseOrder, ST);
// Second Step, fill new nodes by merged values and simplify if possible.
FillPlaceholders(Map, TraverseOrder, ST);
if (!AddrSinkNewSelects && ST.countNewSelectNodes() > 0) {
ST.destroyNewNodes(CommonType);
return nullptr;
}
// Now we'd like to match New Phi nodes to existed ones.
unsigned PhiNotMatchedCount = 0;
if (!MatchPhiSet(ST, AddrSinkNewPhis, PhiNotMatchedCount)) {
ST.destroyNewNodes(CommonType);
return nullptr;
}
auto *Result = ST.Get(Map.find(Original)->second);
if (Result) {
NumMemoryInstsPhiCreated += ST.countNewPhiNodes() + PhiNotMatchedCount;
NumMemoryInstsSelectCreated += ST.countNewSelectNodes();
}
return Result;
}
/// Try to match PHI node to Candidate.
/// Matcher tracks the matched Phi nodes.
bool MatchPhiNode(PHINode *PHI, PHINode *Candidate,
SmallSetVector<PHIPair, 8> &Matcher,
PhiNodeSet &PhiNodesToMatch) {
SmallVector<PHIPair, 8> WorkList;
Matcher.insert({ PHI, Candidate });
SmallSet<PHINode *, 8> MatchedPHIs;
MatchedPHIs.insert(PHI);
WorkList.push_back({ PHI, Candidate });
SmallSet<PHIPair, 8> Visited;
while (!WorkList.empty()) {
auto Item = WorkList.pop_back_val();
if (!Visited.insert(Item).second)
continue;
// We iterate over all incoming values to Phi to compare them.
// If values are different and both of them Phi and the first one is a
// Phi we added (subject to match) and both of them is in the same basic
// block then we can match our pair if values match. So we state that
// these values match and add it to work list to verify that.
for (auto B : Item.first->blocks()) {
Value *FirstValue = Item.first->getIncomingValueForBlock(B);
Value *SecondValue = Item.second->getIncomingValueForBlock(B);
if (FirstValue == SecondValue)
continue;
PHINode *FirstPhi = dyn_cast<PHINode>(FirstValue);
PHINode *SecondPhi = dyn_cast<PHINode>(SecondValue);
// One of them is not Phi or
// The first one is not Phi node from the set we'd like to match or
// Phi nodes from different basic blocks then
// we will not be able to match.
if (!FirstPhi || !SecondPhi || !PhiNodesToMatch.count(FirstPhi) ||
FirstPhi->getParent() != SecondPhi->getParent())
return false;
// If we already matched them then continue.
if (Matcher.count({ FirstPhi, SecondPhi }))
continue;
// So the values are different and does not match. So we need them to
// match. (But we register no more than one match per PHI node, so that
// we won't later try to replace them twice.)
if (!MatchedPHIs.insert(FirstPhi).second)
Matcher.insert({ FirstPhi, SecondPhi });
// But me must check it.
WorkList.push_back({ FirstPhi, SecondPhi });
}
}
return true;
}
/// For the given set of PHI nodes (in the SimplificationTracker) try
/// to find their equivalents.
/// Returns false if this matching fails and creation of new Phi is disabled.
bool MatchPhiSet(SimplificationTracker &ST, bool AllowNewPhiNodes,
unsigned &PhiNotMatchedCount) {
// Matched and PhiNodesToMatch iterate their elements in a deterministic
// order, so the replacements (ReplacePhi) are also done in a deterministic
// order.
SmallSetVector<PHIPair, 8> Matched;
SmallPtrSet<PHINode *, 8> WillNotMatch;
PhiNodeSet &PhiNodesToMatch = ST.newPhiNodes();
while (PhiNodesToMatch.size()) {
PHINode *PHI = *PhiNodesToMatch.begin();
// Add us, if no Phi nodes in the basic block we do not match.
WillNotMatch.clear();
WillNotMatch.insert(PHI);
// Traverse all Phis until we found equivalent or fail to do that.
bool IsMatched = false;
for (auto &P : PHI->getParent()->phis()) {
if (&P == PHI)
continue;
if ((IsMatched = MatchPhiNode(PHI, &P, Matched, PhiNodesToMatch)))
break;
// If it does not match, collect all Phi nodes from matcher.
// if we end up with no match, them all these Phi nodes will not match
// later.
for (auto M : Matched)
WillNotMatch.insert(M.first);
Matched.clear();
}
if (IsMatched) {
// Replace all matched values and erase them.
for (auto MV : Matched)
ST.ReplacePhi(MV.first, MV.second);
Matched.clear();
continue;
}
// If we are not allowed to create new nodes then bail out.
if (!AllowNewPhiNodes)
return false;
// Just remove all seen values in matcher. They will not match anything.
PhiNotMatchedCount += WillNotMatch.size();
for (auto *P : WillNotMatch)
PhiNodesToMatch.erase(P);
}
return true;
}
/// Fill the placeholders with values from predecessors and simplify them.
void FillPlaceholders(FoldAddrToValueMapping &Map,
SmallVectorImpl<Value *> &TraverseOrder,
SimplificationTracker &ST) {
while (!TraverseOrder.empty()) {
Value *Current = TraverseOrder.pop_back_val();
assert(Map.find(Current) != Map.end() && "No node to fill!!!");
Value *V = Map[Current];
if (SelectInst *Select = dyn_cast<SelectInst>(V)) {
// CurrentValue also must be Select.
auto *CurrentSelect = cast<SelectInst>(Current);
auto *TrueValue = CurrentSelect->getTrueValue();
assert(Map.find(TrueValue) != Map.end() && "No True Value!");
Select->setTrueValue(ST.Get(Map[TrueValue]));
auto *FalseValue = CurrentSelect->getFalseValue();
assert(Map.find(FalseValue) != Map.end() && "No False Value!");
Select->setFalseValue(ST.Get(Map[FalseValue]));
} else {
// Must be a Phi node then.
PHINode *PHI = cast<PHINode>(V);
auto *CurrentPhi = dyn_cast<PHINode>(Current);
// Fill the Phi node with values from predecessors.
for (auto B : predecessors(PHI->getParent())) {
Value *PV = CurrentPhi->getIncomingValueForBlock(B);
assert(Map.find(PV) != Map.end() && "No predecessor Value!");
PHI->addIncoming(ST.Get(Map[PV]), B);
}
}
Map[Current] = ST.Simplify(V);
}
}
/// Starting from original value recursively iterates over def-use chain up to
/// known ending values represented in a map. For each traversed phi/select
/// inserts a placeholder Phi or Select.
/// Reports all new created Phi/Select nodes by adding them to set.
/// Also reports and order in what values have been traversed.
void InsertPlaceholders(FoldAddrToValueMapping &Map,
SmallVectorImpl<Value *> &TraverseOrder,
SimplificationTracker &ST) {
SmallVector<Value *, 32> Worklist;
assert((isa<PHINode>(Original) || isa<SelectInst>(Original)) &&
"Address must be a Phi or Select node");
auto *Dummy = UndefValue::get(CommonType);
Worklist.push_back(Original);
while (!Worklist.empty()) {
Value *Current = Worklist.pop_back_val();
// if it is already visited or it is an ending value then skip it.
if (Map.find(Current) != Map.end())
continue;
TraverseOrder.push_back(Current);
// CurrentValue must be a Phi node or select. All others must be covered
// by anchors.
if (SelectInst *CurrentSelect = dyn_cast<SelectInst>(Current)) {
// Is it OK to get metadata from OrigSelect?!
// Create a Select placeholder with dummy value.
SelectInst *Select = SelectInst::Create(
CurrentSelect->getCondition(), Dummy, Dummy,
CurrentSelect->getName(), CurrentSelect, CurrentSelect);
Map[Current] = Select;
ST.insertNewSelect(Select);
// We are interested in True and False values.
Worklist.push_back(CurrentSelect->getTrueValue());
Worklist.push_back(CurrentSelect->getFalseValue());
} else {
// It must be a Phi node then.
PHINode *CurrentPhi = cast<PHINode>(Current);
unsigned PredCount = CurrentPhi->getNumIncomingValues();
PHINode *PHI =
PHINode::Create(CommonType, PredCount, "sunk_phi", CurrentPhi);
Map[Current] = PHI;
ST.insertNewPhi(PHI);
for (Value *P : CurrentPhi->incoming_values())
Worklist.push_back(P);
}
}
}
bool addrModeCombiningAllowed() {
if (DisableComplexAddrModes)
return false;
switch (DifferentField) {
default:
return false;
case ExtAddrMode::BaseRegField:
return AddrSinkCombineBaseReg;
case ExtAddrMode::BaseGVField:
return AddrSinkCombineBaseGV;
case ExtAddrMode::BaseOffsField:
return AddrSinkCombineBaseOffs;
case ExtAddrMode::ScaledRegField:
return AddrSinkCombineScaledReg;
}
}
};
} // end anonymous namespace
/// Try adding ScaleReg*Scale to the current addressing mode.
/// Return true and update AddrMode if this addr mode is legal for the target,
/// false if not.
bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,
unsigned Depth) {
// If Scale is 1, then this is the same as adding ScaleReg to the addressing
// mode. Just process that directly.
if (Scale == 1)
return matchAddr(ScaleReg, Depth);
// If the scale is 0, it takes nothing to add this.
if (Scale == 0)
return true;
// If we already have a scale of this value, we can add to it, otherwise, we
// need an available scale field.
if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
return false;
ExtAddrMode TestAddrMode = AddrMode;
// Add scale to turn X*4+X*3 -> X*7. This could also do things like
// [A+B + A*7] -> [B+A*8].
TestAddrMode.Scale += Scale;
TestAddrMode.ScaledReg = ScaleReg;
// If the new address isn't legal, bail out.
if (!TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace))
return false;
// It was legal, so commit it.
AddrMode = TestAddrMode;
// Okay, we decided that we can add ScaleReg+Scale to AddrMode. Check now
// to see if ScaleReg is actually X+C. If so, we can turn this into adding
// X*Scale + C*Scale to addr mode.
ConstantInt *CI = nullptr; Value *AddLHS = nullptr;
if (isa<Instruction>(ScaleReg) && // not a constant expr.
match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) {
TestAddrMode.InBounds = false;
TestAddrMode.ScaledReg = AddLHS;
TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale;
// If this addressing mode is legal, commit it and remember that we folded
// this instruction.
if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) {
AddrModeInsts.push_back(cast<Instruction>(ScaleReg));
AddrMode = TestAddrMode;
return true;
}
}
// Otherwise, not (x+c)*scale, just return what we have.
return true;
}
/// This is a little filter, which returns true if an addressing computation
/// involving I might be folded into a load/store accessing it.
/// This doesn't need to be perfect, but needs to accept at least
/// the set of instructions that MatchOperationAddr can.
static bool MightBeFoldableInst(Instruction *I) {
switch (I->getOpcode()) {
case Instruction::BitCast:
case Instruction::AddrSpaceCast:
// Don't touch identity bitcasts.
if (I->getType() == I->getOperand(0)->getType())
return false;
return I->getType()->isIntOrPtrTy();
case Instruction::PtrToInt:
// PtrToInt is always a noop, as we know that the int type is pointer sized.
return true;
case Instruction::IntToPtr:
// We know the input is intptr_t, so this is foldable.
return true;
case Instruction::Add:
return true;
case Instruction::Mul:
case Instruction::Shl:
// Can only handle X*C and X << C.
return isa<ConstantInt>(I->getOperand(1));
case Instruction::GetElementPtr:
return true;
default:
return false;
}
}
/// Check whether or not \p Val is a legal instruction for \p TLI.
/// \note \p Val is assumed to be the product of some type promotion.
/// Therefore if \p Val has an undefined state in \p TLI, this is assumed
/// to be legal, as the non-promoted value would have had the same state.
static bool isPromotedInstructionLegal(const TargetLowering &TLI,
const DataLayout &DL, Value *Val) {
Instruction *PromotedInst = dyn_cast<Instruction>(Val);
if (!PromotedInst)
return false;
int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode());
// If the ISDOpcode is undefined, it was undefined before the promotion.
if (!ISDOpcode)
return true;
// Otherwise, check if the promoted instruction is legal or not.
return TLI.isOperationLegalOrCustom(
ISDOpcode, TLI.getValueType(DL, PromotedInst->getType()));
}
namespace {
/// Hepler class to perform type promotion.
class TypePromotionHelper {
/// Utility function to add a promoted instruction \p ExtOpnd to
/// \p PromotedInsts and record the type of extension we have seen.
static void addPromotedInst(InstrToOrigTy &PromotedInsts,
Instruction *ExtOpnd,
bool IsSExt) {
ExtType ExtTy = IsSExt ? SignExtension : ZeroExtension;
InstrToOrigTy::iterator It = PromotedInsts.find(ExtOpnd);
if (It != PromotedInsts.end()) {
// If the new extension is same as original, the information in
// PromotedInsts[ExtOpnd] is still correct.
if (It->second.getInt() == ExtTy)
return;
// Now the new extension is different from old extension, we make
// the type information invalid by setting extension type to
// BothExtension.
ExtTy = BothExtension;
}
PromotedInsts[ExtOpnd] = TypeIsSExt(ExtOpnd->getType(), ExtTy);
}
/// Utility function to query the original type of instruction \p Opnd
/// with a matched extension type. If the extension doesn't match, we
/// cannot use the information we had on the original type.
/// BothExtension doesn't match any extension type.
static const Type *getOrigType(const InstrToOrigTy &PromotedInsts,
Instruction *Opnd,
bool IsSExt) {
ExtType ExtTy = IsSExt ? SignExtension : ZeroExtension;
InstrToOrigTy::const_iterator It = PromotedInsts.find(Opnd);
if (It != PromotedInsts.end() && It->second.getInt() == ExtTy)
return It->second.getPointer();
return nullptr;
}
/// Utility function to check whether or not a sign or zero extension
/// of \p Inst with \p ConsideredExtType can be moved through \p Inst by
/// either using the operands of \p Inst or promoting \p Inst.
/// The type of the extension is defined by \p IsSExt.
/// In other words, check if:
/// ext (Ty Inst opnd1 opnd2 ... opndN) to ConsideredExtType.
/// #1 Promotion applies:
/// ConsideredExtType Inst (ext opnd1 to ConsideredExtType, ...).
/// #2 Operand reuses:
/// ext opnd1 to ConsideredExtType.
/// \p PromotedInsts maps the instructions to their type before promotion.
static bool canGetThrough(const Instruction *Inst, Type *ConsideredExtType,
const InstrToOrigTy &PromotedInsts, bool IsSExt);
/// Utility function to determine if \p OpIdx should be promoted when
/// promoting \p Inst.
static bool shouldExtOperand(const Instruction *Inst, int OpIdx) {
return !(isa<SelectInst>(Inst) && OpIdx == 0);
}
/// Utility function to promote the operand of \p Ext when this
/// operand is a promotable trunc or sext or zext.
/// \p PromotedInsts maps the instructions to their type before promotion.
/// \p CreatedInstsCost[out] contains the cost of all instructions
/// created to promote the operand of Ext.
/// Newly added extensions are inserted in \p Exts.
/// Newly added truncates are inserted in \p Truncs.
/// Should never be called directly.
/// \return The promoted value which is used instead of Ext.
static Value *promoteOperandForTruncAndAnyExt(
Instruction *Ext, TypePromotionTransaction &TPT,
InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
SmallVectorImpl<Instruction *> *Exts,
SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI);
/// Utility function to promote the operand of \p Ext when this
/// operand is promotable and is not a supported trunc or sext.
/// \p PromotedInsts maps the instructions to their type before promotion.
/// \p CreatedInstsCost[out] contains the cost of all the instructions
/// created to promote the operand of Ext.
/// Newly added extensions are inserted in \p Exts.
/// Newly added truncates are inserted in \p Truncs.
/// Should never be called directly.
/// \return The promoted value which is used instead of Ext.
static Value *promoteOperandForOther(Instruction *Ext,
TypePromotionTransaction &TPT,
InstrToOrigTy &PromotedInsts,
unsigned &CreatedInstsCost,
SmallVectorImpl<Instruction *> *Exts,
SmallVectorImpl<Instruction *> *Truncs,
const TargetLowering &TLI, bool IsSExt);
/// \see promoteOperandForOther.
static Value *signExtendOperandForOther(
Instruction *Ext, TypePromotionTransaction &TPT,
InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
SmallVectorImpl<Instruction *> *Exts,
SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
Exts, Truncs, TLI, true);
}
/// \see promoteOperandForOther.
static Value *zeroExtendOperandForOther(
Instruction *Ext, TypePromotionTransaction &TPT,
InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
SmallVectorImpl<Instruction *> *Exts,
SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
Exts, Truncs, TLI, false);
}
public:
/// Type for the utility function that promotes the operand of Ext.
using Action = Value *(*)(Instruction *Ext, TypePromotionTransaction &TPT,
InstrToOrigTy &PromotedInsts,
unsigned &CreatedInstsCost,
SmallVectorImpl<Instruction *> *Exts,
SmallVectorImpl<Instruction *> *Truncs,
const TargetLowering &TLI);
/// Given a sign/zero extend instruction \p Ext, return the appropriate
/// action to promote the operand of \p Ext instead of using Ext.
/// \return NULL if no promotable action is possible with the current
/// sign extension.
/// \p InsertedInsts keeps track of all the instructions inserted by the
/// other CodeGenPrepare optimizations. This information is important
/// because we do not want to promote these instructions as CodeGenPrepare
/// will reinsert them later. Thus creating an infinite loop: create/remove.
/// \p PromotedInsts maps the instructions to their type before promotion.
static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedInsts,
const TargetLowering &TLI,
const InstrToOrigTy &PromotedInsts);
};
} // end anonymous namespace
bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
Type *ConsideredExtType,
const InstrToOrigTy &PromotedInsts,
bool IsSExt) {
// The promotion helper does not know how to deal with vector types yet.
// To be able to fix that, we would need to fix the places where we
// statically extend, e.g., constants and such.
if (Inst->getType()->isVectorTy())
return false;
// We can always get through zext.
if (isa<ZExtInst>(Inst))
return true;
// sext(sext) is ok too.
if (IsSExt && isa<SExtInst>(Inst))
return true;
// We can get through binary operator, if it is legal. In other words, the
// binary operator must have a nuw or nsw flag.
const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
((!IsSExt && BinOp->hasNoUnsignedWrap()) ||
(IsSExt && BinOp->hasNoSignedWrap())))
return true;
// ext(and(opnd, cst)) --> and(ext(opnd), ext(cst))
if ((Inst->getOpcode() == Instruction::And ||
Inst->getOpcode() == Instruction::Or))
return true;
// ext(xor(opnd, cst)) --> xor(ext(opnd), ext(cst))
if (Inst->getOpcode() == Instruction::Xor) {
const ConstantInt *Cst = dyn_cast<ConstantInt>(Inst->getOperand(1));
// Make sure it is not a NOT.
if (Cst && !Cst->getValue().isAllOnesValue())
return true;
}
// zext(shrl(opnd, cst)) --> shrl(zext(opnd), zext(cst))
// It may change a poisoned value into a regular value, like
// zext i32 (shrl i8 %val, 12) --> shrl i32 (zext i8 %val), 12
// poisoned value regular value
// It should be OK since undef covers valid value.
if (Inst->getOpcode() == Instruction::LShr && !IsSExt)
return true;
// and(ext(shl(opnd, cst)), cst) --> and(shl(ext(opnd), ext(cst)), cst)
// It may change a poisoned value into a regular value, like
// zext i32 (shl i8 %val, 12) --> shl i32 (zext i8 %val), 12
// poisoned value regular value
// It should be OK since undef covers valid value.
if (Inst->getOpcode() == Instruction::Shl && Inst->hasOneUse()) {
const Instruction *ExtInst =
dyn_cast<const Instruction>(*Inst->user_begin());
if (ExtInst->hasOneUse()) {
const Instruction *AndInst =
dyn_cast<const Instruction>(*ExtInst->user_begin());
if (AndInst && AndInst->getOpcode() == Instruction::And) {
const ConstantInt *Cst = dyn_cast<ConstantInt>(AndInst->getOperand(1));
if (Cst &&
Cst->getValue().isIntN(Inst->getType()->getIntegerBitWidth()))
return true;
}
}
}
// Check if we can do the following simplification.
// ext(trunc(opnd)) --> ext(opnd)
if (!isa<TruncInst>(Inst))
return false;
Value *OpndVal = Inst->getOperand(0);
// Check if we can use this operand in the extension.
// If the type is larger than the result type of the extension, we cannot.
if (!OpndVal->getType()->isIntegerTy() ||
OpndVal->getType()->getIntegerBitWidth() >
ConsideredExtType->getIntegerBitWidth())
return false;
// If the operand of the truncate is not an instruction, we will not have
// any information on the dropped bits.
// (Actually we could for constant but it is not worth the extra logic).
Instruction *Opnd = dyn_cast<Instruction>(OpndVal);
if (!Opnd)
return false;
// Check if the source of the type is narrow enough.
// I.e., check that trunc just drops extended bits of the same kind of
// the extension.
// #1 get the type of the operand and check the kind of the extended bits.
const Type *OpndType = getOrigType(PromotedInsts, Opnd, IsSExt);
if (OpndType)
;
else if ((IsSExt && isa<SExtInst>(Opnd)) || (!IsSExt && isa<ZExtInst>(Opnd)))
OpndType = Opnd->getOperand(0)->getType();
else
return false;
// #2 check that the truncate just drops extended bits.
return Inst->getType()->getIntegerBitWidth() >=
OpndType->getIntegerBitWidth();
}
TypePromotionHelper::Action TypePromotionHelper::getAction(
Instruction *Ext, const SetOfInstrs &InsertedInsts,
const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts) {
assert((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
"Unexpected instruction type");
Instruction *ExtOpnd = dyn_cast<Instruction>(Ext->getOperand(0));
Type *ExtTy = Ext->getType();
bool IsSExt = isa<SExtInst>(Ext);
// If the operand of the extension is not an instruction, we cannot
// get through.
// If it, check we can get through.
if (!ExtOpnd || !canGetThrough(ExtOpnd, ExtTy, PromotedInsts, IsSExt))
return nullptr;
// Do not promote if the operand has been added by codegenprepare.
// Otherwise, it means we are undoing an optimization that is likely to be
// redone, thus causing potential infinite loop.
if (isa<TruncInst>(ExtOpnd) && InsertedInsts.count(ExtOpnd))
return nullptr;
// SExt or Trunc instructions.
// Return the related handler.
if (isa<SExtInst>(ExtOpnd) || isa<TruncInst>(ExtOpnd) ||
isa<ZExtInst>(ExtOpnd))
return promoteOperandForTruncAndAnyExt;
// Regular instruction.
// Abort early if we will have to insert non-free instructions.
if (!ExtOpnd->hasOneUse() && !TLI.isTruncateFree(ExtTy, ExtOpnd->getType()))
return nullptr;
return IsSExt ? signExtendOperandForOther : zeroExtendOperandForOther;
}
Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(
Instruction *SExt, TypePromotionTransaction &TPT,
InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
SmallVectorImpl<Instruction *> *Exts,
SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
// By construction, the operand of SExt is an instruction. Otherwise we cannot
// get through it and this method should not be called.
Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0));
Value *ExtVal = SExt;
bool HasMergedNonFreeExt = false;
if (isa<ZExtInst>(SExtOpnd)) {
// Replace s|zext(zext(opnd))
// => zext(opnd).
HasMergedNonFreeExt = !TLI.isExtFree(SExtOpnd);
Value *ZExt =
TPT.createZExt(SExt, SExtOpnd->getOperand(0), SExt->getType());
TPT.replaceAllUsesWith(SExt, ZExt);
TPT.eraseInstruction(SExt);
ExtVal = ZExt;
} else {
// Replace z|sext(trunc(opnd)) or sext(sext(opnd))
// => z|sext(opnd).
TPT.setOperand(SExt, 0, SExtOpnd->getOperand(0));
}
CreatedInstsCost = 0;
// Remove dead code.
if (SExtOpnd->use_empty())
TPT.eraseInstruction(SExtOpnd);
// Check if the extension is still needed.
Instruction *ExtInst = dyn_cast<Instruction>(ExtVal);
if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType()) {
if (ExtInst) {
if (Exts)
Exts->push_back(ExtInst);
CreatedInstsCost = !TLI.isExtFree(ExtInst) && !HasMergedNonFreeExt;
}
return ExtVal;
}
// At this point we have: ext ty opnd to ty.
// Reassign the uses of ExtInst to the opnd and remove ExtInst.
Value *NextVal = ExtInst->getOperand(0);
TPT.eraseInstruction(ExtInst, NextVal);
return NextVal;
}
Value *TypePromotionHelper::promoteOperandForOther(
Instruction *Ext, TypePromotionTransaction &TPT,
InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
SmallVectorImpl<Instruction *> *Exts,
SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI,
bool IsSExt) {
// By construction, the operand of Ext is an instruction. Otherwise we cannot
// get through it and this method should not be called.
Instruction *ExtOpnd = cast<Instruction>(Ext->getOperand(0));
CreatedInstsCost = 0;
if (!ExtOpnd->hasOneUse()) {
// ExtOpnd will be promoted.
// All its uses, but Ext, will need to use a truncated value of the
// promoted version.
// Create the truncate now.
Value *Trunc = TPT.createTrunc(Ext, ExtOpnd->getType());
if (Instruction *ITrunc = dyn_cast<Instruction>(Trunc)) {
// Insert it just after the definition.
ITrunc->moveAfter(ExtOpnd);
if (Truncs)
Truncs->push_back(ITrunc);
}
TPT.replaceAllUsesWith(ExtOpnd, Trunc);
// Restore the operand of Ext (which has been replaced by the previous call
// to replaceAllUsesWith) to avoid creating a cycle trunc <-> sext.
TPT.setOperand(Ext, 0, ExtOpnd);
}
// Get through the Instruction:
// 1. Update its type.
// 2. Replace the uses of Ext by Inst.
// 3. Extend each operand that needs to be extended.
// Remember the original type of the instruction before promotion.
// This is useful to know that the high bits are sign extended bits.
addPromotedInst(PromotedInsts, ExtOpnd, IsSExt);
// Step #1.
TPT.mutateType(ExtOpnd, Ext->getType());
// Step #2.
TPT.replaceAllUsesWith(Ext, ExtOpnd);
// Step #3.
Instruction *ExtForOpnd = Ext;
LLVM_DEBUG(dbgs() << "Propagate Ext to operands\n");
for (int OpIdx = 0, EndOpIdx = ExtOpnd->getNumOperands(); OpIdx != EndOpIdx;
++OpIdx) {
LLVM_DEBUG(dbgs() << "Operand:\n" << *(ExtOpnd->getOperand(OpIdx)) << '\n');
if (ExtOpnd->getOperand(OpIdx)->getType() == Ext->getType() ||
!shouldExtOperand(ExtOpnd, OpIdx)) {
LLVM_DEBUG(dbgs() << "No need to propagate\n");
continue;
}
// Check if we can statically extend the operand.
Value *Opnd = ExtOpnd->getOperand(OpIdx);
if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
LLVM_DEBUG(dbgs() << "Statically extend\n");
unsigned BitWidth = Ext->getType()->getIntegerBitWidth();
APInt CstVal = IsSExt ? Cst->getValue().sext(BitWidth)
: Cst->getValue().zext(BitWidth);
TPT.setOperand(ExtOpnd, OpIdx, ConstantInt::get(Ext->getType(), CstVal));
continue;
}
// UndefValue are typed, so we have to statically sign extend them.
if (isa<UndefValue>(Opnd)) {
LLVM_DEBUG(dbgs() << "Statically extend\n");
TPT.setOperand(ExtOpnd, OpIdx, UndefValue::get(Ext->getType()));
continue;
}
// Otherwise we have to explicitly sign extend the operand.
// Check if Ext was reused to extend an operand.
if (!ExtForOpnd) {
// If yes, create a new one.
LLVM_DEBUG(dbgs() << "More operands to ext\n");
Value *ValForExtOpnd = IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType())
: TPT.createZExt(Ext, Opnd, Ext->getType());
if (!isa<Instruction>(ValForExtOpnd)) {
TPT.setOperand(ExtOpnd, OpIdx, ValForExtOpnd);
continue;
}
ExtForOpnd = cast<Instruction>(ValForExtOpnd);
}
if (Exts)
Exts->push_back(ExtForOpnd);
TPT.setOperand(ExtForOpnd, 0, Opnd);
// Move the sign extension before the insertion point.
TPT.moveBefore(ExtForOpnd, ExtOpnd);
TPT.setOperand(ExtOpnd, OpIdx, ExtForOpnd);
CreatedInstsCost += !TLI.isExtFree(ExtForOpnd);
// If more sext are required, new instructions will have to be created.
ExtForOpnd = nullptr;
}
if (ExtForOpnd == Ext) {
LLVM_DEBUG(dbgs() << "Extension is useless now\n");
TPT.eraseInstruction(Ext);
}
return ExtOpnd;
}
/// Check whether or not promoting an instruction to a wider type is profitable.
/// \p NewCost gives the cost of extension instructions created by the
/// promotion.
/// \p OldCost gives the cost of extension instructions before the promotion
/// plus the number of instructions that have been
/// matched in the addressing mode the promotion.
/// \p PromotedOperand is the value that has been promoted.
/// \return True if the promotion is profitable, false otherwise.
bool AddressingModeMatcher::isPromotionProfitable(
unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const {
LLVM_DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost
<< '\n');
// The cost of the new extensions is greater than the cost of the
// old extension plus what we folded.
// This is not profitable.
if (NewCost > OldCost)
return false;
if (NewCost < OldCost)
return true;
// The promotion is neutral but it may help folding the sign extension in
// loads for instance.
// Check that we did not create an illegal instruction.
return isPromotedInstructionLegal(TLI, DL, PromotedOperand);
}
/// Given an instruction or constant expr, see if we can fold the operation
/// into the addressing mode. If so, update the addressing mode and return
/// true, otherwise return false without modifying AddrMode.
/// If \p MovedAway is not NULL, it contains the information of whether or
/// not AddrInst has to be folded into the addressing mode on success.
/// If \p MovedAway == true, \p AddrInst will not be part of the addressing
/// because it has been moved away.
/// Thus AddrInst must not be added in the matched instructions.
/// This state can happen when AddrInst is a sext, since it may be moved away.
/// Therefore, AddrInst may not be valid when MovedAway is true and it must
/// not be referenced anymore.
bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
unsigned Depth,
bool *MovedAway) {
// Avoid exponential behavior on extremely deep expression trees.
if (Depth >= 5) return false;
// By default, all matched instructions stay in place.
if (MovedAway)
*MovedAway = false;
switch (Opcode) {
case Instruction::PtrToInt:
// PtrToInt is always a noop, as we know that the int type is pointer sized.
return matchAddr(AddrInst->getOperand(0), Depth);
case Instruction::IntToPtr: {
auto AS = AddrInst->getType()->getPointerAddressSpace();
auto PtrTy = MVT::getIntegerVT(DL.getPointerSizeInBits(AS));
// This inttoptr is a no-op if the integer type is pointer sized.
if (TLI.getValueType(DL, AddrInst->getOperand(0)->getType()) == PtrTy)
return matchAddr(AddrInst->getOperand(0), Depth);
return false;
}
case Instruction::BitCast:
// BitCast is always a noop, and we can handle it as long as it is
// int->int or pointer->pointer (we don't want int<->fp or something).
if (AddrInst->getOperand(0)->getType()->isIntOrPtrTy() &&
// Don't touch identity bitcasts. These were probably put here by LSR,
// and we don't want to mess around with them. Assume it knows what it
// is doing.
AddrInst->getOperand(0)->getType() != AddrInst->getType())
return matchAddr(AddrInst->getOperand(0), Depth);
return false;
case Instruction::AddrSpaceCast: {
unsigned SrcAS
= AddrInst->getOperand(0)->getType()->getPointerAddressSpace();
unsigned DestAS = AddrInst->getType()->getPointerAddressSpace();
if (TLI.isNoopAddrSpaceCast(SrcAS, DestAS))
return matchAddr(AddrInst->getOperand(0), Depth);
return false;
}
case Instruction::Add: {
// Check to see if we can merge in the RHS then the LHS. If so, we win.
ExtAddrMode BackupAddrMode = AddrMode;
unsigned OldSize = AddrModeInsts.size();
// Start a transaction at this point.
// The LHS may match but not the RHS.
// Therefore, we need a higher level restoration point to undo partially
// matched operation.
TypePromotionTransaction::ConstRestorationPt LastKnownGood =
TPT.getRestorationPoint();
AddrMode.InBounds = false;
if (matchAddr(AddrInst->getOperand(1), Depth+1) &&
matchAddr(AddrInst->getOperand(0), Depth+1))
return true;
// Restore the old addr mode info.
AddrMode = BackupAddrMode;
AddrModeInsts.resize(OldSize);
TPT.rollback(LastKnownGood);
// Otherwise this was over-aggressive. Try merging in the LHS then the RHS.
if (matchAddr(AddrInst->getOperand(0), Depth+1) &&
matchAddr(AddrInst->getOperand(1), Depth+1))
return true;
// Otherwise we definitely can't merge the ADD in.
AddrMode = BackupAddrMode;
AddrModeInsts.resize(OldSize);
TPT.rollback(LastKnownGood);
break;
}
//case Instruction::Or:
// TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD.
//break;
case Instruction::Mul:
case Instruction::Shl: {
// Can only handle X*C and X << C.
AddrMode.InBounds = false;
ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
if (!RHS || RHS->getBitWidth() > 64)
return false;
int64_t Scale = RHS->getSExtValue();
if (Opcode == Instruction::Shl)
Scale = 1LL << Scale;
return matchScaledValue(AddrInst->getOperand(0), Scale, Depth);
}
case Instruction::GetElementPtr: {
// Scan the GEP. We check it if it contains constant offsets and at most
// one variable offset.
int VariableOperand = -1;
unsigned VariableScale = 0;
int64_t ConstantOffset = 0;
gep_type_iterator GTI = gep_type_begin(AddrInst);
for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
if (StructType *STy = GTI.getStructTypeOrNull()) {
const StructLayout *SL = DL.getStructLayout(STy);
unsigned Idx =
cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
ConstantOffset += SL->getElementOffset(Idx);
} else {
uint64_t TypeSize = DL.getTypeAllocSize(GTI.getIndexedType());
if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
const APInt &CVal = CI->getValue();
if (CVal.getMinSignedBits() <= 64) {
ConstantOffset += CVal.getSExtValue() * TypeSize;
continue;
}
}
if (TypeSize) { // Scales of zero don't do anything.
// We only allow one variable index at the moment.
if (VariableOperand != -1)
return false;
// Remember the variable index.
VariableOperand = i;
VariableScale = TypeSize;
}
}
}
// A common case is for the GEP to only do a constant offset. In this case,
// just add it to the disp field and check validity.
if (VariableOperand == -1) {
AddrMode.BaseOffs += ConstantOffset;
if (ConstantOffset == 0 ||
TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) {
// Check to see if we can fold the base pointer in too.
if (matchAddr(AddrInst->getOperand(0), Depth+1)) {
if (!cast<GEPOperator>(AddrInst)->isInBounds())
AddrMode.InBounds = false;
return true;
}
} else if (EnableGEPOffsetSplit && isa<GetElementPtrInst>(AddrInst) &&
TLI.shouldConsiderGEPOffsetSplit() && Depth == 0 &&
ConstantOffset > 0) {
// Record GEPs with non-zero offsets as candidates for splitting in the
// event that the offset cannot fit into the r+i addressing mode.
// Simple and common case that only one GEP is used in calculating the
// address for the memory access.
Value *Base = AddrInst->getOperand(0);
auto *BaseI = dyn_cast<Instruction>(Base);
auto *GEP = cast<GetElementPtrInst>(AddrInst);
if (isa<Argument>(Base) || isa<GlobalValue>(Base) ||
(BaseI && !isa<CastInst>(BaseI) &&
!isa<GetElementPtrInst>(BaseI))) {
// Make sure the parent block allows inserting non-PHI instructions
// before the terminator.
BasicBlock *Parent =
BaseI ? BaseI->getParent() : &GEP->getFunction()->getEntryBlock();
if (!Parent->getTerminator()->isEHPad())
LargeOffsetGEP = std::make_pair(GEP, ConstantOffset);
}
}
AddrMode.BaseOffs -= ConstantOffset;
return false;
}
// Save the valid addressing mode in case we can't match.
ExtAddrMode BackupAddrMode = AddrMode;
unsigned OldSize = AddrModeInsts.size();
// See if the scale and offset amount is valid for this target.
AddrMode.BaseOffs += ConstantOffset;
if (!cast<GEPOperator>(AddrInst)->isInBounds())
AddrMode.InBounds = false;
// Match the base operand of the GEP.
if (!matchAddr(AddrInst->getOperand(0), Depth+1)) {
// If it couldn't be matched, just stuff the value in a register.
if (AddrMode.HasBaseReg) {
AddrMode = BackupAddrMode;
AddrModeInsts.resize(OldSize);
return false;
}
AddrMode.HasBaseReg = true;
AddrMode.BaseReg = AddrInst->getOperand(0);
}
// Match the remaining variable portion of the GEP.
if (!matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,
Depth)) {
// If it couldn't be matched, try stuffing the base into a register
// instead of matching it, and retrying the match of the scale.
AddrMode = BackupAddrMode;
AddrModeInsts.resize(OldSize);
if (AddrMode.HasBaseReg)
return false;
AddrMode.HasBaseReg = true;
AddrMode.BaseReg = AddrInst->getOperand(0);
AddrMode.BaseOffs += ConstantOffset;
if (!matchScaledValue(AddrInst->getOperand(VariableOperand),
VariableScale, Depth)) {
// If even that didn't work, bail.
AddrMode = BackupAddrMode;
AddrModeInsts.resize(OldSize);
return false;
}
}
return true;
}
case Instruction::SExt:
case Instruction::ZExt: {
Instruction *Ext = dyn_cast<Instruction>(AddrInst);
if (!Ext)
return false;
// Try to move this ext out of the way of the addressing mode.
// Ask for a method for doing so.
TypePromotionHelper::Action TPH =
TypePromotionHelper::getAction(Ext, InsertedInsts, TLI, PromotedInsts);
if (!TPH)
return false;
TypePromotionTransaction::ConstRestorationPt LastKnownGood =
TPT.getRestorationPoint();
unsigned CreatedInstsCost = 0;
unsigned ExtCost = !TLI.isExtFree(Ext);
Value *PromotedOperand =
TPH(Ext, TPT, PromotedInsts, CreatedInstsCost, nullptr, nullptr, TLI);
// SExt has been moved away.
// Thus either it will be rematched later in the recursive calls or it is
// gone. Anyway, we must not fold it into the addressing mode at this point.
// E.g.,
// op = add opnd, 1
// idx = ext op
// addr = gep base, idx
// is now:
// promotedOpnd = ext opnd <- no match here
// op = promoted_add promotedOpnd, 1 <- match (later in recursive calls)
// addr = gep base, op <- match
if (MovedAway)
*MovedAway = true;
assert(PromotedOperand &&
"TypePromotionHelper should have filtered out those cases");
ExtAddrMode BackupAddrMode = AddrMode;
unsigned OldSize = AddrModeInsts.size();
if (!matchAddr(PromotedOperand, Depth) ||
// The total of the new cost is equal to the cost of the created
// instructions.
// The total of the old cost is equal to the cost of the extension plus
// what we have saved in the addressing mode.
!isPromotionProfitable(CreatedInstsCost,
ExtCost + (AddrModeInsts.size() - OldSize),
PromotedOperand)) {
AddrMode = BackupAddrMode;
AddrModeInsts.resize(OldSize);
LLVM_DEBUG(dbgs() << "Sign extension does not pay off: rollback\n");
TPT.rollback(LastKnownGood);
return false;
}
return true;
}
}
return false;
}
/// If we can, try to add the value of 'Addr' into the current addressing mode.
/// If Addr can't be added to AddrMode this returns false and leaves AddrMode
/// unmodified. This assumes that Addr is either a pointer type or intptr_t
/// for the target.
///
bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {
// Start a transaction at this point that we will rollback if the matching
// fails.
TypePromotionTransaction::ConstRestorationPt LastKnownGood =
TPT.getRestorationPoint();
if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
// Fold in immediates if legal for the target.
AddrMode.BaseOffs += CI->getSExtValue();
if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
return true;
AddrMode.BaseOffs -= CI->getSExtValue();
} else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
// If this is a global variable, try to fold it into the addressing mode.
if (!AddrMode.BaseGV) {
AddrMode.BaseGV = GV;
if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
return true;
AddrMode.BaseGV = nullptr;
}
} else if (Instruction *I = dyn_cast<Instruction>(Addr)) {
ExtAddrMode BackupAddrMode = AddrMode;
unsigned OldSize = AddrModeInsts.size();
// Check to see if it is possible to fold this operation.
bool MovedAway = false;
if (matchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) {
// This instruction may have been moved away. If so, there is nothing
// to check here.
if (MovedAway)
return true;
// Okay, it's possible to fold this. Check to see if it is actually
// *profitable* to do so. We use a simple cost model to avoid increasing
// register pressure too much.
if (I->hasOneUse() ||
isProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {
AddrModeInsts.push_back(I);
return true;
}
// It isn't profitable to do this, roll back.
//cerr << "NOT FOLDING: " << *I;
AddrMode = BackupAddrMode;
AddrModeInsts.resize(OldSize);
TPT.rollback(LastKnownGood);
}
} else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
if (matchOperationAddr(CE, CE->getOpcode(), Depth))
return true;
TPT.rollback(LastKnownGood);
} else if (isa<ConstantPointerNull>(Addr)) {
// Null pointer gets folded without affecting the addressing mode.
return true;
}
// Worse case, the target should support [reg] addressing modes. :)
if (!AddrMode.HasBaseReg) {
AddrMode.HasBaseReg = true;
AddrMode.BaseReg = Addr;
// Still check for legality in case the target supports [imm] but not [i+r].
if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
return true;
AddrMode.HasBaseReg = false;
AddrMode.BaseReg = nullptr;
}
// If the base register is already taken, see if we can do [r+r].
if (AddrMode.Scale == 0) {
AddrMode.Scale = 1;
AddrMode.ScaledReg = Addr;
if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
return true;
AddrMode.Scale = 0;
AddrMode.ScaledReg = nullptr;
}
// Couldn't match.
TPT.rollback(LastKnownGood);
return false;
}
/// Check to see if all uses of OpVal by the specified inline asm call are due
/// to memory operands. If so, return true, otherwise return false.
static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
const TargetLowering &TLI,
const TargetRegisterInfo &TRI) {
const Function *F = CI->getFunction();
TargetLowering::AsmOperandInfoVector TargetConstraints =
TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI,
ImmutableCallSite(CI));
for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
// Compute the constraint code and ConstraintType to use.
TLI.ComputeConstraintToUse(OpInfo, SDValue());
// If this asm operand is our Value*, and if it isn't an indirect memory
// operand, we can't fold it!
if (OpInfo.CallOperandVal == OpVal &&
(OpInfo.ConstraintType != TargetLowering::C_Memory ||
!OpInfo.isIndirect))
return false;
}
return true;
}
// Max number of memory uses to look at before aborting the search to conserve
// compile time.
static constexpr int MaxMemoryUsesToScan = 20;
/// Recursively walk all the uses of I until we find a memory use.
/// If we find an obviously non-foldable instruction, return true.
/// Add the ultimately found memory instructions to MemoryUses.
static bool FindAllMemoryUses(
Instruction *I,
SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses,
SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetLowering &TLI,
const TargetRegisterInfo &TRI, int SeenInsts = 0) {
// If we already considered this instruction, we're done.
if (!ConsideredInsts.insert(I).second)
return false;
// If this is an obviously unfoldable instruction, bail out.
if (!MightBeFoldableInst(I))
return true;
const bool OptSize = I->getFunction()->hasOptSize();
// Loop over all the uses, recursively processing them.
for (Use &U : I->uses()) {
// Conservatively return true if we're seeing a large number or a deep chain
// of users. This avoids excessive compilation times in pathological cases.
if (SeenInsts++ >= MaxMemoryUsesToScan)
return true;
Instruction *UserI = cast<Instruction>(U.getUser());
if (LoadInst *LI = dyn_cast<LoadInst>(UserI)) {
MemoryUses.push_back(std::make_pair(LI, U.getOperandNo()));
continue;
}
if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) {
unsigned opNo = U.getOperandNo();
if (opNo != StoreInst::getPointerOperandIndex())
return true; // Storing addr, not into addr.
MemoryUses.push_back(std::make_pair(SI, opNo));
continue;
}
if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) {
unsigned opNo = U.getOperandNo();
if (opNo != AtomicRMWInst::getPointerOperandIndex())
return true; // Storing addr, not into addr.
MemoryUses.push_back(std::make_pair(RMW, opNo));
continue;
}
if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) {
unsigned opNo = U.getOperandNo();
if (opNo != AtomicCmpXchgInst::getPointerOperandIndex())
return true; // Storing addr, not into addr.
MemoryUses.push_back(std::make_pair(CmpX, opNo));
continue;
}
if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
// If this is a cold call, we can sink the addressing calculation into
// the cold path. See optimizeCallInst
if (!OptSize && CI->hasFnAttr(Attribute::Cold))
continue;
InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
if (!IA) return true;
// If this is a memory operand, we're cool, otherwise bail out.
if (!IsOperandAMemoryOperand(CI, IA, I, TLI, TRI))
return true;
continue;
}
if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI,
SeenInsts))
return true;
}
return false;
}
/// Return true if Val is already known to be live at the use site that we're
/// folding it into. If so, there is no cost to include it in the addressing
/// mode. KnownLive1 and KnownLive2 are two values that we know are live at the
/// instruction already.
bool AddressingModeMatcher::valueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
Value *KnownLive2) {
// If Val is either of the known-live values, we know it is live!
if (Val == nullptr || Val == KnownLive1 || Val == KnownLive2)
return true;
// All values other than instructions and arguments (e.g. constants) are live.
if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true;
// If Val is a constant sized alloca in the entry block, it is live, this is
// true because it is just a reference to the stack/frame pointer, which is
// live for the whole function.
if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))
if (AI->isStaticAlloca())
return true;
// Check to see if this value is already used in the memory instruction's
// block. If so, it's already live into the block at the very least, so we
// can reasonably fold it.
return Val->isUsedInBasicBlock(MemoryInst->getParent());
}
/// It is possible for the addressing mode of the machine to fold the specified
/// instruction into a load or store that ultimately uses it.
/// However, the specified instruction has multiple uses.
/// Given this, it may actually increase register pressure to fold it
/// into the load. For example, consider this code:
///
/// X = ...
/// Y = X+1
/// use(Y) -> nonload/store
/// Z = Y+1
/// load Z
///
/// In this case, Y has multiple uses, and can be folded into the load of Z
/// (yielding load [X+2]). However, doing this will cause both "X" and "X+1" to
/// be live at the use(Y) line. If we don't fold Y into load Z, we use one
/// fewer register. Since Y can't be folded into "use(Y)" we don't increase the
/// number of computations either.
///
/// Note that this (like most of CodeGenPrepare) is just a rough heuristic. If
/// X was live across 'load Z' for other reasons, we actually *would* want to
/// fold the addressing mode in the Z case. This would make Y die earlier.
bool AddressingModeMatcher::
isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
ExtAddrMode &AMAfter) {
if (IgnoreProfitability) return true;
// AMBefore is the addressing mode before this instruction was folded into it,
// and AMAfter is the addressing mode after the instruction was folded. Get
// the set of registers referenced by AMAfter and subtract out those
// referenced by AMBefore: this is the set of values which folding in this
// address extends the lifetime of.
//
// Note that there are only two potential values being referenced here,
// BaseReg and ScaleReg (global addresses are always available, as are any
// folded immediates).
Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg;
// If the BaseReg or ScaledReg was referenced by the previous addrmode, their
// lifetime wasn't extended by adding this instruction.
if (valueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
BaseReg = nullptr;
if (valueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
ScaledReg = nullptr;
// If folding this instruction (and it's subexprs) didn't extend any live
// ranges, we're ok with it.
if (!BaseReg && !ScaledReg)
return true;
// If all uses of this instruction can have the address mode sunk into them,
// we can remove the addressing mode and effectively trade one live register
// for another (at worst.) In this context, folding an addressing mode into
// the use is just a particularly nice way of sinking it.
SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
SmallPtrSet<Instruction*, 16> ConsideredInsts;
if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI))
return false; // Has a non-memory, non-foldable use!
// Now that we know that all uses of this instruction are part of a chain of
// computation involving only operations that could theoretically be folded
// into a memory use, loop over each of these memory operation uses and see
// if they could *actually* fold the instruction. The assumption is that
// addressing modes are cheap and that duplicating the computation involved
// many times is worthwhile, even on a fastpath. For sinking candidates
// (i.e. cold call sites), this serves as a way to prevent excessive code
// growth since most architectures have some reasonable small and fast way to
// compute an effective address. (i.e LEA on x86)
SmallVector<Instruction*, 32> MatchedAddrModeInsts;
for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
Instruction *User = MemoryUses[i].first;
unsigned OpNo = MemoryUses[i].second;
// Get the access type of this use. If the use isn't a pointer, we don't
// know what it accesses.
Value *Address = User->getOperand(OpNo);
PointerType *AddrTy = dyn_cast<PointerType>(Address->getType());
if (!AddrTy)
return false;
Type *AddressAccessTy = AddrTy->getElementType();
unsigned AS = AddrTy->getAddressSpace();
// Do a match against the root of this address, ignoring profitability. This
// will tell us if the addressing mode for the memory operation will
// *actually* cover the shared instruction.
ExtAddrMode Result;
std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,
0);
TypePromotionTransaction::ConstRestorationPt LastKnownGood =
TPT.getRestorationPoint();
AddressingModeMatcher Matcher(
MatchedAddrModeInsts, TLI, TRI, AddressAccessTy, AS, MemoryInst, Result,
InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP);
Matcher.IgnoreProfitability = true;
bool Success = Matcher.matchAddr(Address, 0);
(void)Success; assert(Success && "Couldn't select *anything*?");
// The match was to check the profitability, the changes made are not
// part of the original matcher. Therefore, they should be dropped
// otherwise the original matcher will not present the right state.
TPT.rollback(LastKnownGood);
// If the match didn't cover I, then it won't be shared by it.
if (!is_contained(MatchedAddrModeInsts, I))
return false;
MatchedAddrModeInsts.clear();
}
return true;
}
/// Return true if the specified values are defined in a
/// different basic block than BB.
static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
if (Instruction *I = dyn_cast<Instruction>(V))
return I->getParent() != BB;
return false;
}
/// Sink addressing mode computation immediate before MemoryInst if doing so
/// can be done without increasing register pressure. The need for the
/// register pressure constraint means this can end up being an all or nothing
/// decision for all uses of the same addressing computation.
///
/// Load and Store Instructions often have addressing modes that can do
/// significant amounts of computation. As such, instruction selection will try
/// to get the load or store to do as much computation as possible for the
/// program. The problem is that isel can only see within a single block. As
/// such, we sink as much legal addressing mode work into the block as possible.
///
/// This method is used to optimize both load/store and inline asms with memory
/// operands. It's also used to sink addressing computations feeding into cold
/// call sites into their (cold) basic block.
///
/// The motivation for handling sinking into cold blocks is that doing so can
/// both enable other address mode sinking (by satisfying the register pressure
/// constraint above), and reduce register pressure globally (by removing the
/// addressing mode computation from the fast path entirely.).
bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
Type *AccessTy, unsigned AddrSpace) {
Value *Repl = Addr;
// Try to collapse single-value PHI nodes. This is necessary to undo
// unprofitable PRE transformations.
SmallVector<Value*, 8> worklist;
SmallPtrSet<Value*, 16> Visited;
worklist.push_back(Addr);
// Use a worklist to iteratively look through PHI and select nodes, and
// ensure that the addressing mode obtained from the non-PHI/select roots of
// the graph are compatible.
bool PhiOrSelectSeen = false;
SmallVector<Instruction*, 16> AddrModeInsts;
const SimplifyQuery SQ(*DL, TLInfo);
AddressingModeCombiner AddrModes(SQ, Addr);
TypePromotionTransaction TPT(RemovedInsts);
TypePromotionTransaction::ConstRestorationPt LastKnownGood =
TPT.getRestorationPoint();
while (!worklist.empty()) {
Value *V = worklist.back();
worklist.pop_back();
// We allow traversing cyclic Phi nodes.
// In case of success after this loop we ensure that traversing through
// Phi nodes ends up with all cases to compute address of the form
// BaseGV + Base + Scale * Index + Offset
// where Scale and Offset are constans and BaseGV, Base and Index
// are exactly the same Values in all cases.
// It means that BaseGV, Scale and Offset dominate our memory instruction
// and have the same value as they had in address computation represented
// as Phi. So we can safely sink address computation to memory instruction.
if (!Visited.insert(V).second)
continue;
// For a PHI node, push all of its incoming values.
if (PHINode *P = dyn_cast<PHINode>(V)) {
for (Value *IncValue : P->incoming_values())
worklist.push_back(IncValue);
PhiOrSelectSeen = true;
continue;
}
// Similar for select.
if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
worklist.push_back(SI->getFalseValue());
worklist.push_back(SI->getTrueValue());
PhiOrSelectSeen = true;
continue;
}
// For non-PHIs, determine the addressing mode being computed. Note that
// the result may differ depending on what other uses our candidate
// addressing instructions might have.
AddrModeInsts.clear();
std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,
0);
ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI,
InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP);
GetElementPtrInst *GEP = LargeOffsetGEP.first;
if (GEP && !NewGEPBases.count(GEP)) {
// If splitting the underlying data structure can reduce the offset of a
// GEP, collect the GEP. Skip the GEPs that are the new bases of
// previously split data structures.
LargeOffsetGEPMap[GEP->getPointerOperand()].push_back(LargeOffsetGEP);
if (LargeOffsetGEPID.find(GEP) == LargeOffsetGEPID.end())
LargeOffsetGEPID[GEP] = LargeOffsetGEPID.size();
}
NewAddrMode.OriginalValue = V;
if (!AddrModes.addNewAddrMode(NewAddrMode))
break;
}
// Try to combine the AddrModes we've collected. If we couldn't collect any,
// or we have multiple but either couldn't combine them or combining them
// wouldn't do anything useful, bail out now.
if (!AddrModes.combineAddrModes()) {
TPT.rollback(LastKnownGood);
return false;
}
TPT.commit();
// Get the combined AddrMode (or the only AddrMode, if we only had one).
ExtAddrMode AddrMode = AddrModes.getAddrMode();
// If all the instructions matched are already in this BB, don't do anything.
// If we saw a Phi node then it is not local definitely, and if we saw a select
// then we want to push the address calculation past it even if it's already
// in this BB.
if (!PhiOrSelectSeen && none_of(AddrModeInsts, [&](Value *V) {
return IsNonLocalValue(V, MemoryInst->getParent());
})) {
LLVM_DEBUG(dbgs() << "CGP: Found local addrmode: " << AddrMode
<< "\n");
return false;
}
// Insert this computation right after this user. Since our caller is
// scanning from the top of the BB to the bottom, reuse of the expr are
// guaranteed to happen later.
IRBuilder<> Builder(MemoryInst);
// Now that we determined the addressing expression we want to use and know
// that we have to sink it into this block. Check to see if we have already
// done this for some other load/store instr in this block. If so, reuse
// the computation. Before attempting reuse, check if the address is valid
// as it may have been erased.
WeakTrackingVH SunkAddrVH = SunkAddrs[Addr];
Value * SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr;
if (SunkAddr) {
LLVM_DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode
<< " for " << *MemoryInst << "\n");
if (SunkAddr->getType() != Addr->getType())
SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
} else if (AddrSinkUsingGEPs ||
(!AddrSinkUsingGEPs.getNumOccurrences() && TM && TTI->useAA())) {
// By default, we use the GEP-based method when AA is used later. This
// prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
<< " for " << *MemoryInst << "\n");
Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
Value *ResultPtr = nullptr, *ResultIndex = nullptr;
// First, find the pointer.
if (AddrMode.BaseReg && AddrMode.BaseReg->getType()->isPointerTy()) {
ResultPtr = AddrMode.BaseReg;
AddrMode.BaseReg = nullptr;
}
if (AddrMode.Scale && AddrMode.ScaledReg->getType()->isPointerTy()) {
// We can't add more than one pointer together, nor can we scale a
// pointer (both of which seem meaningless).
if (ResultPtr || AddrMode.Scale != 1)
return false;
ResultPtr = AddrMode.ScaledReg;
AddrMode.Scale = 0;
}
// It is only safe to sign extend the BaseReg if we know that the math
// required to create it did not overflow before we extend it. Since
// the original IR value was tossed in favor of a constant back when
// the AddrMode was created we need to bail out gracefully if widths
// do not match instead of extending it.
//
// (See below for code to add the scale.)
if (AddrMode.Scale) {
Type *ScaledRegTy = AddrMode.ScaledReg->getType();
if (cast<IntegerType>(IntPtrTy)->getBitWidth() >
cast<IntegerType>(ScaledRegTy)->getBitWidth())
return false;
}
if (AddrMode.BaseGV) {
if (ResultPtr)
return false;
ResultPtr = AddrMode.BaseGV;
}
// If the real base value actually came from an inttoptr, then the matcher
// will look through it and provide only the integer value. In that case,
// use it here.
if (!DL->isNonIntegralPointerType(Addr->getType())) {
if (!ResultPtr && AddrMode.BaseReg) {
ResultPtr = Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(),
"sunkaddr");
AddrMode.BaseReg = nullptr;
} else if (!ResultPtr && AddrMode.Scale == 1) {
ResultPtr = Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(),
"sunkaddr");
AddrMode.Scale = 0;
}
}
if (!ResultPtr &&
!AddrMode.BaseReg && !AddrMode.Scale && !AddrMode.BaseOffs) {
SunkAddr = Constant::getNullValue(Addr->getType());
} else if (!ResultPtr) {
return false;
} else {
Type *I8PtrTy =
Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace());
Type *I8Ty = Builder.getInt8Ty();
// Start with the base register. Do this first so that subsequent address
// matching finds it last, which will prevent it from trying to match it
// as the scaled value in case it happens to be a mul. That would be
// problematic if we've sunk a different mul for the scale, because then
// we'd end up sinking both muls.
if (AddrMode.BaseReg) {
Value *V = AddrMode.BaseReg;
if (V->getType() != IntPtrTy)
V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
ResultIndex = V;
}
// Add the scale value.
if (AddrMode.Scale) {
Value *V = AddrMode.ScaledReg;
if (V->getType() == IntPtrTy) {
// done.
} else {
assert(cast<IntegerType>(IntPtrTy)->getBitWidth() <
cast<IntegerType>(V->getType())->getBitWidth() &&
"We can't transform if ScaledReg is too narrow");
V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
}
if (AddrMode.Scale != 1)
V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
"sunkaddr");
if (ResultIndex)
ResultIndex = Builder.CreateAdd(ResultIndex, V, "sunkaddr");
else
ResultIndex = V;
}
// Add in the Base Offset if present.
if (AddrMode.BaseOffs) {
Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
if (ResultIndex) {
// We need to add this separately from the scale above to help with
// SDAG consecutive load/store merging.
if (ResultPtr->getType() != I8PtrTy)
ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
ResultPtr =
AddrMode.InBounds
? Builder.CreateInBoundsGEP(I8Ty, ResultPtr, ResultIndex,
"sunkaddr")
: Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
}
ResultIndex = V;
}
if (!ResultIndex) {
SunkAddr = ResultPtr;
} else {
if (ResultPtr->getType() != I8PtrTy)
ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
SunkAddr =
AddrMode.InBounds
? Builder.CreateInBoundsGEP(I8Ty, ResultPtr, ResultIndex,
"sunkaddr")
: Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
}
if (SunkAddr->getType() != Addr->getType())
SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
}
} else {
// We'd require a ptrtoint/inttoptr down the line, which we can't do for
// non-integral pointers, so in that case bail out now.
Type *BaseTy = AddrMode.BaseReg ? AddrMode.BaseReg->getType() : nullptr;
Type *ScaleTy = AddrMode.Scale ? AddrMode.ScaledReg->getType() : nullptr;
PointerType *BasePtrTy = dyn_cast_or_null<PointerType>(BaseTy);
PointerType *ScalePtrTy = dyn_cast_or_null<PointerType>(ScaleTy);
if (DL->isNonIntegralPointerType(Addr->getType()) ||
(BasePtrTy && DL->isNonIntegralPointerType(BasePtrTy)) ||
(ScalePtrTy && DL->isNonIntegralPointerType(ScalePtrTy)) ||
(AddrMode.BaseGV &&
DL->isNonIntegralPointerType(AddrMode.BaseGV->getType())))
return false;
LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
<< " for " << *MemoryInst << "\n");
Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
Value *Result = nullptr;
// Start with the base register. Do this first so that subsequent address
// matching finds it last, which will prevent it from trying to match it
// as the scaled value in case it happens to be a mul. That would be
// problematic if we've sunk a different mul for the scale, because then
// we'd end up sinking both muls.
if (AddrMode.BaseReg) {
Value *V = AddrMode.BaseReg;
if (V->getType()->isPointerTy())
V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
if (V->getType() != IntPtrTy)
V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
Result = V;
}
// Add the scale value.
if (AddrMode.Scale) {
Value *V = AddrMode.ScaledReg;
if (V->getType() == IntPtrTy) {
// done.
} else if (V->getType()->isPointerTy()) {
V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
} else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
cast<IntegerType>(V->getType())->getBitWidth()) {
V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
} else {
// It is only safe to sign extend the BaseReg if we know that the math
// required to create it did not overflow before we extend it. Since
// the original IR value was tossed in favor of a constant back when
// the AddrMode was created we need to bail out gracefully if widths
// do not match instead of extending it.
Instruction *I = dyn_cast_or_null<Instruction>(Result);
if (I && (Result != AddrMode.BaseReg))
I->eraseFromParent();
return false;
}
if (AddrMode.Scale != 1)
V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
"sunkaddr");
if (Result)
Result = Builder.CreateAdd(Result, V, "sunkaddr");
else
Result = V;
}
// Add in the BaseGV if present.
if (AddrMode.BaseGV) {
Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr");
if (Result)
Result = Builder.CreateAdd(Result, V, "sunkaddr");
else
Result = V;
}
// Add in the Base Offset if present.
if (AddrMode.BaseOffs) {
Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
if (Result)
Result = Builder.CreateAdd(Result, V, "sunkaddr");
else
Result = V;
}
if (!Result)
SunkAddr = Constant::getNullValue(Addr->getType());
else
SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr");
}
MemoryInst->replaceUsesOfWith(Repl, SunkAddr);
// Store the newly computed address into the cache. In the case we reused a
// value, this should be idempotent.
SunkAddrs[Addr] = WeakTrackingVH(SunkAddr);
// If we have no uses, recursively delete the value and all dead instructions
// using it.
if (Repl->use_empty()) {
// This can cause recursive deletion, which can invalidate our iterator.
// Use a WeakTrackingVH to hold onto it in case this happens.
Value *CurValue = &*CurInstIterator;
WeakTrackingVH IterHandle(CurValue);
BasicBlock *BB = CurInstIterator->getParent();
RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo);
if (IterHandle != CurValue) {
// If the iterator instruction was recursively deleted, start over at the
// start of the block.
CurInstIterator = BB->begin();
SunkAddrs.clear();
}
}
++NumMemoryInsts;
return true;
}
/// If there are any memory operands, use OptimizeMemoryInst to sink their
/// address computing into the block when possible / profitable.
bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
bool MadeChange = false;
const TargetRegisterInfo *TRI =
TM->getSubtargetImpl(*CS->getFunction())->getRegisterInfo();
TargetLowering::AsmOperandInfoVector TargetConstraints =
TLI->ParseConstraints(*DL, TRI, CS);
unsigned ArgNo = 0;
for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
// Compute the constraint code and ConstraintType to use.
TLI->ComputeConstraintToUse(OpInfo, SDValue());
if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
OpInfo.isIndirect) {
Value *OpVal = CS->getArgOperand(ArgNo++);
MadeChange |= optimizeMemoryInst(CS, OpVal, OpVal->getType(), ~0u);
} else if (OpInfo.Type == InlineAsm::isInput)
ArgNo++;
}
return MadeChange;
}
/// Check if all the uses of \p Val are equivalent (or free) zero or
/// sign extensions.
static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) {
assert(!Val->use_empty() && "Input must have at least one use");
const Instruction *FirstUser = cast<Instruction>(*Val->user_begin());
bool IsSExt = isa<SExtInst>(FirstUser);
Type *ExtTy = FirstUser->getType();
for (const User *U : Val->users()) {
const Instruction *UI = cast<Instruction>(U);
if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI)))
return false;
Type *CurTy = UI->getType();
// Same input and output types: Same instruction after CSE.
if (CurTy == ExtTy)
continue;
// If IsSExt is true, we are in this situation:
// a = Val
// b = sext ty1 a to ty2
// c = sext ty1 a to ty3
// Assuming ty2 is shorter than ty3, this could be turned into:
// a = Val
// b = sext ty1 a to ty2
// c = sext ty2 b to ty3
// However, the last sext is not free.
if (IsSExt)
return false;
// This is a ZExt, maybe this is free to extend from one type to another.
// In that case, we would not account for a different use.
Type *NarrowTy;
Type *LargeTy;
if (ExtTy->getScalarType()->getIntegerBitWidth() >
CurTy->getScalarType()->getIntegerBitWidth()) {
NarrowTy = CurTy;
LargeTy = ExtTy;
} else {
NarrowTy = ExtTy;
LargeTy = CurTy;
}
if (!TLI.isZExtFree(NarrowTy, LargeTy))
return false;
}
// All uses are the same or can be derived from one another for free.
return true;
}
/// Try to speculatively promote extensions in \p Exts and continue
/// promoting through newly promoted operands recursively as far as doing so is
/// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts.
/// When some promotion happened, \p TPT contains the proper state to revert
/// them.
///
/// \return true if some promotion happened, false otherwise.
bool CodeGenPrepare::tryToPromoteExts(
TypePromotionTransaction &TPT, const SmallVectorImpl<Instruction *> &Exts,
SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
unsigned CreatedInstsCost) {
bool Promoted = false;
// Iterate over all the extensions to try to promote them.
for (auto I : Exts) {
// Early check if we directly have ext(load).
if (isa<LoadInst>(I->getOperand(0))) {
ProfitablyMovedExts.push_back(I);
continue;
}
// Check whether or not we want to do any promotion. The reason we have
// this check inside the for loop is to catch the case where an extension
// is directly fed by a load because in such case the extension can be moved
// up without any promotion on its operands.
if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion)
return false;
// Get the action to perform the promotion.
TypePromotionHelper::Action TPH =
TypePromotionHelper::getAction(I, InsertedInsts, *TLI, PromotedInsts);
// Check if we can promote.
if (!TPH) {
// Save the current extension as we cannot move up through its operand.
ProfitablyMovedExts.push_back(I);
continue;
}
// Save the current state.
TypePromotionTransaction::ConstRestorationPt LastKnownGood =
TPT.getRestorationPoint();
SmallVector<Instruction *, 4> NewExts;
unsigned NewCreatedInstsCost = 0;
unsigned ExtCost = !TLI->isExtFree(I);
// Promote.
Value *PromotedVal = TPH(I, TPT, PromotedInsts, NewCreatedInstsCost,
&NewExts, nullptr, *TLI);
assert(PromotedVal &&
"TypePromotionHelper should have filtered out those cases");
// We would be able to merge only one extension in a load.
// Therefore, if we have more than 1 new extension we heuristically
// cut this search path, because it means we degrade the code quality.
// With exactly 2, the transformation is neutral, because we will merge
// one extension but leave one. However, we optimistically keep going,
// because the new extension may be removed too.
long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost;
// FIXME: It would be possible to propagate a negative value instead of
// conservatively ceiling it to 0.
TotalCreatedInstsCost =
std::max((long long)0, (TotalCreatedInstsCost - ExtCost));
if (!StressExtLdPromotion &&
(TotalCreatedInstsCost > 1 ||
!isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) {
// This promotion is not profitable, rollback to the previous state, and
// save the current extension in ProfitablyMovedExts as the latest
// speculative promotion turned out to be unprofitable.
TPT.rollback(LastKnownGood);
ProfitablyMovedExts.push_back(I);
continue;
}
// Continue promoting NewExts as far as doing so is profitable.
SmallVector<Instruction *, 2> NewlyMovedExts;
(void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost);
bool NewPromoted = false;
for (auto ExtInst : NewlyMovedExts) {
Instruction *MovedExt = cast<Instruction>(ExtInst);
Value *ExtOperand = MovedExt->getOperand(0);
// If we have reached to a load, we need this extra profitability check
// as it could potentially be merged into an ext(load).
if (isa<LoadInst>(ExtOperand) &&
!(StressExtLdPromotion || NewCreatedInstsCost <= ExtCost ||
(ExtOperand->hasOneUse() || hasSameExtUse(ExtOperand, *TLI))))
continue;
ProfitablyMovedExts.push_back(MovedExt);
NewPromoted = true;
}
// If none of speculative promotions for NewExts is profitable, rollback
// and save the current extension (I) as the last profitable extension.
if (!NewPromoted) {
TPT.rollback(LastKnownGood);
ProfitablyMovedExts.push_back(I);
continue;
}
// The promotion is profitable.
Promoted = true;
}
return Promoted;
}
/// Merging redundant sexts when one is dominating the other.
bool CodeGenPrepare::mergeSExts(Function &F) {
bool Changed = false;
for (auto &Entry : ValToSExtendedUses) {
SExts &Insts = Entry.second;
SExts CurPts;
for (Instruction *Inst : Insts) {
if (RemovedInsts.count(Inst) || !isa<SExtInst>(Inst) ||
Inst->getOperand(0) != Entry.first)
continue;
bool inserted = false;
for (auto &Pt : CurPts) {
if (getDT(F).dominates(Inst, Pt)) {
Pt->replaceAllUsesWith(Inst);
RemovedInsts.insert(Pt);
Pt->removeFromParent();
Pt = Inst;
inserted = true;
Changed = true;
break;
}
if (!getDT(F).dominates(Pt, Inst))
// Give up if we need to merge in a common dominator as the
// experiments show it is not profitable.
continue;
Inst->replaceAllUsesWith(Pt);
RemovedInsts.insert(Inst);
Inst->removeFromParent();
inserted = true;
Changed = true;
break;
}
if (!inserted)
CurPts.push_back(Inst);
}
}
return Changed;
}
// Spliting large data structures so that the GEPs accessing them can have
// smaller offsets so that they can be sunk to the same blocks as their users.
// For example, a large struct starting from %base is splitted into two parts
// where the second part starts from %new_base.
//
// Before:
// BB0:
// %base =
//
// BB1:
// %gep0 = gep %base, off0
// %gep1 = gep %base, off1
// %gep2 = gep %base, off2
//
// BB2:
// %load1 = load %gep0
// %load2 = load %gep1
// %load3 = load %gep2
//
// After:
// BB0:
// %base =
// %new_base = gep %base, off0
//
// BB1:
// %new_gep0 = %new_base
// %new_gep1 = gep %new_base, off1 - off0
// %new_gep2 = gep %new_base, off2 - off0
//
// BB2:
// %load1 = load i32, i32* %new_gep0
// %load2 = load i32, i32* %new_gep1
// %load3 = load i32, i32* %new_gep2
//
// %new_gep1 and %new_gep2 can be sunk to BB2 now after the splitting because
// their offsets are smaller enough to fit into the addressing mode.
bool CodeGenPrepare::splitLargeGEPOffsets() {
bool Changed = false;
for (auto &Entry : LargeOffsetGEPMap) {
Value *OldBase = Entry.first;
SmallVectorImpl<std::pair<AssertingVH<GetElementPtrInst>, int64_t>>
&LargeOffsetGEPs = Entry.second;
auto compareGEPOffset =
[&](const std::pair<GetElementPtrInst *, int64_t> &LHS,
const std::pair<GetElementPtrInst *, int64_t> &RHS) {
if (LHS.first == RHS.first)
return false;
if (LHS.second != RHS.second)
return LHS.second < RHS.second;
return LargeOffsetGEPID[LHS.first] < LargeOffsetGEPID[RHS.first];
};
// Sorting all the GEPs of the same data structures based on the offsets.
llvm::sort(LargeOffsetGEPs, compareGEPOffset);
LargeOffsetGEPs.erase(
std::unique(LargeOffsetGEPs.begin(), LargeOffsetGEPs.end()),
LargeOffsetGEPs.end());
// Skip if all the GEPs have the same offsets.
if (LargeOffsetGEPs.front().second == LargeOffsetGEPs.back().second)
continue;
GetElementPtrInst *BaseGEP = LargeOffsetGEPs.begin()->first;
int64_t BaseOffset = LargeOffsetGEPs.begin()->second;
Value *NewBaseGEP = nullptr;
auto LargeOffsetGEP = LargeOffsetGEPs.begin();
while (LargeOffsetGEP != LargeOffsetGEPs.end()) {
GetElementPtrInst *GEP = LargeOffsetGEP->first;
int64_t Offset = LargeOffsetGEP->second;
if (Offset != BaseOffset) {
TargetLowering::AddrMode AddrMode;
AddrMode.BaseOffs = Offset - BaseOffset;
// The result type of the GEP might not be the type of the memory
// access.
if (!TLI->isLegalAddressingMode(*DL, AddrMode,
GEP->getResultElementType(),
GEP->getAddressSpace())) {
// We need to create a new base if the offset to the current base is
// too large to fit into the addressing mode. So, a very large struct
// may be splitted into several parts.
BaseGEP = GEP;
BaseOffset = Offset;
NewBaseGEP = nullptr;
}
}
// Generate a new GEP to replace the current one.
LLVMContext &Ctx = GEP->getContext();
Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
Type *I8PtrTy =
Type::getInt8PtrTy(Ctx, GEP->getType()->getPointerAddressSpace());
Type *I8Ty = Type::getInt8Ty(Ctx);
if (!NewBaseGEP) {
// Create a new base if we don't have one yet. Find the insertion
// pointer for the new base first.
BasicBlock::iterator NewBaseInsertPt;
BasicBlock *NewBaseInsertBB;
if (auto *BaseI = dyn_cast<Instruction>(OldBase)) {
// If the base of the struct is an instruction, the new base will be
// inserted close to it.
NewBaseInsertBB = BaseI->getParent();
if (isa<PHINode>(BaseI))
NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
else if (InvokeInst *Invoke = dyn_cast<InvokeInst>(BaseI)) {
NewBaseInsertBB =
SplitEdge(NewBaseInsertBB, Invoke->getNormalDest());
NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
} else
NewBaseInsertPt = std::next(BaseI->getIterator());
} else {
// If the current base is an argument or global value, the new base
// will be inserted to the entry block.
NewBaseInsertBB = &BaseGEP->getFunction()->getEntryBlock();
NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
}
IRBuilder<> NewBaseBuilder(NewBaseInsertBB, NewBaseInsertPt);
// Create a new base.
Value *BaseIndex = ConstantInt::get(IntPtrTy, BaseOffset);
NewBaseGEP = OldBase;
if (NewBaseGEP->getType() != I8PtrTy)
NewBaseGEP = NewBaseBuilder.CreatePointerCast(NewBaseGEP, I8PtrTy);
NewBaseGEP =
NewBaseBuilder.CreateGEP(I8Ty, NewBaseGEP, BaseIndex, "splitgep");
NewGEPBases.insert(NewBaseGEP);
}
IRBuilder<> Builder(GEP);
Value *NewGEP = NewBaseGEP;
if (Offset == BaseOffset) {
if (GEP->getType() != I8PtrTy)
NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType());
} else {
// Calculate the new offset for the new GEP.
Value *Index = ConstantInt::get(IntPtrTy, Offset - BaseOffset);
NewGEP = Builder.CreateGEP(I8Ty, NewBaseGEP, Index);
if (GEP->getType() != I8PtrTy)
NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType());
}
GEP->replaceAllUsesWith(NewGEP);
LargeOffsetGEPID.erase(GEP);
LargeOffsetGEP = LargeOffsetGEPs.erase(LargeOffsetGEP);
GEP->eraseFromParent();
Changed = true;
}
}
return Changed;
}
/// Return true, if an ext(load) can be formed from an extension in
/// \p MovedExts.
bool CodeGenPrepare::canFormExtLd(
const SmallVectorImpl<Instruction *> &MovedExts, LoadInst *&LI,
Instruction *&Inst, bool HasPromoted) {
for (auto *MovedExtInst : MovedExts) {
if (isa<LoadInst>(MovedExtInst->getOperand(0))) {
LI = cast<LoadInst>(MovedExtInst->getOperand(0));
Inst = MovedExtInst;
break;
}
}
if (!LI)
return false;
// If they're already in the same block, there's nothing to do.
// Make the cheap checks first if we did not promote.
// If we promoted, we need to check if it is indeed profitable.
if (!HasPromoted && LI->getParent() == Inst->getParent())
return false;
return TLI->isExtLoad(LI, Inst, *DL);
}
/// Move a zext or sext fed by a load into the same basic block as the load,
/// unless conditions are unfavorable. This allows SelectionDAG to fold the
/// extend into the load.
///
/// E.g.,
/// \code
/// %ld = load i32* %addr
/// %add = add nuw i32 %ld, 4
/// %zext = zext i32 %add to i64
// \endcode
/// =>
/// \code
/// %ld = load i32* %addr
/// %zext = zext i32 %ld to i64
/// %add = add nuw i64 %zext, 4
/// \encode
/// Note that the promotion in %add to i64 is done in tryToPromoteExts(), which
/// allow us to match zext(load i32*) to i64.
///
/// Also, try to promote the computations used to obtain a sign extended
/// value used into memory accesses.
/// E.g.,
/// \code
/// a = add nsw i32 b, 3
/// d = sext i32 a to i64
/// e = getelementptr ..., i64 d
/// \endcode
/// =>
/// \code
/// f = sext i32 b to i64
/// a = add nsw i64 f, 3
/// e = getelementptr ..., i64 a
/// \endcode
///
/// \p Inst[in/out] the extension may be modified during the process if some
/// promotions apply.
bool CodeGenPrepare::optimizeExt(Instruction *&Inst) {
// ExtLoad formation and address type promotion infrastructure requires TLI to
// be effective.
if (!TLI)
return false;
bool AllowPromotionWithoutCommonHeader = false;
/// See if it is an interesting sext operations for the address type
/// promotion before trying to promote it, e.g., the ones with the right
/// type and used in memory accesses.
bool ATPConsiderable = TTI->shouldConsiderAddressTypePromotion(
*Inst, AllowPromotionWithoutCommonHeader);
TypePromotionTransaction TPT(RemovedInsts);
TypePromotionTransaction::ConstRestorationPt LastKnownGood =
TPT.getRestorationPoint();
SmallVector<Instruction *, 1> Exts;
SmallVector<Instruction *, 2> SpeculativelyMovedExts;
Exts.push_back(Inst);
bool HasPromoted = tryToPromoteExts(TPT, Exts, SpeculativelyMovedExts);
// Look for a load being extended.
LoadInst *LI = nullptr;
Instruction *ExtFedByLoad;
// Try to promote a chain of computation if it allows to form an extended
// load.
if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) {
assert(LI && ExtFedByLoad && "Expect a valid load and extension");
TPT.commit();
// Move the extend into the same block as the load
ExtFedByLoad->moveAfter(LI);
// CGP does not check if the zext would be speculatively executed when moved
// to the same basic block as the load. Preserving its original location
// would pessimize the debugging experience, as well as negatively impact
// the quality of sample pgo. We don't want to use "line 0" as that has a
// size cost in the line-table section and logically the zext can be seen as
// part of the load. Therefore we conservatively reuse the same debug
// location for the load and the zext.
ExtFedByLoad->setDebugLoc(LI->getDebugLoc());
++NumExtsMoved;
Inst = ExtFedByLoad;
return true;
}
// Continue promoting SExts if known as considerable depending on targets.
if (ATPConsiderable &&
performAddressTypePromotion(Inst, AllowPromotionWithoutCommonHeader,
HasPromoted, TPT, SpeculativelyMovedExts))
return true;
TPT.rollback(LastKnownGood);
return false;
}
// Perform address type promotion if doing so is profitable.
// If AllowPromotionWithoutCommonHeader == false, we should find other sext
// instructions that sign extended the same initial value. However, if
// AllowPromotionWithoutCommonHeader == true, we expect promoting the
// extension is just profitable.
bool CodeGenPrepare::performAddressTypePromotion(
Instruction *&Inst, bool AllowPromotionWithoutCommonHeader,
bool HasPromoted, TypePromotionTransaction &TPT,
SmallVectorImpl<Instruction *> &SpeculativelyMovedExts) {
bool Promoted = false;
SmallPtrSet<Instruction *, 1> UnhandledExts;
bool AllSeenFirst = true;
for (auto I : SpeculativelyMovedExts) {
Value *HeadOfChain = I->getOperand(0);
DenseMap<Value *, Instruction *>::iterator AlreadySeen =
SeenChainsForSExt.find(HeadOfChain);
// If there is an unhandled SExt which has the same header, try to promote
// it as well.
if (AlreadySeen != SeenChainsForSExt.end()) {
if (AlreadySeen->second != nullptr)
UnhandledExts.insert(AlreadySeen->second);
AllSeenFirst = false;
}
}
if (!AllSeenFirst || (AllowPromotionWithoutCommonHeader &&
SpeculativelyMovedExts.size() == 1)) {
TPT.commit();
if (HasPromoted)
Promoted = true;
for (auto I : SpeculativelyMovedExts) {
Value *HeadOfChain = I->getOperand(0);
SeenChainsForSExt[HeadOfChain] = nullptr;
ValToSExtendedUses[HeadOfChain].push_back(I);
}
// Update Inst as promotion happen.
Inst = SpeculativelyMovedExts.pop_back_val();
} else {
// This is the first chain visited from the header, keep the current chain
// as unhandled. Defer to promote this until we encounter another SExt
// chain derived from the same header.
for (auto I : SpeculativelyMovedExts) {
Value *HeadOfChain = I->getOperand(0);
SeenChainsForSExt[HeadOfChain] = Inst;
}
return false;
}
if (!AllSeenFirst && !UnhandledExts.empty())
for (auto VisitedSExt : UnhandledExts) {
if (RemovedInsts.count(VisitedSExt))
continue;
TypePromotionTransaction TPT(RemovedInsts);
SmallVector<Instruction *, 1> Exts;
SmallVector<Instruction *, 2> Chains;
Exts.push_back(VisitedSExt);
bool HasPromoted = tryToPromoteExts(TPT, Exts, Chains);
TPT.commit();
if (HasPromoted)
Promoted = true;
for (auto I : Chains) {
Value *HeadOfChain = I->getOperand(0);
// Mark this as handled.
SeenChainsForSExt[HeadOfChain] = nullptr;
ValToSExtendedUses[HeadOfChain].push_back(I);
}
}
return Promoted;
}
bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
BasicBlock *DefBB = I->getParent();
// If the result of a {s|z}ext and its source are both live out, rewrite all
// other uses of the source with result of extension.
Value *Src = I->getOperand(0);
if (Src->hasOneUse())
return false;
// Only do this xform if truncating is free.
if (TLI && !TLI->isTruncateFree(I->getType(), Src->getType()))
return false;
// Only safe to perform the optimization if the source is also defined in
// this block.
if (!isa<Instruction>(Src) || DefBB != cast<Instruction>(Src)->getParent())
return false;
bool DefIsLiveOut = false;
for (User *U : I->users()) {
Instruction *UI = cast<Instruction>(U);
// Figure out which BB this ext is used in.
BasicBlock *UserBB = UI->getParent();
if (UserBB == DefBB) continue;
DefIsLiveOut = true;
break;
}
if (!DefIsLiveOut)
return false;
// Make sure none of the uses are PHI nodes.
for (User *U : Src->users()) {
Instruction *UI = cast<Instruction>(U);
BasicBlock *UserBB = UI->getParent();
if (UserBB == DefBB) continue;
// Be conservative. We don't want this xform to end up introducing
// reloads just before load / store instructions.
if (isa<PHINode>(UI) || isa<LoadInst>(UI) || isa<StoreInst>(UI))
return false;
}
// InsertedTruncs - Only insert one trunc in each block once.
DenseMap<BasicBlock*, Instruction*> InsertedTruncs;
bool MadeChange = false;
for (Use &U : Src->uses()) {
Instruction *User = cast<Instruction>(U.getUser());
// Figure out which BB this ext is used in.
BasicBlock *UserBB = User->getParent();
if (UserBB == DefBB) continue;
// Both src and def are live in this block. Rewrite the use.
Instruction *&InsertedTrunc = InsertedTruncs[UserBB];
if (!InsertedTrunc) {
BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
assert(InsertPt != UserBB->end());
InsertedTrunc = new TruncInst(I, Src->getType(), "", &*InsertPt);
InsertedInsts.insert(InsertedTrunc);
}
// Replace a use of the {s|z}ext source with a use of the result.
U = InsertedTrunc;
++NumExtUses;
MadeChange = true;
}
return MadeChange;
}
// Find loads whose uses only use some of the loaded value's bits. Add an "and"
// just after the load if the target can fold this into one extload instruction,
// with the hope of eliminating some of the other later "and" instructions using
// the loaded value. "and"s that are made trivially redundant by the insertion
// of the new "and" are removed by this function, while others (e.g. those whose
// path from the load goes through a phi) are left for isel to potentially
// remove.
//
// For example:
//
// b0:
// x = load i32
// ...
// b1:
// y = and x, 0xff
// z = use y
//
// becomes:
//
// b0:
// x = load i32
// x' = and x, 0xff
// ...
// b1:
// z = use x'
//
// whereas:
//
// b0:
// x1 = load i32
// ...
// b1:
// x2 = load i32
// ...
// b2:
// x = phi x1, x2
// y = and x, 0xff
//
// becomes (after a call to optimizeLoadExt for each load):
//
// b0:
// x1 = load i32
// x1' = and x1, 0xff
// ...
// b1:
// x2 = load i32
// x2' = and x2, 0xff
// ...
// b2:
// x = phi x1', x2'
// y = and x, 0xff
bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
if (!Load->isSimple() || !Load->getType()->isIntOrPtrTy())
return false;
// Skip loads we've already transformed.
if (Load->hasOneUse() &&
InsertedInsts.count(cast<Instruction>(*Load->user_begin())))
return false;
// Look at all uses of Load, looking through phis, to determine how many bits
// of the loaded value are needed.
SmallVector<Instruction *, 8> WorkList;
SmallPtrSet<Instruction *, 16> Visited;
SmallVector<Instruction *, 8> AndsToMaybeRemove;
for (auto *U : Load->users())
WorkList.push_back(cast<Instruction>(U));
EVT LoadResultVT = TLI->getValueType(*DL, Load->getType());
unsigned BitWidth = LoadResultVT.getSizeInBits();
APInt DemandBits(BitWidth, 0);
APInt WidestAndBits(BitWidth, 0);
while (!WorkList.empty()) {
Instruction *I = WorkList.back();
WorkList.pop_back();
// Break use-def graph loops.
if (!Visited.insert(I).second)
continue;
// For a PHI node, push all of its users.
if (auto *Phi = dyn_cast<PHINode>(I)) {
for (auto *U : Phi->users())
WorkList.push_back(cast<Instruction>(U));
continue;
}
switch (I->getOpcode()) {
case Instruction::And: {
auto *AndC = dyn_cast<ConstantInt>(I->getOperand(1));
if (!AndC)
return false;
APInt AndBits = AndC->getValue();
DemandBits |= AndBits;
// Keep track of the widest and mask we see.
if (AndBits.ugt(WidestAndBits))
WidestAndBits = AndBits;
if (AndBits == WidestAndBits && I->getOperand(0) == Load)
AndsToMaybeRemove.push_back(I);
break;
}
case Instruction::Shl: {
auto *ShlC = dyn_cast<ConstantInt>(I->getOperand(1));
if (!ShlC)
return false;
uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1);
DemandBits.setLowBits(BitWidth - ShiftAmt);
break;
}
case Instruction::Trunc: {
EVT TruncVT = TLI->getValueType(*DL, I->getType());
unsigned TruncBitWidth = TruncVT.getSizeInBits();
DemandBits.setLowBits(TruncBitWidth);
break;
}
default:
return false;
}
}
uint32_t ActiveBits = DemandBits.getActiveBits();
// Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the
// target even if isLoadExtLegal says an i1 EXTLOAD is valid. For example,
// for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but
// (and (load x) 1) is not matched as a single instruction, rather as a LDR
// followed by an AND.
// TODO: Look into removing this restriction by fixing backends to either
// return false for isLoadExtLegal for i1 or have them select this pattern to
// a single instruction.
//
// Also avoid hoisting if we didn't see any ands with the exact DemandBits
// mask, since these are the only ands that will be removed by isel.
if (ActiveBits <= 1 || !DemandBits.isMask(ActiveBits) ||
WidestAndBits != DemandBits)
return false;
LLVMContext &Ctx = Load->getType()->getContext();
Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits);
EVT TruncVT = TLI->getValueType(*DL, TruncTy);
// Reject cases that won't be matched as extloads.
if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound() ||
!TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT))
return false;
IRBuilder<> Builder(Load->getNextNode());
auto *NewAnd = dyn_cast<Instruction>(
Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
// Mark this instruction as "inserted by CGP", so that other
// optimizations don't touch it.
InsertedInsts.insert(NewAnd);
// Replace all uses of load with new and (except for the use of load in the
// new and itself).
Load->replaceAllUsesWith(NewAnd);
NewAnd->setOperand(0, Load);
// Remove any and instructions that are now redundant.
for (auto *And : AndsToMaybeRemove)
// Check that the and mask is the same as the one we decided to put on the
// new and.
if (cast<ConstantInt>(And->getOperand(1))->getValue() == DemandBits) {
And->replaceAllUsesWith(NewAnd);
if (&*CurInstIterator == And)
CurInstIterator = std::next(And->getIterator());
And->eraseFromParent();
++NumAndUses;
}
++NumAndsAdded;
return true;
}
/// Check if V (an operand of a select instruction) is an expensive instruction
/// that is only used once.
static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {
auto *I = dyn_cast<Instruction>(V);
// If it's safe to speculatively execute, then it should not have side
// effects; therefore, it's safe to sink and possibly *not* execute.
return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) &&
TTI->getUserCost(I) >= TargetTransformInfo::TCC_Expensive;
}
/// Returns true if a SelectInst should be turned into an explicit branch.
static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI,
const TargetLowering *TLI,
SelectInst *SI) {
// If even a predictable select is cheap, then a branch can't be cheaper.
if (!TLI->isPredictableSelectExpensive())
return false;
// FIXME: This should use the same heuristics as IfConversion to determine
// whether a select is better represented as a branch.
// If metadata tells us that the select condition is obviously predictable,
// then we want to replace the select with a branch.
uint64_t TrueWeight, FalseWeight;
if (SI->extractProfMetadata(TrueWeight, FalseWeight)) {
uint64_t Max = std::max(TrueWeight, FalseWeight);
uint64_t Sum = TrueWeight + FalseWeight;
if (Sum != 0) {
auto Probability = BranchProbability::getBranchProbability(Max, Sum);
if (Probability > TLI->getPredictableBranchThreshold())
return true;
}
}
CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
// If a branch is predictable, an out-of-order CPU can avoid blocking on its
// comparison condition. If the compare has more than one use, there's
// probably another cmov or setcc around, so it's not worth emitting a branch.
if (!Cmp || !Cmp->hasOneUse())
return false;
// If either operand of the select is expensive and only needed on one side
// of the select, we should form a branch.
if (sinkSelectOperand(TTI, SI->getTrueValue()) ||
sinkSelectOperand(TTI, SI->getFalseValue()))
return true;
return false;
}
/// If \p isTrue is true, return the true value of \p SI, otherwise return
/// false value of \p SI. If the true/false value of \p SI is defined by any
/// select instructions in \p Selects, look through the defining select
/// instruction until the true/false value is not defined in \p Selects.
static Value *getTrueOrFalseValue(
SelectInst *SI, bool isTrue,
const SmallPtrSet<const Instruction *, 2> &Selects) {
Value *V = nullptr;
for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI);
DefSI = dyn_cast<SelectInst>(V)) {
assert(DefSI->getCondition() == SI->getCondition() &&
"The condition of DefSI does not match with SI");
V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
}
assert(V && "Failed to get select true/false value");
return V;
}
bool CodeGenPrepare::optimizeShiftInst(BinaryOperator *Shift) {
assert(Shift->isShift() && "Expected a shift");
// If this is (1) a vector shift, (2) shifts by scalars are cheaper than
// general vector shifts, and (3) the shift amount is a select-of-splatted
// values, hoist the shifts before the select:
// shift Op0, (select Cond, TVal, FVal) -->
// select Cond, (shift Op0, TVal), (shift Op0, FVal)
//
// This is inverting a generic IR transform when we know that the cost of a
// general vector shift is more than the cost of 2 shift-by-scalars.
// We can't do this effectively in SDAG because we may not be able to
// determine if the select operands are splats from within a basic block.
Type *Ty = Shift->getType();
if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty))
return false;
Value *Cond, *TVal, *FVal;
if (!match(Shift->getOperand(1),
m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))
return false;
if (!isSplatValue(TVal) || !isSplatValue(FVal))
return false;
IRBuilder<> Builder(Shift);
BinaryOperator::BinaryOps Opcode = Shift->getOpcode();
Value *NewTVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), TVal);
Value *NewFVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), FVal);
Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal);
Shift->replaceAllUsesWith(NewSel);
Shift->eraseFromParent();
return true;
}
/// If we have a SelectInst that will likely profit from branch prediction,
/// turn it into a branch.
bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
// If branch conversion isn't desirable, exit early.
if (DisableSelectToBranch || OptSize || !TLI)
return false;
// Find all consecutive select instructions that share the same condition.
SmallVector<SelectInst *, 2> ASI;
ASI.push_back(SI);
for (BasicBlock::iterator It = ++BasicBlock::iterator(SI);
It != SI->getParent()->end(); ++It) {
SelectInst *I = dyn_cast<SelectInst>(&*It);
if (I && SI->getCondition() == I->getCondition()) {
ASI.push_back(I);
} else {
break;
}
}
SelectInst *LastSI = ASI.back();
// Increment the current iterator to skip all the rest of select instructions
// because they will be either "not lowered" or "all lowered" to branch.
CurInstIterator = std::next(LastSI->getIterator());
bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
// Can we convert the 'select' to CF ?
if (VectorCond || SI->getMetadata(LLVMContext::MD_unpredictable))
return false;
TargetLowering::SelectSupportKind SelectKind;
if (VectorCond)
SelectKind = TargetLowering::VectorMaskSelect;
else if (SI->getType()->isVectorTy())
SelectKind = TargetLowering::ScalarCondVectorVal;
else
SelectKind = TargetLowering::ScalarValSelect;
if (TLI->isSelectSupported(SelectKind) &&
!isFormingBranchFromSelectProfitable(TTI, TLI, SI))
return false;
// The DominatorTree needs to be rebuilt by any consumers after this
// transformation. We simply reset here rather than setting the ModifiedDT
// flag to avoid restarting the function walk in runOnFunction for each
// select optimized.
DT.reset();
// Transform a sequence like this:
// start:
// %cmp = cmp uge i32 %a, %b
// %sel = select i1 %cmp, i32 %c, i32 %d
//
// Into:
// start:
// %cmp = cmp uge i32 %a, %b
// br i1 %cmp, label %select.true, label %select.false
// select.true:
// br label %select.end
// select.false:
// br label %select.end
// select.end:
// %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
//
// In addition, we may sink instructions that produce %c or %d from
// the entry block into the destination(s) of the new branch.
// If the true or false blocks do not contain a sunken instruction, that
// block and its branch may be optimized away. In that case, one side of the
// first branch will point directly to select.end, and the corresponding PHI
// predecessor block will be the start block.
// First, we split the block containing the select into 2 blocks.
BasicBlock *StartBlock = SI->getParent();
BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));
BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
// Delete the unconditional branch that was just created by the split.
StartBlock->getTerminator()->eraseFromParent();
// These are the new basic blocks for the conditional branch.
// At least one will become an actual new basic block.
BasicBlock *TrueBlock = nullptr;
BasicBlock *FalseBlock = nullptr;
BranchInst *TrueBranch = nullptr;
BranchInst *FalseBranch = nullptr;
// Sink expensive instructions into the conditional blocks to avoid executing
// them speculatively.
for (SelectInst *SI : ASI) {
if (sinkSelectOperand(TTI, SI->getTrueValue())) {
if (TrueBlock == nullptr) {
TrueBlock = BasicBlock::Create(SI->getContext(), "select.true.sink",
EndBlock->getParent(), EndBlock);
TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
TrueBranch->setDebugLoc(SI->getDebugLoc());
}
auto *TrueInst = cast<Instruction>(SI->getTrueValue());
TrueInst->moveBefore(TrueBranch);
}
if (sinkSelectOperand(TTI, SI->getFalseValue())) {
if (FalseBlock == nullptr) {
FalseBlock = BasicBlock::Create(SI->getContext(), "select.false.sink",
EndBlock->getParent(), EndBlock);
FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
FalseBranch->setDebugLoc(SI->getDebugLoc());
}
auto *FalseInst = cast<Instruction>(SI->getFalseValue());
FalseInst->moveBefore(FalseBranch);
}
}
// If there was nothing to sink, then arbitrarily choose the 'false' side
// for a new input value to the PHI.
if (TrueBlock == FalseBlock) {
assert(TrueBlock == nullptr &&
"Unexpected basic block transform while optimizing select");
FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
EndBlock->getParent(), EndBlock);
auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
FalseBranch->setDebugLoc(SI->getDebugLoc());
}
// Insert the real conditional branch based on the original condition.
// If we did not create a new block for one of the 'true' or 'false' paths
// of the condition, it means that side of the branch goes to the end block
// directly and the path originates from the start block from the point of
// view of the new PHI.
BasicBlock *TT, *FT;
if (TrueBlock == nullptr) {
TT = EndBlock;
FT = FalseBlock;
TrueBlock = StartBlock;
} else if (FalseBlock == nullptr) {
TT = TrueBlock;
FT = EndBlock;
FalseBlock = StartBlock;
} else {
TT = TrueBlock;
FT = FalseBlock;
}
IRBuilder<>(SI).CreateCondBr(SI->getCondition(), TT, FT, SI);
SmallPtrSet<const Instruction *, 2> INS;
INS.insert(ASI.begin(), ASI.end());
// Use reverse iterator because later select may use the value of the
// earlier select, and we need to propagate value through earlier select
// to get the PHI operand.
for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
SelectInst *SI = *It;
// The select itself is replaced with a PHI Node.
PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front());
PN->takeName(SI);
PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock);
PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock);
PN->setDebugLoc(SI->getDebugLoc());
SI->replaceAllUsesWith(PN);
SI->eraseFromParent();
INS.erase(SI);
++NumSelectsExpanded;
}
// Instruct OptimizeBlock to skip to the next block.
CurInstIterator = StartBlock->end();
return true;
}
static bool isBroadcastShuffle(ShuffleVectorInst *SVI) {
SmallVector<int, 16> Mask(SVI->getShuffleMask());
int SplatElem = -1;
for (unsigned i = 0; i < Mask.size(); ++i) {
if (SplatElem != -1 && Mask[i] != -1 && Mask[i] != SplatElem)
return false;
SplatElem = Mask[i];
}
return true;
}
/// Some targets have expensive vector shifts if the lanes aren't all the same
/// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases
/// it's often worth sinking a shufflevector splat down to its use so that
/// codegen can spot all lanes are identical.
bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
BasicBlock *DefBB = SVI->getParent();
// Only do this xform if variable vector shifts are particularly expensive.
if (!TLI || !TLI->isVectorShiftByScalarCheap(SVI->getType()))
return false;
// We only expect better codegen by sinking a shuffle if we can recognise a
// constant splat.
if (!isBroadcastShuffle(SVI))
return false;
// InsertedShuffles - Only insert a shuffle in each block once.
DenseMap<BasicBlock*, Instruction*> InsertedShuffles;
bool MadeChange = false;
for (User *U : SVI->users()) {
Instruction *UI = cast<Instruction>(U);
// Figure out which BB this ext is used in.
BasicBlock *UserBB = UI->getParent();
if (UserBB == DefBB) continue;
// For now only apply this when the splat is used by a shift instruction.
if (!UI->isShift()) continue;
// Everything checks out, sink the shuffle if the user's block doesn't
// already have a copy.
Instruction *&InsertedShuffle = InsertedShuffles[UserBB];
if (!InsertedShuffle) {
BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
assert(InsertPt != UserBB->end());
InsertedShuffle =
new ShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1),
SVI->getOperand(2), "", &*InsertPt);
InsertedShuffle->setDebugLoc(SVI->getDebugLoc());
}
UI->replaceUsesOfWith(SVI, InsertedShuffle);
MadeChange = true;
}
// If we removed all uses, nuke the shuffle.
if (SVI->use_empty()) {
SVI->eraseFromParent();
MadeChange = true;
}
return MadeChange;
}
bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
// If the operands of I can be folded into a target instruction together with
// I, duplicate and sink them.
SmallVector<Use *, 4> OpsToSink;
if (!TLI || !TLI->shouldSinkOperands(I, OpsToSink))
return false;
// OpsToSink can contain multiple uses in a use chain (e.g.
// (%u1 with %u1 = shufflevector), (%u2 with %u2 = zext %u1)). The dominating
// uses must come first, which means they are sunk first, temporarily creating
// invalid IR. This will be fixed once their dominated users are sunk and
// updated.
BasicBlock *TargetBB = I->getParent();
bool Changed = false;
SmallVector<Use *, 4> ToReplace;
for (Use *U : OpsToSink) {
auto *UI = cast<Instruction>(U->get());
if (UI->getParent() == TargetBB || isa<PHINode>(UI))
continue;
ToReplace.push_back(U);
}
SmallPtrSet<Instruction *, 4> MaybeDead;
for (Use *U : ToReplace) {
auto *UI = cast<Instruction>(U->get());
Instruction *NI = UI->clone();
MaybeDead.insert(UI);
LLVM_DEBUG(dbgs() << "Sinking " << *UI << " to user " << *I << "\n");
NI->insertBefore(I);
InsertedInsts.insert(NI);
U->set(NI);
Changed = true;
}
// Remove instructions that are dead after sinking.
for (auto *I : MaybeDead)
if (!I->hasNUsesOrMore(1))
I->eraseFromParent();
return Changed;
}
bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
if (!TLI || !DL)
return false;
Value *Cond = SI->getCondition();
Type *OldType = Cond->getType();
LLVMContext &Context = Cond->getContext();
MVT RegType = TLI->getRegisterType(Context, TLI->getValueType(*DL, OldType));
unsigned RegWidth = RegType.getSizeInBits();
if (RegWidth <= cast<IntegerType>(OldType)->getBitWidth())
return false;
// If the register width is greater than the type width, expand the condition
// of the switch instruction and each case constant to the width of the
// register. By widening the type of the switch condition, subsequent
// comparisons (for case comparisons) will not need to be extended to the
// preferred register width, so we will potentially eliminate N-1 extends,
// where N is the number of cases in the switch.
auto *NewType = Type::getIntNTy(Context, RegWidth);
// Zero-extend the switch condition and case constants unless the switch
// condition is a function argument that is already being sign-extended.
// In that case, we can avoid an unnecessary mask/extension by sign-extending
// everything instead.
Instruction::CastOps ExtType = Instruction::ZExt;
if (auto *Arg = dyn_cast<Argument>(Cond))
if (Arg->hasSExtAttr())
ExtType = Instruction::SExt;
auto *ExtInst = CastInst::Create(ExtType, Cond, NewType);
ExtInst->insertBefore(SI);
ExtInst->setDebugLoc(SI->getDebugLoc());
SI->setCondition(ExtInst);
for (auto Case : SI->cases()) {
APInt NarrowConst = Case.getCaseValue()->getValue();
APInt WideConst = (ExtType == Instruction::ZExt) ?
NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth);
Case.setValue(ConstantInt::get(Context, WideConst));
}
return true;
}
namespace {
/// Helper class to promote a scalar operation to a vector one.
/// This class is used to move downward extractelement transition.
/// E.g.,
/// a = vector_op <2 x i32>
/// b = extractelement <2 x i32> a, i32 0
/// c = scalar_op b
/// store c
///
/// =>
/// a = vector_op <2 x i32>
/// c = vector_op a (equivalent to scalar_op on the related lane)
/// * d = extractelement <2 x i32> c, i32 0
/// * store d
/// Assuming both extractelement and store can be combine, we get rid of the
/// transition.
class VectorPromoteHelper {
/// DataLayout associated with the current module.
const DataLayout &DL;
/// Used to perform some checks on the legality of vector operations.
const TargetLowering &TLI;
/// Used to estimated the cost of the promoted chain.
const TargetTransformInfo &TTI;
/// The transition being moved downwards.
Instruction *Transition;
/// The sequence of instructions to be promoted.
SmallVector<Instruction *, 4> InstsToBePromoted;
/// Cost of combining a store and an extract.
unsigned StoreExtractCombineCost;
/// Instruction that will be combined with the transition.
Instruction *CombineInst = nullptr;
/// The instruction that represents the current end of the transition.
/// Since we are faking the promotion until we reach the end of the chain
/// of computation, we need a way to get the current end of the transition.
Instruction *getEndOfTransition() const {
if (InstsToBePromoted.empty())
return Transition;
return InstsToBePromoted.back();
}
/// Return the index of the original value in the transition.
/// E.g., for "extractelement <2 x i32> c, i32 1" the original value,
/// c, is at index 0.
unsigned getTransitionOriginalValueIdx() const {
assert(isa<ExtractElementInst>(Transition) &&
"Other kind of transitions are not supported yet");
return 0;
}
/// Return the index of the index in the transition.
/// E.g., for "extractelement <2 x i32> c, i32 0" the index
/// is at index 1.
unsigned getTransitionIdx() const {
assert(isa<ExtractElementInst>(Transition) &&
"Other kind of transitions are not supported yet");
return 1;
}
/// Get the type of the transition.
/// This is the type of the original value.
/// E.g., for "extractelement <2 x i32> c, i32 1" the type of the
/// transition is <2 x i32>.
Type *getTransitionType() const {
return Transition->getOperand(getTransitionOriginalValueIdx())->getType();
}
/// Promote \p ToBePromoted by moving \p Def downward through.
/// I.e., we have the following sequence:
/// Def = Transition <ty1> a to <ty2>
/// b = ToBePromoted <ty2> Def, ...
/// =>
/// b = ToBePromoted <ty1> a, ...
/// Def = Transition <ty1> ToBePromoted to <ty2>
void promoteImpl(Instruction *ToBePromoted);
/// Check whether or not it is profitable to promote all the
/// instructions enqueued to be promoted.
bool isProfitableToPromote() {
Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx());
unsigned Index = isa<ConstantInt>(ValIdx)
? cast<ConstantInt>(ValIdx)->getZExtValue()
: -1;
Type *PromotedType = getTransitionType();
StoreInst *ST = cast<StoreInst>(CombineInst);
unsigned AS = ST->getPointerAddressSpace();
unsigned Align = ST->getAlignment();
// Check if this store is supported.
if (!TLI.allowsMisalignedMemoryAccesses(
TLI.getValueType(DL, ST->getValueOperand()->getType()), AS,
Align)) {
// If this is not supported, there is no way we can combine
// the extract with the store.
return false;
}
// The scalar chain of computation has to pay for the transition
// scalar to vector.
// The vector chain has to account for the combining cost.
uint64_t ScalarCost =
TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index);
uint64_t VectorCost = StoreExtractCombineCost;
for (const auto &Inst : InstsToBePromoted) {
// Compute the cost.
// By construction, all instructions being promoted are arithmetic ones.
// Moreover, one argument is a constant that can be viewed as a splat
// constant.
Value *Arg0 = Inst->getOperand(0);
bool IsArg0Constant = isa<UndefValue>(Arg0) || isa<ConstantInt>(Arg0) ||
isa<ConstantFP>(Arg0);
TargetTransformInfo::OperandValueKind Arg0OVK =
IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
: TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Arg1OVK =
!IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
: TargetTransformInfo::OK_AnyValue;
ScalarCost += TTI.getArithmeticInstrCost(
Inst->getOpcode(), Inst->getType(), Arg0OVK, Arg1OVK);
VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType,
Arg0OVK, Arg1OVK);
}
LLVM_DEBUG(
dbgs() << "Estimated cost of computation to be promoted:\nScalar: "
<< ScalarCost << "\nVector: " << VectorCost << '\n');
return ScalarCost > VectorCost;
}
/// Generate a constant vector with \p Val with the same
/// number of elements as the transition.
/// \p UseSplat defines whether or not \p Val should be replicated
/// across the whole vector.
/// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>,
/// otherwise we generate a vector with as many undef as possible:
/// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only
/// used at the index of the extract.
Value *getConstantVector(Constant *Val, bool UseSplat) const {
unsigned ExtractIdx = std::numeric_limits<unsigned>::max();
if (!UseSplat) {
// If we cannot determine where the constant must be, we have to
// use a splat constant.
Value *ValExtractIdx = Transition->getOperand(getTransitionIdx());
if (ConstantInt *CstVal = dyn_cast<ConstantInt>(ValExtractIdx))
ExtractIdx = CstVal->getSExtValue();
else
UseSplat = true;
}
unsigned End = getTransitionType()->getVectorNumElements();
if (UseSplat)
return ConstantVector::getSplat(End, Val);
SmallVector<Constant *, 4> ConstVec;
UndefValue *UndefVal = UndefValue::get(Val->getType());
for (unsigned Idx = 0; Idx != End; ++Idx) {
if (Idx == ExtractIdx)
ConstVec.push_back(Val);
else
ConstVec.push_back(UndefVal);
}
return ConstantVector::get(ConstVec);
}
/// Check if promoting to a vector type an operand at \p OperandIdx
/// in \p Use can trigger undefined behavior.
static bool canCauseUndefinedBehavior(const Instruction *Use,
unsigned OperandIdx) {
// This is not safe to introduce undef when the operand is on
// the right hand side of a division-like instruction.
if (OperandIdx != 1)
return false;
switch (Use->getOpcode()) {
default:
return false;
case Instruction::SDiv:
case Instruction::UDiv:
case Instruction::SRem:
case Instruction::URem:
return true;
case Instruction::FDiv:
case Instruction::FRem:
return !Use->hasNoNaNs();
}
llvm_unreachable(nullptr);
}
public:
VectorPromoteHelper(const DataLayout &DL, const TargetLowering &TLI,
const TargetTransformInfo &TTI, Instruction *Transition,
unsigned CombineCost)
: DL(DL), TLI(TLI), TTI(TTI), Transition(Transition),
StoreExtractCombineCost(CombineCost) {
assert(Transition && "Do not know how to promote null");
}
/// Check if we can promote \p ToBePromoted to \p Type.
bool canPromote(const Instruction *ToBePromoted) const {
// We could support CastInst too.
return isa<BinaryOperator>(ToBePromoted);
}
/// Check if it is profitable to promote \p ToBePromoted
/// by moving downward the transition through.
bool shouldPromote(const Instruction *ToBePromoted) const {
// Promote only if all the operands can be statically expanded.
// Indeed, we do not want to introduce any new kind of transitions.
for (const Use &U : ToBePromoted->operands()) {
const Value *Val = U.get();
if (Val == getEndOfTransition()) {
// If the use is a division and the transition is on the rhs,
// we cannot promote the operation, otherwise we may create a
// division by zero.
if (canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()))
return false;
continue;
}
if (!isa<ConstantInt>(Val) && !isa<UndefValue>(Val) &&
!isa<ConstantFP>(Val))
return false;
}
// Check that the resulting operation is legal.
int ISDOpcode = TLI.InstructionOpcodeToISD(ToBePromoted->getOpcode());
if (!ISDOpcode)
return false;
return StressStoreExtract ||
TLI.isOperationLegalOrCustom(
ISDOpcode, TLI.getValueType(DL, getTransitionType(), true));
}
/// Check whether or not \p Use can be combined
/// with the transition.
/// I.e., is it possible to do Use(Transition) => AnotherUse?
bool canCombine(const Instruction *Use) { return isa<StoreInst>(Use); }
/// Record \p ToBePromoted as part of the chain to be promoted.
void enqueueForPromotion(Instruction *ToBePromoted) {
InstsToBePromoted.push_back(ToBePromoted);
}
/// Set the instruction that will be combined with the transition.
void recordCombineInstruction(Instruction *ToBeCombined) {
assert(canCombine(ToBeCombined) && "Unsupported instruction to combine");
CombineInst = ToBeCombined;
}
/// Promote all the instructions enqueued for promotion if it is
/// is profitable.
/// \return True if the promotion happened, false otherwise.
bool promote() {
// Check if there is something to promote.
// Right now, if we do not have anything to combine with,
// we assume the promotion is not profitable.
if (InstsToBePromoted.empty() || !CombineInst)
return false;
// Check cost.
if (!StressStoreExtract && !isProfitableToPromote())
return false;
// Promote.
for (auto &ToBePromoted : InstsToBePromoted)
promoteImpl(ToBePromoted);
InstsToBePromoted.clear();
return true;
}
};
} // end anonymous namespace
void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
// At this point, we know that all the operands of ToBePromoted but Def
// can be statically promoted.
// For Def, we need to use its parameter in ToBePromoted:
// b = ToBePromoted ty1 a
// Def = Transition ty1 b to ty2
// Move the transition down.
// 1. Replace all uses of the promoted operation by the transition.
// = ... b => = ... Def.
assert(ToBePromoted->getType() == Transition->getType() &&
"The type of the result of the transition does not match "
"the final type");
ToBePromoted->replaceAllUsesWith(Transition);
// 2. Update the type of the uses.
// b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def.
Type *TransitionTy = getTransitionType();
ToBePromoted->mutateType(TransitionTy);
// 3. Update all the operands of the promoted operation with promoted
// operands.
// b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a.
for (Use &U : ToBePromoted->operands()) {
Value *Val = U.get();
Value *NewVal = nullptr;
if (Val == Transition)
NewVal = Transition->getOperand(getTransitionOriginalValueIdx());
else if (isa<UndefValue>(Val) || isa<ConstantInt>(Val) ||
isa<ConstantFP>(Val)) {
// Use a splat constant if it is not safe to use undef.
NewVal = getConstantVector(
cast<Constant>(Val),
isa<UndefValue>(Val) ||
canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()));
} else
llvm_unreachable("Did you modified shouldPromote and forgot to update "
"this?");
ToBePromoted->setOperand(U.getOperandNo(), NewVal);
}
Transition->moveAfter(ToBePromoted);
Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted);
}
/// Some targets can do store(extractelement) with one instruction.
/// Try to push the extractelement towards the stores when the target
/// has this feature and this is profitable.
bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) {
unsigned CombineCost = std::numeric_limits<unsigned>::max();
if (DisableStoreExtract || !TLI ||
(!StressStoreExtract &&
!TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(),
Inst->getOperand(1), CombineCost)))
return false;
// At this point we know that Inst is a vector to scalar transition.
// Try to move it down the def-use chain, until:
// - We can combine the transition with its single use
// => we got rid of the transition.
// - We escape the current basic block
// => we would need to check that we are moving it at a cheaper place and
// we do not do that for now.
BasicBlock *Parent = Inst->getParent();
LLVM_DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n');
VectorPromoteHelper VPH(*DL, *TLI, *TTI, Inst, CombineCost);
// If the transition has more than one use, assume this is not going to be
// beneficial.
while (Inst->hasOneUse()) {
Instruction *ToBePromoted = cast<Instruction>(*Inst->user_begin());
LLVM_DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n');
if (ToBePromoted->getParent() != Parent) {
LLVM_DEBUG(dbgs() << "Instruction to promote is in a different block ("
<< ToBePromoted->getParent()->getName()
<< ") than the transition (" << Parent->getName()
<< ").\n");
return false;
}
if (VPH.canCombine(ToBePromoted)) {
LLVM_DEBUG(dbgs() << "Assume " << *Inst << '\n'
<< "will be combined with: " << *ToBePromoted << '\n');
VPH.recordCombineInstruction(ToBePromoted);
bool Changed = VPH.promote();
NumStoreExtractExposed += Changed;
return Changed;
}
LLVM_DEBUG(dbgs() << "Try promoting.\n");
if (!VPH.canPromote(ToBePromoted) || !VPH.shouldPromote(ToBePromoted))
return false;
LLVM_DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n");
VPH.enqueueForPromotion(ToBePromoted);
Inst = ToBePromoted;
}
return false;
}
/// For the instruction sequence of store below, F and I values
/// are bundled together as an i64 value before being stored into memory.
/// Sometimes it is more efficient to generate separate stores for F and I,
/// which can remove the bitwise instructions or sink them to colder places.
///
/// (store (or (zext (bitcast F to i32) to i64),
/// (shl (zext I to i64), 32)), addr) -->
/// (store F, addr) and (store I, addr+4)
///
/// Similarly, splitting for other merged store can also be beneficial, like:
/// For pair of {i32, i32}, i64 store --> two i32 stores.
/// For pair of {i32, i16}, i64 store --> two i32 stores.
/// For pair of {i16, i16}, i32 store --> two i16 stores.
/// For pair of {i16, i8}, i32 store --> two i16 stores.
/// For pair of {i8, i8}, i16 store --> two i8 stores.
///
/// We allow each target to determine specifically which kind of splitting is
/// supported.
///
/// The store patterns are commonly seen from the simple code snippet below
/// if only std::make_pair(...) is sroa transformed before inlined into hoo.
/// void goo(const std::pair<int, float> &);
/// hoo() {
/// ...
/// goo(std::make_pair(tmp, ftmp));
/// ...
/// }
///
/// Although we already have similar splitting in DAG Combine, we duplicate
/// it in CodeGenPrepare to catch the case in which pattern is across
/// multiple BBs. The logic in DAG Combine is kept to catch case generated
/// during code expansion.
static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,
const TargetLowering &TLI) {
// Handle simple but common cases only.
Type *StoreType = SI.getValueOperand()->getType();
if (!DL.typeSizeEqualsStoreSize(StoreType) ||
DL.getTypeSizeInBits(StoreType) == 0)
return false;
unsigned HalfValBitSize = DL.getTypeSizeInBits(StoreType) / 2;
Type *SplitStoreType = Type::getIntNTy(SI.getContext(), HalfValBitSize);
if (!DL.typeSizeEqualsStoreSize(SplitStoreType))
return false;
// Don't split the store if it is volatile.
if (SI.isVolatile())
return false;
// Match the following patterns:
// (store (or (zext LValue to i64),
// (shl (zext HValue to i64), 32)), HalfValBitSize)
// or
// (store (or (shl (zext HValue to i64), 32)), HalfValBitSize)
// (zext LValue to i64),
// Expect both operands of OR and the first operand of SHL have only
// one use.
Value *LValue, *HValue;
if (!match(SI.getValueOperand(),
m_c_Or(m_OneUse(m_ZExt(m_Value(LValue))),
m_OneUse(m_Shl(m_OneUse(m_ZExt(m_Value(HValue))),
m_SpecificInt(HalfValBitSize))))))
return false;
// Check LValue and HValue are int with size less or equal than 32.
if (!LValue->getType()->isIntegerTy() ||
DL.getTypeSizeInBits(LValue->getType()) > HalfValBitSize ||
!HValue->getType()->isIntegerTy() ||
DL.getTypeSizeInBits(HValue->getType()) > HalfValBitSize)
return false;
// If LValue/HValue is a bitcast instruction, use the EVT before bitcast
// as the input of target query.
auto *LBC = dyn_cast<BitCastInst>(LValue);
auto *HBC = dyn_cast<BitCastInst>(HValue);
EVT LowTy = LBC ? EVT::getEVT(LBC->getOperand(0)->getType())
: EVT::getEVT(LValue->getType());
EVT HighTy = HBC ? EVT::getEVT(HBC->getOperand(0)->getType())
: EVT::getEVT(HValue->getType());
if (!ForceSplitStore && !TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy))
return false;
// Start to split store.
IRBuilder<> Builder(SI.getContext());
Builder.SetInsertPoint(&SI);
// If LValue/HValue is a bitcast in another BB, create a new one in current
// BB so it may be merged with the splitted stores by dag combiner.
if (LBC && LBC->getParent() != SI.getParent())
LValue = Builder.CreateBitCast(LBC->getOperand(0), LBC->getType());
if (HBC && HBC->getParent() != SI.getParent())
HValue = Builder.CreateBitCast(HBC->getOperand(0), HBC->getType());
bool IsLE = SI.getModule()->getDataLayout().isLittleEndian();
auto CreateSplitStore = [&](Value *V, bool Upper) {
V = Builder.CreateZExtOrBitCast(V, SplitStoreType);
Value *Addr = Builder.CreateBitCast(
SI.getOperand(1),
SplitStoreType->getPointerTo(SI.getPointerAddressSpace()));
if ((IsLE && Upper) || (!IsLE && !Upper))
Addr = Builder.CreateGEP(
SplitStoreType, Addr,
ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1));
Builder.CreateAlignedStore(
V, Addr, Upper ? SI.getAlignment() / 2 : SI.getAlignment());
};
CreateSplitStore(LValue, false);
CreateSplitStore(HValue, true);
// Delete the old store.
SI.eraseFromParent();
return true;
}
// Return true if the GEP has two operands, the first operand is of a sequential
// type, and the second operand is a constant.
static bool GEPSequentialConstIndexed(GetElementPtrInst *GEP) {
gep_type_iterator I = gep_type_begin(*GEP);
return GEP->getNumOperands() == 2 &&
I.isSequential() &&
isa<ConstantInt>(GEP->getOperand(1));
}
// Try unmerging GEPs to reduce liveness interference (register pressure) across
// IndirectBr edges. Since IndirectBr edges tend to touch on many blocks,
// reducing liveness interference across those edges benefits global register
// allocation. Currently handles only certain cases.
//
// For example, unmerge %GEPI and %UGEPI as below.
//
// ---------- BEFORE ----------
// SrcBlock:
// ...
// %GEPIOp = ...
// ...
// %GEPI = gep %GEPIOp, Idx
// ...
// indirectbr ... [ label %DstB0, label %DstB1, ... label %DstBi ... ]
// (* %GEPI is alive on the indirectbr edges due to other uses ahead)
// (* %GEPIOp is alive on the indirectbr edges only because of it's used by
// %UGEPI)
//
// DstB0: ... (there may be a gep similar to %UGEPI to be unmerged)
// DstB1: ... (there may be a gep similar to %UGEPI to be unmerged)
// ...
//
// DstBi:
// ...
// %UGEPI = gep %GEPIOp, UIdx
// ...
// ---------------------------
//
// ---------- AFTER ----------
// SrcBlock:
// ... (same as above)
// (* %GEPI is still alive on the indirectbr edges)
// (* %GEPIOp is no longer alive on the indirectbr edges as a result of the
// unmerging)
// ...
//
// DstBi:
// ...
// %UGEPI = gep %GEPI, (UIdx-Idx)
// ...
// ---------------------------
//
// The register pressure on the IndirectBr edges is reduced because %GEPIOp is
// no longer alive on them.
//
// We try to unmerge GEPs here in CodGenPrepare, as opposed to limiting merging
// of GEPs in the first place in InstCombiner::visitGetElementPtrInst() so as
// not to disable further simplications and optimizations as a result of GEP
// merging.
//
// Note this unmerging may increase the length of the data flow critical path
// (the path from %GEPIOp to %UGEPI would go through %GEPI), which is a tradeoff
// between the register pressure and the length of data-flow critical
// path. Restricting this to the uncommon IndirectBr case would minimize the
// impact of potentially longer critical path, if any, and the impact on compile
// time.
static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,
const TargetTransformInfo *TTI) {
BasicBlock *SrcBlock = GEPI->getParent();
// Check that SrcBlock ends with an IndirectBr. If not, give up. The common
// (non-IndirectBr) cases exit early here.
if (!isa<IndirectBrInst>(SrcBlock->getTerminator()))
return false;
// Check that GEPI is a simple gep with a single constant index.
if (!GEPSequentialConstIndexed(GEPI))
return false;
ConstantInt *GEPIIdx = cast<ConstantInt>(GEPI->getOperand(1));
// Check that GEPI is a cheap one.
if (TTI->getIntImmCost(GEPIIdx->getValue(), GEPIIdx->getType())
> TargetTransformInfo::TCC_Basic)
return false;
Value *GEPIOp = GEPI->getOperand(0);
// Check that GEPIOp is an instruction that's also defined in SrcBlock.
if (!isa<Instruction>(GEPIOp))
return false;
auto *GEPIOpI = cast<Instruction>(GEPIOp);
if (GEPIOpI->getParent() != SrcBlock)
return false;
// Check that GEP is used outside the block, meaning it's alive on the
// IndirectBr edge(s).
if (find_if(GEPI->users(), [&](User *Usr) {
if (auto *I = dyn_cast<Instruction>(Usr)) {
if (I->getParent() != SrcBlock) {
return true;
}
}
return false;
}) == GEPI->users().end())
return false;
// The second elements of the GEP chains to be unmerged.
std::vector<GetElementPtrInst *> UGEPIs;
// Check each user of GEPIOp to check if unmerging would make GEPIOp not alive
// on IndirectBr edges.
for (User *Usr : GEPIOp->users()) {
if (Usr == GEPI) continue;
// Check if Usr is an Instruction. If not, give up.
if (!isa<Instruction>(Usr))
return false;
auto *UI = cast<Instruction>(Usr);
// Check if Usr in the same block as GEPIOp, which is fine, skip.
if (UI->getParent() == SrcBlock)
continue;
// Check if Usr is a GEP. If not, give up.
if (!isa<GetElementPtrInst>(Usr))
return false;
auto *UGEPI = cast<GetElementPtrInst>(Usr);
// Check if UGEPI is a simple gep with a single constant index and GEPIOp is
// the pointer operand to it. If so, record it in the vector. If not, give
// up.
if (!GEPSequentialConstIndexed(UGEPI))
return false;
if (UGEPI->getOperand(0) != GEPIOp)
return false;
if (GEPIIdx->getType() !=
cast<ConstantInt>(UGEPI->getOperand(1))->getType())
return false;
ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
if (TTI->getIntImmCost(UGEPIIdx->getValue(), UGEPIIdx->getType())
> TargetTransformInfo::TCC_Basic)
return false;
UGEPIs.push_back(UGEPI);
}
if (UGEPIs.size() == 0)
return false;
// Check the materializing cost of (Uidx-Idx).
for (GetElementPtrInst *UGEPI : UGEPIs) {
ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
APInt NewIdx = UGEPIIdx->getValue() - GEPIIdx->getValue();
unsigned ImmCost = TTI->getIntImmCost(NewIdx, GEPIIdx->getType());
if (ImmCost > TargetTransformInfo::TCC_Basic)
return false;
}
// Now unmerge between GEPI and UGEPIs.
for (GetElementPtrInst *UGEPI : UGEPIs) {
UGEPI->setOperand(0, GEPI);
ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
Constant *NewUGEPIIdx =
ConstantInt::get(GEPIIdx->getType(),
UGEPIIdx->getValue() - GEPIIdx->getValue());
UGEPI->setOperand(1, NewUGEPIIdx);
// If GEPI is not inbounds but UGEPI is inbounds, change UGEPI to not
// inbounds to avoid UB.
if (!GEPI->isInBounds()) {
UGEPI->setIsInBounds(false);
}
}
// After unmerging, verify that GEPIOp is actually only used in SrcBlock (not
// alive on IndirectBr edges).
assert(find_if(GEPIOp->users(), [&](User *Usr) {
return cast<Instruction>(Usr)->getParent() != SrcBlock;
}) == GEPIOp->users().end() && "GEPIOp is used outside SrcBlock");
return true;
}
bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
// Bail out if we inserted the instruction to prevent optimizations from
// stepping on each other's toes.
if (InsertedInsts.count(I))
return false;
// TODO: Move into the switch on opcode below here.
if (PHINode *P = dyn_cast<PHINode>(I)) {
// It is possible for very late stage optimizations (such as SimplifyCFG)
// to introduce PHI nodes too late to be cleaned up. If we detect such a
// trivial PHI, go ahead and zap it here.
if (Value *V = SimplifyInstruction(P, {*DL, TLInfo})) {
LargeOffsetGEPMap.erase(P);
P->replaceAllUsesWith(V);
P->eraseFromParent();
++NumPHIsElim;
return true;
}
return false;
}
if (CastInst *CI = dyn_cast<CastInst>(I)) {
// If the source of the cast is a constant, then this should have
// already been constant folded. The only reason NOT to constant fold
// it is if something (e.g. LSR) was careful to place the constant
// evaluation in a block other than then one that uses it (e.g. to hoist
// the address of globals out of a loop). If this is the case, we don't
// want to forward-subst the cast.
if (isa<Constant>(CI->getOperand(0)))
return false;
if (TLI && OptimizeNoopCopyExpression(CI, *TLI, *DL))
return true;
if (isa<ZExtInst>(I) || isa<SExtInst>(I)) {
/// Sink a zext or sext into its user blocks if the target type doesn't
/// fit in one register
if (TLI &&
TLI->getTypeAction(CI->getContext(),
TLI->getValueType(*DL, CI->getType())) ==
TargetLowering::TypeExpandInteger) {
return SinkCast(CI);
} else {
bool MadeChange = optimizeExt(I);
return MadeChange | optimizeExtUses(I);
}
}
return false;
}
if (auto *Cmp = dyn_cast<CmpInst>(I))
if (TLI && optimizeCmp(Cmp, ModifiedDT))
return true;
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
if (TLI) {
bool Modified = optimizeLoadExt(LI);
unsigned AS = LI->getPointerAddressSpace();
Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
return Modified;
}
return false;
}
if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
if (TLI && splitMergedValStore(*SI, *DL, *TLI))
return true;
SI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
if (TLI) {
unsigned AS = SI->getPointerAddressSpace();
return optimizeMemoryInst(I, SI->getOperand(1),
SI->getOperand(0)->getType(), AS);
}
return false;
}
if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
unsigned AS = RMW->getPointerAddressSpace();
return optimizeMemoryInst(I, RMW->getPointerOperand(),
RMW->getType(), AS);
}
if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(I)) {
unsigned AS = CmpX->getPointerAddressSpace();
return optimizeMemoryInst(I, CmpX->getPointerOperand(),
CmpX->getCompareOperand()->getType(), AS);
}
BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
if (BinOp && (BinOp->getOpcode() == Instruction::And) &&
EnableAndCmpSinking && TLI)
return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts);
// TODO: Move this into the switch on opcode - it handles shifts already.
if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
BinOp->getOpcode() == Instruction::LShr)) {
ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
if (TLI && CI && TLI->hasExtractBitsInsn())
if (OptimizeExtractBits(BinOp, CI, *TLI, *DL))
return true;
}
if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
if (GEPI->hasAllZeroIndices()) {
/// The GEP operand must be a pointer, so must its result -> BitCast
Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(),
GEPI->getName(), GEPI);
NC->setDebugLoc(GEPI->getDebugLoc());
GEPI->replaceAllUsesWith(NC);
GEPI->eraseFromParent();
++NumGEPsElim;
optimizeInst(NC, ModifiedDT);
return true;
}
if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) {
return true;
}
return false;
}
if (tryToSinkFreeOperands(I))
return true;
switch (I->getOpcode()) {
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
return optimizeShiftInst(cast<BinaryOperator>(I));
case Instruction::Call:
return optimizeCallInst(cast<CallInst>(I), ModifiedDT);
case Instruction::Select:
return optimizeSelectInst(cast<SelectInst>(I));
case Instruction::ShuffleVector:
return optimizeShuffleVectorInst(cast<ShuffleVectorInst>(I));
case Instruction::Switch:
return optimizeSwitchInst(cast<SwitchInst>(I));
case Instruction::ExtractElement:
return optimizeExtractElementInst(cast<ExtractElementInst>(I));
}
return false;
}
/// Given an OR instruction, check to see if this is a bitreverse
/// idiom. If so, insert the new intrinsic and return true.
static bool makeBitReverse(Instruction &I, const DataLayout &DL,
const TargetLowering &TLI) {
if (!I.getType()->isIntegerTy() ||
!TLI.isOperationLegalOrCustom(ISD::BITREVERSE,
TLI.getValueType(DL, I.getType(), true)))
return false;
SmallVector<Instruction*, 4> Insts;
if (!recognizeBSwapOrBitReverseIdiom(&I, false, true, Insts))
return false;
Instruction *LastInst = Insts.back();
I.replaceAllUsesWith(LastInst);
RecursivelyDeleteTriviallyDeadInstructions(&I);
return true;
}
// In this pass we look for GEP and cast instructions that are used
// across basic blocks and rewrite them to improve basic-block-at-a-time
// selection.
bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
SunkAddrs.clear();
bool MadeChange = false;
CurInstIterator = BB.begin();
while (CurInstIterator != BB.end()) {
MadeChange |= optimizeInst(&*CurInstIterator++, ModifiedDT);
if (ModifiedDT)
return true;
}
bool MadeBitReverse = true;
while (TLI && MadeBitReverse) {
MadeBitReverse = false;
for (auto &I : reverse(BB)) {
if (makeBitReverse(I, *DL, *TLI)) {
MadeBitReverse = MadeChange = true;
ModifiedDT = true;
break;
}
}
}
MadeChange |= dupRetToEnableTailCallOpts(&BB, ModifiedDT);
return MadeChange;
}
// llvm.dbg.value is far away from the value then iSel may not be able
// handle it properly. iSel will drop llvm.dbg.value if it can not
// find a node corresponding to the value.
bool CodeGenPrepare::placeDbgValues(Function &F) {
bool MadeChange = false;
for (BasicBlock &BB : F) {
Instruction *PrevNonDbgInst = nullptr;
for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
Instruction *Insn = &*BI++;
DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
// Leave dbg.values that refer to an alloca alone. These
// intrinsics describe the address of a variable (= the alloca)
// being taken. They should not be moved next to the alloca
// (and to the beginning of the scope), but rather stay close to
// where said address is used.
if (!DVI || (DVI->getValue() && isa<AllocaInst>(DVI->getValue()))) {
PrevNonDbgInst = Insn;
continue;
}
Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue());
if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) {
// If VI is a phi in a block with an EHPad terminator, we can't insert
// after it.
if (isa<PHINode>(VI) && VI->getParent()->getTerminator()->isEHPad())
continue;
LLVM_DEBUG(dbgs() << "Moving Debug Value before :\n"
<< *DVI << ' ' << *VI);
DVI->removeFromParent();
if (isa<PHINode>(VI))
DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt());
else
DVI->insertAfter(VI);
MadeChange = true;
++NumDbgValueMoved;
}
}
}
return MadeChange;
}
/// Scale down both weights to fit into uint32_t.
static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
uint32_t Scale = (NewMax / std::numeric_limits<uint32_t>::max()) + 1;
NewTrue = NewTrue / Scale;
NewFalse = NewFalse / Scale;
}
/// Some targets prefer to split a conditional branch like:
/// \code
/// %0 = icmp ne i32 %a, 0
/// %1 = icmp ne i32 %b, 0
/// %or.cond = or i1 %0, %1
/// br i1 %or.cond, label %TrueBB, label %FalseBB
/// \endcode
/// into multiple branch instructions like:
/// \code
/// bb1:
/// %0 = icmp ne i32 %a, 0
/// br i1 %0, label %TrueBB, label %bb2
/// bb2:
/// %1 = icmp ne i32 %b, 0
/// br i1 %1, label %TrueBB, label %FalseBB
/// \endcode
/// This usually allows instruction selection to do even further optimizations
/// and combine the compare with the branch instruction. Currently this is
/// applied for targets which have "cheap" jump instructions.
///
/// FIXME: Remove the (equivalent?) implementation in SelectionDAG.
///
bool CodeGenPrepare::splitBranchCondition(Function &F, bool &ModifiedDT) {
if (!TM || !TM->Options.EnableFastISel || !TLI || TLI->isJumpExpensive())
return false;
bool MadeChange = false;
for (auto &BB : F) {
// Does this BB end with the following?
// %cond1 = icmp|fcmp|binary instruction ...
// %cond2 = icmp|fcmp|binary instruction ...
// %cond.or = or|and i1 %cond1, cond2
// br i1 %cond.or label %dest1, label %dest2"
BinaryOperator *LogicOp;
BasicBlock *TBB, *FBB;
if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB)))
continue;
auto *Br1 = cast<BranchInst>(BB.getTerminator());
if (Br1->getMetadata(LLVMContext::MD_unpredictable))
continue;
unsigned Opc;
Value *Cond1, *Cond2;
if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)),
m_OneUse(m_Value(Cond2)))))
Opc = Instruction::And;
else if (match(LogicOp, m_Or(m_OneUse(m_Value(Cond1)),
m_OneUse(m_Value(Cond2)))))
Opc = Instruction::Or;
else
continue;
if (!match(Cond1, m_CombineOr(m_Cmp(), m_BinOp())) ||
!match(Cond2, m_CombineOr(m_Cmp(), m_BinOp())) )
continue;
LLVM_DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());
// Create a new BB.
auto TmpBB =
BasicBlock::Create(BB.getContext(), BB.getName() + ".cond.split",
BB.getParent(), BB.getNextNode());
// Update original basic block by using the first condition directly by the
// branch instruction and removing the no longer needed and/or instruction.
Br1->setCondition(Cond1);
LogicOp->eraseFromParent();
// Depending on the condition we have to either replace the true or the
// false successor of the original branch instruction.
if (Opc == Instruction::And)
Br1->setSuccessor(0, TmpBB);
else
Br1->setSuccessor(1, TmpBB);
// Fill in the new basic block.
auto *Br2 = IRBuilder<>(TmpBB).CreateCondBr(Cond2, TBB, FBB);
if (auto *I = dyn_cast<Instruction>(Cond2)) {
I->removeFromParent();
I->insertBefore(Br2);
}
// Update PHI nodes in both successors. The original BB needs to be
// replaced in one successor's PHI nodes, because the branch comes now from
// the newly generated BB (NewBB). In the other successor we need to add one
// incoming edge to the PHI nodes, because both branch instructions target
// now the same successor. Depending on the original branch condition
// (and/or) we have to swap the successors (TrueDest, FalseDest), so that
// we perform the correct update for the PHI nodes.
// This doesn't change the successor order of the just created branch
// instruction (or any other instruction).
if (Opc == Instruction::Or)
std::swap(TBB, FBB);
// Replace the old BB with the new BB.
TBB->replacePhiUsesWith(&BB, TmpBB);
// Add another incoming edge form the new BB.
for (PHINode &PN : FBB->phis()) {
auto *Val = PN.getIncomingValueForBlock(&BB);
PN.addIncoming(Val, TmpBB);
}
// Update the branch weights (from SelectionDAGBuilder::
// FindMergedConditions).
if (Opc == Instruction::Or) {
// Codegen X | Y as:
// BB1:
// jmp_if_X TBB
// jmp TmpBB
// TmpBB:
// jmp_if_Y TBB
// jmp FBB
//
// We have flexibility in setting Prob for BB1 and Prob for NewBB.
// The requirement is that
// TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
// = TrueProb for original BB.
// Assuming the original weights are A and B, one choice is to set BB1's
// weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice
// assumes that
// TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
// Another choice is to assume TrueProb for BB1 equals to TrueProb for
// TmpBB, but the math is more complicated.
uint64_t TrueWeight, FalseWeight;
if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) {
uint64_t NewTrueWeight = TrueWeight;
uint64_t NewFalseWeight = TrueWeight + 2 * FalseWeight;
scaleWeights(NewTrueWeight, NewFalseWeight);
Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
.createBranchWeights(TrueWeight, FalseWeight));
NewTrueWeight = TrueWeight;
NewFalseWeight = 2 * FalseWeight;
scaleWeights(NewTrueWeight, NewFalseWeight);
Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
.createBranchWeights(TrueWeight, FalseWeight));
}
} else {
// Codegen X & Y as:
// BB1:
// jmp_if_X TmpBB
// jmp FBB
// TmpBB:
// jmp_if_Y TBB
// jmp FBB
//
// This requires creation of TmpBB after CurBB.
// We have flexibility in setting Prob for BB1 and Prob for TmpBB.
// The requirement is that
// FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
// = FalseProb for original BB.
// Assuming the original weights are A and B, one choice is to set BB1's
// weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice
// assumes that
// FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB.
uint64_t TrueWeight, FalseWeight;
if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) {
uint64_t NewTrueWeight = 2 * TrueWeight + FalseWeight;
uint64_t NewFalseWeight = FalseWeight;
scaleWeights(NewTrueWeight, NewFalseWeight);
Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
.createBranchWeights(TrueWeight, FalseWeight));
NewTrueWeight = 2 * TrueWeight;
NewFalseWeight = FalseWeight;
scaleWeights(NewTrueWeight, NewFalseWeight);
Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
.createBranchWeights(TrueWeight, FalseWeight));
}
}
ModifiedDT = true;
MadeChange = true;
LLVM_DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump();
TmpBB->dump());
}
return MadeChange;
}
Index: vendor/llvm/dist-release_90/lib/CodeGen/LiveDebugValues.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/CodeGen/LiveDebugValues.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/CodeGen/LiveDebugValues.cpp (revision 351303)
@@ -1,1316 +1,1329 @@
//===- LiveDebugValues.cpp - Tracking Debug Value MIs ---------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// This pass implements a data flow analysis that propagates debug location
/// information by inserting additional DBG_VALUE instructions into the machine
/// instruction stream. The pass internally builds debug location liveness
/// ranges to determine the points where additional DBG_VALUEs need to be
/// inserted.
///
/// This is a separate pass from DbgValueHistoryCalculator to facilitate
/// testing and improve modularity.
///
//===----------------------------------------------------------------------===//
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/SparseBitVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/UniqueVector.h"
#include "llvm/CodeGen/LexicalScopes.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/DIBuilder.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Module.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <functional>
#include <queue>
#include <tuple>
#include <utility>
#include <vector>
using namespace llvm;
#define DEBUG_TYPE "livedebugvalues"
STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted");
// If @MI is a DBG_VALUE with debug value described by a defined
// register, returns the number of this register. In the other case, returns 0.
static Register isDbgValueDescribedByReg(const MachineInstr &MI) {
assert(MI.isDebugValue() && "expected a DBG_VALUE");
assert(MI.getNumOperands() == 4 && "malformed DBG_VALUE");
// If location of variable is described using a register (directly
// or indirectly), this register is always a first operand.
return MI.getOperand(0).isReg() ? MI.getOperand(0).getReg() : Register();
}
namespace {
class LiveDebugValues : public MachineFunctionPass {
private:
const TargetRegisterInfo *TRI;
const TargetInstrInfo *TII;
const TargetFrameLowering *TFI;
BitVector CalleeSavedRegs;
LexicalScopes LS;
enum struct TransferKind { TransferCopy, TransferSpill, TransferRestore };
/// Keeps track of lexical scopes associated with a user value's source
/// location.
class UserValueScopes {
DebugLoc DL;
LexicalScopes &LS;
SmallPtrSet<const MachineBasicBlock *, 4> LBlocks;
public:
UserValueScopes(DebugLoc D, LexicalScopes &L) : DL(std::move(D)), LS(L) {}
/// Return true if current scope dominates at least one machine
/// instruction in a given machine basic block.
bool dominates(MachineBasicBlock *MBB) {
if (LBlocks.empty())
LS.getMachineBasicBlocks(DL, LBlocks);
return LBlocks.count(MBB) != 0 || LS.dominates(DL, MBB);
}
};
using FragmentInfo = DIExpression::FragmentInfo;
using OptFragmentInfo = Optional<DIExpression::FragmentInfo>;
/// Storage for identifying a potentially inlined instance of a variable,
/// or a fragment thereof.
class DebugVariable {
const DILocalVariable *Variable;
OptFragmentInfo Fragment;
const DILocation *InlinedAt;
/// Fragment that will overlap all other fragments. Used as default when
/// caller demands a fragment.
static const FragmentInfo DefaultFragment;
public:
DebugVariable(const DILocalVariable *Var, OptFragmentInfo &&FragmentInfo,
const DILocation *InlinedAt)
: Variable(Var), Fragment(FragmentInfo), InlinedAt(InlinedAt) {}
DebugVariable(const DILocalVariable *Var, OptFragmentInfo &FragmentInfo,
const DILocation *InlinedAt)
: Variable(Var), Fragment(FragmentInfo), InlinedAt(InlinedAt) {}
DebugVariable(const DILocalVariable *Var, const DIExpression *DIExpr,
const DILocation *InlinedAt)
: DebugVariable(Var, DIExpr->getFragmentInfo(), InlinedAt) {}
DebugVariable(const MachineInstr &MI)
: DebugVariable(MI.getDebugVariable(),
MI.getDebugExpression()->getFragmentInfo(),
MI.getDebugLoc()->getInlinedAt()) {}
const DILocalVariable *getVar() const { return Variable; }
const OptFragmentInfo &getFragment() const { return Fragment; }
const DILocation *getInlinedAt() const { return InlinedAt; }
const FragmentInfo getFragmentDefault() const {
return Fragment.getValueOr(DefaultFragment);
}
static bool isFragmentDefault(FragmentInfo &F) {
return F == DefaultFragment;
}
bool operator==(const DebugVariable &Other) const {
return std::tie(Variable, Fragment, InlinedAt) ==
std::tie(Other.Variable, Other.Fragment, Other.InlinedAt);
}
bool operator<(const DebugVariable &Other) const {
return std::tie(Variable, Fragment, InlinedAt) <
std::tie(Other.Variable, Other.Fragment, Other.InlinedAt);
}
};
friend struct llvm::DenseMapInfo<DebugVariable>;
/// A pair of debug variable and value location.
struct VarLoc {
// The location at which a spilled variable resides. It consists of a
// register and an offset.
struct SpillLoc {
unsigned SpillBase;
int SpillOffset;
bool operator==(const SpillLoc &Other) const {
return SpillBase == Other.SpillBase && SpillOffset == Other.SpillOffset;
}
};
const DebugVariable Var;
const MachineInstr &MI; ///< Only used for cloning a new DBG_VALUE.
mutable UserValueScopes UVS;
enum VarLocKind {
InvalidKind = 0,
RegisterKind,
SpillLocKind,
ImmediateKind,
EntryValueKind
} Kind = InvalidKind;
/// The value location. Stored separately to avoid repeatedly
/// extracting it from MI.
union {
uint64_t RegNo;
SpillLoc SpillLocation;
uint64_t Hash;
int64_t Immediate;
const ConstantFP *FPImm;
const ConstantInt *CImm;
} Loc;
VarLoc(const MachineInstr &MI, LexicalScopes &LS,
VarLocKind K = InvalidKind)
: Var(MI), MI(MI), UVS(MI.getDebugLoc(), LS){
static_assert((sizeof(Loc) == sizeof(uint64_t)),
"hash does not cover all members of Loc");
assert(MI.isDebugValue() && "not a DBG_VALUE");
assert(MI.getNumOperands() == 4 && "malformed DBG_VALUE");
if (int RegNo = isDbgValueDescribedByReg(MI)) {
Kind = MI.isDebugEntryValue() ? EntryValueKind : RegisterKind;
Loc.RegNo = RegNo;
} else if (MI.getOperand(0).isImm()) {
Kind = ImmediateKind;
Loc.Immediate = MI.getOperand(0).getImm();
} else if (MI.getOperand(0).isFPImm()) {
Kind = ImmediateKind;
Loc.FPImm = MI.getOperand(0).getFPImm();
} else if (MI.getOperand(0).isCImm()) {
Kind = ImmediateKind;
Loc.CImm = MI.getOperand(0).getCImm();
}
assert((Kind != ImmediateKind || !MI.isDebugEntryValue()) &&
"entry values must be register locations");
}
/// The constructor for spill locations.
VarLoc(const MachineInstr &MI, unsigned SpillBase, int SpillOffset,
LexicalScopes &LS)
: Var(MI), MI(MI), UVS(MI.getDebugLoc(), LS) {
assert(MI.isDebugValue() && "not a DBG_VALUE");
assert(MI.getNumOperands() == 4 && "malformed DBG_VALUE");
Kind = SpillLocKind;
Loc.SpillLocation = {SpillBase, SpillOffset};
}
// Is the Loc field a constant or constant object?
bool isConstant() const { return Kind == ImmediateKind; }
/// If this variable is described by a register, return it,
/// otherwise return 0.
unsigned isDescribedByReg() const {
if (Kind == RegisterKind)
return Loc.RegNo;
return 0;
}
/// Determine whether the lexical scope of this value's debug location
/// dominates MBB.
bool dominates(MachineBasicBlock &MBB) const { return UVS.dominates(&MBB); }
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD void dump() const { MI.dump(); }
#endif
bool operator==(const VarLoc &Other) const {
return Kind == Other.Kind && Var == Other.Var &&
Loc.Hash == Other.Loc.Hash;
}
/// This operator guarantees that VarLocs are sorted by Variable first.
bool operator<(const VarLoc &Other) const {
return std::tie(Var, Kind, Loc.Hash) <
std::tie(Other.Var, Other.Kind, Other.Loc.Hash);
}
};
using DebugParamMap = SmallDenseMap<const DILocalVariable *, MachineInstr *>;
using VarLocMap = UniqueVector<VarLoc>;
using VarLocSet = SparseBitVector<>;
using VarLocInMBB = SmallDenseMap<const MachineBasicBlock *, VarLocSet>;
struct TransferDebugPair {
MachineInstr *TransferInst;
MachineInstr *DebugInst;
};
using TransferMap = SmallVector<TransferDebugPair, 4>;
// Types for recording sets of variable fragments that overlap. For a given
// local variable, we record all other fragments of that variable that could
// overlap it, to reduce search time.
using FragmentOfVar =
std::pair<const DILocalVariable *, DIExpression::FragmentInfo>;
using OverlapMap =
DenseMap<FragmentOfVar, SmallVector<DIExpression::FragmentInfo, 1>>;
// Helper while building OverlapMap, a map of all fragments seen for a given
// DILocalVariable.
using VarToFragments =
DenseMap<const DILocalVariable *, SmallSet<FragmentInfo, 4>>;
/// This holds the working set of currently open ranges. For fast
/// access, this is done both as a set of VarLocIDs, and a map of
/// DebugVariable to recent VarLocID. Note that a DBG_VALUE ends all
/// previous open ranges for the same variable.
class OpenRangesSet {
VarLocSet VarLocs;
SmallDenseMap<DebugVariable, unsigned, 8> Vars;
OverlapMap &OverlappingFragments;
public:
OpenRangesSet(OverlapMap &_OLapMap) : OverlappingFragments(_OLapMap) {}
const VarLocSet &getVarLocs() const { return VarLocs; }
/// Terminate all open ranges for Var by removing it from the set.
void erase(DebugVariable Var);
/// Terminate all open ranges listed in \c KillSet by removing
/// them from the set.
void erase(const VarLocSet &KillSet, const VarLocMap &VarLocIDs) {
VarLocs.intersectWithComplement(KillSet);
for (unsigned ID : KillSet)
Vars.erase(VarLocIDs[ID].Var);
}
/// Insert a new range into the set.
void insert(unsigned VarLocID, DebugVariable Var) {
VarLocs.set(VarLocID);
Vars.insert({Var, VarLocID});
}
/// Empty the set.
void clear() {
VarLocs.clear();
Vars.clear();
}
/// Return whether the set is empty or not.
bool empty() const {
assert(Vars.empty() == VarLocs.empty() && "open ranges are inconsistent");
return VarLocs.empty();
}
};
bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF,
unsigned &Reg);
/// If a given instruction is identified as a spill, return the spill location
/// and set \p Reg to the spilled register.
Optional<VarLoc::SpillLoc> isRestoreInstruction(const MachineInstr &MI,
MachineFunction *MF,
unsigned &Reg);
/// Given a spill instruction, extract the register and offset used to
/// address the spill location in a target independent way.
VarLoc::SpillLoc extractSpillBaseRegAndOffset(const MachineInstr &MI);
void insertTransferDebugPair(MachineInstr &MI, OpenRangesSet &OpenRanges,
TransferMap &Transfers, VarLocMap &VarLocIDs,
unsigned OldVarID, TransferKind Kind,
unsigned NewReg = 0);
void transferDebugValue(const MachineInstr &MI, OpenRangesSet &OpenRanges,
VarLocMap &VarLocIDs);
void transferSpillOrRestoreInst(MachineInstr &MI, OpenRangesSet &OpenRanges,
VarLocMap &VarLocIDs, TransferMap &Transfers);
void emitEntryValues(MachineInstr &MI, OpenRangesSet &OpenRanges,
VarLocMap &VarLocIDs, TransferMap &Transfers,
DebugParamMap &DebugEntryVals,
SparseBitVector<> &KillSet);
void transferRegisterCopy(MachineInstr &MI, OpenRangesSet &OpenRanges,
VarLocMap &VarLocIDs, TransferMap &Transfers);
void transferRegisterDef(MachineInstr &MI, OpenRangesSet &OpenRanges,
VarLocMap &VarLocIDs, TransferMap &Transfers,
DebugParamMap &DebugEntryVals);
bool transferTerminatorInst(MachineInstr &MI, OpenRangesSet &OpenRanges,
VarLocInMBB &OutLocs, const VarLocMap &VarLocIDs);
bool process(MachineInstr &MI, OpenRangesSet &OpenRanges,
VarLocInMBB &OutLocs, VarLocMap &VarLocIDs,
TransferMap &Transfers, DebugParamMap &DebugEntryVals,
bool transferChanges, OverlapMap &OverlapFragments,
VarToFragments &SeenFragments);
void accumulateFragmentMap(MachineInstr &MI, VarToFragments &SeenFragments,
OverlapMap &OLapMap);
bool join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
const VarLocMap &VarLocIDs,
SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks);
bool ExtendRanges(MachineFunction &MF);
public:
static char ID;
/// Default construct and initialize the pass.
LiveDebugValues();
/// Tell the pass manager which passes we depend on and what
/// information we preserve.
void getAnalysisUsage(AnalysisUsage &AU) const override;
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::NoVRegs);
}
/// Print to ostream with a message.
void printVarLocInMBB(const MachineFunction &MF, const VarLocInMBB &V,
const VarLocMap &VarLocIDs, const char *msg,
raw_ostream &Out) const;
/// Calculate the liveness information for the given machine function.
bool runOnMachineFunction(MachineFunction &MF) override;
};
} // end anonymous namespace
namespace llvm {
template <> struct DenseMapInfo<LiveDebugValues::DebugVariable> {
using DV = LiveDebugValues::DebugVariable;
using OptFragmentInfo = LiveDebugValues::OptFragmentInfo;
using FragmentInfo = LiveDebugValues::FragmentInfo;
// Empty key: no key should be generated that has no DILocalVariable.
static inline DV getEmptyKey() {
return DV(nullptr, OptFragmentInfo(), nullptr);
}
// Difference in tombstone is that the Optional is meaningful
static inline DV getTombstoneKey() {
return DV(nullptr, OptFragmentInfo({0, 0}), nullptr);
}
static unsigned getHashValue(const DV &D) {
unsigned HV = 0;
const OptFragmentInfo &Fragment = D.getFragment();
if (Fragment)
HV = DenseMapInfo<FragmentInfo>::getHashValue(*Fragment);
return hash_combine(D.getVar(), HV, D.getInlinedAt());
}
static bool isEqual(const DV &A, const DV &B) { return A == B; }
};
} // namespace llvm
//===----------------------------------------------------------------------===//
// Implementation
//===----------------------------------------------------------------------===//
const DIExpression::FragmentInfo
LiveDebugValues::DebugVariable::DefaultFragment = {
std::numeric_limits<uint64_t>::max(),
std::numeric_limits<uint64_t>::min()};
char LiveDebugValues::ID = 0;
char &llvm::LiveDebugValuesID = LiveDebugValues::ID;
INITIALIZE_PASS(LiveDebugValues, DEBUG_TYPE, "Live DEBUG_VALUE analysis",
false, false)
/// Default construct and initialize the pass.
LiveDebugValues::LiveDebugValues() : MachineFunctionPass(ID) {
initializeLiveDebugValuesPass(*PassRegistry::getPassRegistry());
}
/// Tell the pass manager which passes we depend on and what information we
/// preserve.
void LiveDebugValues::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
/// Erase a variable from the set of open ranges, and additionally erase any
/// fragments that may overlap it.
void LiveDebugValues::OpenRangesSet::erase(DebugVariable Var) {
// Erasure helper.
auto DoErase = [this](DebugVariable VarToErase) {
auto It = Vars.find(VarToErase);
if (It != Vars.end()) {
unsigned ID = It->second;
VarLocs.reset(ID);
Vars.erase(It);
}
};
// Erase the variable/fragment that ends here.
DoErase(Var);
// Extract the fragment. Interpret an empty fragment as one that covers all
// possible bits.
FragmentInfo ThisFragment = Var.getFragmentDefault();
// There may be fragments that overlap the designated fragment. Look them up
// in the pre-computed overlap map, and erase them too.
auto MapIt = OverlappingFragments.find({Var.getVar(), ThisFragment});
if (MapIt != OverlappingFragments.end()) {
for (auto Fragment : MapIt->second) {
LiveDebugValues::OptFragmentInfo FragmentHolder;
if (!DebugVariable::isFragmentDefault(Fragment))
FragmentHolder = LiveDebugValues::OptFragmentInfo(Fragment);
DoErase({Var.getVar(), FragmentHolder, Var.getInlinedAt()});
}
}
}
//===----------------------------------------------------------------------===//
// Debug Range Extension Implementation
//===----------------------------------------------------------------------===//
#ifndef NDEBUG
void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF,
const VarLocInMBB &V,
const VarLocMap &VarLocIDs,
const char *msg,
raw_ostream &Out) const {
Out << '\n' << msg << '\n';
for (const MachineBasicBlock &BB : MF) {
const VarLocSet &L = V.lookup(&BB);
if (L.empty())
continue;
Out << "MBB: " << BB.getNumber() << ":\n";
for (unsigned VLL : L) {
const VarLoc &VL = VarLocIDs[VLL];
Out << " Var: " << VL.Var.getVar()->getName();
Out << " MI: ";
VL.dump();
}
}
Out << "\n";
}
#endif
LiveDebugValues::VarLoc::SpillLoc
LiveDebugValues::extractSpillBaseRegAndOffset(const MachineInstr &MI) {
assert(MI.hasOneMemOperand() &&
"Spill instruction does not have exactly one memory operand?");
auto MMOI = MI.memoperands_begin();
const PseudoSourceValue *PVal = (*MMOI)->getPseudoValue();
assert(PVal->kind() == PseudoSourceValue::FixedStack &&
"Inconsistent memory operand in spill instruction");
int FI = cast<FixedStackPseudoSourceValue>(PVal)->getFrameIndex();
const MachineBasicBlock *MBB = MI.getParent();
unsigned Reg;
int Offset = TFI->getFrameIndexReference(*MBB->getParent(), FI, Reg);
return {Reg, Offset};
}
/// End all previous ranges related to @MI and start a new range from @MI
/// if it is a DBG_VALUE instr.
void LiveDebugValues::transferDebugValue(const MachineInstr &MI,
OpenRangesSet &OpenRanges,
VarLocMap &VarLocIDs) {
if (!MI.isDebugValue())
return;
const DILocalVariable *Var = MI.getDebugVariable();
const DIExpression *Expr = MI.getDebugExpression();
const DILocation *DebugLoc = MI.getDebugLoc();
const DILocation *InlinedAt = DebugLoc->getInlinedAt();
assert(Var->isValidLocationForIntrinsic(DebugLoc) &&
"Expected inlined-at fields to agree");
// End all previous ranges of Var.
DebugVariable V(Var, Expr, InlinedAt);
OpenRanges.erase(V);
// Add the VarLoc to OpenRanges from this DBG_VALUE.
unsigned ID;
if (isDbgValueDescribedByReg(MI) || MI.getOperand(0).isImm() ||
MI.getOperand(0).isFPImm() || MI.getOperand(0).isCImm()) {
// Use normal VarLoc constructor for registers and immediates.
VarLoc VL(MI, LS);
ID = VarLocIDs.insert(VL);
OpenRanges.insert(ID, VL.Var);
} else if (MI.hasOneMemOperand()) {
// It's a stack spill -- fetch spill base and offset.
VarLoc::SpillLoc SpillLocation = extractSpillBaseRegAndOffset(MI);
VarLoc VL(MI, SpillLocation.SpillBase, SpillLocation.SpillOffset, LS);
ID = VarLocIDs.insert(VL);
OpenRanges.insert(ID, VL.Var);
} else {
// This must be an undefined location. We should leave OpenRanges closed.
assert(MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == 0 &&
"Unexpected non-undef DBG_VALUE encountered");
}
}
void LiveDebugValues::emitEntryValues(MachineInstr &MI,
OpenRangesSet &OpenRanges,
VarLocMap &VarLocIDs,
TransferMap &Transfers,
DebugParamMap &DebugEntryVals,
SparseBitVector<> &KillSet) {
MachineFunction *MF = MI.getParent()->getParent();
for (unsigned ID : KillSet) {
if (!VarLocIDs[ID].Var.getVar()->isParameter())
continue;
const MachineInstr *CurrDebugInstr = &VarLocIDs[ID].MI;
// If parameter's DBG_VALUE is not in the map that means we can't
// generate parameter's entry value.
if (!DebugEntryVals.count(CurrDebugInstr->getDebugVariable()))
continue;
auto ParamDebugInstr = DebugEntryVals[CurrDebugInstr->getDebugVariable()];
DIExpression *NewExpr = DIExpression::prepend(
ParamDebugInstr->getDebugExpression(), DIExpression::EntryValue);
MachineInstr *EntryValDbgMI =
BuildMI(*MF, ParamDebugInstr->getDebugLoc(), ParamDebugInstr->getDesc(),
ParamDebugInstr->isIndirectDebugValue(),
ParamDebugInstr->getOperand(0).getReg(),
ParamDebugInstr->getDebugVariable(), NewExpr);
if (ParamDebugInstr->isIndirectDebugValue())
EntryValDbgMI->getOperand(1).setImm(
ParamDebugInstr->getOperand(1).getImm());
Transfers.push_back({&MI, EntryValDbgMI});
VarLoc VL(*EntryValDbgMI, LS);
unsigned EntryValLocID = VarLocIDs.insert(VL);
OpenRanges.insert(EntryValLocID, VL.Var);
}
}
/// Create new TransferDebugPair and insert it in \p Transfers. The VarLoc
/// with \p OldVarID should be deleted form \p OpenRanges and replaced with
/// new VarLoc. If \p NewReg is different than default zero value then the
/// new location will be register location created by the copy like instruction,
/// otherwise it is variable's location on the stack.
void LiveDebugValues::insertTransferDebugPair(
MachineInstr &MI, OpenRangesSet &OpenRanges, TransferMap &Transfers,
VarLocMap &VarLocIDs, unsigned OldVarID, TransferKind Kind,
unsigned NewReg) {
const MachineInstr *DebugInstr = &VarLocIDs[OldVarID].MI;
MachineFunction *MF = MI.getParent()->getParent();
MachineInstr *NewDebugInstr;
auto ProcessVarLoc = [&MI, &OpenRanges, &Transfers, &DebugInstr,
&VarLocIDs](VarLoc &VL, MachineInstr *NewDebugInstr) {
unsigned LocId = VarLocIDs.insert(VL);
// Close this variable's previous location range.
DebugVariable V(*DebugInstr);
OpenRanges.erase(V);
OpenRanges.insert(LocId, VL.Var);
// The newly created DBG_VALUE instruction NewDebugInstr must be inserted
// after MI. Keep track of the pairing.
TransferDebugPair MIP = {&MI, NewDebugInstr};
Transfers.push_back(MIP);
};
// End all previous ranges of Var.
OpenRanges.erase(VarLocIDs[OldVarID].Var);
switch (Kind) {
case TransferKind::TransferCopy: {
assert(NewReg &&
"No register supplied when handling a copy of a debug value");
// Create a DBG_VALUE instruction to describe the Var in its new
// register location.
NewDebugInstr = BuildMI(
*MF, DebugInstr->getDebugLoc(), DebugInstr->getDesc(),
DebugInstr->isIndirectDebugValue(), NewReg,
DebugInstr->getDebugVariable(), DebugInstr->getDebugExpression());
if (DebugInstr->isIndirectDebugValue())
NewDebugInstr->getOperand(1).setImm(DebugInstr->getOperand(1).getImm());
VarLoc VL(*NewDebugInstr, LS);
ProcessVarLoc(VL, NewDebugInstr);
LLVM_DEBUG(dbgs() << "Creating DBG_VALUE inst for register copy: ";
NewDebugInstr->print(dbgs(), /*IsStandalone*/false,
/*SkipOpers*/false, /*SkipDebugLoc*/false,
/*AddNewLine*/true, TII));
return;
}
case TransferKind::TransferSpill: {
// Create a DBG_VALUE instruction to describe the Var in its spilled
// location.
VarLoc::SpillLoc SpillLocation = extractSpillBaseRegAndOffset(MI);
auto *SpillExpr = DIExpression::prepend(DebugInstr->getDebugExpression(),
DIExpression::ApplyOffset,
SpillLocation.SpillOffset);
NewDebugInstr = BuildMI(
*MF, DebugInstr->getDebugLoc(), DebugInstr->getDesc(), true,
SpillLocation.SpillBase, DebugInstr->getDebugVariable(), SpillExpr);
VarLoc VL(*NewDebugInstr, SpillLocation.SpillBase,
SpillLocation.SpillOffset, LS);
ProcessVarLoc(VL, NewDebugInstr);
LLVM_DEBUG(dbgs() << "Creating DBG_VALUE inst for spill: ";
NewDebugInstr->print(dbgs(), /*IsStandalone*/false,
/*SkipOpers*/false, /*SkipDebugLoc*/false,
/*AddNewLine*/true, TII));
return;
}
case TransferKind::TransferRestore: {
assert(NewReg &&
"No register supplied when handling a restore of a debug value");
MachineFunction *MF = MI.getMF();
DIBuilder DIB(*const_cast<Function &>(MF->getFunction()).getParent());
+
+ const DIExpression *NewExpr;
+ if (auto Fragment = DebugInstr->getDebugExpression()->getFragmentInfo())
+ NewExpr = *DIExpression::createFragmentExpression(DIB.createExpression(),
+ Fragment->OffsetInBits, Fragment->SizeInBits);
+ else
+ NewExpr = DIB.createExpression();
+
NewDebugInstr =
BuildMI(*MF, DebugInstr->getDebugLoc(), DebugInstr->getDesc(), false,
- NewReg, DebugInstr->getDebugVariable(), DIB.createExpression());
+ NewReg, DebugInstr->getDebugVariable(), NewExpr);
VarLoc VL(*NewDebugInstr, LS);
ProcessVarLoc(VL, NewDebugInstr);
LLVM_DEBUG(dbgs() << "Creating DBG_VALUE inst for register restore: ";
NewDebugInstr->print(dbgs(), /*IsStandalone*/false,
/*SkipOpers*/false, /*SkipDebugLoc*/false,
/*AddNewLine*/true, TII));
return;
}
}
llvm_unreachable("Invalid transfer kind");
}
/// A definition of a register may mark the end of a range.
void LiveDebugValues::transferRegisterDef(
MachineInstr &MI, OpenRangesSet &OpenRanges, VarLocMap &VarLocIDs,
TransferMap &Transfers, DebugParamMap &DebugEntryVals) {
MachineFunction *MF = MI.getMF();
const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
SparseBitVector<> KillSet;
for (const MachineOperand &MO : MI.operands()) {
// Determine whether the operand is a register def. Assume that call
// instructions never clobber SP, because some backends (e.g., AArch64)
// never list SP in the regmask.
if (MO.isReg() && MO.isDef() && MO.getReg() &&
TRI->isPhysicalRegister(MO.getReg()) &&
!(MI.isCall() && MO.getReg() == SP)) {
// Remove ranges of all aliased registers.
for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
for (unsigned ID : OpenRanges.getVarLocs())
if (VarLocIDs[ID].isDescribedByReg() == *RAI)
KillSet.set(ID);
} else if (MO.isRegMask()) {
// Remove ranges of all clobbered registers. Register masks don't usually
// list SP as preserved. While the debug info may be off for an
// instruction or two around callee-cleanup calls, transferring the
// DEBUG_VALUE across the call is still a better user experience.
for (unsigned ID : OpenRanges.getVarLocs()) {
unsigned Reg = VarLocIDs[ID].isDescribedByReg();
if (Reg && Reg != SP && MO.clobbersPhysReg(Reg))
KillSet.set(ID);
}
}
}
OpenRanges.erase(KillSet, VarLocIDs);
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
auto &TM = TPC->getTM<TargetMachine>();
if (TM.Options.EnableDebugEntryValues)
emitEntryValues(MI, OpenRanges, VarLocIDs, Transfers, DebugEntryVals,
KillSet);
}
}
/// Decide if @MI is a spill instruction and return true if it is. We use 2
/// criteria to make this decision:
/// - Is this instruction a store to a spill slot?
/// - Is there a register operand that is both used and killed?
/// TODO: Store optimization can fold spills into other stores (including
/// other spills). We do not handle this yet (more than one memory operand).
bool LiveDebugValues::isSpillInstruction(const MachineInstr &MI,
MachineFunction *MF, unsigned &Reg) {
SmallVector<const MachineMemOperand*, 1> Accesses;
// TODO: Handle multiple stores folded into one.
if (!MI.hasOneMemOperand())
return false;
if (!MI.getSpillSize(TII) && !MI.getFoldedSpillSize(TII))
return false; // This is not a spill instruction, since no valid size was
// returned from either function.
auto isKilledReg = [&](const MachineOperand MO, unsigned &Reg) {
if (!MO.isReg() || !MO.isUse()) {
Reg = 0;
return false;
}
Reg = MO.getReg();
return MO.isKill();
};
for (const MachineOperand &MO : MI.operands()) {
// In a spill instruction generated by the InlineSpiller the spilled
// register has its kill flag set.
if (isKilledReg(MO, Reg))
return true;
if (Reg != 0) {
// Check whether next instruction kills the spilled register.
// FIXME: Current solution does not cover search for killed register in
// bundles and instructions further down the chain.
auto NextI = std::next(MI.getIterator());
// Skip next instruction that points to basic block end iterator.
if (MI.getParent()->end() == NextI)
continue;
unsigned RegNext;
for (const MachineOperand &MONext : NextI->operands()) {
// Return true if we came across the register from the
// previous spill instruction that is killed in NextI.
if (isKilledReg(MONext, RegNext) && RegNext == Reg)
return true;
}
}
}
// Return false if we didn't find spilled register.
return false;
}
Optional<LiveDebugValues::VarLoc::SpillLoc>
LiveDebugValues::isRestoreInstruction(const MachineInstr &MI,
MachineFunction *MF, unsigned &Reg) {
if (!MI.hasOneMemOperand())
return None;
// FIXME: Handle folded restore instructions with more than one memory
// operand.
if (MI.getRestoreSize(TII)) {
Reg = MI.getOperand(0).getReg();
return extractSpillBaseRegAndOffset(MI);
}
return None;
}
/// A spilled register may indicate that we have to end the current range of
/// a variable and create a new one for the spill location.
/// A restored register may indicate the reverse situation.
/// We don't want to insert any instructions in process(), so we just create
/// the DBG_VALUE without inserting it and keep track of it in \p Transfers.
/// It will be inserted into the BB when we're done iterating over the
/// instructions.
void LiveDebugValues::transferSpillOrRestoreInst(MachineInstr &MI,
OpenRangesSet &OpenRanges,
VarLocMap &VarLocIDs,
TransferMap &Transfers) {
MachineFunction *MF = MI.getMF();
TransferKind TKind;
unsigned Reg;
Optional<VarLoc::SpillLoc> Loc;
LLVM_DEBUG(dbgs() << "Examining instruction: "; MI.dump(););
if (isSpillInstruction(MI, MF, Reg)) {
TKind = TransferKind::TransferSpill;
LLVM_DEBUG(dbgs() << "Recognized as spill: "; MI.dump(););
LLVM_DEBUG(dbgs() << "Register: " << Reg << " " << printReg(Reg, TRI)
<< "\n");
} else {
if (!(Loc = isRestoreInstruction(MI, MF, Reg)))
return;
TKind = TransferKind::TransferRestore;
LLVM_DEBUG(dbgs() << "Recognized as restore: "; MI.dump(););
LLVM_DEBUG(dbgs() << "Register: " << Reg << " " << printReg(Reg, TRI)
<< "\n");
}
// Check if the register or spill location is the location of a debug value.
+ // FIXME: Don't create a spill transfer if there is a complex expression,
+ // because we currently cannot recover the original expression on restore.
for (unsigned ID : OpenRanges.getVarLocs()) {
+ const MachineInstr *DebugInstr = &VarLocIDs[ID].MI;
+
if (TKind == TransferKind::TransferSpill &&
- VarLocIDs[ID].isDescribedByReg() == Reg) {
+ VarLocIDs[ID].isDescribedByReg() == Reg &&
+ !DebugInstr->getDebugExpression()->isComplex()) {
LLVM_DEBUG(dbgs() << "Spilling Register " << printReg(Reg, TRI) << '('
<< VarLocIDs[ID].Var.getVar()->getName() << ")\n");
} else if (TKind == TransferKind::TransferRestore &&
VarLocIDs[ID].Loc.SpillLocation == *Loc) {
LLVM_DEBUG(dbgs() << "Restoring Register " << printReg(Reg, TRI) << '('
<< VarLocIDs[ID].Var.getVar()->getName() << ")\n");
} else
continue;
insertTransferDebugPair(MI, OpenRanges, Transfers, VarLocIDs, ID, TKind,
Reg);
return;
}
}
/// If \p MI is a register copy instruction, that copies a previously tracked
/// value from one register to another register that is callee saved, we
/// create new DBG_VALUE instruction described with copy destination register.
void LiveDebugValues::transferRegisterCopy(MachineInstr &MI,
OpenRangesSet &OpenRanges,
VarLocMap &VarLocIDs,
TransferMap &Transfers) {
const MachineOperand *SrcRegOp, *DestRegOp;
if (!TII->isCopyInstr(MI, SrcRegOp, DestRegOp) || !SrcRegOp->isKill() ||
!DestRegOp->isDef())
return;
auto isCalleSavedReg = [&](unsigned Reg) {
for (MCRegAliasIterator RAI(Reg, TRI, true); RAI.isValid(); ++RAI)
if (CalleeSavedRegs.test(*RAI))
return true;
return false;
};
unsigned SrcReg = SrcRegOp->getReg();
unsigned DestReg = DestRegOp->getReg();
// We want to recognize instructions where destination register is callee
// saved register. If register that could be clobbered by the call is
// included, there would be a great chance that it is going to be clobbered
// soon. It is more likely that previous register location, which is callee
// saved, is going to stay unclobbered longer, even if it is killed.
if (!isCalleSavedReg(DestReg))
return;
for (unsigned ID : OpenRanges.getVarLocs()) {
if (VarLocIDs[ID].isDescribedByReg() == SrcReg) {
insertTransferDebugPair(MI, OpenRanges, Transfers, VarLocIDs, ID,
TransferKind::TransferCopy, DestReg);
return;
}
}
}
/// Terminate all open ranges at the end of the current basic block.
bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI,
OpenRangesSet &OpenRanges,
VarLocInMBB &OutLocs,
const VarLocMap &VarLocIDs) {
bool Changed = false;
const MachineBasicBlock *CurMBB = MI.getParent();
if (!(MI.isTerminator() || (&MI == &CurMBB->back())))
return false;
if (OpenRanges.empty())
return false;
LLVM_DEBUG(for (unsigned ID
: OpenRanges.getVarLocs()) {
// Copy OpenRanges to OutLocs, if not already present.
dbgs() << "Add to OutLocs in MBB #" << CurMBB->getNumber() << ": ";
VarLocIDs[ID].dump();
});
VarLocSet &VLS = OutLocs[CurMBB];
Changed = VLS |= OpenRanges.getVarLocs();
// New OutLocs set may be different due to spill, restore or register
// copy instruction processing.
if (Changed)
VLS = OpenRanges.getVarLocs();
OpenRanges.clear();
return Changed;
}
/// Accumulate a mapping between each DILocalVariable fragment and other
/// fragments of that DILocalVariable which overlap. This reduces work during
/// the data-flow stage from "Find any overlapping fragments" to "Check if the
/// known-to-overlap fragments are present".
/// \param MI A previously unprocessed DEBUG_VALUE instruction to analyze for
/// fragment usage.
/// \param SeenFragments Map from DILocalVariable to all fragments of that
/// Variable which are known to exist.
/// \param OverlappingFragments The overlap map being constructed, from one
/// Var/Fragment pair to a vector of fragments known to overlap.
void LiveDebugValues::accumulateFragmentMap(MachineInstr &MI,
VarToFragments &SeenFragments,
OverlapMap &OverlappingFragments) {
DebugVariable MIVar(MI);
FragmentInfo ThisFragment = MIVar.getFragmentDefault();
// If this is the first sighting of this variable, then we are guaranteed
// there are currently no overlapping fragments either. Initialize the set
// of seen fragments, record no overlaps for the current one, and return.
auto SeenIt = SeenFragments.find(MIVar.getVar());
if (SeenIt == SeenFragments.end()) {
SmallSet<FragmentInfo, 4> OneFragment;
OneFragment.insert(ThisFragment);
SeenFragments.insert({MIVar.getVar(), OneFragment});
OverlappingFragments.insert({{MIVar.getVar(), ThisFragment}, {}});
return;
}
// If this particular Variable/Fragment pair already exists in the overlap
// map, it has already been accounted for.
auto IsInOLapMap =
OverlappingFragments.insert({{MIVar.getVar(), ThisFragment}, {}});
if (!IsInOLapMap.second)
return;
auto &ThisFragmentsOverlaps = IsInOLapMap.first->second;
auto &AllSeenFragments = SeenIt->second;
// Otherwise, examine all other seen fragments for this variable, with "this"
// fragment being a previously unseen fragment. Record any pair of
// overlapping fragments.
for (auto &ASeenFragment : AllSeenFragments) {
// Does this previously seen fragment overlap?
if (DIExpression::fragmentsOverlap(ThisFragment, ASeenFragment)) {
// Yes: Mark the current fragment as being overlapped.
ThisFragmentsOverlaps.push_back(ASeenFragment);
// Mark the previously seen fragment as being overlapped by the current
// one.
auto ASeenFragmentsOverlaps =
OverlappingFragments.find({MIVar.getVar(), ASeenFragment});
assert(ASeenFragmentsOverlaps != OverlappingFragments.end() &&
"Previously seen var fragment has no vector of overlaps");
ASeenFragmentsOverlaps->second.push_back(ThisFragment);
}
}
AllSeenFragments.insert(ThisFragment);
}
/// This routine creates OpenRanges and OutLocs.
bool LiveDebugValues::process(MachineInstr &MI, OpenRangesSet &OpenRanges,
VarLocInMBB &OutLocs, VarLocMap &VarLocIDs,
TransferMap &Transfers, DebugParamMap &DebugEntryVals,
bool transferChanges,
OverlapMap &OverlapFragments,
VarToFragments &SeenFragments) {
bool Changed = false;
transferDebugValue(MI, OpenRanges, VarLocIDs);
transferRegisterDef(MI, OpenRanges, VarLocIDs, Transfers,
DebugEntryVals);
if (transferChanges) {
transferRegisterCopy(MI, OpenRanges, VarLocIDs, Transfers);
transferSpillOrRestoreInst(MI, OpenRanges, VarLocIDs, Transfers);
} else {
// Build up a map of overlapping fragments on the first run through.
if (MI.isDebugValue())
accumulateFragmentMap(MI, SeenFragments, OverlapFragments);
}
Changed = transferTerminatorInst(MI, OpenRanges, OutLocs, VarLocIDs);
return Changed;
}
/// This routine joins the analysis results of all incoming edges in @MBB by
/// inserting a new DBG_VALUE instruction at the start of the @MBB - if the same
/// source variable in all the predecessors of @MBB reside in the same location.
bool LiveDebugValues::join(
MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
const VarLocMap &VarLocIDs,
SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks) {
LLVM_DEBUG(dbgs() << "join MBB: " << MBB.getNumber() << "\n");
bool Changed = false;
VarLocSet InLocsT; // Temporary incoming locations.
// For all predecessors of this MBB, find the set of VarLocs that
// can be joined.
int NumVisited = 0;
for (auto p : MBB.predecessors()) {
// Ignore unvisited predecessor blocks. As we are processing
// the blocks in reverse post-order any unvisited block can
// be considered to not remove any incoming values.
if (!Visited.count(p)) {
LLVM_DEBUG(dbgs() << " ignoring unvisited pred MBB: " << p->getNumber()
<< "\n");
continue;
}
auto OL = OutLocs.find(p);
// Join is null in case of empty OutLocs from any of the pred.
if (OL == OutLocs.end())
return false;
// Just copy over the Out locs to incoming locs for the first visited
// predecessor, and for all other predecessors join the Out locs.
if (!NumVisited)
InLocsT = OL->second;
else
InLocsT &= OL->second;
LLVM_DEBUG({
if (!InLocsT.empty()) {
for (auto ID : InLocsT)
dbgs() << " gathered candidate incoming var: "
<< VarLocIDs[ID].Var.getVar()->getName() << "\n";
}
});
NumVisited++;
}
// Filter out DBG_VALUES that are out of scope.
VarLocSet KillSet;
bool IsArtificial = ArtificialBlocks.count(&MBB);
if (!IsArtificial) {
for (auto ID : InLocsT) {
if (!VarLocIDs[ID].dominates(MBB)) {
KillSet.set(ID);
LLVM_DEBUG({
auto Name = VarLocIDs[ID].Var.getVar()->getName();
dbgs() << " killing " << Name << ", it doesn't dominate MBB\n";
});
}
}
}
InLocsT.intersectWithComplement(KillSet);
// As we are processing blocks in reverse post-order we
// should have processed at least one predecessor, unless it
// is the entry block which has no predecessor.
assert((NumVisited || MBB.pred_empty()) &&
"Should have processed at least one predecessor");
if (InLocsT.empty())
return false;
VarLocSet &ILS = InLocs[&MBB];
// Insert DBG_VALUE instructions, if not already inserted.
VarLocSet Diff = InLocsT;
Diff.intersectWithComplement(ILS);
for (auto ID : Diff) {
// This VarLoc is not found in InLocs i.e. it is not yet inserted. So, a
// new range is started for the var from the mbb's beginning by inserting
// a new DBG_VALUE. process() will end this range however appropriate.
const VarLoc &DiffIt = VarLocIDs[ID];
const MachineInstr *DebugInstr = &DiffIt.MI;
MachineInstr *MI = nullptr;
if (DiffIt.isConstant()) {
MachineOperand MO(DebugInstr->getOperand(0));
MI = BuildMI(MBB, MBB.instr_begin(), DebugInstr->getDebugLoc(),
DebugInstr->getDesc(), false, MO,
DebugInstr->getDebugVariable(),
DebugInstr->getDebugExpression());
} else {
MI = BuildMI(MBB, MBB.instr_begin(), DebugInstr->getDebugLoc(),
DebugInstr->getDesc(), DebugInstr->isIndirectDebugValue(),
DebugInstr->getOperand(0).getReg(),
DebugInstr->getDebugVariable(),
DebugInstr->getDebugExpression());
if (DebugInstr->isIndirectDebugValue())
MI->getOperand(1).setImm(DebugInstr->getOperand(1).getImm());
}
LLVM_DEBUG(dbgs() << "Inserted: "; MI->dump(););
ILS.set(ID);
++NumInserted;
Changed = true;
}
return Changed;
}
/// Calculate the liveness information for the given machine function and
/// extend ranges across basic blocks.
bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "\nDebug Range Extension\n");
bool Changed = false;
bool OLChanged = false;
bool MBBJoined = false;
VarLocMap VarLocIDs; // Map VarLoc<>unique ID for use in bitvectors.
OverlapMap OverlapFragments; // Map of overlapping variable fragments
OpenRangesSet OpenRanges(OverlapFragments);
// Ranges that are open until end of bb.
VarLocInMBB OutLocs; // Ranges that exist beyond bb.
VarLocInMBB InLocs; // Ranges that are incoming after joining.
TransferMap Transfers; // DBG_VALUEs associated with spills.
VarToFragments SeenFragments;
// Blocks which are artificial, i.e. blocks which exclusively contain
// instructions without locations, or with line 0 locations.
SmallPtrSet<const MachineBasicBlock *, 16> ArtificialBlocks;
DenseMap<unsigned int, MachineBasicBlock *> OrderToBB;
DenseMap<MachineBasicBlock *, unsigned int> BBToOrder;
std::priority_queue<unsigned int, std::vector<unsigned int>,
std::greater<unsigned int>>
Worklist;
std::priority_queue<unsigned int, std::vector<unsigned int>,
std::greater<unsigned int>>
Pending;
enum : bool { dontTransferChanges = false, transferChanges = true };
// Besides parameter's modification, check whether a DBG_VALUE is inlined
// in order to deduce whether the variable that it tracks comes from
// a different function. If that is the case we can't track its entry value.
auto IsUnmodifiedFuncParam = [&](const MachineInstr &MI) {
auto *DIVar = MI.getDebugVariable();
return DIVar->isParameter() && DIVar->isNotModified() &&
!MI.getDebugLoc()->getInlinedAt();
};
const TargetLowering *TLI = MF.getSubtarget().getTargetLowering();
unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
unsigned FP = TRI->getFrameRegister(MF);
auto IsRegOtherThanSPAndFP = [&](const MachineOperand &Op) -> bool {
return Op.isReg() && Op.getReg() != SP && Op.getReg() != FP;
};
// Working set of currently collected debug variables mapped to DBG_VALUEs
// representing candidates for production of debug entry values.
DebugParamMap DebugEntryVals;
MachineBasicBlock &First_MBB = *(MF.begin());
// Only in the case of entry MBB collect DBG_VALUEs representing
// function parameters in order to generate debug entry values for them.
// Currently, we generate debug entry values only for parameters that are
// unmodified throughout the function and located in a register.
// TODO: Add support for parameters that are described as fragments.
// TODO: Add support for modified arguments that can be expressed
// by using its entry value.
// TODO: Add support for local variables that are expressed in terms of
// parameters entry values.
for (auto &MI : First_MBB)
if (MI.isDebugValue() && IsUnmodifiedFuncParam(MI) &&
!MI.isIndirectDebugValue() && IsRegOtherThanSPAndFP(MI.getOperand(0)) &&
!DebugEntryVals.count(MI.getDebugVariable()) &&
!MI.getDebugExpression()->isFragment())
DebugEntryVals[MI.getDebugVariable()] = &MI;
// Initialize every mbb with OutLocs.
// We are not looking at any spill instructions during the initial pass
// over the BBs. The LiveDebugVariables pass has already created DBG_VALUE
// instructions for spills of registers that are known to be user variables
// within the BB in which the spill occurs.
for (auto &MBB : MF) {
for (auto &MI : MBB) {
process(MI, OpenRanges, OutLocs, VarLocIDs, Transfers, DebugEntryVals,
dontTransferChanges, OverlapFragments, SeenFragments);
}
// Add any entry DBG_VALUE instructions necessitated by parameter
// clobbering.
for (auto &TR : Transfers) {
MBB.insertAfter(MachineBasicBlock::iterator(*TR.TransferInst),
TR.DebugInst);
}
Transfers.clear();
}
auto hasNonArtificialLocation = [](const MachineInstr &MI) -> bool {
if (const DebugLoc &DL = MI.getDebugLoc())
return DL.getLine() != 0;
return false;
};
for (auto &MBB : MF)
if (none_of(MBB.instrs(), hasNonArtificialLocation))
ArtificialBlocks.insert(&MBB);
LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
"OutLocs after initialization", dbgs()));
ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
unsigned int RPONumber = 0;
for (auto RI = RPOT.begin(), RE = RPOT.end(); RI != RE; ++RI) {
OrderToBB[RPONumber] = *RI;
BBToOrder[*RI] = RPONumber;
Worklist.push(RPONumber);
++RPONumber;
}
// This is a standard "union of predecessor outs" dataflow problem.
// To solve it, we perform join() and process() using the two worklist method
// until the ranges converge.
// Ranges have converged when both worklists are empty.
SmallPtrSet<const MachineBasicBlock *, 16> Visited;
while (!Worklist.empty() || !Pending.empty()) {
// We track what is on the pending worklist to avoid inserting the same
// thing twice. We could avoid this with a custom priority queue, but this
// is probably not worth it.
SmallPtrSet<MachineBasicBlock *, 16> OnPending;
LLVM_DEBUG(dbgs() << "Processing Worklist\n");
while (!Worklist.empty()) {
MachineBasicBlock *MBB = OrderToBB[Worklist.top()];
Worklist.pop();
MBBJoined =
join(*MBB, OutLocs, InLocs, VarLocIDs, Visited, ArtificialBlocks);
Visited.insert(MBB);
if (MBBJoined) {
MBBJoined = false;
Changed = true;
// Now that we have started to extend ranges across BBs we need to
// examine spill instructions to see whether they spill registers that
// correspond to user variables.
for (auto &MI : *MBB)
OLChanged |=
process(MI, OpenRanges, OutLocs, VarLocIDs, Transfers,
DebugEntryVals, transferChanges, OverlapFragments,
SeenFragments);
// Add any DBG_VALUE instructions necessitated by spills.
for (auto &TR : Transfers)
MBB->insertAfter(MachineBasicBlock::iterator(*TR.TransferInst),
TR.DebugInst);
Transfers.clear();
LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
"OutLocs after propagating", dbgs()));
LLVM_DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs,
"InLocs after propagating", dbgs()));
if (OLChanged) {
OLChanged = false;
for (auto s : MBB->successors())
if (OnPending.insert(s).second) {
Pending.push(BBToOrder[s]);
}
}
}
}
Worklist.swap(Pending);
// At this point, pending must be empty, since it was just the empty
// worklist
assert(Pending.empty() && "Pending should be empty");
}
LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "Final OutLocs", dbgs()));
LLVM_DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs, "Final InLocs", dbgs()));
return Changed;
}
bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
if (!MF.getFunction().getSubprogram())
// LiveDebugValues will already have removed all DBG_VALUEs.
return false;
// Skip functions from NoDebug compilation units.
if (MF.getFunction().getSubprogram()->getUnit()->getEmissionKind() ==
DICompileUnit::NoDebug)
return false;
TRI = MF.getSubtarget().getRegisterInfo();
TII = MF.getSubtarget().getInstrInfo();
TFI = MF.getSubtarget().getFrameLowering();
TFI->determineCalleeSaves(MF, CalleeSavedRegs,
make_unique<RegScavenger>().get());
LS.initialize(MF);
bool Changed = ExtendRanges(MF);
return Changed;
}
Index: vendor/llvm/dist-release_90/lib/CodeGen/MachineCSE.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/CodeGen/MachineCSE.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/CodeGen/MachineCSE.cpp (revision 351303)
@@ -1,871 +1,896 @@
//===- MachineCSE.cpp - Machine Common Subexpression Elimination Pass -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass performs global common subexpression elimination on machine
// instructions using a scoped hash table based value numbering scheme. It
// must be run while the machine function is still in SSA form.
//
//===----------------------------------------------------------------------===//
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/ScopedHashTable.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/RecyclingAllocator.h"
#include "llvm/Support/raw_ostream.h"
#include <cassert>
#include <iterator>
#include <utility>
#include <vector>
using namespace llvm;
#define DEBUG_TYPE "machine-cse"
STATISTIC(NumCoalesces, "Number of copies coalesced");
STATISTIC(NumCSEs, "Number of common subexpression eliminated");
STATISTIC(NumPREs, "Number of partial redundant expression"
" transformed to fully redundant");
STATISTIC(NumPhysCSEs,
"Number of physreg referencing common subexpr eliminated");
STATISTIC(NumCrossBBCSEs,
"Number of cross-MBB physreg referencing CS eliminated");
STATISTIC(NumCommutes, "Number of copies coalesced after commuting");
namespace {
class MachineCSE : public MachineFunctionPass {
const TargetInstrInfo *TII;
const TargetRegisterInfo *TRI;
AliasAnalysis *AA;
MachineDominatorTree *DT;
MachineRegisterInfo *MRI;
+ MachineBlockFrequencyInfo *MBFI;
public:
static char ID; // Pass identification
MachineCSE() : MachineFunctionPass(ID) {
initializeMachineCSEPass(*PassRegistry::getPassRegistry());
}
bool runOnMachineFunction(MachineFunction &MF) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
AU.addRequired<AAResultsWrapperPass>();
AU.addPreservedID(MachineLoopInfoID);
AU.addRequired<MachineDominatorTree>();
AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineBlockFrequencyInfo>();
+ AU.addPreserved<MachineBlockFrequencyInfo>();
}
void releaseMemory() override {
ScopeMap.clear();
PREMap.clear();
Exps.clear();
}
private:
using AllocatorTy = RecyclingAllocator<BumpPtrAllocator,
ScopedHashTableVal<MachineInstr *, unsigned>>;
using ScopedHTType =
ScopedHashTable<MachineInstr *, unsigned, MachineInstrExpressionTrait,
AllocatorTy>;
using ScopeType = ScopedHTType::ScopeTy;
using PhysDefVector = SmallVector<std::pair<unsigned, unsigned>, 2>;
unsigned LookAheadLimit = 0;
DenseMap<MachineBasicBlock *, ScopeType *> ScopeMap;
DenseMap<MachineInstr *, MachineBasicBlock *, MachineInstrExpressionTrait>
PREMap;
ScopedHTType VNT;
SmallVector<MachineInstr *, 64> Exps;
unsigned CurrVN = 0;
bool PerformTrivialCopyPropagation(MachineInstr *MI,
MachineBasicBlock *MBB);
bool isPhysDefTriviallyDead(unsigned Reg,
MachineBasicBlock::const_iterator I,
MachineBasicBlock::const_iterator E) const;
bool hasLivePhysRegDefUses(const MachineInstr *MI,
const MachineBasicBlock *MBB,
SmallSet<unsigned, 8> &PhysRefs,
PhysDefVector &PhysDefs, bool &PhysUseDef) const;
bool PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
SmallSet<unsigned, 8> &PhysRefs,
PhysDefVector &PhysDefs, bool &NonLocal) const;
bool isCSECandidate(MachineInstr *MI);
bool isProfitableToCSE(unsigned CSReg, unsigned Reg,
MachineBasicBlock *CSBB, MachineInstr *MI);
void EnterScope(MachineBasicBlock *MBB);
void ExitScope(MachineBasicBlock *MBB);
bool ProcessBlockCSE(MachineBasicBlock *MBB);
void ExitScopeIfDone(MachineDomTreeNode *Node,
DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren);
bool PerformCSE(MachineDomTreeNode *Node);
bool isPRECandidate(MachineInstr *MI);
bool ProcessBlockPRE(MachineDominatorTree *MDT, MachineBasicBlock *MBB);
bool PerformSimplePRE(MachineDominatorTree *DT);
+ /// Heuristics to see if it's beneficial to move common computations of MBB
+ /// and MBB1 to CandidateBB.
+ bool isBeneficalToHoistInto(MachineBasicBlock *CandidateBB,
+ MachineBasicBlock *MBB,
+ MachineBasicBlock *MBB1);
};
} // end anonymous namespace
char MachineCSE::ID = 0;
char &llvm::MachineCSEID = MachineCSE::ID;
INITIALIZE_PASS_BEGIN(MachineCSE, DEBUG_TYPE,
"Machine Common Subexpression Elimination", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(MachineCSE, DEBUG_TYPE,
"Machine Common Subexpression Elimination", false, false)
/// The source register of a COPY machine instruction can be propagated to all
/// its users, and this propagation could increase the probability of finding
/// common subexpressions. If the COPY has only one user, the COPY itself can
/// be removed.
bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI,
MachineBasicBlock *MBB) {
bool Changed = false;
for (MachineOperand &MO : MI->operands()) {
if (!MO.isReg() || !MO.isUse())
continue;
unsigned Reg = MO.getReg();
if (!TargetRegisterInfo::isVirtualRegister(Reg))
continue;
bool OnlyOneUse = MRI->hasOneNonDBGUse(Reg);
MachineInstr *DefMI = MRI->getVRegDef(Reg);
if (!DefMI->isCopy())
continue;
unsigned SrcReg = DefMI->getOperand(1).getReg();
if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
continue;
if (DefMI->getOperand(0).getSubReg())
continue;
// FIXME: We should trivially coalesce subregister copies to expose CSE
// opportunities on instructions with truncated operands (see
// cse-add-with-overflow.ll). This can be done here as follows:
// if (SrcSubReg)
// RC = TRI->getMatchingSuperRegClass(MRI->getRegClass(SrcReg), RC,
// SrcSubReg);
// MO.substVirtReg(SrcReg, SrcSubReg, *TRI);
//
// The 2-addr pass has been updated to handle coalesced subregs. However,
// some machine-specific code still can't handle it.
// To handle it properly we also need a way find a constrained subregister
// class given a super-reg class and subreg index.
if (DefMI->getOperand(1).getSubReg())
continue;
if (!MRI->constrainRegAttrs(SrcReg, Reg))
continue;
LLVM_DEBUG(dbgs() << "Coalescing: " << *DefMI);
LLVM_DEBUG(dbgs() << "*** to: " << *MI);
// Update matching debug values.
DefMI->changeDebugValuesDefReg(SrcReg);
// Propagate SrcReg of copies to MI.
MO.setReg(SrcReg);
MRI->clearKillFlags(SrcReg);
// Coalesce single use copies.
if (OnlyOneUse) {
DefMI->eraseFromParent();
++NumCoalesces;
}
Changed = true;
}
return Changed;
}
bool
MachineCSE::isPhysDefTriviallyDead(unsigned Reg,
MachineBasicBlock::const_iterator I,
MachineBasicBlock::const_iterator E) const {
unsigned LookAheadLeft = LookAheadLimit;
while (LookAheadLeft) {
// Skip over dbg_value's.
I = skipDebugInstructionsForward(I, E);
if (I == E)
// Reached end of block, we don't know if register is dead or not.
return false;
bool SeenDef = false;
for (const MachineOperand &MO : I->operands()) {
if (MO.isRegMask() && MO.clobbersPhysReg(Reg))
SeenDef = true;
if (!MO.isReg() || !MO.getReg())
continue;
if (!TRI->regsOverlap(MO.getReg(), Reg))
continue;
if (MO.isUse())
// Found a use!
return false;
SeenDef = true;
}
if (SeenDef)
// See a def of Reg (or an alias) before encountering any use, it's
// trivially dead.
return true;
--LookAheadLeft;
++I;
}
return false;
}
static bool isCallerPreservedOrConstPhysReg(unsigned Reg,
const MachineFunction &MF,
const TargetRegisterInfo &TRI) {
// MachineRegisterInfo::isConstantPhysReg directly called by
// MachineRegisterInfo::isCallerPreservedOrConstPhysReg expects the
// reserved registers to be frozen. That doesn't cause a problem post-ISel as
// most (if not all) targets freeze reserved registers right after ISel.
//
// It does cause issues mid-GlobalISel, however, hence the additional
// reservedRegsFrozen check.
const MachineRegisterInfo &MRI = MF.getRegInfo();
return TRI.isCallerPreservedPhysReg(Reg, MF) ||
(MRI.reservedRegsFrozen() && MRI.isConstantPhysReg(Reg));
}
/// hasLivePhysRegDefUses - Return true if the specified instruction read/write
/// physical registers (except for dead defs of physical registers). It also
/// returns the physical register def by reference if it's the only one and the
/// instruction does not uses a physical register.
bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
const MachineBasicBlock *MBB,
SmallSet<unsigned, 8> &PhysRefs,
PhysDefVector &PhysDefs,
bool &PhysUseDef) const {
// First, add all uses to PhysRefs.
for (const MachineOperand &MO : MI->operands()) {
if (!MO.isReg() || MO.isDef())
continue;
unsigned Reg = MO.getReg();
if (!Reg)
continue;
if (TargetRegisterInfo::isVirtualRegister(Reg))
continue;
// Reading either caller preserved or constant physregs is ok.
if (!isCallerPreservedOrConstPhysReg(Reg, *MI->getMF(), *TRI))
for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
PhysRefs.insert(*AI);
}
// Next, collect all defs into PhysDefs. If any is already in PhysRefs
// (which currently contains only uses), set the PhysUseDef flag.
PhysUseDef = false;
MachineBasicBlock::const_iterator I = MI; I = std::next(I);
for (const auto &MOP : llvm::enumerate(MI->operands())) {
const MachineOperand &MO = MOP.value();
if (!MO.isReg() || !MO.isDef())
continue;
unsigned Reg = MO.getReg();
if (!Reg)
continue;
if (TargetRegisterInfo::isVirtualRegister(Reg))
continue;
// Check against PhysRefs even if the def is "dead".
if (PhysRefs.count(Reg))
PhysUseDef = true;
// If the def is dead, it's ok. But the def may not marked "dead". That's
// common since this pass is run before livevariables. We can scan
// forward a few instructions and check if it is obviously dead.
if (!MO.isDead() && !isPhysDefTriviallyDead(Reg, I, MBB->end()))
PhysDefs.push_back(std::make_pair(MOP.index(), Reg));
}
// Finally, add all defs to PhysRefs as well.
for (unsigned i = 0, e = PhysDefs.size(); i != e; ++i)
for (MCRegAliasIterator AI(PhysDefs[i].second, TRI, true); AI.isValid();
++AI)
PhysRefs.insert(*AI);
return !PhysRefs.empty();
}
bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
SmallSet<unsigned, 8> &PhysRefs,
PhysDefVector &PhysDefs,
bool &NonLocal) const {
// For now conservatively returns false if the common subexpression is
// not in the same basic block as the given instruction. The only exception
// is if the common subexpression is in the sole predecessor block.
const MachineBasicBlock *MBB = MI->getParent();
const MachineBasicBlock *CSMBB = CSMI->getParent();
bool CrossMBB = false;
if (CSMBB != MBB) {
if (MBB->pred_size() != 1 || *MBB->pred_begin() != CSMBB)
return false;
for (unsigned i = 0, e = PhysDefs.size(); i != e; ++i) {
if (MRI->isAllocatable(PhysDefs[i].second) ||
MRI->isReserved(PhysDefs[i].second))
// Avoid extending live range of physical registers if they are
//allocatable or reserved.
return false;
}
CrossMBB = true;
}
MachineBasicBlock::const_iterator I = CSMI; I = std::next(I);
MachineBasicBlock::const_iterator E = MI;
MachineBasicBlock::const_iterator EE = CSMBB->end();
unsigned LookAheadLeft = LookAheadLimit;
while (LookAheadLeft) {
// Skip over dbg_value's.
while (I != E && I != EE && I->isDebugInstr())
++I;
if (I == EE) {
assert(CrossMBB && "Reaching end-of-MBB without finding MI?");
(void)CrossMBB;
CrossMBB = false;
NonLocal = true;
I = MBB->begin();
EE = MBB->end();
continue;
}
if (I == E)
return true;
for (const MachineOperand &MO : I->operands()) {
// RegMasks go on instructions like calls that clobber lots of physregs.
// Don't attempt to CSE across such an instruction.
if (MO.isRegMask())
return false;
if (!MO.isReg() || !MO.isDef())
continue;
unsigned MOReg = MO.getReg();
if (TargetRegisterInfo::isVirtualRegister(MOReg))
continue;
if (PhysRefs.count(MOReg))
return false;
}
--LookAheadLeft;
++I;
}
return false;
}
bool MachineCSE::isCSECandidate(MachineInstr *MI) {
if (MI->isPosition() || MI->isPHI() || MI->isImplicitDef() || MI->isKill() ||
MI->isInlineAsm() || MI->isDebugInstr())
return false;
// Ignore copies.
if (MI->isCopyLike())
return false;
// Ignore stuff that we obviously can't move.
if (MI->mayStore() || MI->isCall() || MI->isTerminator() ||
MI->mayRaiseFPException() || MI->hasUnmodeledSideEffects())
return false;
if (MI->mayLoad()) {
// Okay, this instruction does a load. As a refinement, we allow the target
// to decide whether the loaded value is actually a constant. If so, we can
// actually use it as a load.
if (!MI->isDereferenceableInvariantLoad(AA))
// FIXME: we should be able to hoist loads with no other side effects if
// there are no other instructions which can change memory in this loop.
// This is a trivial form of alias analysis.
return false;
}
// Ignore stack guard loads, otherwise the register that holds CSEed value may
// be spilled and get loaded back with corrupted data.
if (MI->getOpcode() == TargetOpcode::LOAD_STACK_GUARD)
return false;
return true;
}
/// isProfitableToCSE - Return true if it's profitable to eliminate MI with a
/// common expression that defines Reg. CSBB is basic block where CSReg is
/// defined.
bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
MachineBasicBlock *CSBB, MachineInstr *MI) {
// FIXME: Heuristics that works around the lack the live range splitting.
// If CSReg is used at all uses of Reg, CSE should not increase register
// pressure of CSReg.
bool MayIncreasePressure = true;
if (TargetRegisterInfo::isVirtualRegister(CSReg) &&
TargetRegisterInfo::isVirtualRegister(Reg)) {
MayIncreasePressure = false;
SmallPtrSet<MachineInstr*, 8> CSUses;
for (MachineInstr &MI : MRI->use_nodbg_instructions(CSReg)) {
CSUses.insert(&MI);
}
for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) {
if (!CSUses.count(&MI)) {
MayIncreasePressure = true;
break;
}
}
}
if (!MayIncreasePressure) return true;
// Heuristics #1: Don't CSE "cheap" computation if the def is not local or in
// an immediate predecessor. We don't want to increase register pressure and
// end up causing other computation to be spilled.
if (TII->isAsCheapAsAMove(*MI)) {
MachineBasicBlock *BB = MI->getParent();
if (CSBB != BB && !CSBB->isSuccessor(BB))
return false;
}
// Heuristics #2: If the expression doesn't not use a vr and the only use
// of the redundant computation are copies, do not cse.
bool HasVRegUse = false;
for (const MachineOperand &MO : MI->operands()) {
if (MO.isReg() && MO.isUse() &&
TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
HasVRegUse = true;
break;
}
}
if (!HasVRegUse) {
bool HasNonCopyUse = false;
for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) {
// Ignore copies.
if (!MI.isCopyLike()) {
HasNonCopyUse = true;
break;
}
}
if (!HasNonCopyUse)
return false;
}
// Heuristics #3: If the common subexpression is used by PHIs, do not reuse
// it unless the defined value is already used in the BB of the new use.
bool HasPHI = false;
for (MachineInstr &UseMI : MRI->use_nodbg_instructions(CSReg)) {
HasPHI |= UseMI.isPHI();
if (UseMI.getParent() == MI->getParent())
return true;
}
return !HasPHI;
}
void MachineCSE::EnterScope(MachineBasicBlock *MBB) {
LLVM_DEBUG(dbgs() << "Entering: " << MBB->getName() << '\n');
ScopeType *Scope = new ScopeType(VNT);
ScopeMap[MBB] = Scope;
}
void MachineCSE::ExitScope(MachineBasicBlock *MBB) {
LLVM_DEBUG(dbgs() << "Exiting: " << MBB->getName() << '\n');
DenseMap<MachineBasicBlock*, ScopeType*>::iterator SI = ScopeMap.find(MBB);
assert(SI != ScopeMap.end());
delete SI->second;
ScopeMap.erase(SI);
}
bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
bool Changed = false;
SmallVector<std::pair<unsigned, unsigned>, 8> CSEPairs;
SmallVector<unsigned, 2> ImplicitDefsToUpdate;
SmallVector<unsigned, 2> ImplicitDefs;
for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; ) {
MachineInstr *MI = &*I;
++I;
if (!isCSECandidate(MI))
continue;
bool FoundCSE = VNT.count(MI);
if (!FoundCSE) {
// Using trivial copy propagation to find more CSE opportunities.
if (PerformTrivialCopyPropagation(MI, MBB)) {
Changed = true;
// After coalescing MI itself may become a copy.
if (MI->isCopyLike())
continue;
// Try again to see if CSE is possible.
FoundCSE = VNT.count(MI);
}
}
// Commute commutable instructions.
bool Commuted = false;
if (!FoundCSE && MI->isCommutable()) {
if (MachineInstr *NewMI = TII->commuteInstruction(*MI)) {
Commuted = true;
FoundCSE = VNT.count(NewMI);
if (NewMI != MI) {
// New instruction. It doesn't need to be kept.
NewMI->eraseFromParent();
Changed = true;
} else if (!FoundCSE)
// MI was changed but it didn't help, commute it back!
(void)TII->commuteInstruction(*MI);
}
}
// If the instruction defines physical registers and the values *may* be
// used, then it's not safe to replace it with a common subexpression.
// It's also not safe if the instruction uses physical registers.
bool CrossMBBPhysDef = false;
SmallSet<unsigned, 8> PhysRefs;
PhysDefVector PhysDefs;
bool PhysUseDef = false;
if (FoundCSE && hasLivePhysRegDefUses(MI, MBB, PhysRefs,
PhysDefs, PhysUseDef)) {
FoundCSE = false;
// ... Unless the CS is local or is in the sole predecessor block
// and it also defines the physical register which is not clobbered
// in between and the physical register uses were not clobbered.
// This can never be the case if the instruction both uses and
// defines the same physical register, which was detected above.
if (!PhysUseDef) {
unsigned CSVN = VNT.lookup(MI);
MachineInstr *CSMI = Exps[CSVN];
if (PhysRegDefsReach(CSMI, MI, PhysRefs, PhysDefs, CrossMBBPhysDef))
FoundCSE = true;
}
}
if (!FoundCSE) {
VNT.insert(MI, CurrVN++);
Exps.push_back(MI);
continue;
}
// Found a common subexpression, eliminate it.
unsigned CSVN = VNT.lookup(MI);
MachineInstr *CSMI = Exps[CSVN];
LLVM_DEBUG(dbgs() << "Examining: " << *MI);
LLVM_DEBUG(dbgs() << "*** Found a common subexpression: " << *CSMI);
// Check if it's profitable to perform this CSE.
bool DoCSE = true;
unsigned NumDefs = MI->getNumDefs();
for (unsigned i = 0, e = MI->getNumOperands(); NumDefs && i != e; ++i) {
MachineOperand &MO = MI->getOperand(i);
if (!MO.isReg() || !MO.isDef())
continue;
unsigned OldReg = MO.getReg();
unsigned NewReg = CSMI->getOperand(i).getReg();
// Go through implicit defs of CSMI and MI, if a def is not dead at MI,
// we should make sure it is not dead at CSMI.
if (MO.isImplicit() && !MO.isDead() && CSMI->getOperand(i).isDead())
ImplicitDefsToUpdate.push_back(i);
// Keep track of implicit defs of CSMI and MI, to clear possibly
// made-redundant kill flags.
if (MO.isImplicit() && !MO.isDead() && OldReg == NewReg)
ImplicitDefs.push_back(OldReg);
if (OldReg == NewReg) {
--NumDefs;
continue;
}
assert(TargetRegisterInfo::isVirtualRegister(OldReg) &&
TargetRegisterInfo::isVirtualRegister(NewReg) &&
"Do not CSE physical register defs!");
if (!isProfitableToCSE(NewReg, OldReg, CSMI->getParent(), MI)) {
LLVM_DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n");
DoCSE = false;
break;
}
// Don't perform CSE if the result of the new instruction cannot exist
// within the constraints (register class, bank, or low-level type) of
// the old instruction.
if (!MRI->constrainRegAttrs(NewReg, OldReg)) {
LLVM_DEBUG(
dbgs() << "*** Not the same register constraints, avoid CSE!\n");
DoCSE = false;
break;
}
CSEPairs.push_back(std::make_pair(OldReg, NewReg));
--NumDefs;
}
// Actually perform the elimination.
if (DoCSE) {
for (std::pair<unsigned, unsigned> &CSEPair : CSEPairs) {
unsigned OldReg = CSEPair.first;
unsigned NewReg = CSEPair.second;
// OldReg may have been unused but is used now, clear the Dead flag
MachineInstr *Def = MRI->getUniqueVRegDef(NewReg);
assert(Def != nullptr && "CSEd register has no unique definition?");
Def->clearRegisterDeads(NewReg);
// Replace with NewReg and clear kill flags which may be wrong now.
MRI->replaceRegWith(OldReg, NewReg);
MRI->clearKillFlags(NewReg);
}
// Go through implicit defs of CSMI and MI, if a def is not dead at MI,
// we should make sure it is not dead at CSMI.
for (unsigned ImplicitDefToUpdate : ImplicitDefsToUpdate)
CSMI->getOperand(ImplicitDefToUpdate).setIsDead(false);
for (auto PhysDef : PhysDefs)
if (!MI->getOperand(PhysDef.first).isDead())
CSMI->getOperand(PhysDef.first).setIsDead(false);
// Go through implicit defs of CSMI and MI, and clear the kill flags on
// their uses in all the instructions between CSMI and MI.
// We might have made some of the kill flags redundant, consider:
// subs ... implicit-def %nzcv <- CSMI
// csinc ... implicit killed %nzcv <- this kill flag isn't valid anymore
// subs ... implicit-def %nzcv <- MI, to be eliminated
// csinc ... implicit killed %nzcv
// Since we eliminated MI, and reused a register imp-def'd by CSMI
// (here %nzcv), that register, if it was killed before MI, should have
// that kill flag removed, because it's lifetime was extended.
if (CSMI->getParent() == MI->getParent()) {
for (MachineBasicBlock::iterator II = CSMI, IE = MI; II != IE; ++II)
for (auto ImplicitDef : ImplicitDefs)
if (MachineOperand *MO = II->findRegisterUseOperand(
ImplicitDef, /*isKill=*/true, TRI))
MO->setIsKill(false);
} else {
// If the instructions aren't in the same BB, bail out and clear the
// kill flag on all uses of the imp-def'd register.
for (auto ImplicitDef : ImplicitDefs)
MRI->clearKillFlags(ImplicitDef);
}
if (CrossMBBPhysDef) {
// Add physical register defs now coming in from a predecessor to MBB
// livein list.
while (!PhysDefs.empty()) {
auto LiveIn = PhysDefs.pop_back_val();
if (!MBB->isLiveIn(LiveIn.second))
MBB->addLiveIn(LiveIn.second);
}
++NumCrossBBCSEs;
}
MI->eraseFromParent();
++NumCSEs;
if (!PhysRefs.empty())
++NumPhysCSEs;
if (Commuted)
++NumCommutes;
Changed = true;
} else {
VNT.insert(MI, CurrVN++);
Exps.push_back(MI);
}
CSEPairs.clear();
ImplicitDefsToUpdate.clear();
ImplicitDefs.clear();
}
return Changed;
}
/// ExitScopeIfDone - Destroy scope for the MBB that corresponds to the given
/// dominator tree node if its a leaf or all of its children are done. Walk
/// up the dominator tree to destroy ancestors which are now done.
void
MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node,
DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren) {
if (OpenChildren[Node])
return;
// Pop scope.
ExitScope(Node->getBlock());
// Now traverse upwards to pop ancestors whose offsprings are all done.
while (MachineDomTreeNode *Parent = Node->getIDom()) {
unsigned Left = --OpenChildren[Parent];
if (Left != 0)
break;
ExitScope(Parent->getBlock());
Node = Parent;
}
}
bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
SmallVector<MachineDomTreeNode*, 32> Scopes;
SmallVector<MachineDomTreeNode*, 8> WorkList;
DenseMap<MachineDomTreeNode*, unsigned> OpenChildren;
CurrVN = 0;
// Perform a DFS walk to determine the order of visit.
WorkList.push_back(Node);
do {
Node = WorkList.pop_back_val();
Scopes.push_back(Node);
const std::vector<MachineDomTreeNode*> &Children = Node->getChildren();
OpenChildren[Node] = Children.size();
for (MachineDomTreeNode *Child : Children)
WorkList.push_back(Child);
} while (!WorkList.empty());
// Now perform CSE.
bool Changed = false;
for (MachineDomTreeNode *Node : Scopes) {
MachineBasicBlock *MBB = Node->getBlock();
EnterScope(MBB);
Changed |= ProcessBlockCSE(MBB);
// If it's a leaf node, it's done. Traverse upwards to pop ancestors.
ExitScopeIfDone(Node, OpenChildren);
}
return Changed;
}
// We use stronger checks for PRE candidate rather than for CSE ones to embrace
// checks inside ProcessBlockCSE(), not only inside isCSECandidate(). This helps
// to exclude instrs created by PRE that won't be CSEed later.
bool MachineCSE::isPRECandidate(MachineInstr *MI) {
if (!isCSECandidate(MI) ||
MI->isNotDuplicable() ||
MI->mayLoad() ||
MI->isAsCheapAsAMove() ||
MI->getNumDefs() != 1 ||
MI->getNumExplicitDefs() != 1)
return false;
for (auto def : MI->defs())
if (!TRI->isVirtualRegister(def.getReg()))
return false;
for (auto use : MI->uses())
if (use.isReg() && !TRI->isVirtualRegister(use.getReg()))
return false;
return true;
}
bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT,
MachineBasicBlock *MBB) {
bool Changed = false;
for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
MachineInstr *MI = &*I;
++I;
if (!isPRECandidate(MI))
continue;
if (!PREMap.count(MI)) {
PREMap[MI] = MBB;
continue;
}
auto MBB1 = PREMap[MI];
assert(
!DT->properlyDominates(MBB, MBB1) &&
"MBB cannot properly dominate MBB1 while DFS through dominators tree!");
auto CMBB = DT->findNearestCommonDominator(MBB, MBB1);
if (!CMBB->isLegalToHoistInto())
continue;
+ if (!isBeneficalToHoistInto(CMBB, MBB, MBB1))
+ continue;
+
// Two instrs are partial redundant if their basic blocks are reachable
// from one to another but one doesn't dominate another.
if (CMBB != MBB1) {
auto BB = MBB->getBasicBlock(), BB1 = MBB1->getBasicBlock();
if (BB != nullptr && BB1 != nullptr &&
(isPotentiallyReachable(BB1, BB) ||
isPotentiallyReachable(BB, BB1))) {
assert(MI->getOperand(0).isDef() &&
"First operand of instr with one explicit def must be this def");
unsigned VReg = MI->getOperand(0).getReg();
unsigned NewReg = MRI->cloneVirtualRegister(VReg);
if (!isProfitableToCSE(NewReg, VReg, CMBB, MI))
continue;
MachineInstr &NewMI =
TII->duplicate(*CMBB, CMBB->getFirstTerminator(), *MI);
NewMI.getOperand(0).setReg(NewReg);
PREMap[MI] = CMBB;
++NumPREs;
Changed = true;
}
}
}
return Changed;
}
// This simple PRE (partial redundancy elimination) pass doesn't actually
// eliminate partial redundancy but transforms it to full redundancy,
// anticipating that the next CSE step will eliminate this created redundancy.
// If CSE doesn't eliminate this, than created instruction will remain dead
// and eliminated later by Remove Dead Machine Instructions pass.
bool MachineCSE::PerformSimplePRE(MachineDominatorTree *DT) {
SmallVector<MachineDomTreeNode *, 32> BBs;
PREMap.clear();
bool Changed = false;
BBs.push_back(DT->getRootNode());
do {
auto Node = BBs.pop_back_val();
const std::vector<MachineDomTreeNode *> &Children = Node->getChildren();
for (MachineDomTreeNode *Child : Children)
BBs.push_back(Child);
MachineBasicBlock *MBB = Node->getBlock();
Changed |= ProcessBlockPRE(DT, MBB);
} while (!BBs.empty());
return Changed;
}
+bool MachineCSE::isBeneficalToHoistInto(MachineBasicBlock *CandidateBB,
+ MachineBasicBlock *MBB,
+ MachineBasicBlock *MBB1) {
+ if (CandidateBB->getParent()->getFunction().hasMinSize())
+ return true;
+ assert(DT->dominates(CandidateBB, MBB) && "CandidateBB should dominate MBB");
+ assert(DT->dominates(CandidateBB, MBB1) &&
+ "CandidateBB should dominate MBB1");
+ return MBFI->getBlockFreq(CandidateBB) <=
+ MBFI->getBlockFreq(MBB) + MBFI->getBlockFreq(MBB1);
+}
+
bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
TII = MF.getSubtarget().getInstrInfo();
TRI = MF.getSubtarget().getRegisterInfo();
MRI = &MF.getRegInfo();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
DT = &getAnalysis<MachineDominatorTree>();
+ MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
LookAheadLimit = TII->getMachineCSELookAheadLimit();
bool ChangedPRE, ChangedCSE;
ChangedPRE = PerformSimplePRE(DT);
ChangedCSE = PerformCSE(DT->getRootNode());
return ChangedPRE || ChangedCSE;
}
Index: vendor/llvm/dist-release_90/lib/CodeGen/MachineModuleInfo.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/CodeGen/MachineModuleInfo.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/CodeGen/MachineModuleInfo.cpp (revision 351303)
@@ -1,329 +1,329 @@
//===-- llvm/CodeGen/MachineModuleInfo.cpp ----------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/TinyPtrVector.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
#include <algorithm>
#include <cassert>
#include <memory>
#include <utility>
#include <vector>
using namespace llvm;
using namespace llvm::dwarf;
// Handle the Pass registration stuff necessary to use DataLayout's.
INITIALIZE_PASS(MachineModuleInfo, "machinemoduleinfo",
"Machine Module Information", false, false)
char MachineModuleInfo::ID = 0;
// Out of line virtual method.
MachineModuleInfoImpl::~MachineModuleInfoImpl() = default;
namespace llvm {
class MMIAddrLabelMapCallbackPtr final : CallbackVH {
MMIAddrLabelMap *Map = nullptr;
public:
MMIAddrLabelMapCallbackPtr() = default;
MMIAddrLabelMapCallbackPtr(Value *V) : CallbackVH(V) {}
void setPtr(BasicBlock *BB) {
ValueHandleBase::operator=(BB);
}
void setMap(MMIAddrLabelMap *map) { Map = map; }
void deleted() override;
void allUsesReplacedWith(Value *V2) override;
};
class MMIAddrLabelMap {
MCContext &Context;
struct AddrLabelSymEntry {
/// The symbols for the label.
TinyPtrVector<MCSymbol *> Symbols;
Function *Fn; // The containing function of the BasicBlock.
unsigned Index; // The index in BBCallbacks for the BasicBlock.
};
DenseMap<AssertingVH<BasicBlock>, AddrLabelSymEntry> AddrLabelSymbols;
/// Callbacks for the BasicBlock's that we have entries for. We use this so
/// we get notified if a block is deleted or RAUWd.
std::vector<MMIAddrLabelMapCallbackPtr> BBCallbacks;
/// This is a per-function list of symbols whose corresponding BasicBlock got
/// deleted. These symbols need to be emitted at some point in the file, so
/// AsmPrinter emits them after the function body.
DenseMap<AssertingVH<Function>, std::vector<MCSymbol*>>
DeletedAddrLabelsNeedingEmission;
public:
MMIAddrLabelMap(MCContext &context) : Context(context) {}
~MMIAddrLabelMap() {
assert(DeletedAddrLabelsNeedingEmission.empty() &&
"Some labels for deleted blocks never got emitted");
}
ArrayRef<MCSymbol *> getAddrLabelSymbolToEmit(BasicBlock *BB);
void takeDeletedSymbolsForFunction(Function *F,
std::vector<MCSymbol*> &Result);
void UpdateForDeletedBlock(BasicBlock *BB);
void UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New);
};
} // end namespace llvm
ArrayRef<MCSymbol *> MMIAddrLabelMap::getAddrLabelSymbolToEmit(BasicBlock *BB) {
assert(BB->hasAddressTaken() &&
"Shouldn't get label for block without address taken");
AddrLabelSymEntry &Entry = AddrLabelSymbols[BB];
// If we already had an entry for this block, just return it.
if (!Entry.Symbols.empty()) {
assert(BB->getParent() == Entry.Fn && "Parent changed");
return Entry.Symbols;
}
// Otherwise, this is a new entry, create a new symbol for it and add an
// entry to BBCallbacks so we can be notified if the BB is deleted or RAUWd.
BBCallbacks.emplace_back(BB);
BBCallbacks.back().setMap(this);
Entry.Index = BBCallbacks.size() - 1;
Entry.Fn = BB->getParent();
- Entry.Symbols.push_back(Context.createTempSymbol());
+ Entry.Symbols.push_back(Context.createTempSymbol(!BB->hasAddressTaken()));
return Entry.Symbols;
}
/// If we have any deleted symbols for F, return them.
void MMIAddrLabelMap::
takeDeletedSymbolsForFunction(Function *F, std::vector<MCSymbol*> &Result) {
DenseMap<AssertingVH<Function>, std::vector<MCSymbol*>>::iterator I =
DeletedAddrLabelsNeedingEmission.find(F);
// If there are no entries for the function, just return.
if (I == DeletedAddrLabelsNeedingEmission.end()) return;
// Otherwise, take the list.
std::swap(Result, I->second);
DeletedAddrLabelsNeedingEmission.erase(I);
}
void MMIAddrLabelMap::UpdateForDeletedBlock(BasicBlock *BB) {
// If the block got deleted, there is no need for the symbol. If the symbol
// was already emitted, we can just forget about it, otherwise we need to
// queue it up for later emission when the function is output.
AddrLabelSymEntry Entry = std::move(AddrLabelSymbols[BB]);
AddrLabelSymbols.erase(BB);
assert(!Entry.Symbols.empty() && "Didn't have a symbol, why a callback?");
BBCallbacks[Entry.Index] = nullptr; // Clear the callback.
assert((BB->getParent() == nullptr || BB->getParent() == Entry.Fn) &&
"Block/parent mismatch");
for (MCSymbol *Sym : Entry.Symbols) {
if (Sym->isDefined())
return;
// If the block is not yet defined, we need to emit it at the end of the
// function. Add the symbol to the DeletedAddrLabelsNeedingEmission list
// for the containing Function. Since the block is being deleted, its
// parent may already be removed, we have to get the function from 'Entry'.
DeletedAddrLabelsNeedingEmission[Entry.Fn].push_back(Sym);
}
}
void MMIAddrLabelMap::UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New) {
// Get the entry for the RAUW'd block and remove it from our map.
AddrLabelSymEntry OldEntry = std::move(AddrLabelSymbols[Old]);
AddrLabelSymbols.erase(Old);
assert(!OldEntry.Symbols.empty() && "Didn't have a symbol, why a callback?");
AddrLabelSymEntry &NewEntry = AddrLabelSymbols[New];
// If New is not address taken, just move our symbol over to it.
if (NewEntry.Symbols.empty()) {
BBCallbacks[OldEntry.Index].setPtr(New); // Update the callback.
NewEntry = std::move(OldEntry); // Set New's entry.
return;
}
BBCallbacks[OldEntry.Index] = nullptr; // Update the callback.
// Otherwise, we need to add the old symbols to the new block's set.
NewEntry.Symbols.insert(NewEntry.Symbols.end(), OldEntry.Symbols.begin(),
OldEntry.Symbols.end());
}
void MMIAddrLabelMapCallbackPtr::deleted() {
Map->UpdateForDeletedBlock(cast<BasicBlock>(getValPtr()));
}
void MMIAddrLabelMapCallbackPtr::allUsesReplacedWith(Value *V2) {
Map->UpdateForRAUWBlock(cast<BasicBlock>(getValPtr()), cast<BasicBlock>(V2));
}
MachineModuleInfo::MachineModuleInfo(const LLVMTargetMachine *TM)
: ImmutablePass(ID), TM(*TM),
Context(TM->getMCAsmInfo(), TM->getMCRegisterInfo(),
TM->getObjFileLowering(), nullptr, false) {
initializeMachineModuleInfoPass(*PassRegistry::getPassRegistry());
}
MachineModuleInfo::~MachineModuleInfo() = default;
bool MachineModuleInfo::doInitialization(Module &M) {
ObjFileMMI = nullptr;
CurCallSite = 0;
UsesMSVCFloatingPoint = UsesMorestackAddr = false;
HasSplitStack = HasNosplitStack = false;
AddrLabelSymbols = nullptr;
TheModule = &M;
DbgInfoAvailable = !llvm::empty(M.debug_compile_units());
return false;
}
bool MachineModuleInfo::doFinalization(Module &M) {
Personalities.clear();
delete AddrLabelSymbols;
AddrLabelSymbols = nullptr;
Context.reset();
delete ObjFileMMI;
ObjFileMMI = nullptr;
return false;
}
//===- Address of Block Management ----------------------------------------===//
ArrayRef<MCSymbol *>
MachineModuleInfo::getAddrLabelSymbolToEmit(const BasicBlock *BB) {
// Lazily create AddrLabelSymbols.
if (!AddrLabelSymbols)
AddrLabelSymbols = new MMIAddrLabelMap(Context);
return AddrLabelSymbols->getAddrLabelSymbolToEmit(const_cast<BasicBlock*>(BB));
}
void MachineModuleInfo::
takeDeletedSymbolsForFunction(const Function *F,
std::vector<MCSymbol*> &Result) {
// If no blocks have had their addresses taken, we're done.
if (!AddrLabelSymbols) return;
return AddrLabelSymbols->
takeDeletedSymbolsForFunction(const_cast<Function*>(F), Result);
}
/// \name Exception Handling
/// \{
void MachineModuleInfo::addPersonality(const Function *Personality) {
for (unsigned i = 0; i < Personalities.size(); ++i)
if (Personalities[i] == Personality)
return;
Personalities.push_back(Personality);
}
/// \}
MachineFunction *
MachineModuleInfo::getMachineFunction(const Function &F) const {
auto I = MachineFunctions.find(&F);
return I != MachineFunctions.end() ? I->second.get() : nullptr;
}
MachineFunction &
MachineModuleInfo::getOrCreateMachineFunction(const Function &F) {
// Shortcut for the common case where a sequence of MachineFunctionPasses
// all query for the same Function.
if (LastRequest == &F)
return *LastResult;
auto I = MachineFunctions.insert(
std::make_pair(&F, std::unique_ptr<MachineFunction>()));
MachineFunction *MF;
if (I.second) {
// No pre-existing machine function, create a new one.
const TargetSubtargetInfo &STI = *TM.getSubtargetImpl(F);
MF = new MachineFunction(F, TM, STI, NextFnNum++, *this);
// Update the set entry.
I.first->second.reset(MF);
} else {
MF = I.first->second.get();
}
LastRequest = &F;
LastResult = MF;
return *MF;
}
void MachineModuleInfo::deleteMachineFunctionFor(Function &F) {
MachineFunctions.erase(&F);
LastRequest = nullptr;
LastResult = nullptr;
}
namespace {
/// This pass frees the MachineFunction object associated with a Function.
class FreeMachineFunction : public FunctionPass {
public:
static char ID;
FreeMachineFunction() : FunctionPass(ID) {}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineModuleInfo>();
AU.addPreserved<MachineModuleInfo>();
}
bool runOnFunction(Function &F) override {
MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>();
MMI.deleteMachineFunctionFor(F);
return true;
}
StringRef getPassName() const override {
return "Free MachineFunction";
}
};
} // end anonymous namespace
char FreeMachineFunction::ID;
FunctionPass *llvm::createFreeMachineFunctionPass() {
return new FreeMachineFunction();
}
Index: vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp (revision 351303)
@@ -1,1023 +1,1033 @@
//===--- ScheduleDAGSDNodes.cpp - Implement the ScheduleDAGSDNodes class --===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This implements the ScheduleDAG class, which is a base class used by
// scheduling implementation classes.
//
//===----------------------------------------------------------------------===//
#include "ScheduleDAGSDNodes.h"
#include "InstrEmitter.h"
#include "SDNodeDbgValue.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/MC/MCInstrItineraries.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
#define DEBUG_TYPE "pre-RA-sched"
STATISTIC(LoadsClustered, "Number of loads clustered together");
// This allows the latency-based scheduler to notice high latency instructions
// without a target itinerary. The choice of number here has more to do with
// balancing scheduler heuristics than with the actual machine latency.
static cl::opt<int> HighLatencyCycles(
"sched-high-latency-cycles", cl::Hidden, cl::init(10),
cl::desc("Roughly estimate the number of cycles that 'long latency'"
"instructions take for targets with no itinerary"));
ScheduleDAGSDNodes::ScheduleDAGSDNodes(MachineFunction &mf)
: ScheduleDAG(mf), BB(nullptr), DAG(nullptr),
InstrItins(mf.getSubtarget().getInstrItineraryData()) {}
/// Run - perform scheduling.
///
void ScheduleDAGSDNodes::Run(SelectionDAG *dag, MachineBasicBlock *bb) {
BB = bb;
DAG = dag;
// Clear the scheduler's SUnit DAG.
ScheduleDAG::clearDAG();
Sequence.clear();
// Invoke the target's selection of scheduler.
Schedule();
}
/// NewSUnit - Creates a new SUnit and return a ptr to it.
///
SUnit *ScheduleDAGSDNodes::newSUnit(SDNode *N) {
#ifndef NDEBUG
const SUnit *Addr = nullptr;
if (!SUnits.empty())
Addr = &SUnits[0];
#endif
SUnits.emplace_back(N, (unsigned)SUnits.size());
assert((Addr == nullptr || Addr == &SUnits[0]) &&
"SUnits std::vector reallocated on the fly!");
SUnits.back().OrigNode = &SUnits.back();
SUnit *SU = &SUnits.back();
const TargetLowering &TLI = DAG->getTargetLoweringInfo();
if (!N ||
(N->isMachineOpcode() &&
N->getMachineOpcode() == TargetOpcode::IMPLICIT_DEF))
SU->SchedulingPref = Sched::None;
else
SU->SchedulingPref = TLI.getSchedulingPreference(N);
return SU;
}
SUnit *ScheduleDAGSDNodes::Clone(SUnit *Old) {
SUnit *SU = newSUnit(Old->getNode());
SU->OrigNode = Old->OrigNode;
SU->Latency = Old->Latency;
SU->isVRegCycle = Old->isVRegCycle;
SU->isCall = Old->isCall;
SU->isCallOp = Old->isCallOp;
SU->isTwoAddress = Old->isTwoAddress;
SU->isCommutable = Old->isCommutable;
SU->hasPhysRegDefs = Old->hasPhysRegDefs;
SU->hasPhysRegClobbers = Old->hasPhysRegClobbers;
SU->isScheduleHigh = Old->isScheduleHigh;
SU->isScheduleLow = Old->isScheduleLow;
SU->SchedulingPref = Old->SchedulingPref;
Old->isCloned = true;
return SU;
}
/// CheckForPhysRegDependency - Check if the dependency between def and use of
/// a specified operand is a physical register dependency. If so, returns the
/// register and the cost of copying the register.
static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
const TargetRegisterInfo *TRI,
const TargetInstrInfo *TII,
unsigned &PhysReg, int &Cost) {
if (Op != 2 || User->getOpcode() != ISD::CopyToReg)
return;
unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
if (TargetRegisterInfo::isVirtualRegister(Reg))
return;
unsigned ResNo = User->getOperand(2).getResNo();
if (Def->getOpcode() == ISD::CopyFromReg &&
cast<RegisterSDNode>(Def->getOperand(1))->getReg() == Reg) {
PhysReg = Reg;
} else if (Def->isMachineOpcode()) {
const MCInstrDesc &II = TII->get(Def->getMachineOpcode());
if (ResNo >= II.getNumDefs() &&
II.ImplicitDefs[ResNo - II.getNumDefs()] == Reg)
PhysReg = Reg;
}
if (PhysReg != 0) {
const TargetRegisterClass *RC =
TRI->getMinimalPhysRegClass(Reg, Def->getSimpleValueType(ResNo));
Cost = RC->getCopyCost();
}
}
// Helper for AddGlue to clone node operands.
static void CloneNodeWithValues(SDNode *N, SelectionDAG *DAG, ArrayRef<EVT> VTs,
SDValue ExtraOper = SDValue()) {
SmallVector<SDValue, 8> Ops(N->op_begin(), N->op_end());
if (ExtraOper.getNode())
Ops.push_back(ExtraOper);
SDVTList VTList = DAG->getVTList(VTs);
MachineSDNode *MN = dyn_cast<MachineSDNode>(N);
// Store memory references.
SmallVector<MachineMemOperand *, 2> MMOs;
if (MN)
MMOs.assign(MN->memoperands_begin(), MN->memoperands_end());
DAG->MorphNodeTo(N, N->getOpcode(), VTList, Ops);
// Reset the memory references
if (MN)
DAG->setNodeMemRefs(MN, MMOs);
}
static bool AddGlue(SDNode *N, SDValue Glue, bool AddGlue, SelectionDAG *DAG) {
SDNode *GlueDestNode = Glue.getNode();
// Don't add glue from a node to itself.
if (GlueDestNode == N) return false;
// Don't add a glue operand to something that already uses glue.
if (GlueDestNode &&
N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Glue) {
return false;
}
// Don't add glue to something that already has a glue value.
if (N->getValueType(N->getNumValues() - 1) == MVT::Glue) return false;
SmallVector<EVT, 4> VTs(N->value_begin(), N->value_end());
if (AddGlue)
VTs.push_back(MVT::Glue);
CloneNodeWithValues(N, DAG, VTs, Glue);
return true;
}
// Cleanup after unsuccessful AddGlue. Use the standard method of morphing the
// node even though simply shrinking the value list is sufficient.
static void RemoveUnusedGlue(SDNode *N, SelectionDAG *DAG) {
assert((N->getValueType(N->getNumValues() - 1) == MVT::Glue &&
!N->hasAnyUseOfValue(N->getNumValues() - 1)) &&
"expected an unused glue value");
CloneNodeWithValues(N, DAG,
makeArrayRef(N->value_begin(), N->getNumValues() - 1));
}
/// ClusterNeighboringLoads - Force nearby loads together by "gluing" them.
/// This function finds loads of the same base and different offsets. If the
/// offsets are not far apart (target specific), it add MVT::Glue inputs and
/// outputs to ensure they are scheduled together and in order. This
/// optimization may benefit some targets by improving cache locality.
void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
SDNode *Chain = nullptr;
unsigned NumOps = Node->getNumOperands();
if (Node->getOperand(NumOps-1).getValueType() == MVT::Other)
Chain = Node->getOperand(NumOps-1).getNode();
if (!Chain)
return;
// Skip any load instruction that has a tied input. There may be an additional
// dependency requiring a different order than by increasing offsets, and the
// added glue may introduce a cycle.
auto hasTiedInput = [this](const SDNode *N) {
const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
for (unsigned I = 0; I != MCID.getNumOperands(); ++I) {
if (MCID.getOperandConstraint(I, MCOI::TIED_TO) != -1)
return true;
}
return false;
};
// Look for other loads of the same chain. Find loads that are loading from
// the same base pointer and different offsets.
SmallPtrSet<SDNode*, 16> Visited;
SmallVector<int64_t, 4> Offsets;
DenseMap<long long, SDNode*> O2SMap; // Map from offset to SDNode.
bool Cluster = false;
SDNode *Base = Node;
if (hasTiedInput(Base))
return;
// This algorithm requires a reasonably low use count before finding a match
// to avoid uselessly blowing up compile time in large blocks.
unsigned UseCount = 0;
for (SDNode::use_iterator I = Chain->use_begin(), E = Chain->use_end();
I != E && UseCount < 100; ++I, ++UseCount) {
SDNode *User = *I;
if (User == Node || !Visited.insert(User).second)
continue;
int64_t Offset1, Offset2;
if (!TII->areLoadsFromSameBasePtr(Base, User, Offset1, Offset2) ||
Offset1 == Offset2 ||
hasTiedInput(User)) {
// FIXME: Should be ok if they addresses are identical. But earlier
// optimizations really should have eliminated one of the loads.
continue;
}
if (O2SMap.insert(std::make_pair(Offset1, Base)).second)
Offsets.push_back(Offset1);
O2SMap.insert(std::make_pair(Offset2, User));
Offsets.push_back(Offset2);
if (Offset2 < Offset1)
Base = User;
Cluster = true;
// Reset UseCount to allow more matches.
UseCount = 0;
}
if (!Cluster)
return;
// Sort them in increasing order.
llvm::sort(Offsets);
// Check if the loads are close enough.
SmallVector<SDNode*, 4> Loads;
unsigned NumLoads = 0;
int64_t BaseOff = Offsets[0];
SDNode *BaseLoad = O2SMap[BaseOff];
Loads.push_back(BaseLoad);
for (unsigned i = 1, e = Offsets.size(); i != e; ++i) {
int64_t Offset = Offsets[i];
SDNode *Load = O2SMap[Offset];
if (!TII->shouldScheduleLoadsNear(BaseLoad, Load, BaseOff, Offset,NumLoads))
break; // Stop right here. Ignore loads that are further away.
Loads.push_back(Load);
++NumLoads;
}
if (NumLoads == 0)
return;
// Cluster loads by adding MVT::Glue outputs and inputs. This also
// ensure they are scheduled in order of increasing addresses.
SDNode *Lead = Loads[0];
SDValue InGlue = SDValue(nullptr, 0);
if (AddGlue(Lead, InGlue, true, DAG))
InGlue = SDValue(Lead, Lead->getNumValues() - 1);
for (unsigned I = 1, E = Loads.size(); I != E; ++I) {
bool OutGlue = I < E - 1;
SDNode *Load = Loads[I];
// If AddGlue fails, we could leave an unsused glue value. This should not
// cause any
if (AddGlue(Load, InGlue, OutGlue, DAG)) {
if (OutGlue)
InGlue = SDValue(Load, Load->getNumValues() - 1);
++LoadsClustered;
}
else if (!OutGlue && InGlue.getNode())
RemoveUnusedGlue(InGlue.getNode(), DAG);
}
}
/// ClusterNodes - Cluster certain nodes which should be scheduled together.
///
void ScheduleDAGSDNodes::ClusterNodes() {
for (SDNode &NI : DAG->allnodes()) {
SDNode *Node = &NI;
if (!Node || !Node->isMachineOpcode())
continue;
unsigned Opc = Node->getMachineOpcode();
const MCInstrDesc &MCID = TII->get(Opc);
if (MCID.mayLoad())
// Cluster loads from "near" addresses into combined SUnits.
ClusterNeighboringLoads(Node);
}
}
void ScheduleDAGSDNodes::BuildSchedUnits() {
// During scheduling, the NodeId field of SDNode is used to map SDNodes
// to their associated SUnits by holding SUnits table indices. A value
// of -1 means the SDNode does not yet have an associated SUnit.
unsigned NumNodes = 0;
for (SDNode &NI : DAG->allnodes()) {
NI.setNodeId(-1);
++NumNodes;
}
// Reserve entries in the vector for each of the SUnits we are creating. This
// ensure that reallocation of the vector won't happen, so SUnit*'s won't get
// invalidated.
// FIXME: Multiply by 2 because we may clone nodes during scheduling.
// This is a temporary workaround.
SUnits.reserve(NumNodes * 2);
// Add all nodes in depth first order.
SmallVector<SDNode*, 64> Worklist;
SmallPtrSet<SDNode*, 32> Visited;
Worklist.push_back(DAG->getRoot().getNode());
Visited.insert(DAG->getRoot().getNode());
SmallVector<SUnit*, 8> CallSUnits;
while (!Worklist.empty()) {
SDNode *NI = Worklist.pop_back_val();
// Add all operands to the worklist unless they've already been added.
for (const SDValue &Op : NI->op_values())
if (Visited.insert(Op.getNode()).second)
Worklist.push_back(Op.getNode());
if (isPassiveNode(NI)) // Leaf node, e.g. a TargetImmediate.
continue;
// If this node has already been processed, stop now.
if (NI->getNodeId() != -1) continue;
SUnit *NodeSUnit = newSUnit(NI);
// See if anything is glued to this node, if so, add them to glued
// nodes. Nodes can have at most one glue input and one glue output. Glue
// is required to be the last operand and result of a node.
// Scan up to find glued preds.
SDNode *N = NI;
while (N->getNumOperands() &&
N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Glue) {
N = N->getOperand(N->getNumOperands()-1).getNode();
assert(N->getNodeId() == -1 && "Node already inserted!");
N->setNodeId(NodeSUnit->NodeNum);
if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).isCall())
NodeSUnit->isCall = true;
}
// Scan down to find any glued succs.
N = NI;
while (N->getValueType(N->getNumValues()-1) == MVT::Glue) {
SDValue GlueVal(N, N->getNumValues()-1);
// There are either zero or one users of the Glue result.
bool HasGlueUse = false;
for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end();
UI != E; ++UI)
if (GlueVal.isOperandOf(*UI)) {
HasGlueUse = true;
assert(N->getNodeId() == -1 && "Node already inserted!");
N->setNodeId(NodeSUnit->NodeNum);
N = *UI;
if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).isCall())
NodeSUnit->isCall = true;
break;
}
if (!HasGlueUse) break;
}
if (NodeSUnit->isCall)
CallSUnits.push_back(NodeSUnit);
// Schedule zero-latency TokenFactor below any nodes that may increase the
// schedule height. Otherwise, ancestors of the TokenFactor may appear to
// have false stalls.
if (NI->getOpcode() == ISD::TokenFactor)
NodeSUnit->isScheduleLow = true;
// If there are glue operands involved, N is now the bottom-most node
// of the sequence of nodes that are glued together.
// Update the SUnit.
NodeSUnit->setNode(N);
assert(N->getNodeId() == -1 && "Node already inserted!");
N->setNodeId(NodeSUnit->NodeNum);
// Compute NumRegDefsLeft. This must be done before AddSchedEdges.
InitNumRegDefsLeft(NodeSUnit);
// Assign the Latency field of NodeSUnit using target-provided information.
computeLatency(NodeSUnit);
}
// Find all call operands.
while (!CallSUnits.empty()) {
SUnit *SU = CallSUnits.pop_back_val();
for (const SDNode *SUNode = SU->getNode(); SUNode;
SUNode = SUNode->getGluedNode()) {
if (SUNode->getOpcode() != ISD::CopyToReg)
continue;
SDNode *SrcN = SUNode->getOperand(2).getNode();
if (isPassiveNode(SrcN)) continue; // Not scheduled.
SUnit *SrcSU = &SUnits[SrcN->getNodeId()];
SrcSU->isCallOp = true;
}
}
}
void ScheduleDAGSDNodes::AddSchedEdges() {
const TargetSubtargetInfo &ST = MF.getSubtarget();
// Check to see if the scheduler cares about latencies.
bool UnitLatencies = forceUnitLatencies();
// Pass 2: add the preds, succs, etc.
for (unsigned su = 0, e = SUnits.size(); su != e; ++su) {
SUnit *SU = &SUnits[su];
SDNode *MainNode = SU->getNode();
if (MainNode->isMachineOpcode()) {
unsigned Opc = MainNode->getMachineOpcode();
const MCInstrDesc &MCID = TII->get(Opc);
for (unsigned i = 0; i != MCID.getNumOperands(); ++i) {
if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) {
SU->isTwoAddress = true;
break;
}
}
if (MCID.isCommutable())
SU->isCommutable = true;
}
// Find all predecessors and successors of the group.
for (SDNode *N = SU->getNode(); N; N = N->getGluedNode()) {
if (N->isMachineOpcode() &&
TII->get(N->getMachineOpcode()).getImplicitDefs()) {
SU->hasPhysRegClobbers = true;
unsigned NumUsed = InstrEmitter::CountResults(N);
while (NumUsed != 0 && !N->hasAnyUseOfValue(NumUsed - 1))
--NumUsed; // Skip over unused values at the end.
if (NumUsed > TII->get(N->getMachineOpcode()).getNumDefs())
SU->hasPhysRegDefs = true;
}
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
SDNode *OpN = N->getOperand(i).getNode();
if (isPassiveNode(OpN)) continue; // Not scheduled.
SUnit *OpSU = &SUnits[OpN->getNodeId()];
assert(OpSU && "Node has no SUnit!");
if (OpSU == SU) continue; // In the same group.
EVT OpVT = N->getOperand(i).getValueType();
assert(OpVT != MVT::Glue && "Glued nodes should be in same sunit!");
bool isChain = OpVT == MVT::Other;
unsigned PhysReg = 0;
int Cost = 1;
// Determine if this is a physical register dependency.
CheckForPhysRegDependency(OpN, N, i, TRI, TII, PhysReg, Cost);
assert((PhysReg == 0 || !isChain) &&
"Chain dependence via physreg data?");
// FIXME: See ScheduleDAGSDNodes::EmitCopyFromReg. For now, scheduler
// emits a copy from the physical register to a virtual register unless
// it requires a cross class copy (cost < 0). That means we are only
// treating "expensive to copy" register dependency as physical register
// dependency. This may change in the future though.
if (Cost >= 0 && !StressSched)
PhysReg = 0;
// If this is a ctrl dep, latency is 1.
unsigned OpLatency = isChain ? 1 : OpSU->Latency;
// Special-case TokenFactor chains as zero-latency.
if(isChain && OpN->getOpcode() == ISD::TokenFactor)
OpLatency = 0;
SDep Dep = isChain ? SDep(OpSU, SDep::Barrier)
: SDep(OpSU, SDep::Data, PhysReg);
Dep.setLatency(OpLatency);
if (!isChain && !UnitLatencies) {
computeOperandLatency(OpN, N, i, Dep);
ST.adjustSchedDependency(OpSU, SU, Dep);
}
if (!SU->addPred(Dep) && !Dep.isCtrl() && OpSU->NumRegDefsLeft > 1) {
// Multiple register uses are combined in the same SUnit. For example,
// we could have a set of glued nodes with all their defs consumed by
// another set of glued nodes. Register pressure tracking sees this as
// a single use, so to keep pressure balanced we reduce the defs.
//
// We can't tell (without more book-keeping) if this results from
// glued nodes or duplicate operands. As long as we don't reduce
// NumRegDefsLeft to zero, we handle the common cases well.
--OpSU->NumRegDefsLeft;
}
}
}
}
}
/// BuildSchedGraph - Build the SUnit graph from the selection dag that we
/// are input. This SUnit graph is similar to the SelectionDAG, but
/// excludes nodes that aren't interesting to scheduling, and represents
/// glued together nodes with a single SUnit.
void ScheduleDAGSDNodes::BuildSchedGraph(AliasAnalysis *AA) {
// Cluster certain nodes which should be scheduled together.
ClusterNodes();
// Populate the SUnits array.
BuildSchedUnits();
// Compute all the scheduling dependencies between nodes.
AddSchedEdges();
}
// Initialize NumNodeDefs for the current Node's opcode.
void ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs() {
// Check for phys reg copy.
if (!Node)
return;
if (!Node->isMachineOpcode()) {
if (Node->getOpcode() == ISD::CopyFromReg)
NodeNumDefs = 1;
else
NodeNumDefs = 0;
return;
}
unsigned POpc = Node->getMachineOpcode();
if (POpc == TargetOpcode::IMPLICIT_DEF) {
// No register need be allocated for this.
NodeNumDefs = 0;
return;
}
if (POpc == TargetOpcode::PATCHPOINT &&
Node->getValueType(0) == MVT::Other) {
// PATCHPOINT is defined to have one result, but it might really have none
// if we're not using CallingConv::AnyReg. Don't mistake the chain for a
// real definition.
NodeNumDefs = 0;
return;
}
unsigned NRegDefs = SchedDAG->TII->get(Node->getMachineOpcode()).getNumDefs();
// Some instructions define regs that are not represented in the selection DAG
// (e.g. unused flags). See tMOVi8. Make sure we don't access past NumValues.
NodeNumDefs = std::min(Node->getNumValues(), NRegDefs);
DefIdx = 0;
}
// Construct a RegDefIter for this SUnit and find the first valid value.
ScheduleDAGSDNodes::RegDefIter::RegDefIter(const SUnit *SU,
const ScheduleDAGSDNodes *SD)
: SchedDAG(SD), Node(SU->getNode()), DefIdx(0), NodeNumDefs(0) {
InitNodeNumDefs();
Advance();
}
// Advance to the next valid value defined by the SUnit.
void ScheduleDAGSDNodes::RegDefIter::Advance() {
for (;Node;) { // Visit all glued nodes.
for (;DefIdx < NodeNumDefs; ++DefIdx) {
if (!Node->hasAnyUseOfValue(DefIdx))
continue;
ValueType = Node->getSimpleValueType(DefIdx);
++DefIdx;
return; // Found a normal regdef.
}
Node = Node->getGluedNode();
if (!Node) {
return; // No values left to visit.
}
InitNodeNumDefs();
}
}
void ScheduleDAGSDNodes::InitNumRegDefsLeft(SUnit *SU) {
assert(SU->NumRegDefsLeft == 0 && "expect a new node");
for (RegDefIter I(SU, this); I.IsValid(); I.Advance()) {
assert(SU->NumRegDefsLeft < USHRT_MAX && "overflow is ok but unexpected");
++SU->NumRegDefsLeft;
}
}
void ScheduleDAGSDNodes::computeLatency(SUnit *SU) {
SDNode *N = SU->getNode();
// TokenFactor operands are considered zero latency, and some schedulers
// (e.g. Top-Down list) may rely on the fact that operand latency is nonzero
// whenever node latency is nonzero.
if (N && N->getOpcode() == ISD::TokenFactor) {
SU->Latency = 0;
return;
}
// Check to see if the scheduler cares about latencies.
if (forceUnitLatencies()) {
SU->Latency = 1;
return;
}
if (!InstrItins || InstrItins->isEmpty()) {
if (N && N->isMachineOpcode() &&
TII->isHighLatencyDef(N->getMachineOpcode()))
SU->Latency = HighLatencyCycles;
else
SU->Latency = 1;
return;
}
// Compute the latency for the node. We use the sum of the latencies for
// all nodes glued together into this SUnit.
SU->Latency = 0;
for (SDNode *N = SU->getNode(); N; N = N->getGluedNode())
if (N->isMachineOpcode())
SU->Latency += TII->getInstrLatency(InstrItins, N);
}
void ScheduleDAGSDNodes::computeOperandLatency(SDNode *Def, SDNode *Use,
unsigned OpIdx, SDep& dep) const{
// Check to see if the scheduler cares about latencies.
if (forceUnitLatencies())
return;
if (dep.getKind() != SDep::Data)
return;
unsigned DefIdx = Use->getOperand(OpIdx).getResNo();
if (Use->isMachineOpcode())
// Adjust the use operand index by num of defs.
OpIdx += TII->get(Use->getMachineOpcode()).getNumDefs();
int Latency = TII->getOperandLatency(InstrItins, Def, DefIdx, Use, OpIdx);
if (Latency > 1 && Use->getOpcode() == ISD::CopyToReg &&
!BB->succ_empty()) {
unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg();
if (TargetRegisterInfo::isVirtualRegister(Reg))
// This copy is a liveout value. It is likely coalesced, so reduce the
// latency so not to penalize the def.
// FIXME: need target specific adjustment here?
Latency = (Latency > 1) ? Latency - 1 : 1;
}
if (Latency >= 0)
dep.setLatency(Latency);
}
void ScheduleDAGSDNodes::dumpNode(const SUnit &SU) const {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dumpNodeName(SU);
dbgs() << ": ";
if (!SU.getNode()) {
dbgs() << "PHYS REG COPY\n";
return;
}
SU.getNode()->dump(DAG);
dbgs() << "\n";
SmallVector<SDNode *, 4> GluedNodes;
for (SDNode *N = SU.getNode()->getGluedNode(); N; N = N->getGluedNode())
GluedNodes.push_back(N);
while (!GluedNodes.empty()) {
dbgs() << " ";
GluedNodes.back()->dump(DAG);
dbgs() << "\n";
GluedNodes.pop_back();
}
#endif
}
void ScheduleDAGSDNodes::dump() const {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
if (EntrySU.getNode() != nullptr)
dumpNodeAll(EntrySU);
for (const SUnit &SU : SUnits)
dumpNodeAll(SU);
if (ExitSU.getNode() != nullptr)
dumpNodeAll(ExitSU);
#endif
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void ScheduleDAGSDNodes::dumpSchedule() const {
for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
if (SUnit *SU = Sequence[i])
dumpNode(*SU);
else
dbgs() << "**** NOOP ****\n";
}
}
#endif
#ifndef NDEBUG
/// VerifyScheduledSequence - Verify that all SUnits were scheduled and that
/// their state is consistent with the nodes listed in Sequence.
///
void ScheduleDAGSDNodes::VerifyScheduledSequence(bool isBottomUp) {
unsigned ScheduledNodes = ScheduleDAG::VerifyScheduledDAG(isBottomUp);
unsigned Noops = 0;
for (unsigned i = 0, e = Sequence.size(); i != e; ++i)
if (!Sequence[i])
++Noops;
assert(Sequence.size() - Noops == ScheduledNodes &&
"The number of nodes scheduled doesn't match the expected number!");
}
#endif // NDEBUG
/// ProcessSDDbgValues - Process SDDbgValues associated with this node.
static void
ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
SmallVectorImpl<std::pair<unsigned, MachineInstr*> > &Orders,
DenseMap<SDValue, unsigned> &VRBaseMap, unsigned Order) {
if (!N->getHasDebugValue())
return;
// Opportunistically insert immediate dbg_value uses, i.e. those with the same
// source order number as N.
MachineBasicBlock *BB = Emitter.getBlock();
MachineBasicBlock::iterator InsertPos = Emitter.getInsertPos();
for (auto DV : DAG->GetDbgValues(N)) {
if (DV->isEmitted())
continue;
unsigned DVOrder = DV->getOrder();
if (!Order || DVOrder == Order) {
MachineInstr *DbgMI = Emitter.EmitDbgValue(DV, VRBaseMap);
if (DbgMI) {
Orders.push_back({DVOrder, DbgMI});
BB->insert(InsertPos, DbgMI);
}
}
}
}
// ProcessSourceNode - Process nodes with source order numbers. These are added
// to a vector which EmitSchedule uses to determine how to insert dbg_value
// instructions in the right order.
static void
ProcessSourceNode(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
DenseMap<SDValue, unsigned> &VRBaseMap,
SmallVectorImpl<std::pair<unsigned, MachineInstr *>> &Orders,
SmallSet<unsigned, 8> &Seen, MachineInstr *NewInsn) {
unsigned Order = N->getIROrder();
if (!Order || Seen.count(Order)) {
// Process any valid SDDbgValues even if node does not have any order
// assigned.
ProcessSDDbgValues(N, DAG, Emitter, Orders, VRBaseMap, 0);
return;
}
// If a new instruction was generated for this Order number, record it.
// Otherwise, leave this order number unseen: we will either find later
// instructions for it, or leave it unseen if there were no instructions at
// all.
if (NewInsn) {
Seen.insert(Order);
Orders.push_back({Order, NewInsn});
}
// Even if no instruction was generated, a Value may have become defined via
// earlier nodes. Try to process them now.
ProcessSDDbgValues(N, DAG, Emitter, Orders, VRBaseMap, Order);
}
void ScheduleDAGSDNodes::
EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, unsigned> &VRBaseMap,
MachineBasicBlock::iterator InsertPos) {
for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
I != E; ++I) {
if (I->isCtrl()) continue; // ignore chain preds
if (I->getSUnit()->CopyDstRC) {
// Copy to physical register.
DenseMap<SUnit*, unsigned>::iterator VRI = VRBaseMap.find(I->getSUnit());
assert(VRI != VRBaseMap.end() && "Node emitted out of order - late");
// Find the destination physical register.
unsigned Reg = 0;
for (SUnit::const_succ_iterator II = SU->Succs.begin(),
EE = SU->Succs.end(); II != EE; ++II) {
if (II->isCtrl()) continue; // ignore chain preds
if (II->getReg()) {
Reg = II->getReg();
break;
}
}
BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), Reg)
.addReg(VRI->second);
} else {
// Copy from physical register.
assert(I->getReg() && "Unknown physical register!");
unsigned VRBase = MRI.createVirtualRegister(SU->CopyDstRC);
bool isNew = VRBaseMap.insert(std::make_pair(SU, VRBase)).second;
(void)isNew; // Silence compiler warning.
assert(isNew && "Node emitted out of order - early");
BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), VRBase)
.addReg(I->getReg());
}
break;
}
}
/// EmitSchedule - Emit the machine code in scheduled order. Return the new
/// InsertPos and MachineBasicBlock that contains this insertion
/// point. ScheduleDAGSDNodes holds a BB pointer for convenience, but this does
/// not necessarily refer to returned BB. The emitter may split blocks.
MachineBasicBlock *ScheduleDAGSDNodes::
EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
InstrEmitter Emitter(BB, InsertPos);
DenseMap<SDValue, unsigned> VRBaseMap;
DenseMap<SUnit*, unsigned> CopyVRBaseMap;
SmallVector<std::pair<unsigned, MachineInstr*>, 32> Orders;
SmallSet<unsigned, 8> Seen;
bool HasDbg = DAG->hasDebugValues();
// Emit a node, and determine where its first instruction is for debuginfo.
// Zero, one, or multiple instructions can be created when emitting a node.
auto EmitNode =
[&](SDNode *Node, bool IsClone, bool IsCloned,
DenseMap<SDValue, unsigned> &VRBaseMap) -> MachineInstr * {
// Fetch instruction prior to this, or end() if nonexistant.
auto GetPrevInsn = [&](MachineBasicBlock::iterator I) {
if (I == BB->begin())
return BB->end();
else
return std::prev(Emitter.getInsertPos());
};
MachineBasicBlock::iterator Before = GetPrevInsn(Emitter.getInsertPos());
Emitter.EmitNode(Node, IsClone, IsCloned, VRBaseMap);
MachineBasicBlock::iterator After = GetPrevInsn(Emitter.getInsertPos());
// If the iterator did not change, no instructions were inserted.
if (Before == After)
return nullptr;
MachineInstr *MI;
if (Before == BB->end()) {
// There were no prior instructions; the new ones must start at the
// beginning of the block.
MI = &Emitter.getBlock()->instr_front();
} else {
// Return first instruction after the pre-existing instructions.
MI = &*std::next(Before);
}
if (MI->isCall() && DAG->getTarget().Options.EnableDebugEntryValues)
MF.addCallArgsForwardingRegs(MI, DAG->getSDCallSiteInfo(Node));
return MI;
};
// If this is the first BB, emit byval parameter dbg_value's.
if (HasDbg && BB->getParent()->begin() == MachineFunction::iterator(BB)) {
SDDbgInfo::DbgIterator PDI = DAG->ByvalParmDbgBegin();
SDDbgInfo::DbgIterator PDE = DAG->ByvalParmDbgEnd();
for (; PDI != PDE; ++PDI) {
MachineInstr *DbgMI= Emitter.EmitDbgValue(*PDI, VRBaseMap);
if (DbgMI) {
BB->insert(InsertPos, DbgMI);
// We re-emit the dbg_value closer to its use, too, after instructions
// are emitted to the BB.
(*PDI)->clearIsEmitted();
}
}
}
for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
SUnit *SU = Sequence[i];
if (!SU) {
// Null SUnit* is a noop.
TII->insertNoop(*Emitter.getBlock(), InsertPos);
continue;
}
// For pre-regalloc scheduling, create instructions corresponding to the
// SDNode and any glued SDNodes and append them to the block.
if (!SU->getNode()) {
// Emit a copy.
EmitPhysRegCopy(SU, CopyVRBaseMap, InsertPos);
continue;
}
SmallVector<SDNode *, 4> GluedNodes;
for (SDNode *N = SU->getNode()->getGluedNode(); N; N = N->getGluedNode())
GluedNodes.push_back(N);
while (!GluedNodes.empty()) {
SDNode *N = GluedNodes.back();
auto NewInsn = EmitNode(N, SU->OrigNode != SU, SU->isCloned, VRBaseMap);
// Remember the source order of the inserted instruction.
if (HasDbg)
ProcessSourceNode(N, DAG, Emitter, VRBaseMap, Orders, Seen, NewInsn);
+
+ if (MDNode *MD = DAG->getHeapAllocSite(N)) {
+ if (NewInsn && NewInsn->isCall())
+ MF.addCodeViewHeapAllocSite(NewInsn, MD);
+ }
+
GluedNodes.pop_back();
}
auto NewInsn =
EmitNode(SU->getNode(), SU->OrigNode != SU, SU->isCloned, VRBaseMap);
// Remember the source order of the inserted instruction.
if (HasDbg)
ProcessSourceNode(SU->getNode(), DAG, Emitter, VRBaseMap, Orders, Seen,
NewInsn);
+ if (MDNode *MD = DAG->getHeapAllocSite(SU->getNode())) {
+ if (NewInsn && NewInsn->isCall())
+ MF.addCodeViewHeapAllocSite(NewInsn, MD);
+ }
}
// Insert all the dbg_values which have not already been inserted in source
// order sequence.
if (HasDbg) {
MachineBasicBlock::iterator BBBegin = BB->getFirstNonPHI();
// Sort the source order instructions and use the order to insert debug
// values. Use stable_sort so that DBG_VALUEs are inserted in the same order
// regardless of the host's implementation fo std::sort.
llvm::stable_sort(Orders, less_first());
std::stable_sort(DAG->DbgBegin(), DAG->DbgEnd(),
[](const SDDbgValue *LHS, const SDDbgValue *RHS) {
return LHS->getOrder() < RHS->getOrder();
});
SDDbgInfo::DbgIterator DI = DAG->DbgBegin();
SDDbgInfo::DbgIterator DE = DAG->DbgEnd();
// Now emit the rest according to source order.
unsigned LastOrder = 0;
for (unsigned i = 0, e = Orders.size(); i != e && DI != DE; ++i) {
unsigned Order = Orders[i].first;
MachineInstr *MI = Orders[i].second;
// Insert all SDDbgValue's whose order(s) are before "Order".
assert(MI);
for (; DI != DE; ++DI) {
if ((*DI)->getOrder() < LastOrder || (*DI)->getOrder() >= Order)
break;
if ((*DI)->isEmitted())
continue;
MachineInstr *DbgMI = Emitter.EmitDbgValue(*DI, VRBaseMap);
if (DbgMI) {
if (!LastOrder)
// Insert to start of the BB (after PHIs).
BB->insert(BBBegin, DbgMI);
else {
// Insert at the instruction, which may be in a different
// block, if the block was split by a custom inserter.
MachineBasicBlock::iterator Pos = MI;
MI->getParent()->insert(Pos, DbgMI);
}
}
}
LastOrder = Order;
}
// Add trailing DbgValue's before the terminator. FIXME: May want to add
// some of them before one or more conditional branches?
SmallVector<MachineInstr*, 8> DbgMIs;
for (; DI != DE; ++DI) {
if ((*DI)->isEmitted())
continue;
assert((*DI)->getOrder() >= LastOrder &&
"emitting DBG_VALUE out of order");
if (MachineInstr *DbgMI = Emitter.EmitDbgValue(*DI, VRBaseMap))
DbgMIs.push_back(DbgMI);
}
MachineBasicBlock *InsertBB = Emitter.getBlock();
MachineBasicBlock::iterator Pos = InsertBB->getFirstTerminator();
InsertBB->insert(Pos, DbgMIs.begin(), DbgMIs.end());
SDDbgInfo::DbgLabelIterator DLI = DAG->DbgLabelBegin();
SDDbgInfo::DbgLabelIterator DLE = DAG->DbgLabelEnd();
// Now emit the rest according to source order.
LastOrder = 0;
for (const auto &InstrOrder : Orders) {
unsigned Order = InstrOrder.first;
MachineInstr *MI = InstrOrder.second;
if (!MI)
continue;
// Insert all SDDbgLabel's whose order(s) are before "Order".
for (; DLI != DLE &&
(*DLI)->getOrder() >= LastOrder && (*DLI)->getOrder() < Order;
++DLI) {
MachineInstr *DbgMI = Emitter.EmitDbgLabel(*DLI);
if (DbgMI) {
if (!LastOrder)
// Insert to start of the BB (after PHIs).
BB->insert(BBBegin, DbgMI);
else {
// Insert at the instruction, which may be in a different
// block, if the block was split by a custom inserter.
MachineBasicBlock::iterator Pos = MI;
MI->getParent()->insert(Pos, DbgMI);
}
}
}
if (DLI == DLE)
break;
LastOrder = Order;
}
}
InsertPos = Emitter.getInsertPos();
return Emitter.getBlock();
}
/// Return the basic block label.
std::string ScheduleDAGSDNodes::getDAGName() const {
return "sunit-dag." + BB->getFullName();
}
Index: vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (revision 351303)
@@ -1,9594 +1,9595 @@
//===- SelectionDAG.cpp - Implement the SelectionDAG data structures ------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This implements the SelectionDAG class.
//
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/SelectionDAG.h"
#include "SDNodeDbgValue.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/APSInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/FoldingSet.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Triple.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/ManagedStatic.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/Mutex.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <limits>
#include <set>
#include <string>
#include <utility>
#include <vector>
using namespace llvm;
/// makeVTList - Return an instance of the SDVTList struct initialized with the
/// specified members.
static SDVTList makeVTList(const EVT *VTs, unsigned NumVTs) {
SDVTList Res = {VTs, NumVTs};
return Res;
}
// Default null implementations of the callbacks.
void SelectionDAG::DAGUpdateListener::NodeDeleted(SDNode*, SDNode*) {}
void SelectionDAG::DAGUpdateListener::NodeUpdated(SDNode*) {}
void SelectionDAG::DAGUpdateListener::NodeInserted(SDNode *) {}
void SelectionDAG::DAGNodeDeletedListener::anchor() {}
#define DEBUG_TYPE "selectiondag"
static cl::opt<bool> EnableMemCpyDAGOpt("enable-memcpy-dag-opt",
cl::Hidden, cl::init(true),
cl::desc("Gang up loads and stores generated by inlining of memcpy"));
static cl::opt<int> MaxLdStGlue("ldstmemcpy-glue-max",
cl::desc("Number limit for gluing ld/st of memcpy."),
cl::Hidden, cl::init(0));
static void NewSDValueDbgMsg(SDValue V, StringRef Msg, SelectionDAG *G) {
LLVM_DEBUG(dbgs() << Msg; V.getNode()->dump(G););
}
//===----------------------------------------------------------------------===//
// ConstantFPSDNode Class
//===----------------------------------------------------------------------===//
/// isExactlyValue - We don't rely on operator== working on double values, as
/// it returns true for things that are clearly not equal, like -0.0 and 0.0.
/// As such, this method can be used to do an exact bit-for-bit comparison of
/// two floating point values.
bool ConstantFPSDNode::isExactlyValue(const APFloat& V) const {
return getValueAPF().bitwiseIsEqual(V);
}
bool ConstantFPSDNode::isValueValidForType(EVT VT,
const APFloat& Val) {
assert(VT.isFloatingPoint() && "Can only convert between FP types");
// convert modifies in place, so make a copy.
APFloat Val2 = APFloat(Val);
bool losesInfo;
(void) Val2.convert(SelectionDAG::EVTToAPFloatSemantics(VT),
APFloat::rmNearestTiesToEven,
&losesInfo);
return !losesInfo;
}
//===----------------------------------------------------------------------===//
// ISD Namespace
//===----------------------------------------------------------------------===//
bool ISD::isConstantSplatVector(const SDNode *N, APInt &SplatVal) {
auto *BV = dyn_cast<BuildVectorSDNode>(N);
if (!BV)
return false;
APInt SplatUndef;
unsigned SplatBitSize;
bool HasUndefs;
unsigned EltSize = N->getValueType(0).getVectorElementType().getSizeInBits();
return BV->isConstantSplat(SplatVal, SplatUndef, SplatBitSize, HasUndefs,
EltSize) &&
EltSize == SplatBitSize;
}
// FIXME: AllOnes and AllZeros duplicate a lot of code. Could these be
// specializations of the more general isConstantSplatVector()?
bool ISD::isBuildVectorAllOnes(const SDNode *N) {
// Look through a bit convert.
while (N->getOpcode() == ISD::BITCAST)
N = N->getOperand(0).getNode();
if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
unsigned i = 0, e = N->getNumOperands();
// Skip over all of the undef values.
while (i != e && N->getOperand(i).isUndef())
++i;
// Do not accept an all-undef vector.
if (i == e) return false;
// Do not accept build_vectors that aren't all constants or which have non-~0
// elements. We have to be a bit careful here, as the type of the constant
// may not be the same as the type of the vector elements due to type
// legalization (the elements are promoted to a legal type for the target and
// a vector of a type may be legal when the base element type is not).
// We only want to check enough bits to cover the vector elements, because
// we care if the resultant vector is all ones, not whether the individual
// constants are.
SDValue NotZero = N->getOperand(i);
unsigned EltSize = N->getValueType(0).getScalarSizeInBits();
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(NotZero)) {
if (CN->getAPIntValue().countTrailingOnes() < EltSize)
return false;
} else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(NotZero)) {
if (CFPN->getValueAPF().bitcastToAPInt().countTrailingOnes() < EltSize)
return false;
} else
return false;
// Okay, we have at least one ~0 value, check to see if the rest match or are
// undefs. Even with the above element type twiddling, this should be OK, as
// the same type legalization should have applied to all the elements.
for (++i; i != e; ++i)
if (N->getOperand(i) != NotZero && !N->getOperand(i).isUndef())
return false;
return true;
}
bool ISD::isBuildVectorAllZeros(const SDNode *N) {
// Look through a bit convert.
while (N->getOpcode() == ISD::BITCAST)
N = N->getOperand(0).getNode();
if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
bool IsAllUndef = true;
for (const SDValue &Op : N->op_values()) {
if (Op.isUndef())
continue;
IsAllUndef = false;
// Do not accept build_vectors that aren't all constants or which have non-0
// elements. We have to be a bit careful here, as the type of the constant
// may not be the same as the type of the vector elements due to type
// legalization (the elements are promoted to a legal type for the target
// and a vector of a type may be legal when the base element type is not).
// We only want to check enough bits to cover the vector elements, because
// we care if the resultant vector is all zeros, not whether the individual
// constants are.
unsigned EltSize = N->getValueType(0).getScalarSizeInBits();
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op)) {
if (CN->getAPIntValue().countTrailingZeros() < EltSize)
return false;
} else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(Op)) {
if (CFPN->getValueAPF().bitcastToAPInt().countTrailingZeros() < EltSize)
return false;
} else
return false;
}
// Do not accept an all-undef vector.
if (IsAllUndef)
return false;
return true;
}
bool ISD::isBuildVectorOfConstantSDNodes(const SDNode *N) {
if (N->getOpcode() != ISD::BUILD_VECTOR)
return false;
for (const SDValue &Op : N->op_values()) {
if (Op.isUndef())
continue;
if (!isa<ConstantSDNode>(Op))
return false;
}
return true;
}
bool ISD::isBuildVectorOfConstantFPSDNodes(const SDNode *N) {
if (N->getOpcode() != ISD::BUILD_VECTOR)
return false;
for (const SDValue &Op : N->op_values()) {
if (Op.isUndef())
continue;
if (!isa<ConstantFPSDNode>(Op))
return false;
}
return true;
}
bool ISD::allOperandsUndef(const SDNode *N) {
// Return false if the node has no operands.
// This is "logically inconsistent" with the definition of "all" but
// is probably the desired behavior.
if (N->getNumOperands() == 0)
return false;
return all_of(N->op_values(), [](SDValue Op) { return Op.isUndef(); });
}
bool ISD::matchUnaryPredicate(SDValue Op,
std::function<bool(ConstantSDNode *)> Match,
bool AllowUndefs) {
// FIXME: Add support for scalar UNDEF cases?
if (auto *Cst = dyn_cast<ConstantSDNode>(Op))
return Match(Cst);
// FIXME: Add support for vector UNDEF cases?
if (ISD::BUILD_VECTOR != Op.getOpcode())
return false;
EVT SVT = Op.getValueType().getScalarType();
for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
if (AllowUndefs && Op.getOperand(i).isUndef()) {
if (!Match(nullptr))
return false;
continue;
}
auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(i));
if (!Cst || Cst->getValueType(0) != SVT || !Match(Cst))
return false;
}
return true;
}
bool ISD::matchBinaryPredicate(
SDValue LHS, SDValue RHS,
std::function<bool(ConstantSDNode *, ConstantSDNode *)> Match,
bool AllowUndefs, bool AllowTypeMismatch) {
if (!AllowTypeMismatch && LHS.getValueType() != RHS.getValueType())
return false;
// TODO: Add support for scalar UNDEF cases?
if (auto *LHSCst = dyn_cast<ConstantSDNode>(LHS))
if (auto *RHSCst = dyn_cast<ConstantSDNode>(RHS))
return Match(LHSCst, RHSCst);
// TODO: Add support for vector UNDEF cases?
if (ISD::BUILD_VECTOR != LHS.getOpcode() ||
ISD::BUILD_VECTOR != RHS.getOpcode())
return false;
EVT SVT = LHS.getValueType().getScalarType();
for (unsigned i = 0, e = LHS.getNumOperands(); i != e; ++i) {
SDValue LHSOp = LHS.getOperand(i);
SDValue RHSOp = RHS.getOperand(i);
bool LHSUndef = AllowUndefs && LHSOp.isUndef();
bool RHSUndef = AllowUndefs && RHSOp.isUndef();
auto *LHSCst = dyn_cast<ConstantSDNode>(LHSOp);
auto *RHSCst = dyn_cast<ConstantSDNode>(RHSOp);
if ((!LHSCst && !LHSUndef) || (!RHSCst && !RHSUndef))
return false;
if (!AllowTypeMismatch && (LHSOp.getValueType() != SVT ||
LHSOp.getValueType() != RHSOp.getValueType()))
return false;
if (!Match(LHSCst, RHSCst))
return false;
}
return true;
}
ISD::NodeType ISD::getExtForLoadExtType(bool IsFP, ISD::LoadExtType ExtType) {
switch (ExtType) {
case ISD::EXTLOAD:
return IsFP ? ISD::FP_EXTEND : ISD::ANY_EXTEND;
case ISD::SEXTLOAD:
return ISD::SIGN_EXTEND;
case ISD::ZEXTLOAD:
return ISD::ZERO_EXTEND;
default:
break;
}
llvm_unreachable("Invalid LoadExtType");
}
ISD::CondCode ISD::getSetCCSwappedOperands(ISD::CondCode Operation) {
// To perform this operation, we just need to swap the L and G bits of the
// operation.
unsigned OldL = (Operation >> 2) & 1;
unsigned OldG = (Operation >> 1) & 1;
return ISD::CondCode((Operation & ~6) | // Keep the N, U, E bits
(OldL << 1) | // New G bit
(OldG << 2)); // New L bit.
}
ISD::CondCode ISD::getSetCCInverse(ISD::CondCode Op, bool isInteger) {
unsigned Operation = Op;
if (isInteger)
Operation ^= 7; // Flip L, G, E bits, but not U.
else
Operation ^= 15; // Flip all of the condition bits.
if (Operation > ISD::SETTRUE2)
Operation &= ~8; // Don't let N and U bits get set.
return ISD::CondCode(Operation);
}
/// For an integer comparison, return 1 if the comparison is a signed operation
/// and 2 if the result is an unsigned comparison. Return zero if the operation
/// does not depend on the sign of the input (setne and seteq).
static int isSignedOp(ISD::CondCode Opcode) {
switch (Opcode) {
default: llvm_unreachable("Illegal integer setcc operation!");
case ISD::SETEQ:
case ISD::SETNE: return 0;
case ISD::SETLT:
case ISD::SETLE:
case ISD::SETGT:
case ISD::SETGE: return 1;
case ISD::SETULT:
case ISD::SETULE:
case ISD::SETUGT:
case ISD::SETUGE: return 2;
}
}
ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2,
bool IsInteger) {
if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
// Cannot fold a signed integer setcc with an unsigned integer setcc.
return ISD::SETCC_INVALID;
unsigned Op = Op1 | Op2; // Combine all of the condition bits.
// If the N and U bits get set, then the resultant comparison DOES suddenly
// care about orderedness, and it is true when ordered.
if (Op > ISD::SETTRUE2)
Op &= ~16; // Clear the U bit if the N bit is set.
// Canonicalize illegal integer setcc's.
if (IsInteger && Op == ISD::SETUNE) // e.g. SETUGT | SETULT
Op = ISD::SETNE;
return ISD::CondCode(Op);
}
ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,
bool IsInteger) {
if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
// Cannot fold a signed setcc with an unsigned setcc.
return ISD::SETCC_INVALID;
// Combine all of the condition bits.
ISD::CondCode Result = ISD::CondCode(Op1 & Op2);
// Canonicalize illegal integer setcc's.
if (IsInteger) {
switch (Result) {
default: break;
case ISD::SETUO : Result = ISD::SETFALSE; break; // SETUGT & SETULT
case ISD::SETOEQ: // SETEQ & SETU[LG]E
case ISD::SETUEQ: Result = ISD::SETEQ ; break; // SETUGE & SETULE
case ISD::SETOLT: Result = ISD::SETULT ; break; // SETULT & SETNE
case ISD::SETOGT: Result = ISD::SETUGT ; break; // SETUGT & SETNE
}
}
return Result;
}
//===----------------------------------------------------------------------===//
// SDNode Profile Support
//===----------------------------------------------------------------------===//
/// AddNodeIDOpcode - Add the node opcode to the NodeID data.
static void AddNodeIDOpcode(FoldingSetNodeID &ID, unsigned OpC) {
ID.AddInteger(OpC);
}
/// AddNodeIDValueTypes - Value type lists are intern'd so we can represent them
/// solely with their pointer.
static void AddNodeIDValueTypes(FoldingSetNodeID &ID, SDVTList VTList) {
ID.AddPointer(VTList.VTs);
}
/// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
static void AddNodeIDOperands(FoldingSetNodeID &ID,
ArrayRef<SDValue> Ops) {
for (auto& Op : Ops) {
ID.AddPointer(Op.getNode());
ID.AddInteger(Op.getResNo());
}
}
/// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
static void AddNodeIDOperands(FoldingSetNodeID &ID,
ArrayRef<SDUse> Ops) {
for (auto& Op : Ops) {
ID.AddPointer(Op.getNode());
ID.AddInteger(Op.getResNo());
}
}
static void AddNodeIDNode(FoldingSetNodeID &ID, unsigned short OpC,
SDVTList VTList, ArrayRef<SDValue> OpList) {
AddNodeIDOpcode(ID, OpC);
AddNodeIDValueTypes(ID, VTList);
AddNodeIDOperands(ID, OpList);
}
/// If this is an SDNode with special info, add this info to the NodeID data.
static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
switch (N->getOpcode()) {
case ISD::TargetExternalSymbol:
case ISD::ExternalSymbol:
case ISD::MCSymbol:
llvm_unreachable("Should only be used on nodes with operands");
default: break; // Normal nodes don't need extra info.
case ISD::TargetConstant:
case ISD::Constant: {
const ConstantSDNode *C = cast<ConstantSDNode>(N);
ID.AddPointer(C->getConstantIntValue());
ID.AddBoolean(C->isOpaque());
break;
}
case ISD::TargetConstantFP:
case ISD::ConstantFP:
ID.AddPointer(cast<ConstantFPSDNode>(N)->getConstantFPValue());
break;
case ISD::TargetGlobalAddress:
case ISD::GlobalAddress:
case ISD::TargetGlobalTLSAddress:
case ISD::GlobalTLSAddress: {
const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
ID.AddPointer(GA->getGlobal());
ID.AddInteger(GA->getOffset());
ID.AddInteger(GA->getTargetFlags());
break;
}
case ISD::BasicBlock:
ID.AddPointer(cast<BasicBlockSDNode>(N)->getBasicBlock());
break;
case ISD::Register:
ID.AddInteger(cast<RegisterSDNode>(N)->getReg());
break;
case ISD::RegisterMask:
ID.AddPointer(cast<RegisterMaskSDNode>(N)->getRegMask());
break;
case ISD::SRCVALUE:
ID.AddPointer(cast<SrcValueSDNode>(N)->getValue());
break;
case ISD::FrameIndex:
case ISD::TargetFrameIndex:
ID.AddInteger(cast<FrameIndexSDNode>(N)->getIndex());
break;
case ISD::LIFETIME_START:
case ISD::LIFETIME_END:
if (cast<LifetimeSDNode>(N)->hasOffset()) {
ID.AddInteger(cast<LifetimeSDNode>(N)->getSize());
ID.AddInteger(cast<LifetimeSDNode>(N)->getOffset());
}
break;
case ISD::JumpTable:
case ISD::TargetJumpTable:
ID.AddInteger(cast<JumpTableSDNode>(N)->getIndex());
ID.AddInteger(cast<JumpTableSDNode>(N)->getTargetFlags());
break;
case ISD::ConstantPool:
case ISD::TargetConstantPool: {
const ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(N);
ID.AddInteger(CP->getAlignment());
ID.AddInteger(CP->getOffset());
if (CP->isMachineConstantPoolEntry())
CP->getMachineCPVal()->addSelectionDAGCSEId(ID);
else
ID.AddPointer(CP->getConstVal());
ID.AddInteger(CP->getTargetFlags());
break;
}
case ISD::TargetIndex: {
const TargetIndexSDNode *TI = cast<TargetIndexSDNode>(N);
ID.AddInteger(TI->getIndex());
ID.AddInteger(TI->getOffset());
ID.AddInteger(TI->getTargetFlags());
break;
}
case ISD::LOAD: {
const LoadSDNode *LD = cast<LoadSDNode>(N);
ID.AddInteger(LD->getMemoryVT().getRawBits());
ID.AddInteger(LD->getRawSubclassData());
ID.AddInteger(LD->getPointerInfo().getAddrSpace());
break;
}
case ISD::STORE: {
const StoreSDNode *ST = cast<StoreSDNode>(N);
ID.AddInteger(ST->getMemoryVT().getRawBits());
ID.AddInteger(ST->getRawSubclassData());
ID.AddInteger(ST->getPointerInfo().getAddrSpace());
break;
}
case ISD::MLOAD: {
const MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N);
ID.AddInteger(MLD->getMemoryVT().getRawBits());
ID.AddInteger(MLD->getRawSubclassData());
ID.AddInteger(MLD->getPointerInfo().getAddrSpace());
break;
}
case ISD::MSTORE: {
const MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
ID.AddInteger(MST->getMemoryVT().getRawBits());
ID.AddInteger(MST->getRawSubclassData());
ID.AddInteger(MST->getPointerInfo().getAddrSpace());
break;
}
case ISD::MGATHER: {
const MaskedGatherSDNode *MG = cast<MaskedGatherSDNode>(N);
ID.AddInteger(MG->getMemoryVT().getRawBits());
ID.AddInteger(MG->getRawSubclassData());
ID.AddInteger(MG->getPointerInfo().getAddrSpace());
break;
}
case ISD::MSCATTER: {
const MaskedScatterSDNode *MS = cast<MaskedScatterSDNode>(N);
ID.AddInteger(MS->getMemoryVT().getRawBits());
ID.AddInteger(MS->getRawSubclassData());
ID.AddInteger(MS->getPointerInfo().getAddrSpace());
break;
}
case ISD::ATOMIC_CMP_SWAP:
case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
case ISD::ATOMIC_SWAP:
case ISD::ATOMIC_LOAD_ADD:
case ISD::ATOMIC_LOAD_SUB:
case ISD::ATOMIC_LOAD_AND:
case ISD::ATOMIC_LOAD_CLR:
case ISD::ATOMIC_LOAD_OR:
case ISD::ATOMIC_LOAD_XOR:
case ISD::ATOMIC_LOAD_NAND:
case ISD::ATOMIC_LOAD_MIN:
case ISD::ATOMIC_LOAD_MAX:
case ISD::ATOMIC_LOAD_UMIN:
case ISD::ATOMIC_LOAD_UMAX:
case ISD::ATOMIC_LOAD:
case ISD::ATOMIC_STORE: {
const AtomicSDNode *AT = cast<AtomicSDNode>(N);
ID.AddInteger(AT->getMemoryVT().getRawBits());
ID.AddInteger(AT->getRawSubclassData());
ID.AddInteger(AT->getPointerInfo().getAddrSpace());
break;
}
case ISD::PREFETCH: {
const MemSDNode *PF = cast<MemSDNode>(N);
ID.AddInteger(PF->getPointerInfo().getAddrSpace());
break;
}
case ISD::VECTOR_SHUFFLE: {
const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
for (unsigned i = 0, e = N->getValueType(0).getVectorNumElements();
i != e; ++i)
ID.AddInteger(SVN->getMaskElt(i));
break;
}
case ISD::TargetBlockAddress:
case ISD::BlockAddress: {
const BlockAddressSDNode *BA = cast<BlockAddressSDNode>(N);
ID.AddPointer(BA->getBlockAddress());
ID.AddInteger(BA->getOffset());
ID.AddInteger(BA->getTargetFlags());
break;
}
} // end switch (N->getOpcode())
// Target specific memory nodes could also have address spaces to check.
if (N->isTargetMemoryOpcode())
ID.AddInteger(cast<MemSDNode>(N)->getPointerInfo().getAddrSpace());
}
/// AddNodeIDNode - Generic routine for adding a nodes info to the NodeID
/// data.
static void AddNodeIDNode(FoldingSetNodeID &ID, const SDNode *N) {
AddNodeIDOpcode(ID, N->getOpcode());
// Add the return value info.
AddNodeIDValueTypes(ID, N->getVTList());
// Add the operand info.
AddNodeIDOperands(ID, N->ops());
// Handle SDNode leafs with special info.
AddNodeIDCustom(ID, N);
}
//===----------------------------------------------------------------------===//
// SelectionDAG Class
//===----------------------------------------------------------------------===//
/// doNotCSE - Return true if CSE should not be performed for this node.
static bool doNotCSE(SDNode *N) {
if (N->getValueType(0) == MVT::Glue)
return true; // Never CSE anything that produces a flag.
switch (N->getOpcode()) {
default: break;
case ISD::HANDLENODE:
case ISD::EH_LABEL:
return true; // Never CSE these nodes.
}
// Check that remaining values produced are not flags.
for (unsigned i = 1, e = N->getNumValues(); i != e; ++i)
if (N->getValueType(i) == MVT::Glue)
return true; // Never CSE anything that produces a flag.
return false;
}
/// RemoveDeadNodes - This method deletes all unreachable nodes in the
/// SelectionDAG.
void SelectionDAG::RemoveDeadNodes() {
// Create a dummy node (which is not added to allnodes), that adds a reference
// to the root node, preventing it from being deleted.
HandleSDNode Dummy(getRoot());
SmallVector<SDNode*, 128> DeadNodes;
// Add all obviously-dead nodes to the DeadNodes worklist.
for (SDNode &Node : allnodes())
if (Node.use_empty())
DeadNodes.push_back(&Node);
RemoveDeadNodes(DeadNodes);
// If the root changed (e.g. it was a dead load, update the root).
setRoot(Dummy.getValue());
}
/// RemoveDeadNodes - This method deletes the unreachable nodes in the
/// given list, and any nodes that become unreachable as a result.
void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes) {
// Process the worklist, deleting the nodes and adding their uses to the
// worklist.
while (!DeadNodes.empty()) {
SDNode *N = DeadNodes.pop_back_val();
// Skip to next node if we've already managed to delete the node. This could
// happen if replacing a node causes a node previously added to the node to
// be deleted.
if (N->getOpcode() == ISD::DELETED_NODE)
continue;
for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
DUL->NodeDeleted(N, nullptr);
// Take the node out of the appropriate CSE map.
RemoveNodeFromCSEMaps(N);
// Next, brutally remove the operand list. This is safe to do, as there are
// no cycles in the graph.
for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
SDUse &Use = *I++;
SDNode *Operand = Use.getNode();
Use.set(SDValue());
// Now that we removed this operand, see if there are no uses of it left.
if (Operand->use_empty())
DeadNodes.push_back(Operand);
}
DeallocateNode(N);
}
}
void SelectionDAG::RemoveDeadNode(SDNode *N){
SmallVector<SDNode*, 16> DeadNodes(1, N);
// Create a dummy node that adds a reference to the root node, preventing
// it from being deleted. (This matters if the root is an operand of the
// dead node.)
HandleSDNode Dummy(getRoot());
RemoveDeadNodes(DeadNodes);
}
void SelectionDAG::DeleteNode(SDNode *N) {
// First take this out of the appropriate CSE map.
RemoveNodeFromCSEMaps(N);
// Finally, remove uses due to operands of this node, remove from the
// AllNodes list, and delete the node.
DeleteNodeNotInCSEMaps(N);
}
void SelectionDAG::DeleteNodeNotInCSEMaps(SDNode *N) {
assert(N->getIterator() != AllNodes.begin() &&
"Cannot delete the entry node!");
assert(N->use_empty() && "Cannot delete a node that is not dead!");
// Drop all of the operands and decrement used node's use counts.
N->DropOperands();
DeallocateNode(N);
}
void SDDbgInfo::erase(const SDNode *Node) {
DbgValMapType::iterator I = DbgValMap.find(Node);
if (I == DbgValMap.end())
return;
for (auto &Val: I->second)
Val->setIsInvalidated();
DbgValMap.erase(I);
}
void SelectionDAG::DeallocateNode(SDNode *N) {
// If we have operands, deallocate them.
removeOperands(N);
NodeAllocator.Deallocate(AllNodes.remove(N));
// Set the opcode to DELETED_NODE to help catch bugs when node
// memory is reallocated.
// FIXME: There are places in SDag that have grown a dependency on the opcode
// value in the released node.
__asan_unpoison_memory_region(&N->NodeType, sizeof(N->NodeType));
N->NodeType = ISD::DELETED_NODE;
// If any of the SDDbgValue nodes refer to this SDNode, invalidate
// them and forget about that node.
DbgInfo->erase(N);
}
#ifndef NDEBUG
/// VerifySDNode - Sanity check the given SDNode. Aborts if it is invalid.
static void VerifySDNode(SDNode *N) {
switch (N->getOpcode()) {
default:
break;
case ISD::BUILD_PAIR: {
EVT VT = N->getValueType(0);
assert(N->getNumValues() == 1 && "Too many results!");
assert(!VT.isVector() && (VT.isInteger() || VT.isFloatingPoint()) &&
"Wrong return type!");
assert(N->getNumOperands() == 2 && "Wrong number of operands!");
assert(N->getOperand(0).getValueType() == N->getOperand(1).getValueType() &&
"Mismatched operand types!");
assert(N->getOperand(0).getValueType().isInteger() == VT.isInteger() &&
"Wrong operand type!");
assert(VT.getSizeInBits() == 2 * N->getOperand(0).getValueSizeInBits() &&
"Wrong return type size");
break;
}
case ISD::BUILD_VECTOR: {
assert(N->getNumValues() == 1 && "Too many results!");
assert(N->getValueType(0).isVector() && "Wrong return type!");
assert(N->getNumOperands() == N->getValueType(0).getVectorNumElements() &&
"Wrong number of operands!");
EVT EltVT = N->getValueType(0).getVectorElementType();
for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ++I) {
assert((I->getValueType() == EltVT ||
(EltVT.isInteger() && I->getValueType().isInteger() &&
EltVT.bitsLE(I->getValueType()))) &&
"Wrong operand type!");
assert(I->getValueType() == N->getOperand(0).getValueType() &&
"Operands must all have the same type");
}
break;
}
}
}
#endif // NDEBUG
/// Insert a newly allocated node into the DAG.
///
/// Handles insertion into the all nodes list and CSE map, as well as
/// verification and other common operations when a new node is allocated.
void SelectionDAG::InsertNode(SDNode *N) {
AllNodes.push_back(N);
#ifndef NDEBUG
N->PersistentId = NextPersistentId++;
VerifySDNode(N);
#endif
for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
DUL->NodeInserted(N);
}
/// RemoveNodeFromCSEMaps - Take the specified node out of the CSE map that
/// correspond to it. This is useful when we're about to delete or repurpose
/// the node. We don't want future request for structurally identical nodes
/// to return N anymore.
bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) {
bool Erased = false;
switch (N->getOpcode()) {
case ISD::HANDLENODE: return false; // noop.
case ISD::CONDCODE:
assert(CondCodeNodes[cast<CondCodeSDNode>(N)->get()] &&
"Cond code doesn't exist!");
Erased = CondCodeNodes[cast<CondCodeSDNode>(N)->get()] != nullptr;
CondCodeNodes[cast<CondCodeSDNode>(N)->get()] = nullptr;
break;
case ISD::ExternalSymbol:
Erased = ExternalSymbols.erase(cast<ExternalSymbolSDNode>(N)->getSymbol());
break;
case ISD::TargetExternalSymbol: {
ExternalSymbolSDNode *ESN = cast<ExternalSymbolSDNode>(N);
Erased = TargetExternalSymbols.erase(
std::pair<std::string,unsigned char>(ESN->getSymbol(),
ESN->getTargetFlags()));
break;
}
case ISD::MCSymbol: {
auto *MCSN = cast<MCSymbolSDNode>(N);
Erased = MCSymbols.erase(MCSN->getMCSymbol());
break;
}
case ISD::VALUETYPE: {
EVT VT = cast<VTSDNode>(N)->getVT();
if (VT.isExtended()) {
Erased = ExtendedValueTypeNodes.erase(VT);
} else {
Erased = ValueTypeNodes[VT.getSimpleVT().SimpleTy] != nullptr;
ValueTypeNodes[VT.getSimpleVT().SimpleTy] = nullptr;
}
break;
}
default:
// Remove it from the CSE Map.
assert(N->getOpcode() != ISD::DELETED_NODE && "DELETED_NODE in CSEMap!");
assert(N->getOpcode() != ISD::EntryToken && "EntryToken in CSEMap!");
Erased = CSEMap.RemoveNode(N);
break;
}
#ifndef NDEBUG
// Verify that the node was actually in one of the CSE maps, unless it has a
// flag result (which cannot be CSE'd) or is one of the special cases that are
// not subject to CSE.
if (!Erased && N->getValueType(N->getNumValues()-1) != MVT::Glue &&
!N->isMachineOpcode() && !doNotCSE(N)) {
N->dump(this);
dbgs() << "\n";
llvm_unreachable("Node is not in map!");
}
#endif
return Erased;
}
/// AddModifiedNodeToCSEMaps - The specified node has been removed from the CSE
/// maps and modified in place. Add it back to the CSE maps, unless an identical
/// node already exists, in which case transfer all its users to the existing
/// node. This transfer can potentially trigger recursive merging.
void
SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N) {
// For node types that aren't CSE'd, just act as if no identical node
// already exists.
if (!doNotCSE(N)) {
SDNode *Existing = CSEMap.GetOrInsertNode(N);
if (Existing != N) {
// If there was already an existing matching node, use ReplaceAllUsesWith
// to replace the dead one with the existing one. This can cause
// recursive merging of other unrelated nodes down the line.
ReplaceAllUsesWith(N, Existing);
// N is now dead. Inform the listeners and delete it.
for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
DUL->NodeDeleted(N, Existing);
DeleteNodeNotInCSEMaps(N);
return;
}
}
// If the node doesn't already exist, we updated it. Inform listeners.
for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
DUL->NodeUpdated(N);
}
/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
/// were replaced with those specified. If this node is never memoized,
/// return null, otherwise return a pointer to the slot it would take. If a
/// node already exists with these operands, the slot will be non-null.
SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op,
void *&InsertPos) {
if (doNotCSE(N))
return nullptr;
SDValue Ops[] = { Op };
FoldingSetNodeID ID;
AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
AddNodeIDCustom(ID, N);
SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
if (Node)
Node->intersectFlagsWith(N->getFlags());
return Node;
}
/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
/// were replaced with those specified. If this node is never memoized,
/// return null, otherwise return a pointer to the slot it would take. If a
/// node already exists with these operands, the slot will be non-null.
SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
SDValue Op1, SDValue Op2,
void *&InsertPos) {
if (doNotCSE(N))
return nullptr;
SDValue Ops[] = { Op1, Op2 };
FoldingSetNodeID ID;
AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
AddNodeIDCustom(ID, N);
SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
if (Node)
Node->intersectFlagsWith(N->getFlags());
return Node;
}
/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
/// were replaced with those specified. If this node is never memoized,
/// return null, otherwise return a pointer to the slot it would take. If a
/// node already exists with these operands, the slot will be non-null.
SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, ArrayRef<SDValue> Ops,
void *&InsertPos) {
if (doNotCSE(N))
return nullptr;
FoldingSetNodeID ID;
AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
AddNodeIDCustom(ID, N);
SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
if (Node)
Node->intersectFlagsWith(N->getFlags());
return Node;
}
unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
Type *Ty = VT == MVT::iPTR ?
PointerType::get(Type::getInt8Ty(*getContext()), 0) :
VT.getTypeForEVT(*getContext());
return getDataLayout().getABITypeAlignment(Ty);
}
// EntryNode could meaningfully have debug info if we can find it...
SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
: TM(tm), OptLevel(OL),
EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)),
Root(getEntryNode()) {
InsertNode(&EntryNode);
DbgInfo = new SDDbgInfo();
}
void SelectionDAG::init(MachineFunction &NewMF,
OptimizationRemarkEmitter &NewORE,
Pass *PassPtr, const TargetLibraryInfo *LibraryInfo,
LegacyDivergenceAnalysis * Divergence) {
MF = &NewMF;
SDAGISelPass = PassPtr;
ORE = &NewORE;
TLI = getSubtarget().getTargetLowering();
TSI = getSubtarget().getSelectionDAGInfo();
LibInfo = LibraryInfo;
Context = &MF->getFunction().getContext();
DA = Divergence;
}
SelectionDAG::~SelectionDAG() {
assert(!UpdateListeners && "Dangling registered DAGUpdateListeners");
allnodes_clear();
OperandRecycler.clear(OperandAllocator);
delete DbgInfo;
}
void SelectionDAG::allnodes_clear() {
assert(&*AllNodes.begin() == &EntryNode);
AllNodes.remove(AllNodes.begin());
while (!AllNodes.empty())
DeallocateNode(&AllNodes.front());
#ifndef NDEBUG
NextPersistentId = 0;
#endif
}
SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
void *&InsertPos) {
SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
if (N) {
switch (N->getOpcode()) {
default: break;
case ISD::Constant:
case ISD::ConstantFP:
llvm_unreachable("Querying for Constant and ConstantFP nodes requires "
"debug location. Use another overload.");
}
}
return N;
}
SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
const SDLoc &DL, void *&InsertPos) {
SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
if (N) {
switch (N->getOpcode()) {
case ISD::Constant:
case ISD::ConstantFP:
// Erase debug location from the node if the node is used at several
// different places. Do not propagate one location to all uses as it
// will cause a worse single stepping debugging experience.
if (N->getDebugLoc() != DL.getDebugLoc())
N->setDebugLoc(DebugLoc());
break;
default:
// When the node's point of use is located earlier in the instruction
// sequence than its prior point of use, update its debug info to the
// earlier location.
if (DL.getIROrder() && DL.getIROrder() < N->getIROrder())
N->setDebugLoc(DL.getDebugLoc());
break;
}
}
return N;
}
void SelectionDAG::clear() {
allnodes_clear();
OperandRecycler.clear(OperandAllocator);
OperandAllocator.Reset();
CSEMap.clear();
ExtendedValueTypeNodes.clear();
ExternalSymbols.clear();
TargetExternalSymbols.clear();
MCSymbols.clear();
+ SDCallSiteDbgInfo.clear();
std::fill(CondCodeNodes.begin(), CondCodeNodes.end(),
static_cast<CondCodeSDNode*>(nullptr));
std::fill(ValueTypeNodes.begin(), ValueTypeNodes.end(),
static_cast<SDNode*>(nullptr));
EntryNode.UseList = nullptr;
InsertNode(&EntryNode);
Root = getEntryNode();
DbgInfo->clear();
}
SDValue SelectionDAG::getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT) {
return VT.bitsGT(Op.getValueType())
? getNode(ISD::FP_EXTEND, DL, VT, Op)
: getNode(ISD::FP_ROUND, DL, VT, Op, getIntPtrConstant(0, DL));
}
SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
return VT.bitsGT(Op.getValueType()) ?
getNode(ISD::ANY_EXTEND, DL, VT, Op) :
getNode(ISD::TRUNCATE, DL, VT, Op);
}
SDValue SelectionDAG::getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
return VT.bitsGT(Op.getValueType()) ?
getNode(ISD::SIGN_EXTEND, DL, VT, Op) :
getNode(ISD::TRUNCATE, DL, VT, Op);
}
SDValue SelectionDAG::getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
return VT.bitsGT(Op.getValueType()) ?
getNode(ISD::ZERO_EXTEND, DL, VT, Op) :
getNode(ISD::TRUNCATE, DL, VT, Op);
}
SDValue SelectionDAG::getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT,
EVT OpVT) {
if (VT.bitsLE(Op.getValueType()))
return getNode(ISD::TRUNCATE, SL, VT, Op);
TargetLowering::BooleanContent BType = TLI->getBooleanContents(OpVT);
return getNode(TLI->getExtendForContent(BType), SL, VT, Op);
}
SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT) {
assert(!VT.isVector() &&
"getZeroExtendInReg should use the vector element type instead of "
"the vector type!");
if (Op.getValueType().getScalarType() == VT) return Op;
unsigned BitWidth = Op.getScalarValueSizeInBits();
APInt Imm = APInt::getLowBitsSet(BitWidth,
VT.getSizeInBits());
return getNode(ISD::AND, DL, Op.getValueType(), Op,
getConstant(Imm, DL, Op.getValueType()));
}
SDValue SelectionDAG::getPtrExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
// Only unsigned pointer semantics are supported right now. In the future this
// might delegate to TLI to check pointer signedness.
return getZExtOrTrunc(Op, DL, VT);
}
SDValue SelectionDAG::getPtrExtendInReg(SDValue Op, const SDLoc &DL, EVT VT) {
// Only unsigned pointer semantics are supported right now. In the future this
// might delegate to TLI to check pointer signedness.
return getZeroExtendInReg(Op, DL, VT);
}
/// getNOT - Create a bitwise NOT operation as (XOR Val, -1).
SDValue SelectionDAG::getNOT(const SDLoc &DL, SDValue Val, EVT VT) {
EVT EltVT = VT.getScalarType();
SDValue NegOne =
getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, VT);
return getNode(ISD::XOR, DL, VT, Val, NegOne);
}
SDValue SelectionDAG::getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT) {
SDValue TrueValue = getBoolConstant(true, DL, VT, VT);
return getNode(ISD::XOR, DL, VT, Val, TrueValue);
}
SDValue SelectionDAG::getBoolConstant(bool V, const SDLoc &DL, EVT VT,
EVT OpVT) {
if (!V)
return getConstant(0, DL, VT);
switch (TLI->getBooleanContents(OpVT)) {
case TargetLowering::ZeroOrOneBooleanContent:
case TargetLowering::UndefinedBooleanContent:
return getConstant(1, DL, VT);
case TargetLowering::ZeroOrNegativeOneBooleanContent:
return getAllOnesConstant(DL, VT);
}
llvm_unreachable("Unexpected boolean content enum!");
}
SDValue SelectionDAG::getConstant(uint64_t Val, const SDLoc &DL, EVT VT,
bool isT, bool isO) {
EVT EltVT = VT.getScalarType();
assert((EltVT.getSizeInBits() >= 64 ||
(uint64_t)((int64_t)Val >> EltVT.getSizeInBits()) + 1 < 2) &&
"getConstant with a uint64_t value that doesn't fit in the type!");
return getConstant(APInt(EltVT.getSizeInBits(), Val), DL, VT, isT, isO);
}
SDValue SelectionDAG::getConstant(const APInt &Val, const SDLoc &DL, EVT VT,
bool isT, bool isO) {
return getConstant(*ConstantInt::get(*Context, Val), DL, VT, isT, isO);
}
SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
EVT VT, bool isT, bool isO) {
assert(VT.isInteger() && "Cannot create FP integer constant!");
EVT EltVT = VT.getScalarType();
const ConstantInt *Elt = &Val;
// In some cases the vector type is legal but the element type is illegal and
// needs to be promoted, for example v8i8 on ARM. In this case, promote the
// inserted value (the type does not need to match the vector element type).
// Any extra bits introduced will be truncated away.
if (VT.isVector() && TLI->getTypeAction(*getContext(), EltVT) ==
TargetLowering::TypePromoteInteger) {
EltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
APInt NewVal = Elt->getValue().zextOrTrunc(EltVT.getSizeInBits());
Elt = ConstantInt::get(*getContext(), NewVal);
}
// In other cases the element type is illegal and needs to be expanded, for
// example v2i64 on MIPS32. In this case, find the nearest legal type, split
// the value into n parts and use a vector type with n-times the elements.
// Then bitcast to the type requested.
// Legalizing constants too early makes the DAGCombiner's job harder so we
// only legalize if the DAG tells us we must produce legal types.
else if (NewNodesMustHaveLegalTypes && VT.isVector() &&
TLI->getTypeAction(*getContext(), EltVT) ==
TargetLowering::TypeExpandInteger) {
const APInt &NewVal = Elt->getValue();
EVT ViaEltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits();
unsigned ViaVecNumElts = VT.getSizeInBits() / ViaEltSizeInBits;
EVT ViaVecVT = EVT::getVectorVT(*getContext(), ViaEltVT, ViaVecNumElts);
// Check the temporary vector is the correct size. If this fails then
// getTypeToTransformTo() probably returned a type whose size (in bits)
// isn't a power-of-2 factor of the requested type size.
assert(ViaVecVT.getSizeInBits() == VT.getSizeInBits());
SmallVector<SDValue, 2> EltParts;
for (unsigned i = 0; i < ViaVecNumElts / VT.getVectorNumElements(); ++i) {
EltParts.push_back(getConstant(NewVal.lshr(i * ViaEltSizeInBits)
.zextOrTrunc(ViaEltSizeInBits), DL,
ViaEltVT, isT, isO));
}
// EltParts is currently in little endian order. If we actually want
// big-endian order then reverse it now.
if (getDataLayout().isBigEndian())
std::reverse(EltParts.begin(), EltParts.end());
// The elements must be reversed when the element order is different
// to the endianness of the elements (because the BITCAST is itself a
// vector shuffle in this situation). However, we do not need any code to
// perform this reversal because getConstant() is producing a vector
// splat.
// This situation occurs in MIPS MSA.
SmallVector<SDValue, 8> Ops;
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
Ops.insert(Ops.end(), EltParts.begin(), EltParts.end());
SDValue V = getNode(ISD::BITCAST, DL, VT, getBuildVector(ViaVecVT, DL, Ops));
return V;
}
assert(Elt->getBitWidth() == EltVT.getSizeInBits() &&
"APInt size does not match type size!");
unsigned Opc = isT ? ISD::TargetConstant : ISD::Constant;
FoldingSetNodeID ID;
AddNodeIDNode(ID, Opc, getVTList(EltVT), None);
ID.AddPointer(Elt);
ID.AddBoolean(isO);
void *IP = nullptr;
SDNode *N = nullptr;
if ((N = FindNodeOrInsertPos(ID, DL, IP)))
if (!VT.isVector())
return SDValue(N, 0);
if (!N) {
N = newSDNode<ConstantSDNode>(isT, isO, Elt, EltVT);
CSEMap.InsertNode(N, IP);
InsertNode(N);
NewSDValueDbgMsg(SDValue(N, 0), "Creating constant: ", this);
}
SDValue Result(N, 0);
if (VT.isVector())
Result = getSplatBuildVector(VT, DL, Result);
return Result;
}
SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, const SDLoc &DL,
bool isTarget) {
return getConstant(Val, DL, TLI->getPointerTy(getDataLayout()), isTarget);
}
SDValue SelectionDAG::getShiftAmountConstant(uint64_t Val, EVT VT,
const SDLoc &DL, bool LegalTypes) {
EVT ShiftVT = TLI->getShiftAmountTy(VT, getDataLayout(), LegalTypes);
return getConstant(Val, DL, ShiftVT);
}
SDValue SelectionDAG::getConstantFP(const APFloat &V, const SDLoc &DL, EVT VT,
bool isTarget) {
return getConstantFP(*ConstantFP::get(*getContext(), V), DL, VT, isTarget);
}
SDValue SelectionDAG::getConstantFP(const ConstantFP &V, const SDLoc &DL,
EVT VT, bool isTarget) {
assert(VT.isFloatingPoint() && "Cannot create integer FP constant!");
EVT EltVT = VT.getScalarType();
// Do the map lookup using the actual bit pattern for the floating point
// value, so that we don't have problems with 0.0 comparing equal to -0.0, and
// we don't have issues with SNANs.
unsigned Opc = isTarget ? ISD::TargetConstantFP : ISD::ConstantFP;
FoldingSetNodeID ID;
AddNodeIDNode(ID, Opc, getVTList(EltVT), None);
ID.AddPointer(&V);
void *IP = nullptr;
SDNode *N = nullptr;
if ((N = FindNodeOrInsertPos(ID, DL, IP)))
if (!VT.isVector())
return SDValue(N, 0);
if (!N) {
N = newSDNode<ConstantFPSDNode>(isTarget, &V, EltVT);
CSEMap.InsertNode(N, IP);
InsertNode(N);
}
SDValue Result(N, 0);
if (VT.isVector())
Result = getSplatBuildVector(VT, DL, Result);
NewSDValueDbgMsg(Result, "Creating fp constant: ", this);
return Result;
}
SDValue SelectionDAG::getConstantFP(double Val, const SDLoc &DL, EVT VT,
bool isTarget) {
EVT EltVT = VT.getScalarType();
if (EltVT == MVT::f32)
return getConstantFP(APFloat((float)Val), DL, VT, isTarget);
else if (EltVT == MVT::f64)
return getConstantFP(APFloat(Val), DL, VT, isTarget);
else if (EltVT == MVT::f80 || EltVT == MVT::f128 || EltVT == MVT::ppcf128 ||
EltVT == MVT::f16) {
bool Ignored;
APFloat APF = APFloat(Val);
APF.convert(EVTToAPFloatSemantics(EltVT), APFloat::rmNearestTiesToEven,
&Ignored);
return getConstantFP(APF, DL, VT, isTarget);
} else
llvm_unreachable("Unsupported type in getConstantFP");
}
SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, const SDLoc &DL,
EVT VT, int64_t Offset, bool isTargetGA,
unsigned char TargetFlags) {
assert((TargetFlags == 0 || isTargetGA) &&
"Cannot set target flags on target-independent globals");
// Truncate (with sign-extension) the offset value to the pointer size.
unsigned BitWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType());
if (BitWidth < 64)
Offset = SignExtend64(Offset, BitWidth);
unsigned Opc;
if (GV->isThreadLocal())
Opc = isTargetGA ? ISD::TargetGlobalTLSAddress : ISD::GlobalTLSAddress;
else
Opc = isTargetGA ? ISD::TargetGlobalAddress : ISD::GlobalAddress;
FoldingSetNodeID ID;
AddNodeIDNode(ID, Opc, getVTList(VT), None);
ID.AddPointer(GV);
ID.AddInteger(Offset);
ID.AddInteger(TargetFlags);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
return SDValue(E, 0);
auto *N = newSDNode<GlobalAddressSDNode>(
Opc, DL.getIROrder(), DL.getDebugLoc(), GV, VT, Offset, TargetFlags);
CSEMap.InsertNode(N, IP);
InsertNode(N);
return SDValue(N, 0);
}
SDValue SelectionDAG::getFrameIndex(int FI, EVT VT, bool isTarget) {
unsigned Opc = isTarget ? ISD::TargetFrameIndex : ISD::FrameIndex;
FoldingSetNodeID ID;
AddNodeIDNode(ID, Opc, getVTList(VT), None);
ID.AddInteger(FI);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, IP))
return SDValue(E, 0);
auto *N = newSDNode<FrameIndexSDNode>(FI, VT, isTarget);
CSEMap.InsertNode(N, IP);
InsertNode(N);
return SDValue(N, 0);
}
SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget,
unsigned char TargetFlags) {
assert((TargetFlags == 0 || isTarget) &&
"Cannot set target flags on target-independent jump tables");
unsigned Opc = isTarget ? ISD::TargetJumpTable : ISD::JumpTable;
FoldingSetNodeID ID;
AddNodeIDNode(ID, Opc, getVTList(VT), None);
ID.AddInteger(JTI);
ID.AddInteger(TargetFlags);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, IP))
return SDValue(E, 0);
auto *N = newSDNode<JumpTableSDNode>(JTI, VT, isTarget, TargetFlags);
CSEMap.InsertNode(N, IP);
InsertNode(N);
return SDValue(N, 0);
}
SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
unsigned Alignment, int Offset,
bool isTarget,
unsigned char TargetFlags) {
assert((TargetFlags == 0 || isTarget) &&
"Cannot set target flags on target-independent globals");
if (Alignment == 0)
Alignment = MF->getFunction().hasOptSize()
? getDataLayout().getABITypeAlignment(C->getType())
: getDataLayout().getPrefTypeAlignment(C->getType());
unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
FoldingSetNodeID ID;
AddNodeIDNode(ID, Opc, getVTList(VT), None);
ID.AddInteger(Alignment);
ID.AddInteger(Offset);
ID.AddPointer(C);
ID.AddInteger(TargetFlags);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, IP))
return SDValue(E, 0);
auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, Alignment,
TargetFlags);
CSEMap.InsertNode(N, IP);
InsertNode(N);
return SDValue(N, 0);
}
SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
unsigned Alignment, int Offset,
bool isTarget,
unsigned char TargetFlags) {
assert((TargetFlags == 0 || isTarget) &&
"Cannot set target flags on target-independent globals");
if (Alignment == 0)
Alignment = getDataLayout().getPrefTypeAlignment(C->getType());
unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
FoldingSetNodeID ID;
AddNodeIDNode(ID, Opc, getVTList(VT), None);
ID.AddInteger(Alignment);
ID.AddInteger(Offset);
C->addSelectionDAGCSEId(ID);
ID.AddInteger(TargetFlags);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, IP))
return SDValue(E, 0);
auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, Alignment,
TargetFlags);
CSEMap.InsertNode(N, IP);
InsertNode(N);
return SDValue(N, 0);
}
SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset,
unsigned char TargetFlags) {
FoldingSetNodeID ID;
AddNodeIDNode(ID, ISD::TargetIndex, getVTList(VT), None);
ID.AddInteger(Index);
ID.AddInteger(Offset);
ID.AddInteger(TargetFlags);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, IP))
return SDValue(E, 0);
auto *N = newSDNode<TargetIndexSDNode>(Index, VT, Offset, TargetFlags);
CSEMap.InsertNode(N, IP);
InsertNode(N);
return SDValue(N, 0);
}
SDValue SelectionDAG::getBasicBlock(MachineBasicBlock *MBB) {
FoldingSetNodeID ID;
AddNodeIDNode(ID, ISD::BasicBlock, getVTList(MVT::Other), None);
ID.AddPointer(MBB);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, IP))
return SDValue(E, 0);
auto *N = newSDNode<BasicBlockSDNode>(MBB);
CSEMap.InsertNode(N, IP);
InsertNode(N);
return SDValue(N, 0);
}
SDValue SelectionDAG::getValueType(EVT VT) {
if (VT.isSimple() && (unsigned)VT.getSimpleVT().SimpleTy >=
ValueTypeNodes.size())
ValueTypeNodes.resize(VT.getSimpleVT().SimpleTy+1);
SDNode *&N = VT.isExtended() ?
ExtendedValueTypeNodes[VT] : ValueTypeNodes[VT.getSimpleVT().SimpleTy];
if (N) return SDValue(N, 0);
N = newSDNode<VTSDNode>(VT);
InsertNode(N);
return SDValue(N, 0);
}
SDValue SelectionDAG::getExternalSymbol(const char *Sym, EVT VT) {
SDNode *&N = ExternalSymbols[Sym];
if (N) return SDValue(N, 0);
N = newSDNode<ExternalSymbolSDNode>(false, Sym, 0, VT);
InsertNode(N);
return SDValue(N, 0);
}
SDValue SelectionDAG::getMCSymbol(MCSymbol *Sym, EVT VT) {
SDNode *&N = MCSymbols[Sym];
if (N)
return SDValue(N, 0);
N = newSDNode<MCSymbolSDNode>(Sym, VT);
InsertNode(N);
return SDValue(N, 0);
}
SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, EVT VT,
unsigned char TargetFlags) {
SDNode *&N =
TargetExternalSymbols[std::pair<std::string,unsigned char>(Sym,
TargetFlags)];
if (N) return SDValue(N, 0);
N = newSDNode<ExternalSymbolSDNode>(true, Sym, TargetFlags, VT);
InsertNode(N);
return SDValue(N, 0);
}
SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) {
if ((unsigned)Cond >= CondCodeNodes.size())
CondCodeNodes.resize(Cond+1);
if (!CondCodeNodes[Cond]) {
auto *N = newSDNode<CondCodeSDNode>(Cond);
CondCodeNodes[Cond] = N;
InsertNode(N);
}
return SDValue(CondCodeNodes[Cond], 0);
}
/// Swaps the values of N1 and N2. Swaps all indices in the shuffle mask M that
/// point at N1 to point at N2 and indices that point at N2 to point at N1.
static void commuteShuffle(SDValue &N1, SDValue &N2, MutableArrayRef<int> M) {
std::swap(N1, N2);
ShuffleVectorSDNode::commuteMask(M);
}
SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1,
SDValue N2, ArrayRef<int> Mask) {
assert(VT.getVectorNumElements() == Mask.size() &&
"Must have the same number of vector elements as mask elements!");
assert(VT == N1.getValueType() && VT == N2.getValueType() &&
"Invalid VECTOR_SHUFFLE");
// Canonicalize shuffle undef, undef -> undef
if (N1.isUndef() && N2.isUndef())
return getUNDEF(VT);
// Validate that all indices in Mask are within the range of the elements
// input to the shuffle.
int NElts = Mask.size();
assert(llvm::all_of(Mask,
[&](int M) { return M < (NElts * 2) && M >= -1; }) &&
"Index out of range");
// Copy the mask so we can do any needed cleanup.
SmallVector<int, 8> MaskVec(Mask.begin(), Mask.end());
// Canonicalize shuffle v, v -> v, undef
if (N1 == N2) {
N2 = getUNDEF(VT);
for (int i = 0; i != NElts; ++i)
if (MaskVec[i] >= NElts) MaskVec[i] -= NElts;
}
// Canonicalize shuffle undef, v -> v, undef. Commute the shuffle mask.
if (N1.isUndef())
commuteShuffle(N1, N2, MaskVec);
if (TLI->hasVectorBlend()) {
// If shuffling a splat, try to blend the splat instead. We do this here so
// that even when this arises during lowering we don't have to re-handle it.
auto BlendSplat = [&](BuildVectorSDNode *BV, int Offset) {
BitVector UndefElements;
SDValue Splat = BV->getSplatValue(&UndefElements);
if (!Splat)
return;
for (int i = 0; i < NElts; ++i) {
if (MaskVec[i] < Offset || MaskVec[i] >= (Offset + NElts))
continue;
// If this input comes from undef, mark it as such.
if (UndefElements[MaskVec[i] - Offset]) {
MaskVec[i] = -1;
continue;
}
// If we can blend a non-undef lane, use that instead.
if (!UndefElements[i])
MaskVec[i] = i + Offset;
}
};
if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
BlendSplat(N1BV, 0);
if (auto *N2BV = dyn_cast<BuildVectorSDNode>(N2))
BlendSplat(N2BV, NElts);
}
// Canonicalize all index into lhs, -> shuffle lhs, undef
// Canonicalize all index into rhs, -> shuffle rhs, undef
bool AllLHS = true, AllRHS = true;
bool N2Undef = N2.isUndef();
for (int i = 0; i != NElts; ++i) {
if (MaskVec[i] >= NElts) {
if (N2Undef)
MaskVec[i] = -1;
else
AllLHS = false;
} else if (MaskVec[i] >= 0) {
AllRHS = false;
}
}
if (AllLHS && AllRHS)
return getUNDEF(VT);
if (AllLHS && !N2Undef)
N2 = getUNDEF(VT);
if (AllRHS) {
N1 = getUNDEF(VT);
commuteShuffle(N1, N2, MaskVec);
}
// Reset our undef status after accounting for the mask.
N2Undef = N2.isUndef();
// Re-check whether both sides ended up undef.
if (N1.isUndef() && N2Undef)
return getUNDEF(VT);
// If Identity shuffle return that node.
bool Identity = true, AllSame = true;
for (int i = 0; i != NElts; ++i) {
if (MaskVec[i] >= 0 && MaskVec[i] != i) Identity = false;
if (MaskVec[i] != MaskVec[0]) AllSame = false;
}
if (Identity && NElts)
return N1;
// Shuffling a constant splat doesn't change the result.
if (N2Undef) {
SDValue V = N1;
// Look through any bitcasts. We check that these don't change the number
// (and size) of elements and just changes their types.
while (V.getOpcode() == ISD::BITCAST)
V = V->getOperand(0);
// A splat should always show up as a build vector node.
if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
BitVector UndefElements;
SDValue Splat = BV->getSplatValue(&UndefElements);
// If this is a splat of an undef, shuffling it is also undef.
if (Splat && Splat.isUndef())
return getUNDEF(VT);
bool SameNumElts =
V.getValueType().getVectorNumElements() == VT.getVectorNumElements();
// We only have a splat which can skip shuffles if there is a splatted
// value and no undef lanes rearranged by the shuffle.
if (Splat && UndefElements.none()) {
// Splat of <x, x, ..., x>, return <x, x, ..., x>, provided that the
// number of elements match or the value splatted is a zero constant.
if (SameNumElts)
return N1;
if (auto *C = dyn_cast<ConstantSDNode>(Splat))
if (C->isNullValue())
return N1;
}
// If the shuffle itself creates a splat, build the vector directly.
if (AllSame && SameNumElts) {
EVT BuildVT = BV->getValueType(0);
const SDValue &Splatted = BV->getOperand(MaskVec[0]);
SDValue NewBV = getSplatBuildVector(BuildVT, dl, Splatted);
// We may have jumped through bitcasts, so the type of the
// BUILD_VECTOR may not match the type of the shuffle.
if (BuildVT != VT)
NewBV = getNode(ISD::BITCAST, dl, VT, NewBV);
return NewBV;
}
}
}
FoldingSetNodeID ID;
SDValue Ops[2] = { N1, N2 };
AddNodeIDNode(ID, ISD::VECTOR_SHUFFLE, getVTList(VT), Ops);
for (int i = 0; i != NElts; ++i)
ID.AddInteger(MaskVec[i]);
void* IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
return SDValue(E, 0);
// Allocate the mask array for the node out of the BumpPtrAllocator, since
// SDNode doesn't have access to it. This memory will be "leaked" when
// the node is deallocated, but recovered when the NodeAllocator is released.
int *MaskAlloc = OperandAllocator.Allocate<int>(NElts);
llvm::copy(MaskVec, MaskAlloc);
auto *N = newSDNode<ShuffleVectorSDNode>(VT, dl.getIROrder(),
dl.getDebugLoc(), MaskAlloc);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
InsertNode(N);
SDValue V = SDValue(N, 0);
NewSDValueDbgMsg(V, "Creating new node: ", this);
return V;
}
SDValue SelectionDAG::getCommutedVectorShuffle(const ShuffleVectorSDNode &SV) {
EVT VT = SV.getValueType(0);
SmallVector<int, 8> MaskVec(SV.getMask().begin(), SV.getMask().end());
ShuffleVectorSDNode::commuteMask(MaskVec);
SDValue Op0 = SV.getOperand(0);
SDValue Op1 = SV.getOperand(1);
return getVectorShuffle(VT, SDLoc(&SV), Op1, Op0, MaskVec);
}
SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) {
FoldingSetNodeID ID;
AddNodeIDNode(ID, ISD::Register, getVTList(VT), None);
ID.AddInteger(RegNo);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, IP))
return SDValue(E, 0);
auto *N = newSDNode<RegisterSDNode>(RegNo, VT);
N->SDNodeBits.IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, DA);
CSEMap.InsertNode(N, IP);
InsertNode(N);
return SDValue(N, 0);
}
SDValue SelectionDAG::getRegisterMask(const uint32_t *RegMask) {
FoldingSetNodeID ID;
AddNodeIDNode(ID, ISD::RegisterMask, getVTList(MVT::Untyped), None);
ID.AddPointer(RegMask);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, IP))
return SDValue(E, 0);
auto *N = newSDNode<RegisterMaskSDNode>(RegMask);
CSEMap.InsertNode(N, IP);
InsertNode(N);
return SDValue(N, 0);
}
SDValue SelectionDAG::getEHLabel(const SDLoc &dl, SDValue Root,
MCSymbol *Label) {
return getLabelNode(ISD::EH_LABEL, dl, Root, Label);
}
SDValue SelectionDAG::getLabelNode(unsigned Opcode, const SDLoc &dl,
SDValue Root, MCSymbol *Label) {
FoldingSetNodeID ID;
SDValue Ops[] = { Root };
AddNodeIDNode(ID, Opcode, getVTList(MVT::Other), Ops);
ID.AddPointer(Label);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, IP))
return SDValue(E, 0);
auto *N =
newSDNode<LabelSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(), Label);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
InsertNode(N);
return SDValue(N, 0);
}
SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
int64_t Offset,
bool isTarget,
unsigned char TargetFlags) {
unsigned Opc = isTarget ? ISD::TargetBlockAddress : ISD::BlockAddress;
FoldingSetNodeID ID;
AddNodeIDNode(ID, Opc, getVTList(VT), None);
ID.AddPointer(BA);
ID.AddInteger(Offset);
ID.AddInteger(TargetFlags);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, IP))
return SDValue(E, 0);
auto *N = newSDNode<BlockAddressSDNode>(Opc, VT, BA, Offset, TargetFlags);
CSEMap.InsertNode(N, IP);
InsertNode(N);
return SDValue(N, 0);
}
SDValue SelectionDAG::getSrcValue(const Value *V) {
assert((!V || V->getType()->isPointerTy()) &&
"SrcValue is not a pointer?");
FoldingSetNodeID ID;
AddNodeIDNode(ID, ISD::SRCVALUE, getVTList(MVT::Other), None);
ID.AddPointer(V);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, IP))
return SDValue(E, 0);
auto *N = newSDNode<SrcValueSDNode>(V);
CSEMap.InsertNode(N, IP);
InsertNode(N);
return SDValue(N, 0);
}
SDValue SelectionDAG::getMDNode(const MDNode *MD) {
FoldingSetNodeID ID;
AddNodeIDNode(ID, ISD::MDNODE_SDNODE, getVTList(MVT::Other), None);
ID.AddPointer(MD);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, IP))
return SDValue(E, 0);
auto *N = newSDNode<MDNodeSDNode>(MD);
CSEMap.InsertNode(N, IP);
InsertNode(N);
return SDValue(N, 0);
}
SDValue SelectionDAG::getBitcast(EVT VT, SDValue V) {
if (VT == V.getValueType())
return V;
return getNode(ISD::BITCAST, SDLoc(V), VT, V);
}
SDValue SelectionDAG::getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr,
unsigned SrcAS, unsigned DestAS) {
SDValue Ops[] = {Ptr};
FoldingSetNodeID ID;
AddNodeIDNode(ID, ISD::ADDRSPACECAST, getVTList(VT), Ops);
ID.AddInteger(SrcAS);
ID.AddInteger(DestAS);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
return SDValue(E, 0);
auto *N = newSDNode<AddrSpaceCastSDNode>(dl.getIROrder(), dl.getDebugLoc(),
VT, SrcAS, DestAS);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
InsertNode(N);
return SDValue(N, 0);
}
/// getShiftAmountOperand - Return the specified value casted to
/// the target's desired shift amount type.
SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
EVT OpTy = Op.getValueType();
EVT ShTy = TLI->getShiftAmountTy(LHSTy, getDataLayout());
if (OpTy == ShTy || OpTy.isVector()) return Op;
return getZExtOrTrunc(Op, SDLoc(Op), ShTy);
}
SDValue SelectionDAG::expandVAArg(SDNode *Node) {
SDLoc dl(Node);
const TargetLowering &TLI = getTargetLoweringInfo();
const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
EVT VT = Node->getValueType(0);
SDValue Tmp1 = Node->getOperand(0);
SDValue Tmp2 = Node->getOperand(1);
unsigned Align = Node->getConstantOperandVal(3);
SDValue VAListLoad = getLoad(TLI.getPointerTy(getDataLayout()), dl, Tmp1,
Tmp2, MachinePointerInfo(V));
SDValue VAList = VAListLoad;
if (Align > TLI.getMinStackArgumentAlignment()) {
assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");
VAList = getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
getConstant(Align - 1, dl, VAList.getValueType()));
VAList = getNode(ISD::AND, dl, VAList.getValueType(), VAList,
getConstant(-(int64_t)Align, dl, VAList.getValueType()));
}
// Increment the pointer, VAList, to the next vaarg
Tmp1 = getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
getConstant(getDataLayout().getTypeAllocSize(
VT.getTypeForEVT(*getContext())),
dl, VAList.getValueType()));
// Store the incremented VAList to the legalized pointer
Tmp1 =
getStore(VAListLoad.getValue(1), dl, Tmp1, Tmp2, MachinePointerInfo(V));
// Load the actual argument out of the pointer VAList
return getLoad(VT, dl, Tmp1, VAList, MachinePointerInfo());
}
SDValue SelectionDAG::expandVACopy(SDNode *Node) {
SDLoc dl(Node);
const TargetLowering &TLI = getTargetLoweringInfo();
// This defaults to loading a pointer from the input and storing it to the
// output, returning the chain.
const Value *VD = cast<SrcValueSDNode>(Node->getOperand(3))->getValue();
const Value *VS = cast<SrcValueSDNode>(Node->getOperand(4))->getValue();
SDValue Tmp1 =
getLoad(TLI.getPointerTy(getDataLayout()), dl, Node->getOperand(0),
Node->getOperand(2), MachinePointerInfo(VS));
return getStore(Tmp1.getValue(1), dl, Tmp1, Node->getOperand(1),
MachinePointerInfo(VD));
}
SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
unsigned ByteSize = VT.getStoreSize();
Type *Ty = VT.getTypeForEVT(*getContext());
unsigned StackAlign =
std::max((unsigned)getDataLayout().getPrefTypeAlignment(Ty), minAlign);
int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
}
SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
unsigned Bytes = std::max(VT1.getStoreSize(), VT2.getStoreSize());
Type *Ty1 = VT1.getTypeForEVT(*getContext());
Type *Ty2 = VT2.getTypeForEVT(*getContext());
const DataLayout &DL = getDataLayout();
unsigned Align =
std::max(DL.getPrefTypeAlignment(Ty1), DL.getPrefTypeAlignment(Ty2));
MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
int FrameIdx = MFI.CreateStackObject(Bytes, Align, false);
return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
}
SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2,
ISD::CondCode Cond, const SDLoc &dl) {
EVT OpVT = N1.getValueType();
// These setcc operations always fold.
switch (Cond) {
default: break;
case ISD::SETFALSE:
case ISD::SETFALSE2: return getBoolConstant(false, dl, VT, OpVT);
case ISD::SETTRUE:
case ISD::SETTRUE2: return getBoolConstant(true, dl, VT, OpVT);
case ISD::SETOEQ:
case ISD::SETOGT:
case ISD::SETOGE:
case ISD::SETOLT:
case ISD::SETOLE:
case ISD::SETONE:
case ISD::SETO:
case ISD::SETUO:
case ISD::SETUEQ:
case ISD::SETUNE:
assert(!OpVT.isInteger() && "Illegal setcc for integer!");
break;
}
if (OpVT.isInteger()) {
// For EQ and NE, we can always pick a value for the undef to make the
// predicate pass or fail, so we can return undef.
// Matches behavior in llvm::ConstantFoldCompareInstruction.
// icmp eq/ne X, undef -> undef.
if ((N1.isUndef() || N2.isUndef()) &&
(Cond == ISD::SETEQ || Cond == ISD::SETNE))
return getUNDEF(VT);
// If both operands are undef, we can return undef for int comparison.
// icmp undef, undef -> undef.
if (N1.isUndef() && N2.isUndef())
return getUNDEF(VT);
// icmp X, X -> true/false
// icmp X, undef -> true/false because undef could be X.
if (N1 == N2)
return getBoolConstant(ISD::isTrueWhenEqual(Cond), dl, VT, OpVT);
}
if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2)) {
const APInt &C2 = N2C->getAPIntValue();
if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1)) {
const APInt &C1 = N1C->getAPIntValue();
switch (Cond) {
default: llvm_unreachable("Unknown integer setcc!");
case ISD::SETEQ: return getBoolConstant(C1 == C2, dl, VT, OpVT);
case ISD::SETNE: return getBoolConstant(C1 != C2, dl, VT, OpVT);
case ISD::SETULT: return getBoolConstant(C1.ult(C2), dl, VT, OpVT);
case ISD::SETUGT: return getBoolConstant(C1.ugt(C2), dl, VT, OpVT);
case ISD::SETULE: return getBoolConstant(C1.ule(C2), dl, VT, OpVT);
case ISD::SETUGE: return getBoolConstant(C1.uge(C2), dl, VT, OpVT);
case ISD::SETLT: return getBoolConstant(C1.slt(C2), dl, VT, OpVT);
case ISD::SETGT: return getBoolConstant(C1.sgt(C2), dl, VT, OpVT);
case ISD::SETLE: return getBoolConstant(C1.sle(C2), dl, VT, OpVT);
case ISD::SETGE: return getBoolConstant(C1.sge(C2), dl, VT, OpVT);
}
}
}
auto *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
auto *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
if (N1CFP && N2CFP) {
APFloat::cmpResult R = N1CFP->getValueAPF().compare(N2CFP->getValueAPF());
switch (Cond) {
default: break;
case ISD::SETEQ: if (R==APFloat::cmpUnordered)
return getUNDEF(VT);
LLVM_FALLTHROUGH;
case ISD::SETOEQ: return getBoolConstant(R==APFloat::cmpEqual, dl, VT,
OpVT);
case ISD::SETNE: if (R==APFloat::cmpUnordered)
return getUNDEF(VT);
LLVM_FALLTHROUGH;
case ISD::SETONE: return getBoolConstant(R==APFloat::cmpGreaterThan ||
R==APFloat::cmpLessThan, dl, VT,
OpVT);
case ISD::SETLT: if (R==APFloat::cmpUnordered)
return getUNDEF(VT);
LLVM_FALLTHROUGH;
case ISD::SETOLT: return getBoolConstant(R==APFloat::cmpLessThan, dl, VT,
OpVT);
case ISD::SETGT: if (R==APFloat::cmpUnordered)
return getUNDEF(VT);
LLVM_FALLTHROUGH;
case ISD::SETOGT: return getBoolConstant(R==APFloat::cmpGreaterThan, dl,
VT, OpVT);
case ISD::SETLE: if (R==APFloat::cmpUnordered)
return getUNDEF(VT);
LLVM_FALLTHROUGH;
case ISD::SETOLE: return getBoolConstant(R==APFloat::cmpLessThan ||
R==APFloat::cmpEqual, dl, VT,
OpVT);
case ISD::SETGE: if (R==APFloat::cmpUnordered)
return getUNDEF(VT);
LLVM_FALLTHROUGH;
case ISD::SETOGE: return getBoolConstant(R==APFloat::cmpGreaterThan ||
R==APFloat::cmpEqual, dl, VT, OpVT);
case ISD::SETO: return getBoolConstant(R!=APFloat::cmpUnordered, dl, VT,
OpVT);
case ISD::SETUO: return getBoolConstant(R==APFloat::cmpUnordered, dl, VT,
OpVT);
case ISD::SETUEQ: return getBoolConstant(R==APFloat::cmpUnordered ||
R==APFloat::cmpEqual, dl, VT,
OpVT);
case ISD::SETUNE: return getBoolConstant(R!=APFloat::cmpEqual, dl, VT,
OpVT);
case ISD::SETULT: return getBoolConstant(R==APFloat::cmpUnordered ||
R==APFloat::cmpLessThan, dl, VT,
OpVT);
case ISD::SETUGT: return getBoolConstant(R==APFloat::cmpGreaterThan ||
R==APFloat::cmpUnordered, dl, VT,
OpVT);
case ISD::SETULE: return getBoolConstant(R!=APFloat::cmpGreaterThan, dl,
VT, OpVT);
case ISD::SETUGE: return getBoolConstant(R!=APFloat::cmpLessThan, dl, VT,
OpVT);
}
} else if (N1CFP && OpVT.isSimple() && !N2.isUndef()) {
// Ensure that the constant occurs on the RHS.
ISD::CondCode SwappedCond = ISD::getSetCCSwappedOperands(Cond);
if (!TLI->isCondCodeLegal(SwappedCond, OpVT.getSimpleVT()))
return SDValue();
return getSetCC(dl, VT, N2, N1, SwappedCond);
} else if ((N2CFP && N2CFP->getValueAPF().isNaN()) ||
(OpVT.isFloatingPoint() && (N1.isUndef() || N2.isUndef()))) {
// If an operand is known to be a nan (or undef that could be a nan), we can
// fold it.
// Choosing NaN for the undef will always make unordered comparison succeed
// and ordered comparison fails.
// Matches behavior in llvm::ConstantFoldCompareInstruction.
switch (ISD::getUnorderedFlavor(Cond)) {
default:
llvm_unreachable("Unknown flavor!");
case 0: // Known false.
return getBoolConstant(false, dl, VT, OpVT);
case 1: // Known true.
return getBoolConstant(true, dl, VT, OpVT);
case 2: // Undefined.
return getUNDEF(VT);
}
}
// Could not fold it.
return SDValue();
}
/// See if the specified operand can be simplified with the knowledge that only
/// the bits specified by DemandedBits are used.
/// TODO: really we should be making this into the DAG equivalent of
/// SimplifyMultipleUseDemandedBits and not generate any new nodes.
SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &DemandedBits) {
EVT VT = V.getValueType();
APInt DemandedElts = VT.isVector()
? APInt::getAllOnesValue(VT.getVectorNumElements())
: APInt(1, 1);
return GetDemandedBits(V, DemandedBits, DemandedElts);
}
/// See if the specified operand can be simplified with the knowledge that only
/// the bits specified by DemandedBits are used in the elements specified by
/// DemandedElts.
/// TODO: really we should be making this into the DAG equivalent of
/// SimplifyMultipleUseDemandedBits and not generate any new nodes.
SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &DemandedBits,
const APInt &DemandedElts) {
switch (V.getOpcode()) {
default:
break;
case ISD::Constant: {
auto *CV = cast<ConstantSDNode>(V.getNode());
assert(CV && "Const value should be ConstSDNode.");
const APInt &CVal = CV->getAPIntValue();
APInt NewVal = CVal & DemandedBits;
if (NewVal != CVal)
return getConstant(NewVal, SDLoc(V), V.getValueType());
break;
}
case ISD::OR:
case ISD::XOR:
// If the LHS or RHS don't contribute bits to the or, drop them.
if (MaskedValueIsZero(V.getOperand(0), DemandedBits))
return V.getOperand(1);
if (MaskedValueIsZero(V.getOperand(1), DemandedBits))
return V.getOperand(0);
break;
case ISD::SRL:
// Only look at single-use SRLs.
if (!V.getNode()->hasOneUse())
break;
if (auto *RHSC = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
// See if we can recursively simplify the LHS.
unsigned Amt = RHSC->getZExtValue();
// Watch out for shift count overflow though.
if (Amt >= DemandedBits.getBitWidth())
break;
APInt SrcDemandedBits = DemandedBits << Amt;
if (SDValue SimplifyLHS =
GetDemandedBits(V.getOperand(0), SrcDemandedBits))
return getNode(ISD::SRL, SDLoc(V), V.getValueType(), SimplifyLHS,
V.getOperand(1));
}
break;
case ISD::AND: {
// X & -1 -> X (ignoring bits which aren't demanded).
// Also handle the case where masked out bits in X are known to be zero.
if (ConstantSDNode *RHSC = isConstOrConstSplat(V.getOperand(1))) {
const APInt &AndVal = RHSC->getAPIntValue();
if (DemandedBits.isSubsetOf(AndVal) ||
DemandedBits.isSubsetOf(computeKnownBits(V.getOperand(0)).Zero |
AndVal))
return V.getOperand(0);
}
break;
}
case ISD::ANY_EXTEND: {
SDValue Src = V.getOperand(0);
unsigned SrcBitWidth = Src.getScalarValueSizeInBits();
// Being conservative here - only peek through if we only demand bits in the
// non-extended source (even though the extended bits are technically
// undef).
if (DemandedBits.getActiveBits() > SrcBitWidth)
break;
APInt SrcDemandedBits = DemandedBits.trunc(SrcBitWidth);
if (SDValue DemandedSrc = GetDemandedBits(Src, SrcDemandedBits))
return getNode(ISD::ANY_EXTEND, SDLoc(V), V.getValueType(), DemandedSrc);
break;
}
case ISD::SIGN_EXTEND_INREG:
EVT ExVT = cast<VTSDNode>(V.getOperand(1))->getVT();
unsigned ExVTBits = ExVT.getScalarSizeInBits();
// If none of the extended bits are demanded, eliminate the sextinreg.
if (DemandedBits.getActiveBits() <= ExVTBits)
return V.getOperand(0);
break;
}
return SDValue();
}
/// SignBitIsZero - Return true if the sign bit of Op is known to be zero. We
/// use this predicate to simplify operations downstream.
bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
unsigned BitWidth = Op.getScalarValueSizeInBits();
return MaskedValueIsZero(Op, APInt::getSignMask(BitWidth), Depth);
}
/// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero. We use
/// this predicate to simplify operations downstream. Mask is known to be zero
/// for bits that V cannot have.
bool SelectionDAG::MaskedValueIsZero(SDValue V, const APInt &Mask,
unsigned Depth) const {
EVT VT = V.getValueType();
APInt DemandedElts = VT.isVector()
? APInt::getAllOnesValue(VT.getVectorNumElements())
: APInt(1, 1);
return MaskedValueIsZero(V, Mask, DemandedElts, Depth);
}
/// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero in
/// DemandedElts. We use this predicate to simplify operations downstream.
/// Mask is known to be zero for bits that V cannot have.
bool SelectionDAG::MaskedValueIsZero(SDValue V, const APInt &Mask,
const APInt &DemandedElts,
unsigned Depth) const {
return Mask.isSubsetOf(computeKnownBits(V, DemandedElts, Depth).Zero);
}
/// MaskedValueIsAllOnes - Return true if '(Op & Mask) == Mask'.
bool SelectionDAG::MaskedValueIsAllOnes(SDValue V, const APInt &Mask,
unsigned Depth) const {
return Mask.isSubsetOf(computeKnownBits(V, Depth).One);
}
/// isSplatValue - Return true if the vector V has the same value
/// across all DemandedElts.
bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
APInt &UndefElts) {
if (!DemandedElts)
return false; // No demanded elts, better to assume we don't know anything.
EVT VT = V.getValueType();
assert(VT.isVector() && "Vector type expected");
unsigned NumElts = VT.getVectorNumElements();
assert(NumElts == DemandedElts.getBitWidth() && "Vector size mismatch");
UndefElts = APInt::getNullValue(NumElts);
switch (V.getOpcode()) {
case ISD::BUILD_VECTOR: {
SDValue Scl;
for (unsigned i = 0; i != NumElts; ++i) {
SDValue Op = V.getOperand(i);
if (Op.isUndef()) {
UndefElts.setBit(i);
continue;
}
if (!DemandedElts[i])
continue;
if (Scl && Scl != Op)
return false;
Scl = Op;
}
return true;
}
case ISD::VECTOR_SHUFFLE: {
// Check if this is a shuffle node doing a splat.
// TODO: Do we need to handle shuffle(splat, undef, mask)?
int SplatIndex = -1;
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(V)->getMask();
for (int i = 0; i != (int)NumElts; ++i) {
int M = Mask[i];
if (M < 0) {
UndefElts.setBit(i);
continue;
}
if (!DemandedElts[i])
continue;
if (0 <= SplatIndex && SplatIndex != M)
return false;
SplatIndex = M;
}
return true;
}
case ISD::EXTRACT_SUBVECTOR: {
SDValue Src = V.getOperand(0);
ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(V.getOperand(1));
unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
// Offset the demanded elts by the subvector index.
uint64_t Idx = SubIdx->getZExtValue();
APInt UndefSrcElts;
APInt DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
if (isSplatValue(Src, DemandedSrc, UndefSrcElts)) {
UndefElts = UndefSrcElts.extractBits(NumElts, Idx);
return true;
}
}
break;
}
case ISD::ADD:
case ISD::SUB:
case ISD::AND: {
APInt UndefLHS, UndefRHS;
SDValue LHS = V.getOperand(0);
SDValue RHS = V.getOperand(1);
if (isSplatValue(LHS, DemandedElts, UndefLHS) &&
isSplatValue(RHS, DemandedElts, UndefRHS)) {
UndefElts = UndefLHS | UndefRHS;
return true;
}
break;
}
}
return false;
}
/// Helper wrapper to main isSplatValue function.
bool SelectionDAG::isSplatValue(SDValue V, bool AllowUndefs) {
EVT VT = V.getValueType();
assert(VT.isVector() && "Vector type expected");
unsigned NumElts = VT.getVectorNumElements();
APInt UndefElts;
APInt DemandedElts = APInt::getAllOnesValue(NumElts);
return isSplatValue(V, DemandedElts, UndefElts) &&
(AllowUndefs || !UndefElts);
}
SDValue SelectionDAG::getSplatSourceVector(SDValue V, int &SplatIdx) {
V = peekThroughExtractSubvectors(V);
EVT VT = V.getValueType();
unsigned Opcode = V.getOpcode();
switch (Opcode) {
default: {
APInt UndefElts;
APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
if (isSplatValue(V, DemandedElts, UndefElts)) {
// Handle case where all demanded elements are UNDEF.
if (DemandedElts.isSubsetOf(UndefElts)) {
SplatIdx = 0;
return getUNDEF(VT);
}
SplatIdx = (UndefElts & DemandedElts).countTrailingOnes();
return V;
}
break;
}
case ISD::VECTOR_SHUFFLE: {
// Check if this is a shuffle node doing a splat.
// TODO - remove this and rely purely on SelectionDAG::isSplatValue,
// getTargetVShiftNode currently struggles without the splat source.
auto *SVN = cast<ShuffleVectorSDNode>(V);
if (!SVN->isSplat())
break;
int Idx = SVN->getSplatIndex();
int NumElts = V.getValueType().getVectorNumElements();
SplatIdx = Idx % NumElts;
return V.getOperand(Idx / NumElts);
}
}
return SDValue();
}
SDValue SelectionDAG::getSplatValue(SDValue V) {
int SplatIdx;
if (SDValue SrcVector = getSplatSourceVector(V, SplatIdx))
return getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V),
SrcVector.getValueType().getScalarType(), SrcVector,
getIntPtrConstant(SplatIdx, SDLoc(V)));
return SDValue();
}
/// If a SHL/SRA/SRL node has a constant or splat constant shift amount that
/// is less than the element bit-width of the shift node, return it.
static const APInt *getValidShiftAmountConstant(SDValue V) {
if (ConstantSDNode *SA = isConstOrConstSplat(V.getOperand(1))) {
// Shifting more than the bitwidth is not valid.
const APInt &ShAmt = SA->getAPIntValue();
if (ShAmt.ult(V.getScalarValueSizeInBits()))
return &ShAmt;
}
return nullptr;
}
/// Determine which bits of Op are known to be either zero or one and return
/// them in Known. For vectors, the known bits are those that are shared by
/// every vector element.
KnownBits SelectionDAG::computeKnownBits(SDValue Op, unsigned Depth) const {
EVT VT = Op.getValueType();
APInt DemandedElts = VT.isVector()
? APInt::getAllOnesValue(VT.getVectorNumElements())
: APInt(1, 1);
return computeKnownBits(Op, DemandedElts, Depth);
}
/// Determine which bits of Op are known to be either zero or one and return
/// them in Known. The DemandedElts argument allows us to only collect the known
/// bits that are shared by the requested vector elements.
KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
unsigned Depth) const {
unsigned BitWidth = Op.getScalarValueSizeInBits();
KnownBits Known(BitWidth); // Don't know anything.
if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
// We know all of the bits for a constant!
Known.One = C->getAPIntValue();
Known.Zero = ~Known.One;
return Known;
}
if (auto *C = dyn_cast<ConstantFPSDNode>(Op)) {
// We know all of the bits for a constant fp!
Known.One = C->getValueAPF().bitcastToAPInt();
Known.Zero = ~Known.One;
return Known;
}
if (Depth == 6)
return Known; // Limit search depth.
KnownBits Known2;
unsigned NumElts = DemandedElts.getBitWidth();
assert((!Op.getValueType().isVector() ||
NumElts == Op.getValueType().getVectorNumElements()) &&
"Unexpected vector size");
if (!DemandedElts)
return Known; // No demanded elts, better to assume we don't know anything.
unsigned Opcode = Op.getOpcode();
switch (Opcode) {
case ISD::BUILD_VECTOR:
// Collect the known bits that are shared by every demanded vector element.
Known.Zero.setAllBits(); Known.One.setAllBits();
for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
if (!DemandedElts[i])
continue;
SDValue SrcOp = Op.getOperand(i);
Known2 = computeKnownBits(SrcOp, Depth + 1);
// BUILD_VECTOR can implicitly truncate sources, we must handle this.
if (SrcOp.getValueSizeInBits() != BitWidth) {
assert(SrcOp.getValueSizeInBits() > BitWidth &&
"Expected BUILD_VECTOR implicit truncation");
Known2 = Known2.trunc(BitWidth);
}
// Known bits are the values that are shared by every demanded element.
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
// If we don't know any bits, early out.
if (Known.isUnknown())
break;
}
break;
case ISD::VECTOR_SHUFFLE: {
// Collect the known bits that are shared by every vector element referenced
// by the shuffle.
APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
Known.Zero.setAllBits(); Known.One.setAllBits();
const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
for (unsigned i = 0; i != NumElts; ++i) {
if (!DemandedElts[i])
continue;
int M = SVN->getMaskElt(i);
if (M < 0) {
// For UNDEF elements, we don't know anything about the common state of
// the shuffle result.
Known.resetAll();
DemandedLHS.clearAllBits();
DemandedRHS.clearAllBits();
break;
}
if ((unsigned)M < NumElts)
DemandedLHS.setBit((unsigned)M % NumElts);
else
DemandedRHS.setBit((unsigned)M % NumElts);
}
// Known bits are the values that are shared by every demanded element.
if (!!DemandedLHS) {
SDValue LHS = Op.getOperand(0);
Known2 = computeKnownBits(LHS, DemandedLHS, Depth + 1);
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
}
// If we don't know any bits, early out.
if (Known.isUnknown())
break;
if (!!DemandedRHS) {
SDValue RHS = Op.getOperand(1);
Known2 = computeKnownBits(RHS, DemandedRHS, Depth + 1);
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
}
break;
}
case ISD::CONCAT_VECTORS: {
// Split DemandedElts and test each of the demanded subvectors.
Known.Zero.setAllBits(); Known.One.setAllBits();
EVT SubVectorVT = Op.getOperand(0).getValueType();
unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements();
unsigned NumSubVectors = Op.getNumOperands();
for (unsigned i = 0; i != NumSubVectors; ++i) {
APInt DemandedSub = DemandedElts.lshr(i * NumSubVectorElts);
DemandedSub = DemandedSub.trunc(NumSubVectorElts);
if (!!DemandedSub) {
SDValue Sub = Op.getOperand(i);
Known2 = computeKnownBits(Sub, DemandedSub, Depth + 1);
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
}
// If we don't know any bits, early out.
if (Known.isUnknown())
break;
}
break;
}
case ISD::INSERT_SUBVECTOR: {
// If we know the element index, demand any elements from the subvector and
// the remainder from the src its inserted into, otherwise demand them all.
SDValue Src = Op.getOperand(0);
SDValue Sub = Op.getOperand(1);
ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
if (SubIdx && SubIdx->getAPIntValue().ule(NumElts - NumSubElts)) {
Known.One.setAllBits();
Known.Zero.setAllBits();
uint64_t Idx = SubIdx->getZExtValue();
APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
if (!!DemandedSubElts) {
Known = computeKnownBits(Sub, DemandedSubElts, Depth + 1);
if (Known.isUnknown())
break; // early-out.
}
APInt SubMask = APInt::getBitsSet(NumElts, Idx, Idx + NumSubElts);
APInt DemandedSrcElts = DemandedElts & ~SubMask;
if (!!DemandedSrcElts) {
Known2 = computeKnownBits(Src, DemandedSrcElts, Depth + 1);
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
}
} else {
Known = computeKnownBits(Sub, Depth + 1);
if (Known.isUnknown())
break; // early-out.
Known2 = computeKnownBits(Src, Depth + 1);
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
}
break;
}
case ISD::EXTRACT_SUBVECTOR: {
// If we know the element index, just demand that subvector elements,
// otherwise demand them all.
SDValue Src = Op.getOperand(0);
ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
// Offset the demanded elts by the subvector index.
uint64_t Idx = SubIdx->getZExtValue();
APInt DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
Known = computeKnownBits(Src, DemandedSrc, Depth + 1);
} else {
Known = computeKnownBits(Src, Depth + 1);
}
break;
}
case ISD::SCALAR_TO_VECTOR: {
// We know about scalar_to_vector as much as we know about it source,
// which becomes the first element of otherwise unknown vector.
if (DemandedElts != 1)
break;
SDValue N0 = Op.getOperand(0);
Known = computeKnownBits(N0, Depth + 1);
if (N0.getValueSizeInBits() != BitWidth)
Known = Known.trunc(BitWidth);
break;
}
case ISD::BITCAST: {
SDValue N0 = Op.getOperand(0);
EVT SubVT = N0.getValueType();
unsigned SubBitWidth = SubVT.getScalarSizeInBits();
// Ignore bitcasts from unsupported types.
if (!(SubVT.isInteger() || SubVT.isFloatingPoint()))
break;
// Fast handling of 'identity' bitcasts.
if (BitWidth == SubBitWidth) {
Known = computeKnownBits(N0, DemandedElts, Depth + 1);
break;
}
bool IsLE = getDataLayout().isLittleEndian();
// Bitcast 'small element' vector to 'large element' scalar/vector.
if ((BitWidth % SubBitWidth) == 0) {
assert(N0.getValueType().isVector() && "Expected bitcast from vector");
// Collect known bits for the (larger) output by collecting the known
// bits from each set of sub elements and shift these into place.
// We need to separately call computeKnownBits for each set of
// sub elements as the knownbits for each is likely to be different.
unsigned SubScale = BitWidth / SubBitWidth;
APInt SubDemandedElts(NumElts * SubScale, 0);
for (unsigned i = 0; i != NumElts; ++i)
if (DemandedElts[i])
SubDemandedElts.setBit(i * SubScale);
for (unsigned i = 0; i != SubScale; ++i) {
Known2 = computeKnownBits(N0, SubDemandedElts.shl(i),
Depth + 1);
unsigned Shifts = IsLE ? i : SubScale - 1 - i;
Known.One |= Known2.One.zext(BitWidth).shl(SubBitWidth * Shifts);
Known.Zero |= Known2.Zero.zext(BitWidth).shl(SubBitWidth * Shifts);
}
}
// Bitcast 'large element' scalar/vector to 'small element' vector.
if ((SubBitWidth % BitWidth) == 0) {
assert(Op.getValueType().isVector() && "Expected bitcast to vector");
// Collect known bits for the (smaller) output by collecting the known
// bits from the overlapping larger input elements and extracting the
// sub sections we actually care about.
unsigned SubScale = SubBitWidth / BitWidth;
APInt SubDemandedElts(NumElts / SubScale, 0);
for (unsigned i = 0; i != NumElts; ++i)
if (DemandedElts[i])
SubDemandedElts.setBit(i / SubScale);
Known2 = computeKnownBits(N0, SubDemandedElts, Depth + 1);
Known.Zero.setAllBits(); Known.One.setAllBits();
for (unsigned i = 0; i != NumElts; ++i)
if (DemandedElts[i]) {
unsigned Shifts = IsLE ? i : NumElts - 1 - i;
unsigned Offset = (Shifts % SubScale) * BitWidth;
Known.One &= Known2.One.lshr(Offset).trunc(BitWidth);
Known.Zero &= Known2.Zero.lshr(Offset).trunc(BitWidth);
// If we don't know any bits, early out.
if (Known.isUnknown())
break;
}
}
break;
}
case ISD::AND:
// If either the LHS or the RHS are Zero, the result is zero.
Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
// Output known-1 bits are only known if set in both the LHS & RHS.
Known.One &= Known2.One;
// Output known-0 are known to be clear if zero in either the LHS | RHS.
Known.Zero |= Known2.Zero;
break;
case ISD::OR:
Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
// Output known-0 bits are only known if clear in both the LHS & RHS.
Known.Zero &= Known2.Zero;
// Output known-1 are known to be set if set in either the LHS | RHS.
Known.One |= Known2.One;
break;
case ISD::XOR: {
Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
// Output known-0 bits are known if clear or set in both the LHS & RHS.
APInt KnownZeroOut = (Known.Zero & Known2.Zero) | (Known.One & Known2.One);
// Output known-1 are known to be set if set in only one of the LHS, RHS.
Known.One = (Known.Zero & Known2.One) | (Known.One & Known2.Zero);
Known.Zero = KnownZeroOut;
break;
}
case ISD::MUL: {
Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
// If low bits are zero in either operand, output low known-0 bits.
// Also compute a conservative estimate for high known-0 bits.
// More trickiness is possible, but this is sufficient for the
// interesting case of alignment computation.
unsigned TrailZ = Known.countMinTrailingZeros() +
Known2.countMinTrailingZeros();
unsigned LeadZ = std::max(Known.countMinLeadingZeros() +
Known2.countMinLeadingZeros(),
BitWidth) - BitWidth;
Known.resetAll();
Known.Zero.setLowBits(std::min(TrailZ, BitWidth));
Known.Zero.setHighBits(std::min(LeadZ, BitWidth));
break;
}
case ISD::UDIV: {
// For the purposes of computing leading zeros we can conservatively
// treat a udiv as a logical right shift by the power of 2 known to
// be less than the denominator.
Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
unsigned LeadZ = Known2.countMinLeadingZeros();
Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
if (RHSMaxLeadingZeros != BitWidth)
LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);
Known.Zero.setHighBits(LeadZ);
break;
}
case ISD::SELECT:
case ISD::VSELECT:
Known = computeKnownBits(Op.getOperand(2), DemandedElts, Depth+1);
// If we don't know any bits, early out.
if (Known.isUnknown())
break;
Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth+1);
// Only known if known in both the LHS and RHS.
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
break;
case ISD::SELECT_CC:
Known = computeKnownBits(Op.getOperand(3), DemandedElts, Depth+1);
// If we don't know any bits, early out.
if (Known.isUnknown())
break;
Known2 = computeKnownBits(Op.getOperand(2), DemandedElts, Depth+1);
// Only known if known in both the LHS and RHS.
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
break;
case ISD::SMULO:
case ISD::UMULO:
case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
if (Op.getResNo() != 1)
break;
// The boolean result conforms to getBooleanContents.
// If we know the result of a setcc has the top bits zero, use this info.
// We know that we have an integer-based boolean since these operations
// are only available for integer.
if (TLI->getBooleanContents(Op.getValueType().isVector(), false) ==
TargetLowering::ZeroOrOneBooleanContent &&
BitWidth > 1)
Known.Zero.setBitsFrom(1);
break;
case ISD::SETCC:
// If we know the result of a setcc has the top bits zero, use this info.
if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
TargetLowering::ZeroOrOneBooleanContent &&
BitWidth > 1)
Known.Zero.setBitsFrom(1);
break;
case ISD::SHL:
if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
unsigned Shift = ShAmt->getZExtValue();
Known.Zero <<= Shift;
Known.One <<= Shift;
// Low bits are known zero.
Known.Zero.setLowBits(Shift);
}
break;
case ISD::SRL:
if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
unsigned Shift = ShAmt->getZExtValue();
Known.Zero.lshrInPlace(Shift);
Known.One.lshrInPlace(Shift);
// High bits are known zero.
Known.Zero.setHighBits(Shift);
} else if (auto *BV = dyn_cast<BuildVectorSDNode>(Op.getOperand(1))) {
// If the shift amount is a vector of constants see if we can bound
// the number of upper zero bits.
unsigned ShiftAmountMin = BitWidth;
for (unsigned i = 0; i != BV->getNumOperands(); ++i) {
if (auto *C = dyn_cast<ConstantSDNode>(BV->getOperand(i))) {
const APInt &ShAmt = C->getAPIntValue();
if (ShAmt.ult(BitWidth)) {
ShiftAmountMin = std::min<unsigned>(ShiftAmountMin,
ShAmt.getZExtValue());
continue;
}
}
// Don't know anything.
ShiftAmountMin = 0;
break;
}
Known.Zero.setHighBits(ShiftAmountMin);
}
break;
case ISD::SRA:
if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
unsigned Shift = ShAmt->getZExtValue();
// Sign extend known zero/one bit (else is unknown).
Known.Zero.ashrInPlace(Shift);
Known.One.ashrInPlace(Shift);
}
break;
case ISD::FSHL:
case ISD::FSHR:
if (ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(2), DemandedElts)) {
unsigned Amt = C->getAPIntValue().urem(BitWidth);
// For fshl, 0-shift returns the 1st arg.
// For fshr, 0-shift returns the 2nd arg.
if (Amt == 0) {
Known = computeKnownBits(Op.getOperand(Opcode == ISD::FSHL ? 0 : 1),
DemandedElts, Depth + 1);
break;
}
// fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
// fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
if (Opcode == ISD::FSHL) {
Known.One <<= Amt;
Known.Zero <<= Amt;
Known2.One.lshrInPlace(BitWidth - Amt);
Known2.Zero.lshrInPlace(BitWidth - Amt);
} else {
Known.One <<= BitWidth - Amt;
Known.Zero <<= BitWidth - Amt;
Known2.One.lshrInPlace(Amt);
Known2.Zero.lshrInPlace(Amt);
}
Known.One |= Known2.One;
Known.Zero |= Known2.Zero;
}
break;
case ISD::SIGN_EXTEND_INREG: {
EVT EVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
unsigned EBits = EVT.getScalarSizeInBits();
// Sign extension. Compute the demanded bits in the result that are not
// present in the input.
APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - EBits);
APInt InSignMask = APInt::getSignMask(EBits);
APInt InputDemandedBits = APInt::getLowBitsSet(BitWidth, EBits);
// If the sign extended bits are demanded, we know that the sign
// bit is demanded.
InSignMask = InSignMask.zext(BitWidth);
if (NewBits.getBoolValue())
InputDemandedBits |= InSignMask;
Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
Known.One &= InputDemandedBits;
Known.Zero &= InputDemandedBits;
// If the sign bit of the input is known set or clear, then we know the
// top bits of the result.
if (Known.Zero.intersects(InSignMask)) { // Input sign bit known clear
Known.Zero |= NewBits;
Known.One &= ~NewBits;
} else if (Known.One.intersects(InSignMask)) { // Input sign bit known set
Known.One |= NewBits;
Known.Zero &= ~NewBits;
} else { // Input sign bit unknown
Known.Zero &= ~NewBits;
Known.One &= ~NewBits;
}
break;
}
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF: {
Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
// If we have a known 1, its position is our upper bound.
unsigned PossibleTZ = Known2.countMaxTrailingZeros();
unsigned LowBits = Log2_32(PossibleTZ) + 1;
Known.Zero.setBitsFrom(LowBits);
break;
}
case ISD::CTLZ:
case ISD::CTLZ_ZERO_UNDEF: {
Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
// If we have a known 1, its position is our upper bound.
unsigned PossibleLZ = Known2.countMaxLeadingZeros();
unsigned LowBits = Log2_32(PossibleLZ) + 1;
Known.Zero.setBitsFrom(LowBits);
break;
}
case ISD::CTPOP: {
Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
// If we know some of the bits are zero, they can't be one.
unsigned PossibleOnes = Known2.countMaxPopulation();
Known.Zero.setBitsFrom(Log2_32(PossibleOnes) + 1);
break;
}
case ISD::LOAD: {
LoadSDNode *LD = cast<LoadSDNode>(Op);
const Constant *Cst = TLI->getTargetConstantFromLoad(LD);
if (ISD::isNON_EXTLoad(LD) && Cst) {
// Determine any common known bits from the loaded constant pool value.
Type *CstTy = Cst->getType();
if ((NumElts * BitWidth) == CstTy->getPrimitiveSizeInBits()) {
// If its a vector splat, then we can (quickly) reuse the scalar path.
// NOTE: We assume all elements match and none are UNDEF.
if (CstTy->isVectorTy()) {
if (const Constant *Splat = Cst->getSplatValue()) {
Cst = Splat;
CstTy = Cst->getType();
}
}
// TODO - do we need to handle different bitwidths?
if (CstTy->isVectorTy() && BitWidth == CstTy->getScalarSizeInBits()) {
// Iterate across all vector elements finding common known bits.
Known.One.setAllBits();
Known.Zero.setAllBits();
for (unsigned i = 0; i != NumElts; ++i) {
if (!DemandedElts[i])
continue;
if (Constant *Elt = Cst->getAggregateElement(i)) {
if (auto *CInt = dyn_cast<ConstantInt>(Elt)) {
const APInt &Value = CInt->getValue();
Known.One &= Value;
Known.Zero &= ~Value;
continue;
}
if (auto *CFP = dyn_cast<ConstantFP>(Elt)) {
APInt Value = CFP->getValueAPF().bitcastToAPInt();
Known.One &= Value;
Known.Zero &= ~Value;
continue;
}
}
Known.One.clearAllBits();
Known.Zero.clearAllBits();
break;
}
} else if (BitWidth == CstTy->getPrimitiveSizeInBits()) {
if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
const APInt &Value = CInt->getValue();
Known.One = Value;
Known.Zero = ~Value;
} else if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
APInt Value = CFP->getValueAPF().bitcastToAPInt();
Known.One = Value;
Known.Zero = ~Value;
}
}
}
} else if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) {
// If this is a ZEXTLoad and we are looking at the loaded value.
EVT VT = LD->getMemoryVT();
unsigned MemBits = VT.getScalarSizeInBits();
Known.Zero.setBitsFrom(MemBits);
} else if (const MDNode *Ranges = LD->getRanges()) {
if (LD->getExtensionType() == ISD::NON_EXTLOAD)
computeKnownBitsFromRangeMetadata(*Ranges, Known);
}
break;
}
case ISD::ZERO_EXTEND_VECTOR_INREG: {
EVT InVT = Op.getOperand(0).getValueType();
APInt InDemandedElts = DemandedElts.zextOrSelf(InVT.getVectorNumElements());
Known = computeKnownBits(Op.getOperand(0), InDemandedElts, Depth + 1);
Known = Known.zext(BitWidth, true /* ExtendedBitsAreKnownZero */);
break;
}
case ISD::ZERO_EXTEND: {
Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
Known = Known.zext(BitWidth, true /* ExtendedBitsAreKnownZero */);
break;
}
case ISD::SIGN_EXTEND_VECTOR_INREG: {
EVT InVT = Op.getOperand(0).getValueType();
APInt InDemandedElts = DemandedElts.zextOrSelf(InVT.getVectorNumElements());
Known = computeKnownBits(Op.getOperand(0), InDemandedElts, Depth + 1);
// If the sign bit is known to be zero or one, then sext will extend
// it to the top bits, else it will just zext.
Known = Known.sext(BitWidth);
break;
}
case ISD::SIGN_EXTEND: {
Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
// If the sign bit is known to be zero or one, then sext will extend
// it to the top bits, else it will just zext.
Known = Known.sext(BitWidth);
break;
}
case ISD::ANY_EXTEND: {
Known = computeKnownBits(Op.getOperand(0), Depth+1);
Known = Known.zext(BitWidth, false /* ExtendedBitsAreKnownZero */);
break;
}
case ISD::TRUNCATE: {
Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
Known = Known.trunc(BitWidth);
break;
}
case ISD::AssertZext: {
EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits());
Known = computeKnownBits(Op.getOperand(0), Depth+1);
Known.Zero |= (~InMask);
Known.One &= (~Known.Zero);
break;
}
case ISD::FGETSIGN:
// All bits are zero except the low bit.
Known.Zero.setBitsFrom(1);
break;
case ISD::USUBO:
case ISD::SSUBO:
if (Op.getResNo() == 1) {
// If we know the result of a setcc has the top bits zero, use this info.
if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
TargetLowering::ZeroOrOneBooleanContent &&
BitWidth > 1)
Known.Zero.setBitsFrom(1);
break;
}
LLVM_FALLTHROUGH;
case ISD::SUB:
case ISD::SUBC: {
Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
Known = KnownBits::computeForAddSub(/* Add */ false, /* NSW */ false,
Known, Known2);
break;
}
case ISD::UADDO:
case ISD::SADDO:
case ISD::ADDCARRY:
if (Op.getResNo() == 1) {
// If we know the result of a setcc has the top bits zero, use this info.
if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
TargetLowering::ZeroOrOneBooleanContent &&
BitWidth > 1)
Known.Zero.setBitsFrom(1);
break;
}
LLVM_FALLTHROUGH;
case ISD::ADD:
case ISD::ADDC:
case ISD::ADDE: {
assert(Op.getResNo() == 0 && "We only compute knownbits for the sum here.");
// With ADDE and ADDCARRY, a carry bit may be added in.
KnownBits Carry(1);
if (Opcode == ISD::ADDE)
// Can't track carry from glue, set carry to unknown.
Carry.resetAll();
else if (Opcode == ISD::ADDCARRY)
// TODO: Compute known bits for the carry operand. Not sure if it is worth
// the trouble (how often will we find a known carry bit). And I haven't
// tested this very much yet, but something like this might work:
// Carry = computeKnownBits(Op.getOperand(2), DemandedElts, Depth + 1);
// Carry = Carry.zextOrTrunc(1, false);
Carry.resetAll();
else
Carry.setAllZero();
Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
Known = KnownBits::computeForAddCarry(Known, Known2, Carry);
break;
}
case ISD::SREM:
if (ConstantSDNode *Rem = isConstOrConstSplat(Op.getOperand(1))) {
const APInt &RA = Rem->getAPIntValue().abs();
if (RA.isPowerOf2()) {
APInt LowBits = RA - 1;
Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
// The low bits of the first operand are unchanged by the srem.
Known.Zero = Known2.Zero & LowBits;
Known.One = Known2.One & LowBits;
// If the first operand is non-negative or has all low bits zero, then
// the upper bits are all zero.
if (Known2.Zero[BitWidth-1] || ((Known2.Zero & LowBits) == LowBits))
Known.Zero |= ~LowBits;
// If the first operand is negative and not all low bits are zero, then
// the upper bits are all one.
if (Known2.One[BitWidth-1] && ((Known2.One & LowBits) != 0))
Known.One |= ~LowBits;
assert((Known.Zero & Known.One) == 0&&"Bits known to be one AND zero?");
}
}
break;
case ISD::UREM: {
if (ConstantSDNode *Rem = isConstOrConstSplat(Op.getOperand(1))) {
const APInt &RA = Rem->getAPIntValue();
if (RA.isPowerOf2()) {
APInt LowBits = (RA - 1);
Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
// The upper bits are all zero, the lower ones are unchanged.
Known.Zero = Known2.Zero | ~LowBits;
Known.One = Known2.One & LowBits;
break;
}
}
// Since the result is less than or equal to either operand, any leading
// zero bits in either operand must also exist in the result.
Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
uint32_t Leaders =
std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
Known.resetAll();
Known.Zero.setHighBits(Leaders);
break;
}
case ISD::EXTRACT_ELEMENT: {
Known = computeKnownBits(Op.getOperand(0), Depth+1);
const unsigned Index = Op.getConstantOperandVal(1);
const unsigned EltBitWidth = Op.getValueSizeInBits();
// Remove low part of known bits mask
Known.Zero = Known.Zero.getHiBits(Known.getBitWidth() - Index * EltBitWidth);
Known.One = Known.One.getHiBits(Known.getBitWidth() - Index * EltBitWidth);
// Remove high part of known bit mask
Known = Known.trunc(EltBitWidth);
break;
}
case ISD::EXTRACT_VECTOR_ELT: {
SDValue InVec = Op.getOperand(0);
SDValue EltNo = Op.getOperand(1);
EVT VecVT = InVec.getValueType();
const unsigned EltBitWidth = VecVT.getScalarSizeInBits();
const unsigned NumSrcElts = VecVT.getVectorNumElements();
// If BitWidth > EltBitWidth the value is anyext:ed. So we do not know
// anything about the extended bits.
if (BitWidth > EltBitWidth)
Known = Known.trunc(EltBitWidth);
ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts)) {
// If we know the element index, just demand that vector element.
unsigned Idx = ConstEltNo->getZExtValue();
APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
Known = computeKnownBits(InVec, DemandedElt, Depth + 1);
} else {
// Unknown element index, so ignore DemandedElts and demand them all.
Known = computeKnownBits(InVec, Depth + 1);
}
if (BitWidth > EltBitWidth)
Known = Known.zext(BitWidth, false /* => any extend */);
break;
}
case ISD::INSERT_VECTOR_ELT: {
SDValue InVec = Op.getOperand(0);
SDValue InVal = Op.getOperand(1);
SDValue EltNo = Op.getOperand(2);
ConstantSDNode *CEltNo = dyn_cast<ConstantSDNode>(EltNo);
if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) {
// If we know the element index, split the demand between the
// source vector and the inserted element.
Known.Zero = Known.One = APInt::getAllOnesValue(BitWidth);
unsigned EltIdx = CEltNo->getZExtValue();
// If we demand the inserted element then add its common known bits.
if (DemandedElts[EltIdx]) {
Known2 = computeKnownBits(InVal, Depth + 1);
Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth());
Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth());
}
// If we demand the source vector then add its common known bits, ensuring
// that we don't demand the inserted element.
APInt VectorElts = DemandedElts & ~(APInt::getOneBitSet(NumElts, EltIdx));
if (!!VectorElts) {
Known2 = computeKnownBits(InVec, VectorElts, Depth + 1);
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
}
} else {
// Unknown element index, so ignore DemandedElts and demand them all.
Known = computeKnownBits(InVec, Depth + 1);
Known2 = computeKnownBits(InVal, Depth + 1);
Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth());
Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth());
}
break;
}
case ISD::BITREVERSE: {
Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
Known.Zero = Known2.Zero.reverseBits();
Known.One = Known2.One.reverseBits();
break;
}
case ISD::BSWAP: {
Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
Known.Zero = Known2.Zero.byteSwap();
Known.One = Known2.One.byteSwap();
break;
}
case ISD::ABS: {
Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
// If the source's MSB is zero then we know the rest of the bits already.
if (Known2.isNonNegative()) {
Known.Zero = Known2.Zero;
Known.One = Known2.One;
break;
}
// We only know that the absolute values's MSB will be zero iff there is
// a set bit that isn't the sign bit (otherwise it could be INT_MIN).
Known2.One.clearSignBit();
if (Known2.One.getBoolValue()) {
Known.Zero = APInt::getSignMask(BitWidth);
break;
}
break;
}
case ISD::UMIN: {
Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
// UMIN - we know that the result will have the maximum of the
// known zero leading bits of the inputs.
unsigned LeadZero = Known.countMinLeadingZeros();
LeadZero = std::max(LeadZero, Known2.countMinLeadingZeros());
Known.Zero &= Known2.Zero;
Known.One &= Known2.One;
Known.Zero.setHighBits(LeadZero);
break;
}
case ISD::UMAX: {
Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
// UMAX - we know that the result will have the maximum of the
// known one leading bits of the inputs.
unsigned LeadOne = Known.countMinLeadingOnes();
LeadOne = std::max(LeadOne, Known2.countMinLeadingOnes());
Known.Zero &= Known2.Zero;
Known.One &= Known2.One;
Known.One.setHighBits(LeadOne);
break;
}
case ISD::SMIN:
case ISD::SMAX: {
// If we have a clamp pattern, we know that the number of sign bits will be
// the minimum of the clamp min/max range.
bool IsMax = (Opcode == ISD::SMAX);
ConstantSDNode *CstLow = nullptr, *CstHigh = nullptr;
if ((CstLow = isConstOrConstSplat(Op.getOperand(1), DemandedElts)))
if (Op.getOperand(0).getOpcode() == (IsMax ? ISD::SMIN : ISD::SMAX))
CstHigh =
isConstOrConstSplat(Op.getOperand(0).getOperand(1), DemandedElts);
if (CstLow && CstHigh) {
if (!IsMax)
std::swap(CstLow, CstHigh);
const APInt &ValueLow = CstLow->getAPIntValue();
const APInt &ValueHigh = CstHigh->getAPIntValue();
if (ValueLow.sle(ValueHigh)) {
unsigned LowSignBits = ValueLow.getNumSignBits();
unsigned HighSignBits = ValueHigh.getNumSignBits();
unsigned MinSignBits = std::min(LowSignBits, HighSignBits);
if (ValueLow.isNegative() && ValueHigh.isNegative()) {
Known.One.setHighBits(MinSignBits);
break;
}
if (ValueLow.isNonNegative() && ValueHigh.isNonNegative()) {
Known.Zero.setHighBits(MinSignBits);
break;
}
}
}
// Fallback - just get the shared known bits of the operands.
Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
if (Known.isUnknown()) break; // Early-out
Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
Known.Zero &= Known2.Zero;
Known.One &= Known2.One;
break;
}
case ISD::FrameIndex:
case ISD::TargetFrameIndex:
TLI->computeKnownBitsForFrameIndex(Op, Known, DemandedElts, *this, Depth);
break;
default:
if (Opcode < ISD::BUILTIN_OP_END)
break;
LLVM_FALLTHROUGH;
case ISD::INTRINSIC_WO_CHAIN:
case ISD::INTRINSIC_W_CHAIN:
case ISD::INTRINSIC_VOID:
// Allow the target to implement this method for its nodes.
TLI->computeKnownBitsForTargetNode(Op, Known, DemandedElts, *this, Depth);
break;
}
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
return Known;
}
SelectionDAG::OverflowKind SelectionDAG::computeOverflowKind(SDValue N0,
SDValue N1) const {
// X + 0 never overflow
if (isNullConstant(N1))
return OFK_Never;
KnownBits N1Known = computeKnownBits(N1);
if (N1Known.Zero.getBoolValue()) {
KnownBits N0Known = computeKnownBits(N0);
bool overflow;
(void)(~N0Known.Zero).uadd_ov(~N1Known.Zero, overflow);
if (!overflow)
return OFK_Never;
}
// mulhi + 1 never overflow
if (N0.getOpcode() == ISD::UMUL_LOHI && N0.getResNo() == 1 &&
(~N1Known.Zero & 0x01) == ~N1Known.Zero)
return OFK_Never;
if (N1.getOpcode() == ISD::UMUL_LOHI && N1.getResNo() == 1) {
KnownBits N0Known = computeKnownBits(N0);
if ((~N0Known.Zero & 0x01) == ~N0Known.Zero)
return OFK_Never;
}
return OFK_Sometime;
}
bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
EVT OpVT = Val.getValueType();
unsigned BitWidth = OpVT.getScalarSizeInBits();
// Is the constant a known power of 2?
if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Val))
return Const->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2();
// A left-shift of a constant one will have exactly one bit set because
// shifting the bit off the end is undefined.
if (Val.getOpcode() == ISD::SHL) {
auto *C = isConstOrConstSplat(Val.getOperand(0));
if (C && C->getAPIntValue() == 1)
return true;
}
// Similarly, a logical right-shift of a constant sign-bit will have exactly
// one bit set.
if (Val.getOpcode() == ISD::SRL) {
auto *C = isConstOrConstSplat(Val.getOperand(0));
if (C && C->getAPIntValue().isSignMask())
return true;
}
// Are all operands of a build vector constant powers of two?
if (Val.getOpcode() == ISD::BUILD_VECTOR)
if (llvm::all_of(Val->ops(), [BitWidth](SDValue E) {
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(E))
return C->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2();
return false;
}))
return true;
// More could be done here, though the above checks are enough
// to handle some common cases.
// Fall back to computeKnownBits to catch other known cases.
KnownBits Known = computeKnownBits(Val);
return (Known.countMaxPopulation() == 1) && (Known.countMinPopulation() == 1);
}
unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
EVT VT = Op.getValueType();
APInt DemandedElts = VT.isVector()
? APInt::getAllOnesValue(VT.getVectorNumElements())
: APInt(1, 1);
return ComputeNumSignBits(Op, DemandedElts, Depth);
}
unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
unsigned Depth) const {
EVT VT = Op.getValueType();
assert((VT.isInteger() || VT.isFloatingPoint()) && "Invalid VT!");
unsigned VTBits = VT.getScalarSizeInBits();
unsigned NumElts = DemandedElts.getBitWidth();
unsigned Tmp, Tmp2;
unsigned FirstAnswer = 1;
if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
const APInt &Val = C->getAPIntValue();
return Val.getNumSignBits();
}
if (Depth == 6)
return 1; // Limit search depth.
if (!DemandedElts)
return 1; // No demanded elts, better to assume we don't know anything.
unsigned Opcode = Op.getOpcode();
switch (Opcode) {
default: break;
case ISD::AssertSext:
Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
return VTBits-Tmp+1;
case ISD::AssertZext:
Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
return VTBits-Tmp;
case ISD::BUILD_VECTOR:
Tmp = VTBits;
for (unsigned i = 0, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i) {
if (!DemandedElts[i])
continue;
SDValue SrcOp = Op.getOperand(i);
Tmp2 = ComputeNumSignBits(Op.getOperand(i), Depth + 1);
// BUILD_VECTOR can implicitly truncate sources, we must handle this.
if (SrcOp.getValueSizeInBits() != VTBits) {
assert(SrcOp.getValueSizeInBits() > VTBits &&
"Expected BUILD_VECTOR implicit truncation");
unsigned ExtraBits = SrcOp.getValueSizeInBits() - VTBits;
Tmp2 = (Tmp2 > ExtraBits ? Tmp2 - ExtraBits : 1);
}
Tmp = std::min(Tmp, Tmp2);
}
return Tmp;
case ISD::VECTOR_SHUFFLE: {
// Collect the minimum number of sign bits that are shared by every vector
// element referenced by the shuffle.
APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
for (unsigned i = 0; i != NumElts; ++i) {
int M = SVN->getMaskElt(i);
if (!DemandedElts[i])
continue;
// For UNDEF elements, we don't know anything about the common state of
// the shuffle result.
if (M < 0)
return 1;
if ((unsigned)M < NumElts)
DemandedLHS.setBit((unsigned)M % NumElts);
else
DemandedRHS.setBit((unsigned)M % NumElts);
}
Tmp = std::numeric_limits<unsigned>::max();
if (!!DemandedLHS)
Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
if (!!DemandedRHS) {
Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
Tmp = std::min(Tmp, Tmp2);
}
// If we don't know anything, early out and try computeKnownBits fall-back.
if (Tmp == 1)
break;
assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
return Tmp;
}
case ISD::BITCAST: {
SDValue N0 = Op.getOperand(0);
EVT SrcVT = N0.getValueType();
unsigned SrcBits = SrcVT.getScalarSizeInBits();
// Ignore bitcasts from unsupported types..
if (!(SrcVT.isInteger() || SrcVT.isFloatingPoint()))
break;
// Fast handling of 'identity' bitcasts.
if (VTBits == SrcBits)
return ComputeNumSignBits(N0, DemandedElts, Depth + 1);
bool IsLE = getDataLayout().isLittleEndian();
// Bitcast 'large element' scalar/vector to 'small element' vector.
if ((SrcBits % VTBits) == 0) {
assert(VT.isVector() && "Expected bitcast to vector");
unsigned Scale = SrcBits / VTBits;
APInt SrcDemandedElts(NumElts / Scale, 0);
for (unsigned i = 0; i != NumElts; ++i)
if (DemandedElts[i])
SrcDemandedElts.setBit(i / Scale);
// Fast case - sign splat can be simply split across the small elements.
Tmp = ComputeNumSignBits(N0, SrcDemandedElts, Depth + 1);
if (Tmp == SrcBits)
return VTBits;
// Slow case - determine how far the sign extends into each sub-element.
Tmp2 = VTBits;
for (unsigned i = 0; i != NumElts; ++i)
if (DemandedElts[i]) {
unsigned SubOffset = i % Scale;
SubOffset = (IsLE ? ((Scale - 1) - SubOffset) : SubOffset);
SubOffset = SubOffset * VTBits;
if (Tmp <= SubOffset)
return 1;
Tmp2 = std::min(Tmp2, Tmp - SubOffset);
}
return Tmp2;
}
break;
}
case ISD::SIGN_EXTEND:
Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits();
return ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1) + Tmp;
case ISD::SIGN_EXTEND_INREG:
// Max of the input and what this extends.
Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getScalarSizeInBits();
Tmp = VTBits-Tmp+1;
Tmp2 = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
return std::max(Tmp, Tmp2);
case ISD::SIGN_EXTEND_VECTOR_INREG: {
SDValue Src = Op.getOperand(0);
EVT SrcVT = Src.getValueType();
APInt DemandedSrcElts = DemandedElts.zextOrSelf(SrcVT.getVectorNumElements());
Tmp = VTBits - SrcVT.getScalarSizeInBits();
return ComputeNumSignBits(Src, DemandedSrcElts, Depth+1) + Tmp;
}
case ISD::SRA:
Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
// SRA X, C -> adds C sign bits.
if (ConstantSDNode *C =
isConstOrConstSplat(Op.getOperand(1), DemandedElts)) {
APInt ShiftVal = C->getAPIntValue();
ShiftVal += Tmp;
Tmp = ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
}
return Tmp;
case ISD::SHL:
if (ConstantSDNode *C =
isConstOrConstSplat(Op.getOperand(1), DemandedElts)) {
// shl destroys sign bits.
Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
if (C->getAPIntValue().uge(VTBits) || // Bad shift.
C->getAPIntValue().uge(Tmp)) break; // Shifted all sign bits out.
return Tmp - C->getZExtValue();
}
break;
case ISD::AND:
case ISD::OR:
case ISD::XOR: // NOT is handled here.
// Logical binary ops preserve the number of sign bits at the worst.
Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
if (Tmp != 1) {
Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth+1);
FirstAnswer = std::min(Tmp, Tmp2);
// We computed what we know about the sign bits as our first
// answer. Now proceed to the generic code that uses
// computeKnownBits, and pick whichever answer is better.
}
break;
case ISD::SELECT:
case ISD::VSELECT:
Tmp = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth+1);
if (Tmp == 1) return 1; // Early out.
Tmp2 = ComputeNumSignBits(Op.getOperand(2), DemandedElts, Depth+1);
return std::min(Tmp, Tmp2);
case ISD::SELECT_CC:
Tmp = ComputeNumSignBits(Op.getOperand(2), DemandedElts, Depth+1);
if (Tmp == 1) return 1; // Early out.
Tmp2 = ComputeNumSignBits(Op.getOperand(3), DemandedElts, Depth+1);
return std::min(Tmp, Tmp2);
case ISD::SMIN:
case ISD::SMAX: {
// If we have a clamp pattern, we know that the number of sign bits will be
// the minimum of the clamp min/max range.
bool IsMax = (Opcode == ISD::SMAX);
ConstantSDNode *CstLow = nullptr, *CstHigh = nullptr;
if ((CstLow = isConstOrConstSplat(Op.getOperand(1), DemandedElts)))
if (Op.getOperand(0).getOpcode() == (IsMax ? ISD::SMIN : ISD::SMAX))
CstHigh =
isConstOrConstSplat(Op.getOperand(0).getOperand(1), DemandedElts);
if (CstLow && CstHigh) {
if (!IsMax)
std::swap(CstLow, CstHigh);
if (CstLow->getAPIntValue().sle(CstHigh->getAPIntValue())) {
Tmp = CstLow->getAPIntValue().getNumSignBits();
Tmp2 = CstHigh->getAPIntValue().getNumSignBits();
return std::min(Tmp, Tmp2);
}
}
// Fallback - just get the minimum number of sign bits of the operands.
Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
if (Tmp == 1)
return 1; // Early out.
Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth + 1);
return std::min(Tmp, Tmp2);
}
case ISD::UMIN:
case ISD::UMAX:
Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
if (Tmp == 1)
return 1; // Early out.
Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth + 1);
return std::min(Tmp, Tmp2);
case ISD::SADDO:
case ISD::UADDO:
case ISD::SSUBO:
case ISD::USUBO:
case ISD::SMULO:
case ISD::UMULO:
if (Op.getResNo() != 1)
break;
// The boolean result conforms to getBooleanContents. Fall through.
// If setcc returns 0/-1, all bits are sign bits.
// We know that we have an integer-based boolean since these operations
// are only available for integer.
if (TLI->getBooleanContents(VT.isVector(), false) ==
TargetLowering::ZeroOrNegativeOneBooleanContent)
return VTBits;
break;
case ISD::SETCC:
// If setcc returns 0/-1, all bits are sign bits.
if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
TargetLowering::ZeroOrNegativeOneBooleanContent)
return VTBits;
break;
case ISD::ROTL:
case ISD::ROTR:
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
unsigned RotAmt = C->getAPIntValue().urem(VTBits);
// Handle rotate right by N like a rotate left by 32-N.
if (Opcode == ISD::ROTR)
RotAmt = (VTBits - RotAmt) % VTBits;
// If we aren't rotating out all of the known-in sign bits, return the
// number that are left. This handles rotl(sext(x), 1) for example.
Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
if (Tmp > (RotAmt + 1)) return (Tmp - RotAmt);
}
break;
case ISD::ADD:
case ISD::ADDC:
// Add can have at most one carry bit. Thus we know that the output
// is, at worst, one more bit than the inputs.
Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
if (Tmp == 1) return 1; // Early out.
// Special case decrementing a value (ADD X, -1):
if (ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
if (CRHS->isAllOnesValue()) {
KnownBits Known = computeKnownBits(Op.getOperand(0), Depth+1);
// If the input is known to be 0 or 1, the output is 0/-1, which is all
// sign bits set.
if ((Known.Zero | 1).isAllOnesValue())
return VTBits;
// If we are subtracting one from a positive number, there is no carry
// out of the result.
if (Known.isNonNegative())
return Tmp;
}
Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
if (Tmp2 == 1) return 1;
return std::min(Tmp, Tmp2)-1;
case ISD::SUB:
Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
if (Tmp2 == 1) return 1;
// Handle NEG.
if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0)))
if (CLHS->isNullValue()) {
KnownBits Known = computeKnownBits(Op.getOperand(1), Depth+1);
// If the input is known to be 0 or 1, the output is 0/-1, which is all
// sign bits set.
if ((Known.Zero | 1).isAllOnesValue())
return VTBits;
// If the input is known to be positive (the sign bit is known clear),
// the output of the NEG has the same number of sign bits as the input.
if (Known.isNonNegative())
return Tmp2;
// Otherwise, we treat this like a SUB.
}
// Sub can have at most one carry bit. Thus we know that the output
// is, at worst, one more bit than the inputs.
Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
if (Tmp == 1) return 1; // Early out.
return std::min(Tmp, Tmp2)-1;
case ISD::TRUNCATE: {
// Check if the sign bits of source go down as far as the truncated value.
unsigned NumSrcBits = Op.getOperand(0).getScalarValueSizeInBits();
unsigned NumSrcSignBits = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
if (NumSrcSignBits > (NumSrcBits - VTBits))
return NumSrcSignBits - (NumSrcBits - VTBits);
break;
}
case ISD::EXTRACT_ELEMENT: {
const int KnownSign = ComputeNumSignBits(Op.getOperand(0), Depth+1);
const int BitWidth = Op.getValueSizeInBits();
const int Items = Op.getOperand(0).getValueSizeInBits() / BitWidth;
// Get reverse index (starting from 1), Op1 value indexes elements from
// little end. Sign starts at big end.
const int rIndex = Items - 1 - Op.getConstantOperandVal(1);
// If the sign portion ends in our element the subtraction gives correct
// result. Otherwise it gives either negative or > bitwidth result
return std::max(std::min(KnownSign - rIndex * BitWidth, BitWidth), 0);
}
case ISD::INSERT_VECTOR_ELT: {
SDValue InVec = Op.getOperand(0);
SDValue InVal = Op.getOperand(1);
SDValue EltNo = Op.getOperand(2);
ConstantSDNode *CEltNo = dyn_cast<ConstantSDNode>(EltNo);
if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) {
// If we know the element index, split the demand between the
// source vector and the inserted element.
unsigned EltIdx = CEltNo->getZExtValue();
// If we demand the inserted element then get its sign bits.
Tmp = std::numeric_limits<unsigned>::max();
if (DemandedElts[EltIdx]) {
// TODO - handle implicit truncation of inserted elements.
if (InVal.getScalarValueSizeInBits() != VTBits)
break;
Tmp = ComputeNumSignBits(InVal, Depth + 1);
}
// If we demand the source vector then get its sign bits, and determine
// the minimum.
APInt VectorElts = DemandedElts;
VectorElts.clearBit(EltIdx);
if (!!VectorElts) {
Tmp2 = ComputeNumSignBits(InVec, VectorElts, Depth + 1);
Tmp = std::min(Tmp, Tmp2);
}
} else {
// Unknown element index, so ignore DemandedElts and demand them all.
Tmp = ComputeNumSignBits(InVec, Depth + 1);
Tmp2 = ComputeNumSignBits(InVal, Depth + 1);
Tmp = std::min(Tmp, Tmp2);
}
assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
return Tmp;
}
case ISD::EXTRACT_VECTOR_ELT: {
SDValue InVec = Op.getOperand(0);
SDValue EltNo = Op.getOperand(1);
EVT VecVT = InVec.getValueType();
const unsigned BitWidth = Op.getValueSizeInBits();
const unsigned EltBitWidth = Op.getOperand(0).getScalarValueSizeInBits();
const unsigned NumSrcElts = VecVT.getVectorNumElements();
// If BitWidth > EltBitWidth the value is anyext:ed, and we do not know
// anything about sign bits. But if the sizes match we can derive knowledge
// about sign bits from the vector operand.
if (BitWidth != EltBitWidth)
break;
// If we know the element index, just demand that vector element, else for
// an unknown element index, ignore DemandedElts and demand them all.
APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts);
ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts))
DemandedSrcElts =
APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue());
return ComputeNumSignBits(InVec, DemandedSrcElts, Depth + 1);
}
case ISD::EXTRACT_SUBVECTOR: {
// If we know the element index, just demand that subvector elements,
// otherwise demand them all.
SDValue Src = Op.getOperand(0);
ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
// Offset the demanded elts by the subvector index.
uint64_t Idx = SubIdx->getZExtValue();
APInt DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
return ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
}
return ComputeNumSignBits(Src, Depth + 1);
}
case ISD::CONCAT_VECTORS: {
// Determine the minimum number of sign bits across all demanded
// elts of the input vectors. Early out if the result is already 1.
Tmp = std::numeric_limits<unsigned>::max();
EVT SubVectorVT = Op.getOperand(0).getValueType();
unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements();
unsigned NumSubVectors = Op.getNumOperands();
for (unsigned i = 0; (i < NumSubVectors) && (Tmp > 1); ++i) {
APInt DemandedSub = DemandedElts.lshr(i * NumSubVectorElts);
DemandedSub = DemandedSub.trunc(NumSubVectorElts);
if (!DemandedSub)
continue;
Tmp2 = ComputeNumSignBits(Op.getOperand(i), DemandedSub, Depth + 1);
Tmp = std::min(Tmp, Tmp2);
}
assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
return Tmp;
}
case ISD::INSERT_SUBVECTOR: {
// If we know the element index, demand any elements from the subvector and
// the remainder from the src its inserted into, otherwise demand them all.
SDValue Src = Op.getOperand(0);
SDValue Sub = Op.getOperand(1);
auto *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
if (SubIdx && SubIdx->getAPIntValue().ule(NumElts - NumSubElts)) {
Tmp = std::numeric_limits<unsigned>::max();
uint64_t Idx = SubIdx->getZExtValue();
APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
if (!!DemandedSubElts) {
Tmp = ComputeNumSignBits(Sub, DemandedSubElts, Depth + 1);
if (Tmp == 1) return 1; // early-out
}
APInt SubMask = APInt::getBitsSet(NumElts, Idx, Idx + NumSubElts);
APInt DemandedSrcElts = DemandedElts & ~SubMask;
if (!!DemandedSrcElts) {
Tmp2 = ComputeNumSignBits(Src, DemandedSrcElts, Depth + 1);
Tmp = std::min(Tmp, Tmp2);
}
assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
return Tmp;
}
// Not able to determine the index so just assume worst case.
Tmp = ComputeNumSignBits(Sub, Depth + 1);
if (Tmp == 1) return 1; // early-out
Tmp2 = ComputeNumSignBits(Src, Depth + 1);
Tmp = std::min(Tmp, Tmp2);
assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
return Tmp;
}
}
// If we are looking at the loaded value of the SDNode.
if (Op.getResNo() == 0) {
// Handle LOADX separately here. EXTLOAD case will fallthrough.
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op)) {
unsigned ExtType = LD->getExtensionType();
switch (ExtType) {
default: break;
case ISD::SEXTLOAD: // e.g. i16->i32 = '17' bits known.
Tmp = LD->getMemoryVT().getScalarSizeInBits();
return VTBits - Tmp + 1;
case ISD::ZEXTLOAD: // e.g. i16->i32 = '16' bits known.
Tmp = LD->getMemoryVT().getScalarSizeInBits();
return VTBits - Tmp;
case ISD::NON_EXTLOAD:
if (const Constant *Cst = TLI->getTargetConstantFromLoad(LD)) {
// We only need to handle vectors - computeKnownBits should handle
// scalar cases.
Type *CstTy = Cst->getType();
if (CstTy->isVectorTy() &&
(NumElts * VTBits) == CstTy->getPrimitiveSizeInBits()) {
Tmp = VTBits;
for (unsigned i = 0; i != NumElts; ++i) {
if (!DemandedElts[i])
continue;
if (Constant *Elt = Cst->getAggregateElement(i)) {
if (auto *CInt = dyn_cast<ConstantInt>(Elt)) {
const APInt &Value = CInt->getValue();
Tmp = std::min(Tmp, Value.getNumSignBits());
continue;
}
if (auto *CFP = dyn_cast<ConstantFP>(Elt)) {
APInt Value = CFP->getValueAPF().bitcastToAPInt();
Tmp = std::min(Tmp, Value.getNumSignBits());
continue;
}
}
// Unknown type. Conservatively assume no bits match sign bit.
return 1;
}
return Tmp;
}
}
break;
}
}
}
// Allow the target to implement this method for its nodes.
if (Opcode >= ISD::BUILTIN_OP_END ||
Opcode == ISD::INTRINSIC_WO_CHAIN ||
Opcode == ISD::INTRINSIC_W_CHAIN ||
Opcode == ISD::INTRINSIC_VOID) {
unsigned NumBits =
TLI->ComputeNumSignBitsForTargetNode(Op, DemandedElts, *this, Depth);
if (NumBits > 1)
FirstAnswer = std::max(FirstAnswer, NumBits);
}
// Finally, if we can prove that the top bits of the result are 0's or 1's,
// use this information.
KnownBits Known = computeKnownBits(Op, DemandedElts, Depth);
APInt Mask;
if (Known.isNonNegative()) { // sign bit is 0
Mask = Known.Zero;
} else if (Known.isNegative()) { // sign bit is 1;
Mask = Known.One;
} else {
// Nothing known.
return FirstAnswer;
}
// Okay, we know that the sign bit in Mask is set. Use CLZ to determine
// the number of identical bits in the top of the input value.
Mask = ~Mask;
Mask <<= Mask.getBitWidth()-VTBits;
// Return # leading zeros. We use 'min' here in case Val was zero before
// shifting. We don't want to return '64' as for an i32 "0".
return std::max(FirstAnswer, std::min(VTBits, Mask.countLeadingZeros()));
}
bool SelectionDAG::isBaseWithConstantOffset(SDValue Op) const {
if ((Op.getOpcode() != ISD::ADD && Op.getOpcode() != ISD::OR) ||
!isa<ConstantSDNode>(Op.getOperand(1)))
return false;
if (Op.getOpcode() == ISD::OR &&
!MaskedValueIsZero(Op.getOperand(0), Op.getConstantOperandAPInt(1)))
return false;
return true;
}
bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const {
// If we're told that NaNs won't happen, assume they won't.
if (getTarget().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs())
return true;
if (Depth == 6)
return false; // Limit search depth.
// TODO: Handle vectors.
// If the value is a constant, we can obviously see if it is a NaN or not.
if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
return !C->getValueAPF().isNaN() ||
(SNaN && !C->getValueAPF().isSignaling());
}
unsigned Opcode = Op.getOpcode();
switch (Opcode) {
case ISD::FADD:
case ISD::FSUB:
case ISD::FMUL:
case ISD::FDIV:
case ISD::FREM:
case ISD::FSIN:
case ISD::FCOS: {
if (SNaN)
return true;
// TODO: Need isKnownNeverInfinity
return false;
}
case ISD::FCANONICALIZE:
case ISD::FEXP:
case ISD::FEXP2:
case ISD::FTRUNC:
case ISD::FFLOOR:
case ISD::FCEIL:
case ISD::FROUND:
case ISD::FRINT:
case ISD::FNEARBYINT: {
if (SNaN)
return true;
return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
}
case ISD::FABS:
case ISD::FNEG:
case ISD::FCOPYSIGN: {
return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
}
case ISD::SELECT:
return isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
case ISD::FP_EXTEND:
case ISD::FP_ROUND: {
if (SNaN)
return true;
return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
}
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
return true;
case ISD::FMA:
case ISD::FMAD: {
if (SNaN)
return true;
return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
}
case ISD::FSQRT: // Need is known positive
case ISD::FLOG:
case ISD::FLOG2:
case ISD::FLOG10:
case ISD::FPOWI:
case ISD::FPOW: {
if (SNaN)
return true;
// TODO: Refine on operand
return false;
}
case ISD::FMINNUM:
case ISD::FMAXNUM: {
// Only one needs to be known not-nan, since it will be returned if the
// other ends up being one.
return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) ||
isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
}
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE: {
if (SNaN)
return true;
// This can return a NaN if either operand is an sNaN, or if both operands
// are NaN.
return (isKnownNeverNaN(Op.getOperand(0), false, Depth + 1) &&
isKnownNeverSNaN(Op.getOperand(1), Depth + 1)) ||
(isKnownNeverNaN(Op.getOperand(1), false, Depth + 1) &&
isKnownNeverSNaN(Op.getOperand(0), Depth + 1));
}
case ISD::FMINIMUM:
case ISD::FMAXIMUM: {
// TODO: Does this quiet or return the origina NaN as-is?
return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
}
case ISD::EXTRACT_VECTOR_ELT: {
return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
}
default:
if (Opcode >= ISD::BUILTIN_OP_END ||
Opcode == ISD::INTRINSIC_WO_CHAIN ||
Opcode == ISD::INTRINSIC_W_CHAIN ||
Opcode == ISD::INTRINSIC_VOID) {
return TLI->isKnownNeverNaNForTargetNode(Op, *this, SNaN, Depth);
}
return false;
}
}
bool SelectionDAG::isKnownNeverZeroFloat(SDValue Op) const {
assert(Op.getValueType().isFloatingPoint() &&
"Floating point type expected");
// If the value is a constant, we can obviously see if it is a zero or not.
// TODO: Add BuildVector support.
if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
return !C->isZero();
return false;
}
bool SelectionDAG::isKnownNeverZero(SDValue Op) const {
assert(!Op.getValueType().isFloatingPoint() &&
"Floating point types unsupported - use isKnownNeverZeroFloat");
// If the value is a constant, we can obviously see if it is a zero or not.
if (ISD::matchUnaryPredicate(
Op, [](ConstantSDNode *C) { return !C->isNullValue(); }))
return true;
// TODO: Recognize more cases here.
switch (Op.getOpcode()) {
default: break;
case ISD::OR:
if (isKnownNeverZero(Op.getOperand(1)) ||
isKnownNeverZero(Op.getOperand(0)))
return true;
break;
}
return false;
}
bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
// Check the obvious case.
if (A == B) return true;
// For for negative and positive zero.
if (const ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A))
if (const ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B))
if (CA->isZero() && CB->isZero()) return true;
// Otherwise they may not be equal.
return false;
}
// FIXME: unify with llvm::haveNoCommonBitsSet.
// FIXME: could also handle masked merge pattern (X & ~M) op (Y & M)
bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
assert(A.getValueType() == B.getValueType() &&
"Values must have the same type");
return (computeKnownBits(A).Zero | computeKnownBits(B).Zero).isAllOnesValue();
}
static SDValue FoldBUILD_VECTOR(const SDLoc &DL, EVT VT,
ArrayRef<SDValue> Ops,
SelectionDAG &DAG) {
int NumOps = Ops.size();
assert(NumOps != 0 && "Can't build an empty vector!");
assert(VT.getVectorNumElements() == (unsigned)NumOps &&
"Incorrect element count in BUILD_VECTOR!");
// BUILD_VECTOR of UNDEFs is UNDEF.
if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
return DAG.getUNDEF(VT);
// BUILD_VECTOR of seq extract/insert from the same vector + type is Identity.
SDValue IdentitySrc;
bool IsIdentity = true;
for (int i = 0; i != NumOps; ++i) {
if (Ops[i].getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Ops[i].getOperand(0).getValueType() != VT ||
(IdentitySrc && Ops[i].getOperand(0) != IdentitySrc) ||
!isa<ConstantSDNode>(Ops[i].getOperand(1)) ||
cast<ConstantSDNode>(Ops[i].getOperand(1))->getAPIntValue() != i) {
IsIdentity = false;
break;
}
IdentitySrc = Ops[i].getOperand(0);
}
if (IsIdentity)
return IdentitySrc;
return SDValue();
}
/// Try to simplify vector concatenation to an input value, undef, or build
/// vector.
static SDValue foldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
ArrayRef<SDValue> Ops,
SelectionDAG &DAG) {
assert(!Ops.empty() && "Can't concatenate an empty list of vectors!");
assert(llvm::all_of(Ops,
[Ops](SDValue Op) {
return Ops[0].getValueType() == Op.getValueType();
}) &&
"Concatenation of vectors with inconsistent value types!");
assert((Ops.size() * Ops[0].getValueType().getVectorNumElements()) ==
VT.getVectorNumElements() &&
"Incorrect element count in vector concatenation!");
if (Ops.size() == 1)
return Ops[0];
// Concat of UNDEFs is UNDEF.
if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
return DAG.getUNDEF(VT);
// Scan the operands and look for extract operations from a single source
// that correspond to insertion at the same location via this concatenation:
// concat (extract X, 0*subvec_elts), (extract X, 1*subvec_elts), ...
SDValue IdentitySrc;
bool IsIdentity = true;
for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
SDValue Op = Ops[i];
unsigned IdentityIndex = i * Op.getValueType().getVectorNumElements();
if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
Op.getOperand(0).getValueType() != VT ||
(IdentitySrc && Op.getOperand(0) != IdentitySrc) ||
!isa<ConstantSDNode>(Op.getOperand(1)) ||
Op.getConstantOperandVal(1) != IdentityIndex) {
IsIdentity = false;
break;
}
assert((!IdentitySrc || IdentitySrc == Op.getOperand(0)) &&
"Unexpected identity source vector for concat of extracts");
IdentitySrc = Op.getOperand(0);
}
if (IsIdentity) {
assert(IdentitySrc && "Failed to set source vector of extracts");
return IdentitySrc;
}
// A CONCAT_VECTOR with all UNDEF/BUILD_VECTOR operands can be
// simplified to one big BUILD_VECTOR.
// FIXME: Add support for SCALAR_TO_VECTOR as well.
EVT SVT = VT.getScalarType();
SmallVector<SDValue, 16> Elts;
for (SDValue Op : Ops) {
EVT OpVT = Op.getValueType();
if (Op.isUndef())
Elts.append(OpVT.getVectorNumElements(), DAG.getUNDEF(SVT));
else if (Op.getOpcode() == ISD::BUILD_VECTOR)
Elts.append(Op->op_begin(), Op->op_end());
else
return SDValue();
}
// BUILD_VECTOR requires all inputs to be of the same type, find the
// maximum type and extend them all.
for (SDValue Op : Elts)
SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
if (SVT.bitsGT(VT.getScalarType()))
for (SDValue &Op : Elts)
Op = DAG.getTargetLoweringInfo().isZExtFree(Op.getValueType(), SVT)
? DAG.getZExtOrTrunc(Op, DL, SVT)
: DAG.getSExtOrTrunc(Op, DL, SVT);
SDValue V = DAG.getBuildVector(VT, DL, Elts);
NewSDValueDbgMsg(V, "New node fold concat vectors: ", &DAG);
return V;
}
/// Gets or creates the specified node.
SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT) {
FoldingSetNodeID ID;
AddNodeIDNode(ID, Opcode, getVTList(VT), None);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
return SDValue(E, 0);
auto *N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(),
getVTList(VT));
CSEMap.InsertNode(N, IP);
InsertNode(N);
SDValue V = SDValue(N, 0);
NewSDValueDbgMsg(V, "Creating new node: ", this);
return V;
}
SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
SDValue Operand, const SDNodeFlags Flags) {
// Constant fold unary operations with an integer constant operand. Even
// opaque constant will be folded, because the folding of unary operations
// doesn't create new constants with different values. Nevertheless, the
// opaque flag is preserved during folding to prevent future folding with
// other constants.
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Operand)) {
const APInt &Val = C->getAPIntValue();
switch (Opcode) {
default: break;
case ISD::SIGN_EXTEND:
return getConstant(Val.sextOrTrunc(VT.getSizeInBits()), DL, VT,
C->isTargetOpcode(), C->isOpaque());
case ISD::TRUNCATE:
if (C->isOpaque())
break;
LLVM_FALLTHROUGH;
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND:
return getConstant(Val.zextOrTrunc(VT.getSizeInBits()), DL, VT,
C->isTargetOpcode(), C->isOpaque());
case ISD::UINT_TO_FP:
case ISD::SINT_TO_FP: {
APFloat apf(EVTToAPFloatSemantics(VT),
APInt::getNullValue(VT.getSizeInBits()));
(void)apf.convertFromAPInt(Val,
Opcode==ISD::SINT_TO_FP,
APFloat::rmNearestTiesToEven);
return getConstantFP(apf, DL, VT);
}
case ISD::BITCAST:
if (VT == MVT::f16 && C->getValueType(0) == MVT::i16)
return getConstantFP(APFloat(APFloat::IEEEhalf(), Val), DL, VT);
if (VT == MVT::f32 && C->getValueType(0) == MVT::i32)
return getConstantFP(APFloat(APFloat::IEEEsingle(), Val), DL, VT);
if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
return getConstantFP(APFloat(APFloat::IEEEdouble(), Val), DL, VT);
if (VT == MVT::f128 && C->getValueType(0) == MVT::i128)
return getConstantFP(APFloat(APFloat::IEEEquad(), Val), DL, VT);
break;
case ISD::ABS:
return getConstant(Val.abs(), DL, VT, C->isTargetOpcode(),
C->isOpaque());
case ISD::BITREVERSE:
return getConstant(Val.reverseBits(), DL, VT, C->isTargetOpcode(),
C->isOpaque());
case ISD::BSWAP:
return getConstant(Val.byteSwap(), DL, VT, C->isTargetOpcode(),
C->isOpaque());
case ISD::CTPOP:
return getConstant(Val.countPopulation(), DL, VT, C->isTargetOpcode(),
C->isOpaque());
case ISD::CTLZ:
case ISD::CTLZ_ZERO_UNDEF:
return getConstant(Val.countLeadingZeros(), DL, VT, C->isTargetOpcode(),
C->isOpaque());
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF:
return getConstant(Val.countTrailingZeros(), DL, VT, C->isTargetOpcode(),
C->isOpaque());
case ISD::FP16_TO_FP: {
bool Ignored;
APFloat FPV(APFloat::IEEEhalf(),
(Val.getBitWidth() == 16) ? Val : Val.trunc(16));
// This can return overflow, underflow, or inexact; we don't care.
// FIXME need to be more flexible about rounding mode.
(void)FPV.convert(EVTToAPFloatSemantics(VT),
APFloat::rmNearestTiesToEven, &Ignored);
return getConstantFP(FPV, DL, VT);
}
}
}
// Constant fold unary operations with a floating point constant operand.
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Operand)) {
APFloat V = C->getValueAPF(); // make copy
switch (Opcode) {
case ISD::FNEG:
V.changeSign();
return getConstantFP(V, DL, VT);
case ISD::FABS:
V.clearSign();
return getConstantFP(V, DL, VT);
case ISD::FCEIL: {
APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardPositive);
if (fs == APFloat::opOK || fs == APFloat::opInexact)
return getConstantFP(V, DL, VT);
break;
}
case ISD::FTRUNC: {
APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardZero);
if (fs == APFloat::opOK || fs == APFloat::opInexact)
return getConstantFP(V, DL, VT);
break;
}
case ISD::FFLOOR: {
APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardNegative);
if (fs == APFloat::opOK || fs == APFloat::opInexact)
return getConstantFP(V, DL, VT);
break;
}
case ISD::FP_EXTEND: {
bool ignored;
// This can return overflow, underflow, or inexact; we don't care.
// FIXME need to be more flexible about rounding mode.
(void)V.convert(EVTToAPFloatSemantics(VT),
APFloat::rmNearestTiesToEven, &ignored);
return getConstantFP(V, DL, VT);
}
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT: {
bool ignored;
APSInt IntVal(VT.getSizeInBits(), Opcode == ISD::FP_TO_UINT);
// FIXME need to be more flexible about rounding mode.
APFloat::opStatus s =
V.convertToInteger(IntVal, APFloat::rmTowardZero, &ignored);
if (s == APFloat::opInvalidOp) // inexact is OK, in fact usual
break;
return getConstant(IntVal, DL, VT);
}
case ISD::BITCAST:
if (VT == MVT::i16 && C->getValueType(0) == MVT::f16)
return getConstant((uint16_t)V.bitcastToAPInt().getZExtValue(), DL, VT);
else if (VT == MVT::i32 && C->getValueType(0) == MVT::f32)
return getConstant((uint32_t)V.bitcastToAPInt().getZExtValue(), DL, VT);
else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64)
return getConstant(V.bitcastToAPInt().getZExtValue(), DL, VT);
break;
case ISD::FP_TO_FP16: {
bool Ignored;
// This can return overflow, underflow, or inexact; we don't care.
// FIXME need to be more flexible about rounding mode.
(void)V.convert(APFloat::IEEEhalf(),
APFloat::rmNearestTiesToEven, &Ignored);
return getConstant(V.bitcastToAPInt(), DL, VT);
}
}
}
// Constant fold unary operations with a vector integer or float operand.
if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Operand)) {
if (BV->isConstant()) {
switch (Opcode) {
default:
// FIXME: Entirely reasonable to perform folding of other unary
// operations here as the need arises.
break;
case ISD::FNEG:
case ISD::FABS:
case ISD::FCEIL:
case ISD::FTRUNC:
case ISD::FFLOOR:
case ISD::FP_EXTEND:
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
case ISD::TRUNCATE:
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::SIGN_EXTEND:
case ISD::UINT_TO_FP:
case ISD::SINT_TO_FP:
case ISD::ABS:
case ISD::BITREVERSE:
case ISD::BSWAP:
case ISD::CTLZ:
case ISD::CTLZ_ZERO_UNDEF:
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF:
case ISD::CTPOP: {
SDValue Ops = { Operand };
if (SDValue Fold = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops))
return Fold;
}
}
}
}
unsigned OpOpcode = Operand.getNode()->getOpcode();
switch (Opcode) {
case ISD::TokenFactor:
case ISD::MERGE_VALUES:
case ISD::CONCAT_VECTORS:
return Operand; // Factor, merge or concat of one node? No need.
case ISD::BUILD_VECTOR: {
// Attempt to simplify BUILD_VECTOR.
SDValue Ops[] = {Operand};
if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
return V;
break;
}
case ISD::FP_ROUND: llvm_unreachable("Invalid method to make FP_ROUND node");
case ISD::FP_EXTEND:
assert(VT.isFloatingPoint() &&
Operand.getValueType().isFloatingPoint() && "Invalid FP cast!");
if (Operand.getValueType() == VT) return Operand; // noop conversion.
assert((!VT.isVector() ||
VT.getVectorNumElements() ==
Operand.getValueType().getVectorNumElements()) &&
"Vector element count mismatch!");
assert(Operand.getValueType().bitsLT(VT) &&
"Invalid fpext node, dst < src!");
if (Operand.isUndef())
return getUNDEF(VT);
break;
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
if (Operand.isUndef())
return getUNDEF(VT);
break;
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
// [us]itofp(undef) = 0, because the result value is bounded.
if (Operand.isUndef())
return getConstantFP(0.0, DL, VT);
break;
case ISD::SIGN_EXTEND:
assert(VT.isInteger() && Operand.getValueType().isInteger() &&
"Invalid SIGN_EXTEND!");
assert(VT.isVector() == Operand.getValueType().isVector() &&
"SIGN_EXTEND result type type should be vector iff the operand "
"type is vector!");
if (Operand.getValueType() == VT) return Operand; // noop extension
assert((!VT.isVector() ||
VT.getVectorNumElements() ==
Operand.getValueType().getVectorNumElements()) &&
"Vector element count mismatch!");
assert(Operand.getValueType().bitsLT(VT) &&
"Invalid sext node, dst < src!");
if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND)
return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
else if (OpOpcode == ISD::UNDEF)
// sext(undef) = 0, because the top bits will all be the same.
return getConstant(0, DL, VT);
break;
case ISD::ZERO_EXTEND:
assert(VT.isInteger() && Operand.getValueType().isInteger() &&
"Invalid ZERO_EXTEND!");
assert(VT.isVector() == Operand.getValueType().isVector() &&
"ZERO_EXTEND result type type should be vector iff the operand "
"type is vector!");
if (Operand.getValueType() == VT) return Operand; // noop extension
assert((!VT.isVector() ||
VT.getVectorNumElements() ==
Operand.getValueType().getVectorNumElements()) &&
"Vector element count mismatch!");
assert(Operand.getValueType().bitsLT(VT) &&
"Invalid zext node, dst < src!");
if (OpOpcode == ISD::ZERO_EXTEND) // (zext (zext x)) -> (zext x)
return getNode(ISD::ZERO_EXTEND, DL, VT, Operand.getOperand(0));
else if (OpOpcode == ISD::UNDEF)
// zext(undef) = 0, because the top bits will be zero.
return getConstant(0, DL, VT);
break;
case ISD::ANY_EXTEND:
assert(VT.isInteger() && Operand.getValueType().isInteger() &&
"Invalid ANY_EXTEND!");
assert(VT.isVector() == Operand.getValueType().isVector() &&
"ANY_EXTEND result type type should be vector iff the operand "
"type is vector!");
if (Operand.getValueType() == VT) return Operand; // noop extension
assert((!VT.isVector() ||
VT.getVectorNumElements() ==
Operand.getValueType().getVectorNumElements()) &&
"Vector element count mismatch!");
assert(Operand.getValueType().bitsLT(VT) &&
"Invalid anyext node, dst < src!");
if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
OpOpcode == ISD::ANY_EXTEND)
// (ext (zext x)) -> (zext x) and (ext (sext x)) -> (sext x)
return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
else if (OpOpcode == ISD::UNDEF)
return getUNDEF(VT);
// (ext (trunc x)) -> x
if (OpOpcode == ISD::TRUNCATE) {
SDValue OpOp = Operand.getOperand(0);
if (OpOp.getValueType() == VT) {
transferDbgValues(Operand, OpOp);
return OpOp;
}
}
break;
case ISD::TRUNCATE:
assert(VT.isInteger() && Operand.getValueType().isInteger() &&
"Invalid TRUNCATE!");
assert(VT.isVector() == Operand.getValueType().isVector() &&
"TRUNCATE result type type should be vector iff the operand "
"type is vector!");
if (Operand.getValueType() == VT) return Operand; // noop truncate
assert((!VT.isVector() ||
VT.getVectorNumElements() ==
Operand.getValueType().getVectorNumElements()) &&
"Vector element count mismatch!");
assert(Operand.getValueType().bitsGT(VT) &&
"Invalid truncate node, src < dst!");
if (OpOpcode == ISD::TRUNCATE)
return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
OpOpcode == ISD::ANY_EXTEND) {
// If the source is smaller than the dest, we still need an extend.
if (Operand.getOperand(0).getValueType().getScalarType()
.bitsLT(VT.getScalarType()))
return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
if (Operand.getOperand(0).getValueType().bitsGT(VT))
return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
return Operand.getOperand(0);
}
if (OpOpcode == ISD::UNDEF)
return getUNDEF(VT);
break;
case ISD::ANY_EXTEND_VECTOR_INREG:
case ISD::ZERO_EXTEND_VECTOR_INREG:
case ISD::SIGN_EXTEND_VECTOR_INREG:
assert(VT.isVector() && "This DAG node is restricted to vector types.");
assert(Operand.getValueType().bitsLE(VT) &&
"The input must be the same size or smaller than the result.");
assert(VT.getVectorNumElements() <
Operand.getValueType().getVectorNumElements() &&
"The destination vector type must have fewer lanes than the input.");
break;
case ISD::ABS:
assert(VT.isInteger() && VT == Operand.getValueType() &&
"Invalid ABS!");
if (OpOpcode == ISD::UNDEF)
return getUNDEF(VT);
break;
case ISD::BSWAP:
assert(VT.isInteger() && VT == Operand.getValueType() &&
"Invalid BSWAP!");
assert((VT.getScalarSizeInBits() % 16 == 0) &&
"BSWAP types must be a multiple of 16 bits!");
if (OpOpcode == ISD::UNDEF)
return getUNDEF(VT);
break;
case ISD::BITREVERSE:
assert(VT.isInteger() && VT == Operand.getValueType() &&
"Invalid BITREVERSE!");
if (OpOpcode == ISD::UNDEF)
return getUNDEF(VT);
break;
case ISD::BITCAST:
// Basic sanity checking.
assert(VT.getSizeInBits() == Operand.getValueSizeInBits() &&
"Cannot BITCAST between types of different sizes!");
if (VT == Operand.getValueType()) return Operand; // noop conversion.
if (OpOpcode == ISD::BITCAST) // bitconv(bitconv(x)) -> bitconv(x)
return getNode(ISD::BITCAST, DL, VT, Operand.getOperand(0));
if (OpOpcode == ISD::UNDEF)
return getUNDEF(VT);
break;
case ISD::SCALAR_TO_VECTOR:
assert(VT.isVector() && !Operand.getValueType().isVector() &&
(VT.getVectorElementType() == Operand.getValueType() ||
(VT.getVectorElementType().isInteger() &&
Operand.getValueType().isInteger() &&
VT.getVectorElementType().bitsLE(Operand.getValueType()))) &&
"Illegal SCALAR_TO_VECTOR node!");
if (OpOpcode == ISD::UNDEF)
return getUNDEF(VT);
// scalar_to_vector(extract_vector_elt V, 0) -> V, top bits are undefined.
if (OpOpcode == ISD::EXTRACT_VECTOR_ELT &&
isa<ConstantSDNode>(Operand.getOperand(1)) &&
Operand.getConstantOperandVal(1) == 0 &&
Operand.getOperand(0).getValueType() == VT)
return Operand.getOperand(0);
break;
case ISD::FNEG:
// Negation of an unknown bag of bits is still completely undefined.
if (OpOpcode == ISD::UNDEF)
return getUNDEF(VT);
// -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
if ((getTarget().Options.UnsafeFPMath || Flags.hasNoSignedZeros()) &&
OpOpcode == ISD::FSUB)
return getNode(ISD::FSUB, DL, VT, Operand.getOperand(1),
Operand.getOperand(0), Flags);
if (OpOpcode == ISD::FNEG) // --X -> X
return Operand.getOperand(0);
break;
case ISD::FABS:
if (OpOpcode == ISD::FNEG) // abs(-X) -> abs(X)
return getNode(ISD::FABS, DL, VT, Operand.getOperand(0));
break;
}
SDNode *N;
SDVTList VTs = getVTList(VT);
SDValue Ops[] = {Operand};
if (VT != MVT::Glue) { // Don't CSE flag producing nodes
FoldingSetNodeID ID;
AddNodeIDNode(ID, Opcode, VTs, Ops);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
E->intersectFlagsWith(Flags);
return SDValue(E, 0);
}
N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
N->setFlags(Flags);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
} else {
N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
createOperands(N, Ops);
}
InsertNode(N);
SDValue V = SDValue(N, 0);
NewSDValueDbgMsg(V, "Creating new node: ", this);
return V;
}
static std::pair<APInt, bool> FoldValue(unsigned Opcode, const APInt &C1,
const APInt &C2) {
switch (Opcode) {
case ISD::ADD: return std::make_pair(C1 + C2, true);
case ISD::SUB: return std::make_pair(C1 - C2, true);
case ISD::MUL: return std::make_pair(C1 * C2, true);
case ISD::AND: return std::make_pair(C1 & C2, true);
case ISD::OR: return std::make_pair(C1 | C2, true);
case ISD::XOR: return std::make_pair(C1 ^ C2, true);
case ISD::SHL: return std::make_pair(C1 << C2, true);
case ISD::SRL: return std::make_pair(C1.lshr(C2), true);
case ISD::SRA: return std::make_pair(C1.ashr(C2), true);
case ISD::ROTL: return std::make_pair(C1.rotl(C2), true);
case ISD::ROTR: return std::make_pair(C1.rotr(C2), true);
case ISD::SMIN: return std::make_pair(C1.sle(C2) ? C1 : C2, true);
case ISD::SMAX: return std::make_pair(C1.sge(C2) ? C1 : C2, true);
case ISD::UMIN: return std::make_pair(C1.ule(C2) ? C1 : C2, true);
case ISD::UMAX: return std::make_pair(C1.uge(C2) ? C1 : C2, true);
case ISD::SADDSAT: return std::make_pair(C1.sadd_sat(C2), true);
case ISD::UADDSAT: return std::make_pair(C1.uadd_sat(C2), true);
case ISD::SSUBSAT: return std::make_pair(C1.ssub_sat(C2), true);
case ISD::USUBSAT: return std::make_pair(C1.usub_sat(C2), true);
case ISD::UDIV:
if (!C2.getBoolValue())
break;
return std::make_pair(C1.udiv(C2), true);
case ISD::UREM:
if (!C2.getBoolValue())
break;
return std::make_pair(C1.urem(C2), true);
case ISD::SDIV:
if (!C2.getBoolValue())
break;
return std::make_pair(C1.sdiv(C2), true);
case ISD::SREM:
if (!C2.getBoolValue())
break;
return std::make_pair(C1.srem(C2), true);
}
return std::make_pair(APInt(1, 0), false);
}
SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
EVT VT, const ConstantSDNode *C1,
const ConstantSDNode *C2) {
if (C1->isOpaque() || C2->isOpaque())
return SDValue();
std::pair<APInt, bool> Folded = FoldValue(Opcode, C1->getAPIntValue(),
C2->getAPIntValue());
if (!Folded.second)
return SDValue();
return getConstant(Folded.first, DL, VT);
}
SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
const GlobalAddressSDNode *GA,
const SDNode *N2) {
if (GA->getOpcode() != ISD::GlobalAddress)
return SDValue();
if (!TLI->isOffsetFoldingLegal(GA))
return SDValue();
auto *C2 = dyn_cast<ConstantSDNode>(N2);
if (!C2)
return SDValue();
int64_t Offset = C2->getSExtValue();
switch (Opcode) {
case ISD::ADD: break;
case ISD::SUB: Offset = -uint64_t(Offset); break;
default: return SDValue();
}
return getGlobalAddress(GA->getGlobal(), SDLoc(C2), VT,
GA->getOffset() + uint64_t(Offset));
}
bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) {
switch (Opcode) {
case ISD::SDIV:
case ISD::UDIV:
case ISD::SREM:
case ISD::UREM: {
// If a divisor is zero/undef or any element of a divisor vector is
// zero/undef, the whole op is undef.
assert(Ops.size() == 2 && "Div/rem should have 2 operands");
SDValue Divisor = Ops[1];
if (Divisor.isUndef() || isNullConstant(Divisor))
return true;
return ISD::isBuildVectorOfConstantSDNodes(Divisor.getNode()) &&
llvm::any_of(Divisor->op_values(),
[](SDValue V) { return V.isUndef() ||
isNullConstant(V); });
// TODO: Handle signed overflow.
}
// TODO: Handle oversized shifts.
default:
return false;
}
}
SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
EVT VT, SDNode *N1, SDNode *N2) {
// If the opcode is a target-specific ISD node, there's nothing we can
// do here and the operand rules may not line up with the below, so
// bail early.
if (Opcode >= ISD::BUILTIN_OP_END)
return SDValue();
if (isUndef(Opcode, {SDValue(N1, 0), SDValue(N2, 0)}))
return getUNDEF(VT);
// Handle the case of two scalars.
if (auto *C1 = dyn_cast<ConstantSDNode>(N1)) {
if (auto *C2 = dyn_cast<ConstantSDNode>(N2)) {
SDValue Folded = FoldConstantArithmetic(Opcode, DL, VT, C1, C2);
assert((!Folded || !VT.isVector()) &&
"Can't fold vectors ops with scalar operands");
return Folded;
}
}
// fold (add Sym, c) -> Sym+c
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N1))
return FoldSymbolOffset(Opcode, VT, GA, N2);
if (TLI->isCommutativeBinOp(Opcode))
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N2))
return FoldSymbolOffset(Opcode, VT, GA, N1);
// For vectors, extract each constant element and fold them individually.
// Either input may be an undef value.
auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
if (!BV1 && !N1->isUndef())
return SDValue();
auto *BV2 = dyn_cast<BuildVectorSDNode>(N2);
if (!BV2 && !N2->isUndef())
return SDValue();
// If both operands are undef, that's handled the same way as scalars.
if (!BV1 && !BV2)
return SDValue();
assert((!BV1 || !BV2 || BV1->getNumOperands() == BV2->getNumOperands()) &&
"Vector binop with different number of elements in operands?");
EVT SVT = VT.getScalarType();
EVT LegalSVT = SVT;
if (NewNodesMustHaveLegalTypes && LegalSVT.isInteger()) {
LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
if (LegalSVT.bitsLT(SVT))
return SDValue();
}
SmallVector<SDValue, 4> Outputs;
unsigned NumOps = BV1 ? BV1->getNumOperands() : BV2->getNumOperands();
for (unsigned I = 0; I != NumOps; ++I) {
SDValue V1 = BV1 ? BV1->getOperand(I) : getUNDEF(SVT);
SDValue V2 = BV2 ? BV2->getOperand(I) : getUNDEF(SVT);
if (SVT.isInteger()) {
if (V1->getValueType(0).bitsGT(SVT))
V1 = getNode(ISD::TRUNCATE, DL, SVT, V1);
if (V2->getValueType(0).bitsGT(SVT))
V2 = getNode(ISD::TRUNCATE, DL, SVT, V2);
}
if (V1->getValueType(0) != SVT || V2->getValueType(0) != SVT)
return SDValue();
// Fold one vector element.
SDValue ScalarResult = getNode(Opcode, DL, SVT, V1, V2);
if (LegalSVT != SVT)
ScalarResult = getNode(ISD::SIGN_EXTEND, DL, LegalSVT, ScalarResult);
// Scalar folding only succeeded if the result is a constant or UNDEF.
if (!ScalarResult.isUndef() && ScalarResult.getOpcode() != ISD::Constant &&
ScalarResult.getOpcode() != ISD::ConstantFP)
return SDValue();
Outputs.push_back(ScalarResult);
}
assert(VT.getVectorNumElements() == Outputs.size() &&
"Vector size mismatch!");
// We may have a vector type but a scalar result. Create a splat.
Outputs.resize(VT.getVectorNumElements(), Outputs.back());
// Build a big vector out of the scalar elements we generated.
return getBuildVector(VT, SDLoc(), Outputs);
}
// TODO: Merge with FoldConstantArithmetic
SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,
const SDLoc &DL, EVT VT,
ArrayRef<SDValue> Ops,
const SDNodeFlags Flags) {
// If the opcode is a target-specific ISD node, there's nothing we can
// do here and the operand rules may not line up with the below, so
// bail early.
if (Opcode >= ISD::BUILTIN_OP_END)
return SDValue();
if (isUndef(Opcode, Ops))
return getUNDEF(VT);
// We can only fold vectors - maybe merge with FoldConstantArithmetic someday?
if (!VT.isVector())
return SDValue();
unsigned NumElts = VT.getVectorNumElements();
auto IsScalarOrSameVectorSize = [&](const SDValue &Op) {
return !Op.getValueType().isVector() ||
Op.getValueType().getVectorNumElements() == NumElts;
};
auto IsConstantBuildVectorOrUndef = [&](const SDValue &Op) {
BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op);
return (Op.isUndef()) || (Op.getOpcode() == ISD::CONDCODE) ||
(BV && BV->isConstant());
};
// All operands must be vector types with the same number of elements as
// the result type and must be either UNDEF or a build vector of constant
// or UNDEF scalars.
if (!llvm::all_of(Ops, IsConstantBuildVectorOrUndef) ||
!llvm::all_of(Ops, IsScalarOrSameVectorSize))
return SDValue();
// If we are comparing vectors, then the result needs to be a i1 boolean
// that is then sign-extended back to the legal result type.
EVT SVT = (Opcode == ISD::SETCC ? MVT::i1 : VT.getScalarType());
// Find legal integer scalar type for constant promotion and
// ensure that its scalar size is at least as large as source.
EVT LegalSVT = VT.getScalarType();
if (NewNodesMustHaveLegalTypes && LegalSVT.isInteger()) {
LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
if (LegalSVT.bitsLT(VT.getScalarType()))
return SDValue();
}
// Constant fold each scalar lane separately.
SmallVector<SDValue, 4> ScalarResults;
for (unsigned i = 0; i != NumElts; i++) {
SmallVector<SDValue, 4> ScalarOps;
for (SDValue Op : Ops) {
EVT InSVT = Op.getValueType().getScalarType();
BuildVectorSDNode *InBV = dyn_cast<BuildVectorSDNode>(Op);
if (!InBV) {
// We've checked that this is UNDEF or a constant of some kind.
if (Op.isUndef())
ScalarOps.push_back(getUNDEF(InSVT));
else
ScalarOps.push_back(Op);
continue;
}
SDValue ScalarOp = InBV->getOperand(i);
EVT ScalarVT = ScalarOp.getValueType();
// Build vector (integer) scalar operands may need implicit
// truncation - do this before constant folding.
if (ScalarVT.isInteger() && ScalarVT.bitsGT(InSVT))
ScalarOp = getNode(ISD::TRUNCATE, DL, InSVT, ScalarOp);
ScalarOps.push_back(ScalarOp);
}
// Constant fold the scalar operands.
SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps, Flags);
// Legalize the (integer) scalar constant if necessary.
if (LegalSVT != SVT)
ScalarResult = getNode(ISD::SIGN_EXTEND, DL, LegalSVT, ScalarResult);
// Scalar folding only succeeded if the result is a constant or UNDEF.
if (!ScalarResult.isUndef() && ScalarResult.getOpcode() != ISD::Constant &&
ScalarResult.getOpcode() != ISD::ConstantFP)
return SDValue();
ScalarResults.push_back(ScalarResult);
}
SDValue V = getBuildVector(VT, DL, ScalarResults);
NewSDValueDbgMsg(V, "New node fold constant vector: ", this);
return V;
}
SDValue SelectionDAG::foldConstantFPMath(unsigned Opcode, const SDLoc &DL,
EVT VT, SDValue N1, SDValue N2) {
// TODO: We don't do any constant folding for strict FP opcodes here, but we
// should. That will require dealing with a potentially non-default
// rounding mode, checking the "opStatus" return value from the APFloat
// math calculations, and possibly other variations.
auto *N1CFP = dyn_cast<ConstantFPSDNode>(N1.getNode());
auto *N2CFP = dyn_cast<ConstantFPSDNode>(N2.getNode());
if (N1CFP && N2CFP) {
APFloat C1 = N1CFP->getValueAPF(), C2 = N2CFP->getValueAPF();
switch (Opcode) {
case ISD::FADD:
C1.add(C2, APFloat::rmNearestTiesToEven);
return getConstantFP(C1, DL, VT);
case ISD::FSUB:
C1.subtract(C2, APFloat::rmNearestTiesToEven);
return getConstantFP(C1, DL, VT);
case ISD::FMUL:
C1.multiply(C2, APFloat::rmNearestTiesToEven);
return getConstantFP(C1, DL, VT);
case ISD::FDIV:
C1.divide(C2, APFloat::rmNearestTiesToEven);
return getConstantFP(C1, DL, VT);
case ISD::FREM:
C1.mod(C2);
return getConstantFP(C1, DL, VT);
case ISD::FCOPYSIGN:
C1.copySign(C2);
return getConstantFP(C1, DL, VT);
default: break;
}
}
if (N1CFP && Opcode == ISD::FP_ROUND) {
APFloat C1 = N1CFP->getValueAPF(); // make copy
bool Unused;
// This can return overflow, underflow, or inexact; we don't care.
// FIXME need to be more flexible about rounding mode.
(void) C1.convert(EVTToAPFloatSemantics(VT), APFloat::rmNearestTiesToEven,
&Unused);
return getConstantFP(C1, DL, VT);
}
switch (Opcode) {
case ISD::FADD:
case ISD::FSUB:
case ISD::FMUL:
case ISD::FDIV:
case ISD::FREM:
// If both operands are undef, the result is undef. If 1 operand is undef,
// the result is NaN. This should match the behavior of the IR optimizer.
if (N1.isUndef() && N2.isUndef())
return getUNDEF(VT);
if (N1.isUndef() || N2.isUndef())
return getConstantFP(APFloat::getNaN(EVTToAPFloatSemantics(VT)), DL, VT);
}
return SDValue();
}
SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
SDValue N1, SDValue N2, const SDNodeFlags Flags) {
ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
// Canonicalize constant to RHS if commutative.
if (TLI->isCommutativeBinOp(Opcode)) {
if (N1C && !N2C) {
std::swap(N1C, N2C);
std::swap(N1, N2);
} else if (N1CFP && !N2CFP) {
std::swap(N1CFP, N2CFP);
std::swap(N1, N2);
}
}
switch (Opcode) {
default: break;
case ISD::TokenFactor:
assert(VT == MVT::Other && N1.getValueType() == MVT::Other &&
N2.getValueType() == MVT::Other && "Invalid token factor!");
// Fold trivial token factors.
if (N1.getOpcode() == ISD::EntryToken) return N2;
if (N2.getOpcode() == ISD::EntryToken) return N1;
if (N1 == N2) return N1;
break;
case ISD::BUILD_VECTOR: {
// Attempt to simplify BUILD_VECTOR.
SDValue Ops[] = {N1, N2};
if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
return V;
break;
}
case ISD::CONCAT_VECTORS: {
SDValue Ops[] = {N1, N2};
if (SDValue V = foldCONCAT_VECTORS(DL, VT, Ops, *this))
return V;
break;
}
case ISD::AND:
assert(VT.isInteger() && "This operator does not apply to FP types!");
assert(N1.getValueType() == N2.getValueType() &&
N1.getValueType() == VT && "Binary operator types must match!");
// (X & 0) -> 0. This commonly occurs when legalizing i64 values, so it's
// worth handling here.
if (N2C && N2C->isNullValue())
return N2;
if (N2C && N2C->isAllOnesValue()) // X & -1 -> X
return N1;
break;
case ISD::OR:
case ISD::XOR:
case ISD::ADD:
case ISD::SUB:
assert(VT.isInteger() && "This operator does not apply to FP types!");
assert(N1.getValueType() == N2.getValueType() &&
N1.getValueType() == VT && "Binary operator types must match!");
// (X ^|+- 0) -> X. This commonly occurs when legalizing i64 values, so
// it's worth handling here.
if (N2C && N2C->isNullValue())
return N1;
break;
case ISD::UDIV:
case ISD::UREM:
case ISD::MULHU:
case ISD::MULHS:
case ISD::MUL:
case ISD::SDIV:
case ISD::SREM:
case ISD::SMIN:
case ISD::SMAX:
case ISD::UMIN:
case ISD::UMAX:
case ISD::SADDSAT:
case ISD::SSUBSAT:
case ISD::UADDSAT:
case ISD::USUBSAT:
assert(VT.isInteger() && "This operator does not apply to FP types!");
assert(N1.getValueType() == N2.getValueType() &&
N1.getValueType() == VT && "Binary operator types must match!");
break;
case ISD::FADD:
case ISD::FSUB:
case ISD::FMUL:
case ISD::FDIV:
case ISD::FREM:
assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
assert(N1.getValueType() == N2.getValueType() &&
N1.getValueType() == VT && "Binary operator types must match!");
if (SDValue V = simplifyFPBinop(Opcode, N1, N2))
return V;
break;
case ISD::FCOPYSIGN: // N1 and result must match. N1/N2 need not match.
assert(N1.getValueType() == VT &&
N1.getValueType().isFloatingPoint() &&
N2.getValueType().isFloatingPoint() &&
"Invalid FCOPYSIGN!");
break;
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
if (SDValue V = simplifyShift(N1, N2))
return V;
LLVM_FALLTHROUGH;
case ISD::ROTL:
case ISD::ROTR:
assert(VT == N1.getValueType() &&
"Shift operators return type must be the same as their first arg");
assert(VT.isInteger() && N2.getValueType().isInteger() &&
"Shifts only work on integers");
assert((!VT.isVector() || VT == N2.getValueType()) &&
"Vector shift amounts must be in the same as their first arg");
// Verify that the shift amount VT is big enough to hold valid shift
// amounts. This catches things like trying to shift an i1024 value by an
// i8, which is easy to fall into in generic code that uses
// TLI.getShiftAmount().
assert(N2.getValueSizeInBits() >= Log2_32_Ceil(N1.getValueSizeInBits()) &&
"Invalid use of small shift amount with oversized value!");
// Always fold shifts of i1 values so the code generator doesn't need to
// handle them. Since we know the size of the shift has to be less than the
// size of the value, the shift/rotate count is guaranteed to be zero.
if (VT == MVT::i1)
return N1;
if (N2C && N2C->isNullValue())
return N1;
break;
case ISD::FP_ROUND_INREG: {
EVT EVT = cast<VTSDNode>(N2)->getVT();
assert(VT == N1.getValueType() && "Not an inreg round!");
assert(VT.isFloatingPoint() && EVT.isFloatingPoint() &&
"Cannot FP_ROUND_INREG integer types");
assert(EVT.isVector() == VT.isVector() &&
"FP_ROUND_INREG type should be vector iff the operand "
"type is vector!");
assert((!EVT.isVector() ||
EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
"Vector element counts must match in FP_ROUND_INREG");
assert(EVT.bitsLE(VT) && "Not rounding down!");
(void)EVT;
if (cast<VTSDNode>(N2)->getVT() == VT) return N1; // Not actually rounding.
break;
}
case ISD::FP_ROUND:
assert(VT.isFloatingPoint() &&
N1.getValueType().isFloatingPoint() &&
VT.bitsLE(N1.getValueType()) &&
N2C && (N2C->getZExtValue() == 0 || N2C->getZExtValue() == 1) &&
"Invalid FP_ROUND!");
if (N1.getValueType() == VT) return N1; // noop conversion.
break;
case ISD::AssertSext:
case ISD::AssertZext: {
EVT EVT = cast<VTSDNode>(N2)->getVT();
assert(VT == N1.getValueType() && "Not an inreg extend!");
assert(VT.isInteger() && EVT.isInteger() &&
"Cannot *_EXTEND_INREG FP types");
assert(!EVT.isVector() &&
"AssertSExt/AssertZExt type should be the vector element type "
"rather than the vector type!");
assert(EVT.bitsLE(VT.getScalarType()) && "Not extending!");
if (VT.getScalarType() == EVT) return N1; // noop assertion.
break;
}
case ISD::SIGN_EXTEND_INREG: {
EVT EVT = cast<VTSDNode>(N2)->getVT();
assert(VT == N1.getValueType() && "Not an inreg extend!");
assert(VT.isInteger() && EVT.isInteger() &&
"Cannot *_EXTEND_INREG FP types");
assert(EVT.isVector() == VT.isVector() &&
"SIGN_EXTEND_INREG type should be vector iff the operand "
"type is vector!");
assert((!EVT.isVector() ||
EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
"Vector element counts must match in SIGN_EXTEND_INREG");
assert(EVT.bitsLE(VT) && "Not extending!");
if (EVT == VT) return N1; // Not actually extending
auto SignExtendInReg = [&](APInt Val, llvm::EVT ConstantVT) {
unsigned FromBits = EVT.getScalarSizeInBits();
Val <<= Val.getBitWidth() - FromBits;
Val.ashrInPlace(Val.getBitWidth() - FromBits);
return getConstant(Val, DL, ConstantVT);
};
if (N1C) {
const APInt &Val = N1C->getAPIntValue();
return SignExtendInReg(Val, VT);
}
if (ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) {
SmallVector<SDValue, 8> Ops;
llvm::EVT OpVT = N1.getOperand(0).getValueType();
for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
SDValue Op = N1.getOperand(i);
if (Op.isUndef()) {
Ops.push_back(getUNDEF(OpVT));
continue;
}
ConstantSDNode *C = cast<ConstantSDNode>(Op);
APInt Val = C->getAPIntValue();
Ops.push_back(SignExtendInReg(Val, OpVT));
}
return getBuildVector(VT, DL, Ops);
}
break;
}
case ISD::EXTRACT_VECTOR_ELT:
assert(VT.getSizeInBits() >= N1.getValueType().getScalarSizeInBits() &&
"The result of EXTRACT_VECTOR_ELT must be at least as wide as the \
element type of the vector.");
// EXTRACT_VECTOR_ELT of an UNDEF is an UNDEF.
if (N1.isUndef())
return getUNDEF(VT);
// EXTRACT_VECTOR_ELT of out-of-bounds element is an UNDEF
if (N2C && N2C->getAPIntValue().uge(N1.getValueType().getVectorNumElements()))
return getUNDEF(VT);
// EXTRACT_VECTOR_ELT of CONCAT_VECTORS is often formed while lowering is
// expanding copies of large vectors from registers.
if (N2C &&
N1.getOpcode() == ISD::CONCAT_VECTORS &&
N1.getNumOperands() > 0) {
unsigned Factor =
N1.getOperand(0).getValueType().getVectorNumElements();
return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
N1.getOperand(N2C->getZExtValue() / Factor),
getConstant(N2C->getZExtValue() % Factor, DL,
N2.getValueType()));
}
// EXTRACT_VECTOR_ELT of BUILD_VECTOR is often formed while lowering is
// expanding large vector constants.
if (N2C && N1.getOpcode() == ISD::BUILD_VECTOR) {
SDValue Elt = N1.getOperand(N2C->getZExtValue());
if (VT != Elt.getValueType())
// If the vector element type is not legal, the BUILD_VECTOR operands
// are promoted and implicitly truncated, and the result implicitly
// extended. Make that explicit here.
Elt = getAnyExtOrTrunc(Elt, DL, VT);
return Elt;
}
// EXTRACT_VECTOR_ELT of INSERT_VECTOR_ELT is often formed when vector
// operations are lowered to scalars.
if (N1.getOpcode() == ISD::INSERT_VECTOR_ELT) {
// If the indices are the same, return the inserted element else
// if the indices are known different, extract the element from
// the original vector.
SDValue N1Op2 = N1.getOperand(2);
ConstantSDNode *N1Op2C = dyn_cast<ConstantSDNode>(N1Op2);
if (N1Op2C && N2C) {
if (N1Op2C->getZExtValue() == N2C->getZExtValue()) {
if (VT == N1.getOperand(1).getValueType())
return N1.getOperand(1);
else
return getSExtOrTrunc(N1.getOperand(1), DL, VT);
}
return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0), N2);
}
}
// EXTRACT_VECTOR_ELT of v1iX EXTRACT_SUBVECTOR could be formed
// when vector types are scalarized and v1iX is legal.
// vextract (v1iX extract_subvector(vNiX, Idx)) -> vextract(vNiX,Idx)
if (N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N1.getValueType().getVectorNumElements() == 1) {
return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0),
N1.getOperand(1));
}
break;
case ISD::EXTRACT_ELEMENT:
assert(N2C && (unsigned)N2C->getZExtValue() < 2 && "Bad EXTRACT_ELEMENT!");
assert(!N1.getValueType().isVector() && !VT.isVector() &&
(N1.getValueType().isInteger() == VT.isInteger()) &&
N1.getValueType() != VT &&
"Wrong types for EXTRACT_ELEMENT!");
// EXTRACT_ELEMENT of BUILD_PAIR is often formed while legalize is expanding
// 64-bit integers into 32-bit parts. Instead of building the extract of
// the BUILD_PAIR, only to have legalize rip it apart, just do it now.
if (N1.getOpcode() == ISD::BUILD_PAIR)
return N1.getOperand(N2C->getZExtValue());
// EXTRACT_ELEMENT of a constant int is also very common.
if (N1C) {
unsigned ElementSize = VT.getSizeInBits();
unsigned Shift = ElementSize * N2C->getZExtValue();
APInt ShiftedVal = N1C->getAPIntValue().lshr(Shift);
return getConstant(ShiftedVal.trunc(ElementSize), DL, VT);
}
break;
case ISD::EXTRACT_SUBVECTOR:
if (VT.isSimple() && N1.getValueType().isSimple()) {
assert(VT.isVector() && N1.getValueType().isVector() &&
"Extract subvector VTs must be a vectors!");
assert(VT.getVectorElementType() ==
N1.getValueType().getVectorElementType() &&
"Extract subvector VTs must have the same element type!");
assert(VT.getSimpleVT() <= N1.getSimpleValueType() &&
"Extract subvector must be from larger vector to smaller vector!");
if (N2C) {
assert((VT.getVectorNumElements() + N2C->getZExtValue()
<= N1.getValueType().getVectorNumElements())
&& "Extract subvector overflow!");
}
// Trivial extraction.
if (VT.getSimpleVT() == N1.getSimpleValueType())
return N1;
// EXTRACT_SUBVECTOR of an UNDEF is an UNDEF.
if (N1.isUndef())
return getUNDEF(VT);
// EXTRACT_SUBVECTOR of CONCAT_VECTOR can be simplified if the pieces of
// the concat have the same type as the extract.
if (N2C && N1.getOpcode() == ISD::CONCAT_VECTORS &&
N1.getNumOperands() > 0 &&
VT == N1.getOperand(0).getValueType()) {
unsigned Factor = VT.getVectorNumElements();
return N1.getOperand(N2C->getZExtValue() / Factor);
}
// EXTRACT_SUBVECTOR of INSERT_SUBVECTOR is often created
// during shuffle legalization.
if (N1.getOpcode() == ISD::INSERT_SUBVECTOR && N2 == N1.getOperand(2) &&
VT == N1.getOperand(1).getValueType())
return N1.getOperand(1);
}
break;
}
// Perform trivial constant folding.
if (SDValue SV =
FoldConstantArithmetic(Opcode, DL, VT, N1.getNode(), N2.getNode()))
return SV;
if (SDValue V = foldConstantFPMath(Opcode, DL, VT, N1, N2))
return V;
// Canonicalize an UNDEF to the RHS, even over a constant.
if (N1.isUndef()) {
if (TLI->isCommutativeBinOp(Opcode)) {
std::swap(N1, N2);
} else {
switch (Opcode) {
case ISD::FP_ROUND_INREG:
case ISD::SIGN_EXTEND_INREG:
case ISD::SUB:
return getUNDEF(VT); // fold op(undef, arg2) -> undef
case ISD::UDIV:
case ISD::SDIV:
case ISD::UREM:
case ISD::SREM:
case ISD::SSUBSAT:
case ISD::USUBSAT:
return getConstant(0, DL, VT); // fold op(undef, arg2) -> 0
}
}
}
// Fold a bunch of operators when the RHS is undef.
if (N2.isUndef()) {
switch (Opcode) {
case ISD::XOR:
if (N1.isUndef())
// Handle undef ^ undef -> 0 special case. This is a common
// idiom (misuse).
return getConstant(0, DL, VT);
LLVM_FALLTHROUGH;
case ISD::ADD:
case ISD::SUB:
case ISD::UDIV:
case ISD::SDIV:
case ISD::UREM:
case ISD::SREM:
return getUNDEF(VT); // fold op(arg1, undef) -> undef
case ISD::MUL:
case ISD::AND:
case ISD::SSUBSAT:
case ISD::USUBSAT:
return getConstant(0, DL, VT); // fold op(arg1, undef) -> 0
case ISD::OR:
case ISD::SADDSAT:
case ISD::UADDSAT:
return getAllOnesConstant(DL, VT);
}
}
// Memoize this node if possible.
SDNode *N;
SDVTList VTs = getVTList(VT);
SDValue Ops[] = {N1, N2};
if (VT != MVT::Glue) {
FoldingSetNodeID ID;
AddNodeIDNode(ID, Opcode, VTs, Ops);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
E->intersectFlagsWith(Flags);
return SDValue(E, 0);
}
N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
N->setFlags(Flags);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
} else {
N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
createOperands(N, Ops);
}
InsertNode(N);
SDValue V = SDValue(N, 0);
NewSDValueDbgMsg(V, "Creating new node: ", this);
return V;
}
SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
SDValue N1, SDValue N2, SDValue N3,
const SDNodeFlags Flags) {
// Perform various simplifications.
switch (Opcode) {
case ISD::FMA: {
assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
assert(N1.getValueType() == VT && N2.getValueType() == VT &&
N3.getValueType() == VT && "FMA types must match!");
ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
ConstantFPSDNode *N3CFP = dyn_cast<ConstantFPSDNode>(N3);
if (N1CFP && N2CFP && N3CFP) {
APFloat V1 = N1CFP->getValueAPF();
const APFloat &V2 = N2CFP->getValueAPF();
const APFloat &V3 = N3CFP->getValueAPF();
V1.fusedMultiplyAdd(V2, V3, APFloat::rmNearestTiesToEven);
return getConstantFP(V1, DL, VT);
}
break;
}
case ISD::BUILD_VECTOR: {
// Attempt to simplify BUILD_VECTOR.
SDValue Ops[] = {N1, N2, N3};
if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
return V;
break;
}
case ISD::CONCAT_VECTORS: {
SDValue Ops[] = {N1, N2, N3};
if (SDValue V = foldCONCAT_VECTORS(DL, VT, Ops, *this))
return V;
break;
}
case ISD::SETCC: {
assert(VT.isInteger() && "SETCC result type must be an integer!");
assert(N1.getValueType() == N2.getValueType() &&
"SETCC operands must have the same type!");
assert(VT.isVector() == N1.getValueType().isVector() &&
"SETCC type should be vector iff the operand type is vector!");
assert((!VT.isVector() ||
VT.getVectorNumElements() == N1.getValueType().getVectorNumElements()) &&
"SETCC vector element counts must match!");
// Use FoldSetCC to simplify SETCC's.
if (SDValue V = FoldSetCC(VT, N1, N2, cast<CondCodeSDNode>(N3)->get(), DL))
return V;
// Vector constant folding.
SDValue Ops[] = {N1, N2, N3};
if (SDValue V = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops)) {
NewSDValueDbgMsg(V, "New node vector constant folding: ", this);
return V;
}
break;
}
case ISD::SELECT:
case ISD::VSELECT:
if (SDValue V = simplifySelect(N1, N2, N3))
return V;
break;
case ISD::VECTOR_SHUFFLE:
llvm_unreachable("should use getVectorShuffle constructor!");
case ISD::INSERT_VECTOR_ELT: {
ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N3);
// INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF
if (N3C && N3C->getZExtValue() >= N1.getValueType().getVectorNumElements())
return getUNDEF(VT);
break;
}
case ISD::INSERT_SUBVECTOR: {
// Inserting undef into undef is still undef.
if (N1.isUndef() && N2.isUndef())
return getUNDEF(VT);
SDValue Index = N3;
if (VT.isSimple() && N1.getValueType().isSimple()
&& N2.getValueType().isSimple()) {
assert(VT.isVector() && N1.getValueType().isVector() &&
N2.getValueType().isVector() &&
"Insert subvector VTs must be a vectors");
assert(VT == N1.getValueType() &&
"Dest and insert subvector source types must match!");
assert(N2.getSimpleValueType() <= N1.getSimpleValueType() &&
"Insert subvector must be from smaller vector to larger vector!");
if (isa<ConstantSDNode>(Index)) {
assert((N2.getValueType().getVectorNumElements() +
cast<ConstantSDNode>(Index)->getZExtValue()
<= VT.getVectorNumElements())
&& "Insert subvector overflow!");
}
// Trivial insertion.
if (VT.getSimpleVT() == N2.getSimpleValueType())
return N2;
// If this is an insert of an extracted vector into an undef vector, we
// can just use the input to the extract.
if (N1.isUndef() && N2.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N2.getOperand(1) == N3 && N2.getOperand(0).getValueType() == VT)
return N2.getOperand(0);
}
break;
}
case ISD::BITCAST:
// Fold bit_convert nodes from a type to themselves.
if (N1.getValueType() == VT)
return N1;
break;
}
// Memoize node if it doesn't produce a flag.
SDNode *N;
SDVTList VTs = getVTList(VT);
SDValue Ops[] = {N1, N2, N3};
if (VT != MVT::Glue) {
FoldingSetNodeID ID;
AddNodeIDNode(ID, Opcode, VTs, Ops);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
E->intersectFlagsWith(Flags);
return SDValue(E, 0);
}
N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
N->setFlags(Flags);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
} else {
N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
createOperands(N, Ops);
}
InsertNode(N);
SDValue V = SDValue(N, 0);
NewSDValueDbgMsg(V, "Creating new node: ", this);
return V;
}
SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
SDValue N1, SDValue N2, SDValue N3, SDValue N4) {
SDValue Ops[] = { N1, N2, N3, N4 };
return getNode(Opcode, DL, VT, Ops);
}
SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
SDValue N1, SDValue N2, SDValue N3, SDValue N4,
SDValue N5) {
SDValue Ops[] = { N1, N2, N3, N4, N5 };
return getNode(Opcode, DL, VT, Ops);
}
/// getStackArgumentTokenFactor - Compute a TokenFactor to force all
/// the incoming stack arguments to be loaded from the stack.
SDValue SelectionDAG::getStackArgumentTokenFactor(SDValue Chain) {
SmallVector<SDValue, 8> ArgChains;
// Include the original chain at the beginning of the list. When this is
// used by target LowerCall hooks, this helps legalize find the
// CALLSEQ_BEGIN node.
ArgChains.push_back(Chain);
// Add a chain value for each stack argument.
for (SDNode::use_iterator U = getEntryNode().getNode()->use_begin(),
UE = getEntryNode().getNode()->use_end(); U != UE; ++U)
if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
if (FI->getIndex() < 0)
ArgChains.push_back(SDValue(L, 1));
// Build a tokenfactor for all the chains.
return getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
}
/// getMemsetValue - Vectorized representation of the memset value
/// operand.
static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
const SDLoc &dl) {
assert(!Value.isUndef());
unsigned NumBits = VT.getScalarSizeInBits();
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Value)) {
assert(C->getAPIntValue().getBitWidth() == 8);
APInt Val = APInt::getSplat(NumBits, C->getAPIntValue());
if (VT.isInteger()) {
bool IsOpaque = VT.getSizeInBits() > 64 ||
!DAG.getTargetLoweringInfo().isLegalStoreImmediate(C->getSExtValue());
return DAG.getConstant(Val, dl, VT, false, IsOpaque);
}
return DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(VT), Val), dl,
VT);
}
assert(Value.getValueType() == MVT::i8 && "memset with non-byte fill value?");
EVT IntVT = VT.getScalarType();
if (!IntVT.isInteger())
IntVT = EVT::getIntegerVT(*DAG.getContext(), IntVT.getSizeInBits());
Value = DAG.getNode(ISD::ZERO_EXTEND, dl, IntVT, Value);
if (NumBits > 8) {
// Use a multiplication with 0x010101... to extend the input to the
// required length.
APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
Value = DAG.getNode(ISD::MUL, dl, IntVT, Value,
DAG.getConstant(Magic, dl, IntVT));
}
if (VT != Value.getValueType() && !VT.isInteger())
Value = DAG.getBitcast(VT.getScalarType(), Value);
if (VT != Value.getValueType())
Value = DAG.getSplatBuildVector(VT, dl, Value);
return Value;
}
/// getMemsetStringVal - Similar to getMemsetValue. Except this is only
/// used when a memcpy is turned into a memset when the source is a constant
/// string ptr.
static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG,
const TargetLowering &TLI,
const ConstantDataArraySlice &Slice) {
// Handle vector with all elements zero.
if (Slice.Array == nullptr) {
if (VT.isInteger())
return DAG.getConstant(0, dl, VT);
else if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128)
return DAG.getConstantFP(0.0, dl, VT);
else if (VT.isVector()) {
unsigned NumElts = VT.getVectorNumElements();
MVT EltVT = (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
return DAG.getNode(ISD::BITCAST, dl, VT,
DAG.getConstant(0, dl,
EVT::getVectorVT(*DAG.getContext(),
EltVT, NumElts)));
} else
llvm_unreachable("Expected type!");
}
assert(!VT.isVector() && "Can't handle vector type here!");
unsigned NumVTBits = VT.getSizeInBits();
unsigned NumVTBytes = NumVTBits / 8;
unsigned NumBytes = std::min(NumVTBytes, unsigned(Slice.Length));
APInt Val(NumVTBits, 0);
if (DAG.getDataLayout().isLittleEndian()) {
for (unsigned i = 0; i != NumBytes; ++i)
Val |= (uint64_t)(unsigned char)Slice[i] << i*8;
} else {
for (unsigned i = 0; i != NumBytes; ++i)
Val |= (uint64_t)(unsigned char)Slice[i] << (NumVTBytes-i-1)*8;
}
// If the "cost" of materializing the integer immediate is less than the cost
// of a load, then it is cost effective to turn the load into the immediate.
Type *Ty = VT.getTypeForEVT(*DAG.getContext());
if (TLI.shouldConvertConstantLoadToIntImm(Val, Ty))
return DAG.getConstant(Val, dl, VT);
return SDValue(nullptr, 0);
}
SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, unsigned Offset,
const SDLoc &DL) {
EVT VT = Base.getValueType();
return getNode(ISD::ADD, DL, VT, Base, getConstant(Offset, DL, VT));
}
/// Returns true if memcpy source is constant data.
static bool isMemSrcFromConstant(SDValue Src, ConstantDataArraySlice &Slice) {
uint64_t SrcDelta = 0;
GlobalAddressSDNode *G = nullptr;
if (Src.getOpcode() == ISD::GlobalAddress)
G = cast<GlobalAddressSDNode>(Src);
else if (Src.getOpcode() == ISD::ADD &&
Src.getOperand(0).getOpcode() == ISD::GlobalAddress &&
Src.getOperand(1).getOpcode() == ISD::Constant) {
G = cast<GlobalAddressSDNode>(Src.getOperand(0));
SrcDelta = cast<ConstantSDNode>(Src.getOperand(1))->getZExtValue();
}
if (!G)
return false;
return getConstantDataArrayInfo(G->getGlobal(), Slice, 8,
SrcDelta + G->getOffset());
}
static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
// On Darwin, -Os means optimize for size without hurting performance, so
// only really optimize for size when -Oz (MinSize) is used.
if (MF.getTarget().getTargetTriple().isOSDarwin())
return MF.getFunction().hasMinSize();
return MF.getFunction().hasOptSize();
}
static void chainLoadsAndStoresForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
SmallVector<SDValue, 32> &OutChains, unsigned From,
unsigned To, SmallVector<SDValue, 16> &OutLoadChains,
SmallVector<SDValue, 16> &OutStoreChains) {
assert(OutLoadChains.size() && "Missing loads in memcpy inlining");
assert(OutStoreChains.size() && "Missing stores in memcpy inlining");
SmallVector<SDValue, 16> GluedLoadChains;
for (unsigned i = From; i < To; ++i) {
OutChains.push_back(OutLoadChains[i]);
GluedLoadChains.push_back(OutLoadChains[i]);
}
// Chain for all loads.
SDValue LoadToken = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
GluedLoadChains);
for (unsigned i = From; i < To; ++i) {
StoreSDNode *ST = dyn_cast<StoreSDNode>(OutStoreChains[i]);
SDValue NewStore = DAG.getTruncStore(LoadToken, dl, ST->getValue(),
ST->getBasePtr(), ST->getMemoryVT(),
ST->getMemOperand());
OutChains.push_back(NewStore);
}
}
static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
SDValue Chain, SDValue Dst, SDValue Src,
uint64_t Size, unsigned Align,
bool isVol, bool AlwaysInline,
MachinePointerInfo DstPtrInfo,
MachinePointerInfo SrcPtrInfo) {
// Turn a memcpy of undef to nop.
// FIXME: We need to honor volatile even is Src is undef.
if (Src.isUndef())
return Chain;
// Expand memcpy to a series of load and store ops if the size operand falls
// below a certain threshold.
// TODO: In the AlwaysInline case, if the size is big then generate a loop
// rather than maybe a humongous number of loads and stores.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
const DataLayout &DL = DAG.getDataLayout();
LLVMContext &C = *DAG.getContext();
std::vector<EVT> MemOps;
bool DstAlignCanChange = false;
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
bool OptSize = shouldLowerMemFuncForSize(MF);
FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
DstAlignCanChange = true;
unsigned SrcAlign = DAG.InferPtrAlignment(Src);
if (Align > SrcAlign)
SrcAlign = Align;
ConstantDataArraySlice Slice;
bool CopyFromConstant = isMemSrcFromConstant(Src, Slice);
bool isZeroConstant = CopyFromConstant && Slice.Array == nullptr;
unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(OptSize);
if (!TLI.findOptimalMemOpLowering(
MemOps, Limit, Size, (DstAlignCanChange ? 0 : Align),
(isZeroConstant ? 0 : SrcAlign), /*IsMemset=*/false,
/*ZeroMemset=*/false, /*MemcpyStrSrc=*/CopyFromConstant,
/*AllowOverlap=*/!isVol, DstPtrInfo.getAddrSpace(),
SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes()))
return SDValue();
if (DstAlignCanChange) {
Type *Ty = MemOps[0].getTypeForEVT(C);
unsigned NewAlign = (unsigned)DL.getABITypeAlignment(Ty);
// Don't promote to an alignment that would require dynamic stack
// realignment.
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
if (!TRI->needsStackRealignment(MF))
while (NewAlign > Align &&
DL.exceedsNaturalStackAlignment(NewAlign))
NewAlign /= 2;
if (NewAlign > Align) {
// Give the stack frame object a larger alignment if needed.
if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
MFI.setObjectAlignment(FI->getIndex(), NewAlign);
Align = NewAlign;
}
}
MachineMemOperand::Flags MMOFlags =
isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
SmallVector<SDValue, 16> OutLoadChains;
SmallVector<SDValue, 16> OutStoreChains;
SmallVector<SDValue, 32> OutChains;
unsigned NumMemOps = MemOps.size();
uint64_t SrcOff = 0, DstOff = 0;
for (unsigned i = 0; i != NumMemOps; ++i) {
EVT VT = MemOps[i];
unsigned VTSize = VT.getSizeInBits() / 8;
SDValue Value, Store;
if (VTSize > Size) {
// Issuing an unaligned load / store pair that overlaps with the previous
// pair. Adjust the offset accordingly.
assert(i == NumMemOps-1 && i != 0);
SrcOff -= VTSize - Size;
DstOff -= VTSize - Size;
}
if (CopyFromConstant &&
(isZeroConstant || (VT.isInteger() && !VT.isVector()))) {
// It's unlikely a store of a vector immediate can be done in a single
// instruction. It would require a load from a constantpool first.
// We only handle zero vectors here.
// FIXME: Handle other cases where store of vector immediate is done in
// a single instruction.
ConstantDataArraySlice SubSlice;
if (SrcOff < Slice.Length) {
SubSlice = Slice;
SubSlice.move(SrcOff);
} else {
// This is an out-of-bounds access and hence UB. Pretend we read zero.
SubSlice.Array = nullptr;
SubSlice.Offset = 0;
SubSlice.Length = VTSize;
}
Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice);
if (Value.getNode()) {
Store = DAG.getStore(Chain, dl, Value,
DAG.getMemBasePlusOffset(Dst, DstOff, dl),
DstPtrInfo.getWithOffset(DstOff), Align,
MMOFlags);
OutChains.push_back(Store);
}
}
if (!Store.getNode()) {
// The type might not be legal for the target. This should only happen
// if the type is smaller than a legal type, as on PPC, so the right
// thing to do is generate a LoadExt/StoreTrunc pair. These simplify
// to Load/Store if NVT==VT.
// FIXME does the case above also need this?
EVT NVT = TLI.getTypeToTransformTo(C, VT);
assert(NVT.bitsGE(VT));
bool isDereferenceable =
SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
if (isDereferenceable)
SrcMMOFlags |= MachineMemOperand::MODereferenceable;
Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
DAG.getMemBasePlusOffset(Src, SrcOff, dl),
SrcPtrInfo.getWithOffset(SrcOff), VT,
MinAlign(SrcAlign, SrcOff), SrcMMOFlags);
OutLoadChains.push_back(Value.getValue(1));
Store = DAG.getTruncStore(
Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
DstPtrInfo.getWithOffset(DstOff), VT, Align, MMOFlags);
OutStoreChains.push_back(Store);
}
SrcOff += VTSize;
DstOff += VTSize;
Size -= VTSize;
}
unsigned GluedLdStLimit = MaxLdStGlue == 0 ?
TLI.getMaxGluedStoresPerMemcpy() : MaxLdStGlue;
unsigned NumLdStInMemcpy = OutStoreChains.size();
if (NumLdStInMemcpy) {
// It may be that memcpy might be converted to memset if it's memcpy
// of constants. In such a case, we won't have loads and stores, but
// just stores. In the absence of loads, there is nothing to gang up.
if ((GluedLdStLimit <= 1) || !EnableMemCpyDAGOpt) {
// If target does not care, just leave as it.
for (unsigned i = 0; i < NumLdStInMemcpy; ++i) {
OutChains.push_back(OutLoadChains[i]);
OutChains.push_back(OutStoreChains[i]);
}
} else {
// Ld/St less than/equal limit set by target.
if (NumLdStInMemcpy <= GluedLdStLimit) {
chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
NumLdStInMemcpy, OutLoadChains,
OutStoreChains);
} else {
unsigned NumberLdChain = NumLdStInMemcpy / GluedLdStLimit;
unsigned RemainingLdStInMemcpy = NumLdStInMemcpy % GluedLdStLimit;
unsigned GlueIter = 0;
for (unsigned cnt = 0; cnt < NumberLdChain; ++cnt) {
unsigned IndexFrom = NumLdStInMemcpy - GlueIter - GluedLdStLimit;
unsigned IndexTo = NumLdStInMemcpy - GlueIter;
chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, IndexFrom, IndexTo,
OutLoadChains, OutStoreChains);
GlueIter += GluedLdStLimit;
}
// Residual ld/st.
if (RemainingLdStInMemcpy) {
chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
RemainingLdStInMemcpy, OutLoadChains,
OutStoreChains);
}
}
}
}
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
}
static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
SDValue Chain, SDValue Dst, SDValue Src,
uint64_t Size, unsigned Align,
bool isVol, bool AlwaysInline,
MachinePointerInfo DstPtrInfo,
MachinePointerInfo SrcPtrInfo) {
// Turn a memmove of undef to nop.
// FIXME: We need to honor volatile even is Src is undef.
if (Src.isUndef())
return Chain;
// Expand memmove to a series of load and store ops if the size operand falls
// below a certain threshold.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
const DataLayout &DL = DAG.getDataLayout();
LLVMContext &C = *DAG.getContext();
std::vector<EVT> MemOps;
bool DstAlignCanChange = false;
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
bool OptSize = shouldLowerMemFuncForSize(MF);
FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
DstAlignCanChange = true;
unsigned SrcAlign = DAG.InferPtrAlignment(Src);
if (Align > SrcAlign)
SrcAlign = Align;
unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove(OptSize);
// FIXME: `AllowOverlap` should really be `!isVol` but there is a bug in
// findOptimalMemOpLowering. Meanwhile, setting it to `false` produces the
// correct code.
bool AllowOverlap = false;
if (!TLI.findOptimalMemOpLowering(
MemOps, Limit, Size, (DstAlignCanChange ? 0 : Align), SrcAlign,
/*IsMemset=*/false, /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false,
AllowOverlap, DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
MF.getFunction().getAttributes()))
return SDValue();
if (DstAlignCanChange) {
Type *Ty = MemOps[0].getTypeForEVT(C);
unsigned NewAlign = (unsigned)DL.getABITypeAlignment(Ty);
if (NewAlign > Align) {
// Give the stack frame object a larger alignment if needed.
if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
MFI.setObjectAlignment(FI->getIndex(), NewAlign);
Align = NewAlign;
}
}
MachineMemOperand::Flags MMOFlags =
isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
uint64_t SrcOff = 0, DstOff = 0;
SmallVector<SDValue, 8> LoadValues;
SmallVector<SDValue, 8> LoadChains;
SmallVector<SDValue, 8> OutChains;
unsigned NumMemOps = MemOps.size();
for (unsigned i = 0; i < NumMemOps; i++) {
EVT VT = MemOps[i];
unsigned VTSize = VT.getSizeInBits() / 8;
SDValue Value;
bool isDereferenceable =
SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
if (isDereferenceable)
SrcMMOFlags |= MachineMemOperand::MODereferenceable;
Value =
DAG.getLoad(VT, dl, Chain, DAG.getMemBasePlusOffset(Src, SrcOff, dl),
SrcPtrInfo.getWithOffset(SrcOff), SrcAlign, SrcMMOFlags);
LoadValues.push_back(Value);
LoadChains.push_back(Value.getValue(1));
SrcOff += VTSize;
}
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
OutChains.clear();
for (unsigned i = 0; i < NumMemOps; i++) {
EVT VT = MemOps[i];
unsigned VTSize = VT.getSizeInBits() / 8;
SDValue Store;
Store = DAG.getStore(Chain, dl, LoadValues[i],
DAG.getMemBasePlusOffset(Dst, DstOff, dl),
DstPtrInfo.getWithOffset(DstOff), Align, MMOFlags);
OutChains.push_back(Store);
DstOff += VTSize;
}
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
}
/// Lower the call to 'memset' intrinsic function into a series of store
/// operations.
///
/// \param DAG Selection DAG where lowered code is placed.
/// \param dl Link to corresponding IR location.
/// \param Chain Control flow dependency.
/// \param Dst Pointer to destination memory location.
/// \param Src Value of byte to write into the memory.
/// \param Size Number of bytes to write.
/// \param Align Alignment of the destination in bytes.
/// \param isVol True if destination is volatile.
/// \param DstPtrInfo IR information on the memory pointer.
/// \returns New head in the control flow, if lowering was successful, empty
/// SDValue otherwise.
///
/// The function tries to replace 'llvm.memset' intrinsic with several store
/// operations and value calculation code. This is usually profitable for small
/// memory size.
static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
SDValue Chain, SDValue Dst, SDValue Src,
uint64_t Size, unsigned Align, bool isVol,
MachinePointerInfo DstPtrInfo) {
// Turn a memset of undef to nop.
// FIXME: We need to honor volatile even is Src is undef.
if (Src.isUndef())
return Chain;
// Expand memset to a series of load/store ops if the size operand
// falls below a certain threshold.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
std::vector<EVT> MemOps;
bool DstAlignCanChange = false;
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
bool OptSize = shouldLowerMemFuncForSize(MF);
FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
DstAlignCanChange = true;
bool IsZeroVal =
isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isNullValue();
if (!TLI.findOptimalMemOpLowering(
MemOps, TLI.getMaxStoresPerMemset(OptSize), Size,
(DstAlignCanChange ? 0 : Align), 0, /*IsMemset=*/true,
/*ZeroMemset=*/IsZeroVal, /*MemcpyStrSrc=*/false,
/*AllowOverlap=*/!isVol, DstPtrInfo.getAddrSpace(), ~0u,
MF.getFunction().getAttributes()))
return SDValue();
if (DstAlignCanChange) {
Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty);
if (NewAlign > Align) {
// Give the stack frame object a larger alignment if needed.
if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
MFI.setObjectAlignment(FI->getIndex(), NewAlign);
Align = NewAlign;
}
}
SmallVector<SDValue, 8> OutChains;
uint64_t DstOff = 0;
unsigned NumMemOps = MemOps.size();
// Find the largest store and generate the bit pattern for it.
EVT LargestVT = MemOps[0];
for (unsigned i = 1; i < NumMemOps; i++)
if (MemOps[i].bitsGT(LargestVT))
LargestVT = MemOps[i];
SDValue MemSetValue = getMemsetValue(Src, LargestVT, DAG, dl);
for (unsigned i = 0; i < NumMemOps; i++) {
EVT VT = MemOps[i];
unsigned VTSize = VT.getSizeInBits() / 8;
if (VTSize > Size) {
// Issuing an unaligned load / store pair that overlaps with the previous
// pair. Adjust the offset accordingly.
assert(i == NumMemOps-1 && i != 0);
DstOff -= VTSize - Size;
}
// If this store is smaller than the largest store see whether we can get
// the smaller value for free with a truncate.
SDValue Value = MemSetValue;
if (VT.bitsLT(LargestVT)) {
if (!LargestVT.isVector() && !VT.isVector() &&
TLI.isTruncateFree(LargestVT, VT))
Value = DAG.getNode(ISD::TRUNCATE, dl, VT, MemSetValue);
else
Value = getMemsetValue(Src, VT, DAG, dl);
}
assert(Value.getValueType() == VT && "Value with wrong type.");
SDValue Store = DAG.getStore(
Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
DstPtrInfo.getWithOffset(DstOff), Align,
isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone);
OutChains.push_back(Store);
DstOff += VT.getSizeInBits() / 8;
Size -= VTSize;
}
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
}
static void checkAddrSpaceIsValidForLibcall(const TargetLowering *TLI,
unsigned AS) {
// Lowering memcpy / memset / memmove intrinsics to calls is only valid if all
// pointer operands can be losslessly bitcasted to pointers of address space 0
if (AS != 0 && !TLI->isNoopAddrSpaceCast(AS, 0)) {
report_fatal_error("cannot lower memory intrinsic in address space " +
Twine(AS));
}
}
SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
SDValue Src, SDValue Size, unsigned Align,
bool isVol, bool AlwaysInline, bool isTailCall,
MachinePointerInfo DstPtrInfo,
MachinePointerInfo SrcPtrInfo) {
assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
// Check to see if we should lower the memcpy to loads and stores first.
// For cases within the target-specified limits, this is the best choice.
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
if (ConstantSize) {
// Memcpy with size zero? Just return the original chain.
if (ConstantSize->isNullValue())
return Chain;
SDValue Result = getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
ConstantSize->getZExtValue(),Align,
isVol, false, DstPtrInfo, SrcPtrInfo);
if (Result.getNode())
return Result;
}
// Then check to see if we should lower the memcpy with target-specific
// code. If the target chooses to do this, this is the next best.
if (TSI) {
SDValue Result = TSI->EmitTargetCodeForMemcpy(
*this, dl, Chain, Dst, Src, Size, Align, isVol, AlwaysInline,
DstPtrInfo, SrcPtrInfo);
if (Result.getNode())
return Result;
}
// If we really need inline code and the target declined to provide it,
// use a (potentially long) sequence of loads and stores.
if (AlwaysInline) {
assert(ConstantSize && "AlwaysInline requires a constant size!");
return getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
ConstantSize->getZExtValue(), Align, isVol,
true, DstPtrInfo, SrcPtrInfo);
}
checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace());
// FIXME: If the memcpy is volatile (isVol), lowering it to a plain libc
// memcpy is not guaranteed to be safe. libc memcpys aren't required to
// respect volatile, so they may do things like read or write memory
// beyond the given memory regions. But fixing this isn't easy, and most
// people don't care.
// Emit a library call.
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
Entry.Ty = Type::getInt8PtrTy(*getContext());
Entry.Node = Dst; Args.push_back(Entry);
Entry.Node = Src; Args.push_back(Entry);
Entry.Ty = getDataLayout().getIntPtrType(*getContext());
Entry.Node = Size; Args.push_back(Entry);
// FIXME: pass in SDLoc
TargetLowering::CallLoweringInfo CLI(*this);
CLI.setDebugLoc(dl)
.setChain(Chain)
.setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
Dst.getValueType().getTypeForEVT(*getContext()),
getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
TLI->getPointerTy(getDataLayout())),
std::move(Args))
.setDiscardResult()
.setTailCall(isTailCall);
std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
return CallResult.second;
}
SDValue SelectionDAG::getAtomicMemcpy(SDValue Chain, const SDLoc &dl,
SDValue Dst, unsigned DstAlign,
SDValue Src, unsigned SrcAlign,
SDValue Size, Type *SizeTy,
unsigned ElemSz, bool isTailCall,
MachinePointerInfo DstPtrInfo,
MachinePointerInfo SrcPtrInfo) {
// Emit a library call.
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
Entry.Ty = getDataLayout().getIntPtrType(*getContext());
Entry.Node = Dst;
Args.push_back(Entry);
Entry.Node = Src;
Args.push_back(Entry);
Entry.Ty = SizeTy;
Entry.Node = Size;
Args.push_back(Entry);
RTLIB::Libcall LibraryCall =
RTLIB::getMEMCPY_ELEMENT_UNORDERED_ATOMIC(ElemSz);
if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
report_fatal_error("Unsupported element size");
TargetLowering::CallLoweringInfo CLI(*this);
CLI.setDebugLoc(dl)
.setChain(Chain)
.setLibCallee(TLI->getLibcallCallingConv(LibraryCall),
Type::getVoidTy(*getContext()),
getExternalSymbol(TLI->getLibcallName(LibraryCall),
TLI->getPointerTy(getDataLayout())),
std::move(Args))
.setDiscardResult()
.setTailCall(isTailCall);
std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
return CallResult.second;
}
SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
SDValue Src, SDValue Size, unsigned Align,
bool isVol, bool isTailCall,
MachinePointerInfo DstPtrInfo,
MachinePointerInfo SrcPtrInfo) {
assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
// Check to see if we should lower the memmove to loads and stores first.
// For cases within the target-specified limits, this is the best choice.
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
if (ConstantSize) {
// Memmove with size zero? Just return the original chain.
if (ConstantSize->isNullValue())
return Chain;
SDValue Result =
getMemmoveLoadsAndStores(*this, dl, Chain, Dst, Src,
ConstantSize->getZExtValue(), Align, isVol,
false, DstPtrInfo, SrcPtrInfo);
if (Result.getNode())
return Result;
}
// Then check to see if we should lower the memmove with target-specific
// code. If the target chooses to do this, this is the next best.
if (TSI) {
SDValue Result = TSI->EmitTargetCodeForMemmove(
*this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo, SrcPtrInfo);
if (Result.getNode())
return Result;
}
checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace());
// FIXME: If the memmove is volatile, lowering it to plain libc memmove may
// not be safe. See memcpy above for more details.
// Emit a library call.
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
Entry.Ty = Type::getInt8PtrTy(*getContext());
Entry.Node = Dst; Args.push_back(Entry);
Entry.Node = Src; Args.push_back(Entry);
Entry.Ty = getDataLayout().getIntPtrType(*getContext());
Entry.Node = Size; Args.push_back(Entry);
// FIXME: pass in SDLoc
TargetLowering::CallLoweringInfo CLI(*this);
CLI.setDebugLoc(dl)
.setChain(Chain)
.setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
Dst.getValueType().getTypeForEVT(*getContext()),
getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
TLI->getPointerTy(getDataLayout())),
std::move(Args))
.setDiscardResult()
.setTailCall(isTailCall);
std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
return CallResult.second;
}
SDValue SelectionDAG::getAtomicMemmove(SDValue Chain, const SDLoc &dl,
SDValue Dst, unsigned DstAlign,
SDValue Src, unsigned SrcAlign,
SDValue Size, Type *SizeTy,
unsigned ElemSz, bool isTailCall,
MachinePointerInfo DstPtrInfo,
MachinePointerInfo SrcPtrInfo) {
// Emit a library call.
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
Entry.Ty = getDataLayout().getIntPtrType(*getContext());
Entry.Node = Dst;
Args.push_back(Entry);
Entry.Node = Src;
Args.push_back(Entry);
Entry.Ty = SizeTy;
Entry.Node = Size;
Args.push_back(Entry);
RTLIB::Libcall LibraryCall =
RTLIB::getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(ElemSz);
if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
report_fatal_error("Unsupported element size");
TargetLowering::CallLoweringInfo CLI(*this);
CLI.setDebugLoc(dl)
.setChain(Chain)
.setLibCallee(TLI->getLibcallCallingConv(LibraryCall),
Type::getVoidTy(*getContext()),
getExternalSymbol(TLI->getLibcallName(LibraryCall),
TLI->getPointerTy(getDataLayout())),
std::move(Args))
.setDiscardResult()
.setTailCall(isTailCall);
std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
return CallResult.second;
}
SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
SDValue Src, SDValue Size, unsigned Align,
bool isVol, bool isTailCall,
MachinePointerInfo DstPtrInfo) {
assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
// Check to see if we should lower the memset to stores first.
// For cases within the target-specified limits, this is the best choice.
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
if (ConstantSize) {
// Memset with size zero? Just return the original chain.
if (ConstantSize->isNullValue())
return Chain;
SDValue Result =
getMemsetStores(*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(),
Align, isVol, DstPtrInfo);
if (Result.getNode())
return Result;
}
// Then check to see if we should lower the memset with target-specific
// code. If the target chooses to do this, this is the next best.
if (TSI) {
SDValue Result = TSI->EmitTargetCodeForMemset(
*this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo);
if (Result.getNode())
return Result;
}
checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
// Emit a library call.
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
Entry.Node = Dst; Entry.Ty = Type::getInt8PtrTy(*getContext());
Args.push_back(Entry);
Entry.Node = Src;
Entry.Ty = Src.getValueType().getTypeForEVT(*getContext());
Args.push_back(Entry);
Entry.Node = Size;
Entry.Ty = getDataLayout().getIntPtrType(*getContext());
Args.push_back(Entry);
// FIXME: pass in SDLoc
TargetLowering::CallLoweringInfo CLI(*this);
CLI.setDebugLoc(dl)
.setChain(Chain)
.setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
Dst.getValueType().getTypeForEVT(*getContext()),
getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
TLI->getPointerTy(getDataLayout())),
std::move(Args))
.setDiscardResult()
.setTailCall(isTailCall);
std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
return CallResult.second;
}
SDValue SelectionDAG::getAtomicMemset(SDValue Chain, const SDLoc &dl,
SDValue Dst, unsigned DstAlign,
SDValue Value, SDValue Size, Type *SizeTy,
unsigned ElemSz, bool isTailCall,
MachinePointerInfo DstPtrInfo) {
// Emit a library call.
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
Entry.Ty = getDataLayout().getIntPtrType(*getContext());
Entry.Node = Dst;
Args.push_back(Entry);
Entry.Ty = Type::getInt8Ty(*getContext());
Entry.Node = Value;
Args.push_back(Entry);
Entry.Ty = SizeTy;
Entry.Node = Size;
Args.push_back(Entry);
RTLIB::Libcall LibraryCall =
RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(ElemSz);
if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
report_fatal_error("Unsupported element size");
TargetLowering::CallLoweringInfo CLI(*this);
CLI.setDebugLoc(dl)
.setChain(Chain)
.setLibCallee(TLI->getLibcallCallingConv(LibraryCall),
Type::getVoidTy(*getContext()),
getExternalSymbol(TLI->getLibcallName(LibraryCall),
TLI->getPointerTy(getDataLayout())),
std::move(Args))
.setDiscardResult()
.setTailCall(isTailCall);
std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
return CallResult.second;
}
SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
SDVTList VTList, ArrayRef<SDValue> Ops,
MachineMemOperand *MMO) {
FoldingSetNodeID ID;
ID.AddInteger(MemVT.getRawBits());
AddNodeIDNode(ID, Opcode, VTList, Ops);
ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
void* IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
cast<AtomicSDNode>(E)->refineAlignment(MMO);
return SDValue(E, 0);
}
auto *N = newSDNode<AtomicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
VTList, MemVT, MMO);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
InsertNode(N);
return SDValue(N, 0);
}
SDValue SelectionDAG::getAtomicCmpSwap(unsigned Opcode, const SDLoc &dl,
EVT MemVT, SDVTList VTs, SDValue Chain,
SDValue Ptr, SDValue Cmp, SDValue Swp,
MachineMemOperand *MMO) {
assert(Opcode == ISD::ATOMIC_CMP_SWAP ||
Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");
SDValue Ops[] = {Chain, Ptr, Cmp, Swp};
return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO);
}
SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
SDValue Chain, SDValue Ptr, SDValue Val,
MachineMemOperand *MMO) {
assert((Opcode == ISD::ATOMIC_LOAD_ADD ||
Opcode == ISD::ATOMIC_LOAD_SUB ||
Opcode == ISD::ATOMIC_LOAD_AND ||
Opcode == ISD::ATOMIC_LOAD_CLR ||
Opcode == ISD::ATOMIC_LOAD_OR ||
Opcode == ISD::ATOMIC_LOAD_XOR ||
Opcode == ISD::ATOMIC_LOAD_NAND ||
Opcode == ISD::ATOMIC_LOAD_MIN ||
Opcode == ISD::ATOMIC_LOAD_MAX ||
Opcode == ISD::ATOMIC_LOAD_UMIN ||
Opcode == ISD::ATOMIC_LOAD_UMAX ||
Opcode == ISD::ATOMIC_LOAD_FADD ||
Opcode == ISD::ATOMIC_LOAD_FSUB ||
Opcode == ISD::ATOMIC_SWAP ||
Opcode == ISD::ATOMIC_STORE) &&
"Invalid Atomic Op");
EVT VT = Val.getValueType();
SDVTList VTs = Opcode == ISD::ATOMIC_STORE ? getVTList(MVT::Other) :
getVTList(VT, MVT::Other);
SDValue Ops[] = {Chain, Ptr, Val};
return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO);
}
SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
EVT VT, SDValue Chain, SDValue Ptr,
MachineMemOperand *MMO) {
assert(Opcode == ISD::ATOMIC_LOAD && "Invalid Atomic Op");
SDVTList VTs = getVTList(VT, MVT::Other);
SDValue Ops[] = {Chain, Ptr};
return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO);
}
/// getMergeValues - Create a MERGE_VALUES node from the given operands.
SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl) {
if (Ops.size() == 1)
return Ops[0];
SmallVector<EVT, 4> VTs;
VTs.reserve(Ops.size());
for (unsigned i = 0; i < Ops.size(); ++i)
VTs.push_back(Ops[i].getValueType());
return getNode(ISD::MERGE_VALUES, dl, getVTList(VTs), Ops);
}
SDValue SelectionDAG::getMemIntrinsicNode(
unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
EVT MemVT, MachinePointerInfo PtrInfo, unsigned Align,
MachineMemOperand::Flags Flags, unsigned Size, const AAMDNodes &AAInfo) {
if (Align == 0) // Ensure that codegen never sees alignment 0
Align = getEVTAlignment(MemVT);
if (!Size)
Size = MemVT.getStoreSize();
MachineFunction &MF = getMachineFunction();
MachineMemOperand *MMO =
MF.getMachineMemOperand(PtrInfo, Flags, Size, Align, AAInfo);
return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, MMO);
}
SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
SDVTList VTList,
ArrayRef<SDValue> Ops, EVT MemVT,
MachineMemOperand *MMO) {
assert((Opcode == ISD::INTRINSIC_VOID ||
Opcode == ISD::INTRINSIC_W_CHAIN ||
Opcode == ISD::PREFETCH ||
Opcode == ISD::LIFETIME_START ||
Opcode == ISD::LIFETIME_END ||
((int)Opcode <= std::numeric_limits<int>::max() &&
(int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) &&
"Opcode is not a memory-accessing opcode!");
// Memoize the node unless it returns a flag.
MemIntrinsicSDNode *N;
if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
FoldingSetNodeID ID;
AddNodeIDNode(ID, Opcode, VTList, Ops);
ID.AddInteger(getSyntheticNodeSubclassData<MemIntrinsicSDNode>(
Opcode, dl.getIROrder(), VTList, MemVT, MMO));
ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
cast<MemIntrinsicSDNode>(E)->refineAlignment(MMO);
return SDValue(E, 0);
}
N = newSDNode<MemIntrinsicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
VTList, MemVT, MMO);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
} else {
N = newSDNode<MemIntrinsicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
VTList, MemVT, MMO);
createOperands(N, Ops);
}
InsertNode(N);
return SDValue(N, 0);
}
SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl,
SDValue Chain, int FrameIndex,
int64_t Size, int64_t Offset) {
const unsigned Opcode = IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END;
const auto VTs = getVTList(MVT::Other);
SDValue Ops[2] = {
Chain,
getFrameIndex(FrameIndex,
getTargetLoweringInfo().getFrameIndexTy(getDataLayout()),
true)};
FoldingSetNodeID ID;
AddNodeIDNode(ID, Opcode, VTs, Ops);
ID.AddInteger(FrameIndex);
ID.AddInteger(Size);
ID.AddInteger(Offset);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
return SDValue(E, 0);
LifetimeSDNode *N = newSDNode<LifetimeSDNode>(
Opcode, dl.getIROrder(), dl.getDebugLoc(), VTs, Size, Offset);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
InsertNode(N);
SDValue V(N, 0);
NewSDValueDbgMsg(V, "Creating new node: ", this);
return V;
}
/// InferPointerInfo - If the specified ptr/offset is a frame index, infer a
/// MachinePointerInfo record from it. This is particularly useful because the
/// code generator has many cases where it doesn't bother passing in a
/// MachinePointerInfo to getLoad or getStore when it has "FI+Cst".
static MachinePointerInfo InferPointerInfo(const MachinePointerInfo &Info,
SelectionDAG &DAG, SDValue Ptr,
int64_t Offset = 0) {
// If this is FI+Offset, we can model it.
if (const FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr))
return MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
FI->getIndex(), Offset);
// If this is (FI+Offset1)+Offset2, we can model it.
if (Ptr.getOpcode() != ISD::ADD ||
!isa<ConstantSDNode>(Ptr.getOperand(1)) ||
!isa<FrameIndexSDNode>(Ptr.getOperand(0)))
return Info;
int FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
return MachinePointerInfo::getFixedStack(
DAG.getMachineFunction(), FI,
Offset + cast<ConstantSDNode>(Ptr.getOperand(1))->getSExtValue());
}
/// InferPointerInfo - If the specified ptr/offset is a frame index, infer a
/// MachinePointerInfo record from it. This is particularly useful because the
/// code generator has many cases where it doesn't bother passing in a
/// MachinePointerInfo to getLoad or getStore when it has "FI+Cst".
static MachinePointerInfo InferPointerInfo(const MachinePointerInfo &Info,
SelectionDAG &DAG, SDValue Ptr,
SDValue OffsetOp) {
// If the 'Offset' value isn't a constant, we can't handle this.
if (ConstantSDNode *OffsetNode = dyn_cast<ConstantSDNode>(OffsetOp))
return InferPointerInfo(Info, DAG, Ptr, OffsetNode->getSExtValue());
if (OffsetOp.isUndef())
return InferPointerInfo(Info, DAG, Ptr);
return Info;
}
SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
EVT VT, const SDLoc &dl, SDValue Chain,
SDValue Ptr, SDValue Offset,
MachinePointerInfo PtrInfo, EVT MemVT,
unsigned Alignment,
MachineMemOperand::Flags MMOFlags,
const AAMDNodes &AAInfo, const MDNode *Ranges) {
assert(Chain.getValueType() == MVT::Other &&
"Invalid chain type");
if (Alignment == 0) // Ensure that codegen never sees alignment 0
Alignment = getEVTAlignment(MemVT);
MMOFlags |= MachineMemOperand::MOLoad;
assert((MMOFlags & MachineMemOperand::MOStore) == 0);
// If we don't have a PtrInfo, infer the trivial frame index case to simplify
// clients.
if (PtrInfo.V.isNull())
PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr, Offset);
MachineFunction &MF = getMachineFunction();
MachineMemOperand *MMO = MF.getMachineMemOperand(
PtrInfo, MMOFlags, MemVT.getStoreSize(), Alignment, AAInfo, Ranges);
return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, MemVT, MMO);
}
SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
EVT VT, const SDLoc &dl, SDValue Chain,
SDValue Ptr, SDValue Offset, EVT MemVT,
MachineMemOperand *MMO) {
if (VT == MemVT) {
ExtType = ISD::NON_EXTLOAD;
} else if (ExtType == ISD::NON_EXTLOAD) {
assert(VT == MemVT && "Non-extending load from different memory type!");
} else {
// Extending load.
assert(MemVT.getScalarType().bitsLT(VT.getScalarType()) &&
"Should only be an extending load, not truncating!");
assert(VT.isInteger() == MemVT.isInteger() &&
"Cannot convert from FP to Int or Int -> FP!");
assert(VT.isVector() == MemVT.isVector() &&
"Cannot use an ext load to convert to or from a vector!");
assert((!VT.isVector() ||
VT.getVectorNumElements() == MemVT.getVectorNumElements()) &&
"Cannot use an ext load to change the number of vector elements!");
}
bool Indexed = AM != ISD::UNINDEXED;
assert((Indexed || Offset.isUndef()) && "Unindexed load with an offset!");
SDVTList VTs = Indexed ?
getVTList(VT, Ptr.getValueType(), MVT::Other) : getVTList(VT, MVT::Other);
SDValue Ops[] = { Chain, Ptr, Offset };
FoldingSetNodeID ID;
AddNodeIDNode(ID, ISD::LOAD, VTs, Ops);
ID.AddInteger(MemVT.getRawBits());
ID.AddInteger(getSyntheticNodeSubclassData<LoadSDNode>(
dl.getIROrder(), VTs, AM, ExtType, MemVT, MMO));
ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
cast<LoadSDNode>(E)->refineAlignment(MMO);
return SDValue(E, 0);
}
auto *N = newSDNode<LoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
ExtType, MemVT, MMO);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
InsertNode(N);
SDValue V(N, 0);
NewSDValueDbgMsg(V, "Creating new node: ", this);
return V;
}
SDValue SelectionDAG::getLoad(EVT VT, const SDLoc &dl, SDValue Chain,
SDValue Ptr, MachinePointerInfo PtrInfo,
unsigned Alignment,
MachineMemOperand::Flags MMOFlags,
const AAMDNodes &AAInfo, const MDNode *Ranges) {
SDValue Undef = getUNDEF(Ptr.getValueType());
return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
PtrInfo, VT, Alignment, MMOFlags, AAInfo, Ranges);
}
SDValue SelectionDAG::getLoad(EVT VT, const SDLoc &dl, SDValue Chain,
SDValue Ptr, MachineMemOperand *MMO) {
SDValue Undef = getUNDEF(Ptr.getValueType());
return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
VT, MMO);
}
SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl,
EVT VT, SDValue Chain, SDValue Ptr,
MachinePointerInfo PtrInfo, EVT MemVT,
unsigned Alignment,
MachineMemOperand::Flags MMOFlags,
const AAMDNodes &AAInfo) {
SDValue Undef = getUNDEF(Ptr.getValueType());
return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef, PtrInfo,
MemVT, Alignment, MMOFlags, AAInfo);
}
SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl,
EVT VT, SDValue Chain, SDValue Ptr, EVT MemVT,
MachineMemOperand *MMO) {
SDValue Undef = getUNDEF(Ptr.getValueType());
return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef,
MemVT, MMO);
}
SDValue SelectionDAG::getIndexedLoad(SDValue OrigLoad, const SDLoc &dl,
SDValue Base, SDValue Offset,
ISD::MemIndexedMode AM) {
LoadSDNode *LD = cast<LoadSDNode>(OrigLoad);
assert(LD->getOffset().isUndef() && "Load is already a indexed load!");
// Don't propagate the invariant or dereferenceable flags.
auto MMOFlags =
LD->getMemOperand()->getFlags() &
~(MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
return getLoad(AM, LD->getExtensionType(), OrigLoad.getValueType(), dl,
LD->getChain(), Base, Offset, LD->getPointerInfo(),
LD->getMemoryVT(), LD->getAlignment(), MMOFlags,
LD->getAAInfo());
}
SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
SDValue Ptr, MachinePointerInfo PtrInfo,
unsigned Alignment,
MachineMemOperand::Flags MMOFlags,
const AAMDNodes &AAInfo) {
assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
if (Alignment == 0) // Ensure that codegen never sees alignment 0
Alignment = getEVTAlignment(Val.getValueType());
MMOFlags |= MachineMemOperand::MOStore;
assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
if (PtrInfo.V.isNull())
PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);
MachineFunction &MF = getMachineFunction();
MachineMemOperand *MMO = MF.getMachineMemOperand(
PtrInfo, MMOFlags, Val.getValueType().getStoreSize(), Alignment, AAInfo);
return getStore(Chain, dl, Val, Ptr, MMO);
}
SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
SDValue Ptr, MachineMemOperand *MMO) {
assert(Chain.getValueType() == MVT::Other &&
"Invalid chain type");
EVT VT = Val.getValueType();
SDVTList VTs = getVTList(MVT::Other);
SDValue Undef = getUNDEF(Ptr.getValueType());
SDValue Ops[] = { Chain, Val, Ptr, Undef };
FoldingSetNodeID ID;
AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
ID.AddInteger(VT.getRawBits());
ID.AddInteger(getSyntheticNodeSubclassData<StoreSDNode>(
dl.getIROrder(), VTs, ISD::UNINDEXED, false, VT, MMO));
ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
cast<StoreSDNode>(E)->refineAlignment(MMO);
return SDValue(E, 0);
}
auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
ISD::UNINDEXED, false, VT, MMO);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
InsertNode(N);
SDValue V(N, 0);
NewSDValueDbgMsg(V, "Creating new node: ", this);
return V;
}
SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
SDValue Ptr, MachinePointerInfo PtrInfo,
EVT SVT, unsigned Alignment,
MachineMemOperand::Flags MMOFlags,
const AAMDNodes &AAInfo) {
assert(Chain.getValueType() == MVT::Other &&
"Invalid chain type");
if (Alignment == 0) // Ensure that codegen never sees alignment 0
Alignment = getEVTAlignment(SVT);
MMOFlags |= MachineMemOperand::MOStore;
assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
if (PtrInfo.V.isNull())
PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);
MachineFunction &MF = getMachineFunction();
MachineMemOperand *MMO = MF.getMachineMemOperand(
PtrInfo, MMOFlags, SVT.getStoreSize(), Alignment, AAInfo);
return getTruncStore(Chain, dl, Val, Ptr, SVT, MMO);
}
SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
SDValue Ptr, EVT SVT,
MachineMemOperand *MMO) {
EVT VT = Val.getValueType();
assert(Chain.getValueType() == MVT::Other &&
"Invalid chain type");
if (VT == SVT)
return getStore(Chain, dl, Val, Ptr, MMO);
assert(SVT.getScalarType().bitsLT(VT.getScalarType()) &&
"Should only be a truncating store, not extending!");
assert(VT.isInteger() == SVT.isInteger() &&
"Can't do FP-INT conversion!");
assert(VT.isVector() == SVT.isVector() &&
"Cannot use trunc store to convert to or from a vector!");
assert((!VT.isVector() ||
VT.getVectorNumElements() == SVT.getVectorNumElements()) &&
"Cannot use trunc store to change the number of vector elements!");
SDVTList VTs = getVTList(MVT::Other);
SDValue Undef = getUNDEF(Ptr.getValueType());
SDValue Ops[] = { Chain, Val, Ptr, Undef };
FoldingSetNodeID ID;
AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
ID.AddInteger(SVT.getRawBits());
ID.AddInteger(getSyntheticNodeSubclassData<StoreSDNode>(
dl.getIROrder(), VTs, ISD::UNINDEXED, true, SVT, MMO));
ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
cast<StoreSDNode>(E)->refineAlignment(MMO);
return SDValue(E, 0);
}
auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
ISD::UNINDEXED, true, SVT, MMO);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
InsertNode(N);
SDValue V(N, 0);
NewSDValueDbgMsg(V, "Creating new node: ", this);
return V;
}
SDValue SelectionDAG::getIndexedStore(SDValue OrigStore, const SDLoc &dl,
SDValue Base, SDValue Offset,
ISD::MemIndexedMode AM) {
StoreSDNode *ST = cast<StoreSDNode>(OrigStore);
assert(ST->getOffset().isUndef() && "Store is already a indexed store!");
SDVTList VTs = getVTList(Base.getValueType(), MVT::Other);
SDValue Ops[] = { ST->getChain(), ST->getValue(), Base, Offset };
FoldingSetNodeID ID;
AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
ID.AddInteger(ST->getMemoryVT().getRawBits());
ID.AddInteger(ST->getRawSubclassData());
ID.AddInteger(ST->getPointerInfo().getAddrSpace());
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
return SDValue(E, 0);
auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
ST->isTruncatingStore(), ST->getMemoryVT(),
ST->getMemOperand());
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
InsertNode(N);
SDValue V(N, 0);
NewSDValueDbgMsg(V, "Creating new node: ", this);
return V;
}
SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain,
SDValue Ptr, SDValue Mask, SDValue PassThru,
EVT MemVT, MachineMemOperand *MMO,
ISD::LoadExtType ExtTy, bool isExpanding) {
SDVTList VTs = getVTList(VT, MVT::Other);
SDValue Ops[] = { Chain, Ptr, Mask, PassThru };
FoldingSetNodeID ID;
AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops);
ID.AddInteger(MemVT.getRawBits());
ID.AddInteger(getSyntheticNodeSubclassData<MaskedLoadSDNode>(
dl.getIROrder(), VTs, ExtTy, isExpanding, MemVT, MMO));
ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
cast<MaskedLoadSDNode>(E)->refineAlignment(MMO);
return SDValue(E, 0);
}
auto *N = newSDNode<MaskedLoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
ExtTy, isExpanding, MemVT, MMO);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
InsertNode(N);
SDValue V(N, 0);
NewSDValueDbgMsg(V, "Creating new node: ", this);
return V;
}
SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl,
SDValue Val, SDValue Ptr, SDValue Mask,
EVT MemVT, MachineMemOperand *MMO,
bool IsTruncating, bool IsCompressing) {
assert(Chain.getValueType() == MVT::Other &&
"Invalid chain type");
SDVTList VTs = getVTList(MVT::Other);
SDValue Ops[] = { Chain, Val, Ptr, Mask };
FoldingSetNodeID ID;
AddNodeIDNode(ID, ISD::MSTORE, VTs, Ops);
ID.AddInteger(MemVT.getRawBits());
ID.AddInteger(getSyntheticNodeSubclassData<MaskedStoreSDNode>(
dl.getIROrder(), VTs, IsTruncating, IsCompressing, MemVT, MMO));
ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
cast<MaskedStoreSDNode>(E)->refineAlignment(MMO);
return SDValue(E, 0);
}
auto *N = newSDNode<MaskedStoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
IsTruncating, IsCompressing, MemVT, MMO);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
InsertNode(N);
SDValue V(N, 0);
NewSDValueDbgMsg(V, "Creating new node: ", this);
return V;
}
SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
ArrayRef<SDValue> Ops,
MachineMemOperand *MMO) {
assert(Ops.size() == 6 && "Incompatible number of operands");
FoldingSetNodeID ID;
AddNodeIDNode(ID, ISD::MGATHER, VTs, Ops);
ID.AddInteger(VT.getRawBits());
ID.AddInteger(getSyntheticNodeSubclassData<MaskedGatherSDNode>(
dl.getIROrder(), VTs, VT, MMO));
ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
cast<MaskedGatherSDNode>(E)->refineAlignment(MMO);
return SDValue(E, 0);
}
auto *N = newSDNode<MaskedGatherSDNode>(dl.getIROrder(), dl.getDebugLoc(),
VTs, VT, MMO);
createOperands(N, Ops);
assert(N->getPassThru().getValueType() == N->getValueType(0) &&
"Incompatible type of the PassThru value in MaskedGatherSDNode");
assert(N->getMask().getValueType().getVectorNumElements() ==
N->getValueType(0).getVectorNumElements() &&
"Vector width mismatch between mask and data");
assert(N->getIndex().getValueType().getVectorNumElements() >=
N->getValueType(0).getVectorNumElements() &&
"Vector width mismatch between index and data");
assert(isa<ConstantSDNode>(N->getScale()) &&
cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
"Scale should be a constant power of 2");
CSEMap.InsertNode(N, IP);
InsertNode(N);
SDValue V(N, 0);
NewSDValueDbgMsg(V, "Creating new node: ", this);
return V;
}
SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
ArrayRef<SDValue> Ops,
MachineMemOperand *MMO) {
assert(Ops.size() == 6 && "Incompatible number of operands");
FoldingSetNodeID ID;
AddNodeIDNode(ID, ISD::MSCATTER, VTs, Ops);
ID.AddInteger(VT.getRawBits());
ID.AddInteger(getSyntheticNodeSubclassData<MaskedScatterSDNode>(
dl.getIROrder(), VTs, VT, MMO));
ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
cast<MaskedScatterSDNode>(E)->refineAlignment(MMO);
return SDValue(E, 0);
}
auto *N = newSDNode<MaskedScatterSDNode>(dl.getIROrder(), dl.getDebugLoc(),
VTs, VT, MMO);
createOperands(N, Ops);
assert(N->getMask().getValueType().getVectorNumElements() ==
N->getValue().getValueType().getVectorNumElements() &&
"Vector width mismatch between mask and data");
assert(N->getIndex().getValueType().getVectorNumElements() >=
N->getValue().getValueType().getVectorNumElements() &&
"Vector width mismatch between index and data");
assert(isa<ConstantSDNode>(N->getScale()) &&
cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
"Scale should be a constant power of 2");
CSEMap.InsertNode(N, IP);
InsertNode(N);
SDValue V(N, 0);
NewSDValueDbgMsg(V, "Creating new node: ", this);
return V;
}
SDValue SelectionDAG::simplifySelect(SDValue Cond, SDValue T, SDValue F) {
// select undef, T, F --> T (if T is a constant), otherwise F
// select, ?, undef, F --> F
// select, ?, T, undef --> T
if (Cond.isUndef())
return isConstantValueOfAnyType(T) ? T : F;
if (T.isUndef())
return F;
if (F.isUndef())
return T;
// select true, T, F --> T
// select false, T, F --> F
if (auto *CondC = dyn_cast<ConstantSDNode>(Cond))
return CondC->isNullValue() ? F : T;
// TODO: This should simplify VSELECT with constant condition using something
// like this (but check boolean contents to be complete?):
// if (ISD::isBuildVectorAllOnes(Cond.getNode()))
// return T;
// if (ISD::isBuildVectorAllZeros(Cond.getNode()))
// return F;
// select ?, T, T --> T
if (T == F)
return T;
return SDValue();
}
SDValue SelectionDAG::simplifyShift(SDValue X, SDValue Y) {
// shift undef, Y --> 0 (can always assume that the undef value is 0)
if (X.isUndef())
return getConstant(0, SDLoc(X.getNode()), X.getValueType());
// shift X, undef --> undef (because it may shift by the bitwidth)
if (Y.isUndef())
return getUNDEF(X.getValueType());
// shift 0, Y --> 0
// shift X, 0 --> X
if (isNullOrNullSplat(X) || isNullOrNullSplat(Y))
return X;
// shift X, C >= bitwidth(X) --> undef
// All vector elements must be too big (or undef) to avoid partial undefs.
auto isShiftTooBig = [X](ConstantSDNode *Val) {
return !Val || Val->getAPIntValue().uge(X.getScalarValueSizeInBits());
};
if (ISD::matchUnaryPredicate(Y, isShiftTooBig, true))
return getUNDEF(X.getValueType());
return SDValue();
}
// TODO: Use fast-math-flags to enable more simplifications.
SDValue SelectionDAG::simplifyFPBinop(unsigned Opcode, SDValue X, SDValue Y) {
ConstantFPSDNode *YC = isConstOrConstSplatFP(Y, /* AllowUndefs */ true);
if (!YC)
return SDValue();
// X + -0.0 --> X
if (Opcode == ISD::FADD)
if (YC->getValueAPF().isNegZero())
return X;
// X - +0.0 --> X
if (Opcode == ISD::FSUB)
if (YC->getValueAPF().isPosZero())
return X;
// X * 1.0 --> X
// X / 1.0 --> X
if (Opcode == ISD::FMUL || Opcode == ISD::FDIV)
if (YC->getValueAPF().isExactlyValue(1.0))
return X;
return SDValue();
}
SDValue SelectionDAG::getVAArg(EVT VT, const SDLoc &dl, SDValue Chain,
SDValue Ptr, SDValue SV, unsigned Align) {
SDValue Ops[] = { Chain, Ptr, SV, getTargetConstant(Align, dl, MVT::i32) };
return getNode(ISD::VAARG, dl, getVTList(VT, MVT::Other), Ops);
}
SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
ArrayRef<SDUse> Ops) {
switch (Ops.size()) {
case 0: return getNode(Opcode, DL, VT);
case 1: return getNode(Opcode, DL, VT, static_cast<const SDValue>(Ops[0]));
case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]);
case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
default: break;
}
// Copy from an SDUse array into an SDValue array for use with
// the regular getNode logic.
SmallVector<SDValue, 8> NewOps(Ops.begin(), Ops.end());
return getNode(Opcode, DL, VT, NewOps);
}
SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
ArrayRef<SDValue> Ops, const SDNodeFlags Flags) {
unsigned NumOps = Ops.size();
switch (NumOps) {
case 0: return getNode(Opcode, DL, VT);
case 1: return getNode(Opcode, DL, VT, Ops[0], Flags);
case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags);
case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2], Flags);
default: break;
}
switch (Opcode) {
default: break;
case ISD::BUILD_VECTOR:
// Attempt to simplify BUILD_VECTOR.
if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
return V;
break;
case ISD::CONCAT_VECTORS:
if (SDValue V = foldCONCAT_VECTORS(DL, VT, Ops, *this))
return V;
break;
case ISD::SELECT_CC:
assert(NumOps == 5 && "SELECT_CC takes 5 operands!");
assert(Ops[0].getValueType() == Ops[1].getValueType() &&
"LHS and RHS of condition must have same type!");
assert(Ops[2].getValueType() == Ops[3].getValueType() &&
"True and False arms of SelectCC must have same type!");
assert(Ops[2].getValueType() == VT &&
"select_cc node must be of same type as true and false value!");
break;
case ISD::BR_CC:
assert(NumOps == 5 && "BR_CC takes 5 operands!");
assert(Ops[2].getValueType() == Ops[3].getValueType() &&
"LHS/RHS of comparison should match types!");
break;
}
// Memoize nodes.
SDNode *N;
SDVTList VTs = getVTList(VT);
if (VT != MVT::Glue) {
FoldingSetNodeID ID;
AddNodeIDNode(ID, Opcode, VTs, Ops);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
return SDValue(E, 0);
N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
} else {
N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
createOperands(N, Ops);
}
InsertNode(N);
SDValue V(N, 0);
NewSDValueDbgMsg(V, "Creating new node: ", this);
return V;
}
SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops) {
return getNode(Opcode, DL, getVTList(ResultTys), Ops);
}
SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
ArrayRef<SDValue> Ops) {
if (VTList.NumVTs == 1)
return getNode(Opcode, DL, VTList.VTs[0], Ops);
#if 0
switch (Opcode) {
// FIXME: figure out how to safely handle things like
// int foo(int x) { return 1 << (x & 255); }
// int bar() { return foo(256); }
case ISD::SRA_PARTS:
case ISD::SRL_PARTS:
case ISD::SHL_PARTS:
if (N3.getOpcode() == ISD::SIGN_EXTEND_INREG &&
cast<VTSDNode>(N3.getOperand(1))->getVT() != MVT::i1)
return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
else if (N3.getOpcode() == ISD::AND)
if (ConstantSDNode *AndRHS = dyn_cast<ConstantSDNode>(N3.getOperand(1))) {
// If the and is only masking out bits that cannot effect the shift,
// eliminate the and.
unsigned NumBits = VT.getScalarSizeInBits()*2;
if ((AndRHS->getValue() & (NumBits-1)) == NumBits-1)
return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
}
break;
}
#endif
// Memoize the node unless it returns a flag.
SDNode *N;
if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
FoldingSetNodeID ID;
AddNodeIDNode(ID, Opcode, VTList, Ops);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
return SDValue(E, 0);
N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
} else {
N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList);
createOperands(N, Ops);
}
InsertNode(N);
SDValue V(N, 0);
NewSDValueDbgMsg(V, "Creating new node: ", this);
return V;
}
SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
SDVTList VTList) {
return getNode(Opcode, DL, VTList, None);
}
SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
SDValue N1) {
SDValue Ops[] = { N1 };
return getNode(Opcode, DL, VTList, Ops);
}
SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
SDValue N1, SDValue N2) {
SDValue Ops[] = { N1, N2 };
return getNode(Opcode, DL, VTList, Ops);
}
SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
SDValue N1, SDValue N2, SDValue N3) {
SDValue Ops[] = { N1, N2, N3 };
return getNode(Opcode, DL, VTList, Ops);
}
SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
SDValue N1, SDValue N2, SDValue N3, SDValue N4) {
SDValue Ops[] = { N1, N2, N3, N4 };
return getNode(Opcode, DL, VTList, Ops);
}
SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
SDValue N1, SDValue N2, SDValue N3, SDValue N4,
SDValue N5) {
SDValue Ops[] = { N1, N2, N3, N4, N5 };
return getNode(Opcode, DL, VTList, Ops);
}
SDVTList SelectionDAG::getVTList(EVT VT) {
return makeVTList(SDNode::getValueTypeList(VT), 1);
}
SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2) {
FoldingSetNodeID ID;
ID.AddInteger(2U);
ID.AddInteger(VT1.getRawBits());
ID.AddInteger(VT2.getRawBits());
void *IP = nullptr;
SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
if (!Result) {
EVT *Array = Allocator.Allocate<EVT>(2);
Array[0] = VT1;
Array[1] = VT2;
Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 2);
VTListMap.InsertNode(Result, IP);
}
return Result->getSDVTList();
}
SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3) {
FoldingSetNodeID ID;
ID.AddInteger(3U);
ID.AddInteger(VT1.getRawBits());
ID.AddInteger(VT2.getRawBits());
ID.AddInteger(VT3.getRawBits());
void *IP = nullptr;
SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
if (!Result) {
EVT *Array = Allocator.Allocate<EVT>(3);
Array[0] = VT1;
Array[1] = VT2;
Array[2] = VT3;
Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 3);
VTListMap.InsertNode(Result, IP);
}
return Result->getSDVTList();
}
SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4) {
FoldingSetNodeID ID;
ID.AddInteger(4U);
ID.AddInteger(VT1.getRawBits());
ID.AddInteger(VT2.getRawBits());
ID.AddInteger(VT3.getRawBits());
ID.AddInteger(VT4.getRawBits());
void *IP = nullptr;
SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
if (!Result) {
EVT *Array = Allocator.Allocate<EVT>(4);
Array[0] = VT1;
Array[1] = VT2;
Array[2] = VT3;
Array[3] = VT4;
Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 4);
VTListMap.InsertNode(Result, IP);
}
return Result->getSDVTList();
}
SDVTList SelectionDAG::getVTList(ArrayRef<EVT> VTs) {
unsigned NumVTs = VTs.size();
FoldingSetNodeID ID;
ID.AddInteger(NumVTs);
for (unsigned index = 0; index < NumVTs; index++) {
ID.AddInteger(VTs[index].getRawBits());
}
void *IP = nullptr;
SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
if (!Result) {
EVT *Array = Allocator.Allocate<EVT>(NumVTs);
llvm::copy(VTs, Array);
Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, NumVTs);
VTListMap.InsertNode(Result, IP);
}
return Result->getSDVTList();
}
/// UpdateNodeOperands - *Mutate* the specified node in-place to have the
/// specified operands. If the resultant node already exists in the DAG,
/// this does not modify the specified node, instead it returns the node that
/// already exists. If the resultant node does not exist in the DAG, the
/// input node is returned. As a degenerate case, if you specify the same
/// input operands as the node already has, the input node is returned.
SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op) {
assert(N->getNumOperands() == 1 && "Update with wrong number of operands");
// Check to see if there is no change.
if (Op == N->getOperand(0)) return N;
// See if the modified node already exists.
void *InsertPos = nullptr;
if (SDNode *Existing = FindModifiedNodeSlot(N, Op, InsertPos))
return Existing;
// Nope it doesn't. Remove the node from its current place in the maps.
if (InsertPos)
if (!RemoveNodeFromCSEMaps(N))
InsertPos = nullptr;
// Now we update the operands.
N->OperandList[0].set(Op);
updateDivergence(N);
// If this gets put into a CSE map, add it.
if (InsertPos) CSEMap.InsertNode(N, InsertPos);
return N;
}
SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2) {
assert(N->getNumOperands() == 2 && "Update with wrong number of operands");
// Check to see if there is no change.
if (Op1 == N->getOperand(0) && Op2 == N->getOperand(1))
return N; // No operands changed, just return the input node.
// See if the modified node already exists.
void *InsertPos = nullptr;
if (SDNode *Existing = FindModifiedNodeSlot(N, Op1, Op2, InsertPos))
return Existing;
// Nope it doesn't. Remove the node from its current place in the maps.
if (InsertPos)
if (!RemoveNodeFromCSEMaps(N))
InsertPos = nullptr;
// Now we update the operands.
if (N->OperandList[0] != Op1)
N->OperandList[0].set(Op1);
if (N->OperandList[1] != Op2)
N->OperandList[1].set(Op2);
updateDivergence(N);
// If this gets put into a CSE map, add it.
if (InsertPos) CSEMap.InsertNode(N, InsertPos);
return N;
}
SDNode *SelectionDAG::
UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3) {
SDValue Ops[] = { Op1, Op2, Op3 };
return UpdateNodeOperands(N, Ops);
}
SDNode *SelectionDAG::
UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
SDValue Op3, SDValue Op4) {
SDValue Ops[] = { Op1, Op2, Op3, Op4 };
return UpdateNodeOperands(N, Ops);
}
SDNode *SelectionDAG::
UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
SDValue Op3, SDValue Op4, SDValue Op5) {
SDValue Ops[] = { Op1, Op2, Op3, Op4, Op5 };
return UpdateNodeOperands(N, Ops);
}
SDNode *SelectionDAG::
UpdateNodeOperands(SDNode *N, ArrayRef<SDValue> Ops) {
unsigned NumOps = Ops.size();
assert(N->getNumOperands() == NumOps &&
"Update with wrong number of operands");
// If no operands changed just return the input node.
if (std::equal(Ops.begin(), Ops.end(), N->op_begin()))
return N;
// See if the modified node already exists.
void *InsertPos = nullptr;
if (SDNode *Existing = FindModifiedNodeSlot(N, Ops, InsertPos))
return Existing;
// Nope it doesn't. Remove the node from its current place in the maps.
if (InsertPos)
if (!RemoveNodeFromCSEMaps(N))
InsertPos = nullptr;
// Now we update the operands.
for (unsigned i = 0; i != NumOps; ++i)
if (N->OperandList[i] != Ops[i])
N->OperandList[i].set(Ops[i]);
updateDivergence(N);
// If this gets put into a CSE map, add it.
if (InsertPos) CSEMap.InsertNode(N, InsertPos);
return N;
}
/// DropOperands - Release the operands and set this node to have
/// zero operands.
void SDNode::DropOperands() {
// Unlike the code in MorphNodeTo that does this, we don't need to
// watch for dead nodes here.
for (op_iterator I = op_begin(), E = op_end(); I != E; ) {
SDUse &Use = *I++;
Use.set(SDValue());
}
}
void SelectionDAG::setNodeMemRefs(MachineSDNode *N,
ArrayRef<MachineMemOperand *> NewMemRefs) {
if (NewMemRefs.empty()) {
N->clearMemRefs();
return;
}
// Check if we can avoid allocating by storing a single reference directly.
if (NewMemRefs.size() == 1) {
N->MemRefs = NewMemRefs[0];
N->NumMemRefs = 1;
return;
}
MachineMemOperand **MemRefsBuffer =
Allocator.template Allocate<MachineMemOperand *>(NewMemRefs.size());
llvm::copy(NewMemRefs, MemRefsBuffer);
N->MemRefs = MemRefsBuffer;
N->NumMemRefs = static_cast<int>(NewMemRefs.size());
}
/// SelectNodeTo - These are wrappers around MorphNodeTo that accept a
/// machine opcode.
///
SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
EVT VT) {
SDVTList VTs = getVTList(VT);
return SelectNodeTo(N, MachineOpc, VTs, None);
}
SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
EVT VT, SDValue Op1) {
SDVTList VTs = getVTList(VT);
SDValue Ops[] = { Op1 };
return SelectNodeTo(N, MachineOpc, VTs, Ops);
}
SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
EVT VT, SDValue Op1,
SDValue Op2) {
SDVTList VTs = getVTList(VT);
SDValue Ops[] = { Op1, Op2 };
return SelectNodeTo(N, MachineOpc, VTs, Ops);
}
SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
EVT VT, SDValue Op1,
SDValue Op2, SDValue Op3) {
SDVTList VTs = getVTList(VT);
SDValue Ops[] = { Op1, Op2, Op3 };
return SelectNodeTo(N, MachineOpc, VTs, Ops);
}
SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
EVT VT, ArrayRef<SDValue> Ops) {
SDVTList VTs = getVTList(VT);
return SelectNodeTo(N, MachineOpc, VTs, Ops);
}
SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
EVT VT1, EVT VT2, ArrayRef<SDValue> Ops) {
SDVTList VTs = getVTList(VT1, VT2);
return SelectNodeTo(N, MachineOpc, VTs, Ops);
}
SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
EVT VT1, EVT VT2) {
SDVTList VTs = getVTList(VT1, VT2);
return SelectNodeTo(N, MachineOpc, VTs, None);
}
SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
EVT VT1, EVT VT2, EVT VT3,
ArrayRef<SDValue> Ops) {
SDVTList VTs = getVTList(VT1, VT2, VT3);
return SelectNodeTo(N, MachineOpc, VTs, Ops);
}
SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
EVT VT1, EVT VT2,
SDValue Op1, SDValue Op2) {
SDVTList VTs = getVTList(VT1, VT2);
SDValue Ops[] = { Op1, Op2 };
return SelectNodeTo(N, MachineOpc, VTs, Ops);
}
SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
SDVTList VTs,ArrayRef<SDValue> Ops) {
SDNode *New = MorphNodeTo(N, ~MachineOpc, VTs, Ops);
// Reset the NodeID to -1.
New->setNodeId(-1);
if (New != N) {
ReplaceAllUsesWith(N, New);
RemoveDeadNode(N);
}
return New;
}
/// UpdateSDLocOnMergeSDNode - If the opt level is -O0 then it throws away
/// the line number information on the merged node since it is not possible to
/// preserve the information that operation is associated with multiple lines.
/// This will make the debugger working better at -O0, were there is a higher
/// probability having other instructions associated with that line.
///
/// For IROrder, we keep the smaller of the two
SDNode *SelectionDAG::UpdateSDLocOnMergeSDNode(SDNode *N, const SDLoc &OLoc) {
DebugLoc NLoc = N->getDebugLoc();
if (NLoc && OptLevel == CodeGenOpt::None && OLoc.getDebugLoc() != NLoc) {
N->setDebugLoc(DebugLoc());
}
unsigned Order = std::min(N->getIROrder(), OLoc.getIROrder());
N->setIROrder(Order);
return N;
}
/// MorphNodeTo - This *mutates* the specified node to have the specified
/// return type, opcode, and operands.
///
/// Note that MorphNodeTo returns the resultant node. If there is already a
/// node of the specified opcode and operands, it returns that node instead of
/// the current one. Note that the SDLoc need not be the same.
///
/// Using MorphNodeTo is faster than creating a new node and swapping it in
/// with ReplaceAllUsesWith both because it often avoids allocating a new
/// node, and because it doesn't require CSE recalculation for any of
/// the node's users.
///
/// However, note that MorphNodeTo recursively deletes dead nodes from the DAG.
/// As a consequence it isn't appropriate to use from within the DAG combiner or
/// the legalizer which maintain worklists that would need to be updated when
/// deleting things.
SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
SDVTList VTs, ArrayRef<SDValue> Ops) {
// If an identical node already exists, use it.
void *IP = nullptr;
if (VTs.VTs[VTs.NumVTs-1] != MVT::Glue) {
FoldingSetNodeID ID;
AddNodeIDNode(ID, Opc, VTs, Ops);
if (SDNode *ON = FindNodeOrInsertPos(ID, SDLoc(N), IP))
return UpdateSDLocOnMergeSDNode(ON, SDLoc(N));
}
if (!RemoveNodeFromCSEMaps(N))
IP = nullptr;
// Start the morphing.
N->NodeType = Opc;
N->ValueList = VTs.VTs;
N->NumValues = VTs.NumVTs;
// Clear the operands list, updating used nodes to remove this from their
// use list. Keep track of any operands that become dead as a result.
SmallPtrSet<SDNode*, 16> DeadNodeSet;
for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
SDUse &Use = *I++;
SDNode *Used = Use.getNode();
Use.set(SDValue());
if (Used->use_empty())
DeadNodeSet.insert(Used);
}
// For MachineNode, initialize the memory references information.
if (MachineSDNode *MN = dyn_cast<MachineSDNode>(N))
MN->clearMemRefs();
// Swap for an appropriately sized array from the recycler.
removeOperands(N);
createOperands(N, Ops);
// Delete any nodes that are still dead after adding the uses for the
// new operands.
if (!DeadNodeSet.empty()) {
SmallVector<SDNode *, 16> DeadNodes;
for (SDNode *N : DeadNodeSet)
if (N->use_empty())
DeadNodes.push_back(N);
RemoveDeadNodes(DeadNodes);
}
if (IP)
CSEMap.InsertNode(N, IP); // Memoize the new node.
return N;
}
SDNode* SelectionDAG::mutateStrictFPToFP(SDNode *Node) {
unsigned OrigOpc = Node->getOpcode();
unsigned NewOpc;
switch (OrigOpc) {
default:
llvm_unreachable("mutateStrictFPToFP called with unexpected opcode!");
case ISD::STRICT_FADD: NewOpc = ISD::FADD; break;
case ISD::STRICT_FSUB: NewOpc = ISD::FSUB; break;
case ISD::STRICT_FMUL: NewOpc = ISD::FMUL; break;
case ISD::STRICT_FDIV: NewOpc = ISD::FDIV; break;
case ISD::STRICT_FREM: NewOpc = ISD::FREM; break;
case ISD::STRICT_FMA: NewOpc = ISD::FMA; break;
case ISD::STRICT_FSQRT: NewOpc = ISD::FSQRT; break;
case ISD::STRICT_FPOW: NewOpc = ISD::FPOW; break;
case ISD::STRICT_FPOWI: NewOpc = ISD::FPOWI; break;
case ISD::STRICT_FSIN: NewOpc = ISD::FSIN; break;
case ISD::STRICT_FCOS: NewOpc = ISD::FCOS; break;
case ISD::STRICT_FEXP: NewOpc = ISD::FEXP; break;
case ISD::STRICT_FEXP2: NewOpc = ISD::FEXP2; break;
case ISD::STRICT_FLOG: NewOpc = ISD::FLOG; break;
case ISD::STRICT_FLOG10: NewOpc = ISD::FLOG10; break;
case ISD::STRICT_FLOG2: NewOpc = ISD::FLOG2; break;
case ISD::STRICT_FRINT: NewOpc = ISD::FRINT; break;
case ISD::STRICT_FNEARBYINT: NewOpc = ISD::FNEARBYINT; break;
case ISD::STRICT_FMAXNUM: NewOpc = ISD::FMAXNUM; break;
case ISD::STRICT_FMINNUM: NewOpc = ISD::FMINNUM; break;
case ISD::STRICT_FCEIL: NewOpc = ISD::FCEIL; break;
case ISD::STRICT_FFLOOR: NewOpc = ISD::FFLOOR; break;
case ISD::STRICT_FROUND: NewOpc = ISD::FROUND; break;
case ISD::STRICT_FTRUNC: NewOpc = ISD::FTRUNC; break;
case ISD::STRICT_FP_ROUND: NewOpc = ISD::FP_ROUND; break;
case ISD::STRICT_FP_EXTEND: NewOpc = ISD::FP_EXTEND; break;
}
assert(Node->getNumValues() == 2 && "Unexpected number of results!");
// We're taking this node out of the chain, so we need to re-link things.
SDValue InputChain = Node->getOperand(0);
SDValue OutputChain = SDValue(Node, 1);
ReplaceAllUsesOfValueWith(OutputChain, InputChain);
SmallVector<SDValue, 3> Ops;
for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i)
Ops.push_back(Node->getOperand(i));
SDVTList VTs = getVTList(Node->getValueType(0));
SDNode *Res = MorphNodeTo(Node, NewOpc, VTs, Ops);
// MorphNodeTo can operate in two ways: if an existing node with the
// specified operands exists, it can just return it. Otherwise, it
// updates the node in place to have the requested operands.
if (Res == Node) {
// If we updated the node in place, reset the node ID. To the isel,
// this should be just like a newly allocated machine node.
Res->setNodeId(-1);
} else {
ReplaceAllUsesWith(Node, Res);
RemoveDeadNode(Node);
}
return Res;
}
/// getMachineNode - These are used for target selectors to create a new node
/// with specified return type(s), MachineInstr opcode, and operands.
///
/// Note that getMachineNode returns the resultant node. If there is already a
/// node of the specified opcode and operands, it returns that node instead of
/// the current one.
MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
EVT VT) {
SDVTList VTs = getVTList(VT);
return getMachineNode(Opcode, dl, VTs, None);
}
MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
EVT VT, SDValue Op1) {
SDVTList VTs = getVTList(VT);
SDValue Ops[] = { Op1 };
return getMachineNode(Opcode, dl, VTs, Ops);
}
MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
EVT VT, SDValue Op1, SDValue Op2) {
SDVTList VTs = getVTList(VT);
SDValue Ops[] = { Op1, Op2 };
return getMachineNode(Opcode, dl, VTs, Ops);
}
MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
EVT VT, SDValue Op1, SDValue Op2,
SDValue Op3) {
SDVTList VTs = getVTList(VT);
SDValue Ops[] = { Op1, Op2, Op3 };
return getMachineNode(Opcode, dl, VTs, Ops);
}
MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
EVT VT, ArrayRef<SDValue> Ops) {
SDVTList VTs = getVTList(VT);
return getMachineNode(Opcode, dl, VTs, Ops);
}
MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
EVT VT1, EVT VT2, SDValue Op1,
SDValue Op2) {
SDVTList VTs = getVTList(VT1, VT2);
SDValue Ops[] = { Op1, Op2 };
return getMachineNode(Opcode, dl, VTs, Ops);
}
MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
EVT VT1, EVT VT2, SDValue Op1,
SDValue Op2, SDValue Op3) {
SDVTList VTs = getVTList(VT1, VT2);
SDValue Ops[] = { Op1, Op2, Op3 };
return getMachineNode(Opcode, dl, VTs, Ops);
}
MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
EVT VT1, EVT VT2,
ArrayRef<SDValue> Ops) {
SDVTList VTs = getVTList(VT1, VT2);
return getMachineNode(Opcode, dl, VTs, Ops);
}
MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
EVT VT1, EVT VT2, EVT VT3,
SDValue Op1, SDValue Op2) {
SDVTList VTs = getVTList(VT1, VT2, VT3);
SDValue Ops[] = { Op1, Op2 };
return getMachineNode(Opcode, dl, VTs, Ops);
}
MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
EVT VT1, EVT VT2, EVT VT3,
SDValue Op1, SDValue Op2,
SDValue Op3) {
SDVTList VTs = getVTList(VT1, VT2, VT3);
SDValue Ops[] = { Op1, Op2, Op3 };
return getMachineNode(Opcode, dl, VTs, Ops);
}
MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
EVT VT1, EVT VT2, EVT VT3,
ArrayRef<SDValue> Ops) {
SDVTList VTs = getVTList(VT1, VT2, VT3);
return getMachineNode(Opcode, dl, VTs, Ops);
}
MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
ArrayRef<EVT> ResultTys,
ArrayRef<SDValue> Ops) {
SDVTList VTs = getVTList(ResultTys);
return getMachineNode(Opcode, dl, VTs, Ops);
}
MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &DL,
SDVTList VTs,
ArrayRef<SDValue> Ops) {
bool DoCSE = VTs.VTs[VTs.NumVTs-1] != MVT::Glue;
MachineSDNode *N;
void *IP = nullptr;
if (DoCSE) {
FoldingSetNodeID ID;
AddNodeIDNode(ID, ~Opcode, VTs, Ops);
IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
return cast<MachineSDNode>(UpdateSDLocOnMergeSDNode(E, DL));
}
}
// Allocate a new MachineSDNode.
N = newSDNode<MachineSDNode>(~Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
createOperands(N, Ops);
if (DoCSE)
CSEMap.InsertNode(N, IP);
InsertNode(N);
return N;
}
/// getTargetExtractSubreg - A convenience function for creating
/// TargetOpcode::EXTRACT_SUBREG nodes.
SDValue SelectionDAG::getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT,
SDValue Operand) {
SDValue SRIdxVal = getTargetConstant(SRIdx, DL, MVT::i32);
SDNode *Subreg = getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
VT, Operand, SRIdxVal);
return SDValue(Subreg, 0);
}
/// getTargetInsertSubreg - A convenience function for creating
/// TargetOpcode::INSERT_SUBREG nodes.
SDValue SelectionDAG::getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT,
SDValue Operand, SDValue Subreg) {
SDValue SRIdxVal = getTargetConstant(SRIdx, DL, MVT::i32);
SDNode *Result = getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
VT, Operand, Subreg, SRIdxVal);
return SDValue(Result, 0);
}
/// getNodeIfExists - Get the specified node if it's already available, or
/// else return NULL.
SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
ArrayRef<SDValue> Ops,
const SDNodeFlags Flags) {
if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) {
FoldingSetNodeID ID;
AddNodeIDNode(ID, Opcode, VTList, Ops);
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, SDLoc(), IP)) {
E->intersectFlagsWith(Flags);
return E;
}
}
return nullptr;
}
/// getDbgValue - Creates a SDDbgValue node.
///
/// SDNode
SDDbgValue *SelectionDAG::getDbgValue(DIVariable *Var, DIExpression *Expr,
SDNode *N, unsigned R, bool IsIndirect,
const DebugLoc &DL, unsigned O) {
assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
"Expected inlined-at fields to agree");
return new (DbgInfo->getAlloc())
SDDbgValue(Var, Expr, N, R, IsIndirect, DL, O);
}
/// Constant
SDDbgValue *SelectionDAG::getConstantDbgValue(DIVariable *Var,
DIExpression *Expr,
const Value *C,
const DebugLoc &DL, unsigned O) {
assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
"Expected inlined-at fields to agree");
return new (DbgInfo->getAlloc()) SDDbgValue(Var, Expr, C, DL, O);
}
/// FrameIndex
SDDbgValue *SelectionDAG::getFrameIndexDbgValue(DIVariable *Var,
DIExpression *Expr, unsigned FI,
bool IsIndirect,
const DebugLoc &DL,
unsigned O) {
assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
"Expected inlined-at fields to agree");
return new (DbgInfo->getAlloc())
SDDbgValue(Var, Expr, FI, IsIndirect, DL, O, SDDbgValue::FRAMEIX);
}
/// VReg
SDDbgValue *SelectionDAG::getVRegDbgValue(DIVariable *Var,
DIExpression *Expr,
unsigned VReg, bool IsIndirect,
const DebugLoc &DL, unsigned O) {
assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
"Expected inlined-at fields to agree");
return new (DbgInfo->getAlloc())
SDDbgValue(Var, Expr, VReg, IsIndirect, DL, O, SDDbgValue::VREG);
}
void SelectionDAG::transferDbgValues(SDValue From, SDValue To,
unsigned OffsetInBits, unsigned SizeInBits,
bool InvalidateDbg) {
SDNode *FromNode = From.getNode();
SDNode *ToNode = To.getNode();
assert(FromNode && ToNode && "Can't modify dbg values");
// PR35338
// TODO: assert(From != To && "Redundant dbg value transfer");
// TODO: assert(FromNode != ToNode && "Intranode dbg value transfer");
if (From == To || FromNode == ToNode)
return;
if (!FromNode->getHasDebugValue())
return;
SmallVector<SDDbgValue *, 2> ClonedDVs;
for (SDDbgValue *Dbg : GetDbgValues(FromNode)) {
if (Dbg->getKind() != SDDbgValue::SDNODE || Dbg->isInvalidated())
continue;
// TODO: assert(!Dbg->isInvalidated() && "Transfer of invalid dbg value");
// Just transfer the dbg value attached to From.
if (Dbg->getResNo() != From.getResNo())
continue;
DIVariable *Var = Dbg->getVariable();
auto *Expr = Dbg->getExpression();
// If a fragment is requested, update the expression.
if (SizeInBits) {
// When splitting a larger (e.g., sign-extended) value whose
// lower bits are described with an SDDbgValue, do not attempt
// to transfer the SDDbgValue to the upper bits.
if (auto FI = Expr->getFragmentInfo())
if (OffsetInBits + SizeInBits > FI->SizeInBits)
continue;
auto Fragment = DIExpression::createFragmentExpression(Expr, OffsetInBits,
SizeInBits);
if (!Fragment)
continue;
Expr = *Fragment;
}
// Clone the SDDbgValue and move it to To.
SDDbgValue *Clone =
getDbgValue(Var, Expr, ToNode, To.getResNo(), Dbg->isIndirect(),
Dbg->getDebugLoc(), Dbg->getOrder());
ClonedDVs.push_back(Clone);
if (InvalidateDbg) {
// Invalidate value and indicate the SDDbgValue should not be emitted.
Dbg->setIsInvalidated();
Dbg->setIsEmitted();
}
}
for (SDDbgValue *Dbg : ClonedDVs)
AddDbgValue(Dbg, ToNode, false);
}
void SelectionDAG::salvageDebugInfo(SDNode &N) {
if (!N.getHasDebugValue())
return;
SmallVector<SDDbgValue *, 2> ClonedDVs;
for (auto DV : GetDbgValues(&N)) {
if (DV->isInvalidated())
continue;
switch (N.getOpcode()) {
default:
break;
case ISD::ADD:
SDValue N0 = N.getOperand(0);
SDValue N1 = N.getOperand(1);
if (!isConstantIntBuildVectorOrConstantInt(N0) &&
isConstantIntBuildVectorOrConstantInt(N1)) {
uint64_t Offset = N.getConstantOperandVal(1);
// Rewrite an ADD constant node into a DIExpression. Since we are
// performing arithmetic to compute the variable's *value* in the
// DIExpression, we need to mark the expression with a
// DW_OP_stack_value.
auto *DIExpr = DV->getExpression();
DIExpr =
DIExpression::prepend(DIExpr, DIExpression::StackValue, Offset);
SDDbgValue *Clone =
getDbgValue(DV->getVariable(), DIExpr, N0.getNode(), N0.getResNo(),
DV->isIndirect(), DV->getDebugLoc(), DV->getOrder());
ClonedDVs.push_back(Clone);
DV->setIsInvalidated();
DV->setIsEmitted();
LLVM_DEBUG(dbgs() << "SALVAGE: Rewriting";
N0.getNode()->dumprFull(this);
dbgs() << " into " << *DIExpr << '\n');
}
}
}
for (SDDbgValue *Dbg : ClonedDVs)
AddDbgValue(Dbg, Dbg->getSDNode(), false);
}
/// Creates a SDDbgLabel node.
SDDbgLabel *SelectionDAG::getDbgLabel(DILabel *Label,
const DebugLoc &DL, unsigned O) {
assert(cast<DILabel>(Label)->isValidLocationForIntrinsic(DL) &&
"Expected inlined-at fields to agree");
return new (DbgInfo->getAlloc()) SDDbgLabel(Label, DL, O);
}
namespace {
/// RAUWUpdateListener - Helper for ReplaceAllUsesWith - When the node
/// pointed to by a use iterator is deleted, increment the use iterator
/// so that it doesn't dangle.
///
class RAUWUpdateListener : public SelectionDAG::DAGUpdateListener {
SDNode::use_iterator &UI;
SDNode::use_iterator &UE;
void NodeDeleted(SDNode *N, SDNode *E) override {
// Increment the iterator as needed.
while (UI != UE && N == *UI)
++UI;
}
public:
RAUWUpdateListener(SelectionDAG &d,
SDNode::use_iterator &ui,
SDNode::use_iterator &ue)
: SelectionDAG::DAGUpdateListener(d), UI(ui), UE(ue) {}
};
} // end anonymous namespace
/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
/// This can cause recursive merging of nodes in the DAG.
///
/// This version assumes From has a single result value.
///
void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) {
SDNode *From = FromN.getNode();
assert(From->getNumValues() == 1 && FromN.getResNo() == 0 &&
"Cannot replace with this method!");
assert(From != To.getNode() && "Cannot replace uses of with self");
// Preserve Debug Values
transferDbgValues(FromN, To);
// Iterate over all the existing uses of From. New uses will be added
// to the beginning of the use list, which we avoid visiting.
// This specifically avoids visiting uses of From that arise while the
// replacement is happening, because any such uses would be the result
// of CSE: If an existing node looks like From after one of its operands
// is replaced by To, we don't want to replace of all its users with To
// too. See PR3018 for more info.
SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
RAUWUpdateListener Listener(*this, UI, UE);
while (UI != UE) {
SDNode *User = *UI;
// This node is about to morph, remove its old self from the CSE maps.
RemoveNodeFromCSEMaps(User);
// A user can appear in a use list multiple times, and when this
// happens the uses are usually next to each other in the list.
// To help reduce the number of CSE recomputations, process all
// the uses of this user that we can find this way.
do {
SDUse &Use = UI.getUse();
++UI;
Use.set(To);
if (To->isDivergent() != From->isDivergent())
updateDivergence(User);
} while (UI != UE && *UI == User);
// Now that we have modified User, add it back to the CSE maps. If it
// already exists there, recursively merge the results together.
AddModifiedNodeToCSEMaps(User);
}
// If we just RAUW'd the root, take note.
if (FromN == getRoot())
setRoot(To);
}
/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
/// This can cause recursive merging of nodes in the DAG.
///
/// This version assumes that for each value of From, there is a
/// corresponding value in To in the same position with the same type.
///
void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To) {
#ifndef NDEBUG
for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
assert((!From->hasAnyUseOfValue(i) ||
From->getValueType(i) == To->getValueType(i)) &&
"Cannot use this version of ReplaceAllUsesWith!");
#endif
// Handle the trivial case.
if (From == To)
return;
// Preserve Debug Info. Only do this if there's a use.
for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
if (From->hasAnyUseOfValue(i)) {
assert((i < To->getNumValues()) && "Invalid To location");
transferDbgValues(SDValue(From, i), SDValue(To, i));
}
// Iterate over just the existing users of From. See the comments in
// the ReplaceAllUsesWith above.
SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
RAUWUpdateListener Listener(*this, UI, UE);
while (UI != UE) {
SDNode *User = *UI;
// This node is about to morph, remove its old self from the CSE maps.
RemoveNodeFromCSEMaps(User);
// A user can appear in a use list multiple times, and when this
// happens the uses are usually next to each other in the list.
// To help reduce the number of CSE recomputations, process all
// the uses of this user that we can find this way.
do {
SDUse &Use = UI.getUse();
++UI;
Use.setNode(To);
if (To->isDivergent() != From->isDivergent())
updateDivergence(User);
} while (UI != UE && *UI == User);
// Now that we have modified User, add it back to the CSE maps. If it
// already exists there, recursively merge the results together.
AddModifiedNodeToCSEMaps(User);
}
// If we just RAUW'd the root, take note.
if (From == getRoot().getNode())
setRoot(SDValue(To, getRoot().getResNo()));
}
/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
/// This can cause recursive merging of nodes in the DAG.
///
/// This version can replace From with any result values. To must match the
/// number and types of values returned by From.
void SelectionDAG::ReplaceAllUsesWith(SDNode *From, const SDValue *To) {
if (From->getNumValues() == 1) // Handle the simple case efficiently.
return ReplaceAllUsesWith(SDValue(From, 0), To[0]);
// Preserve Debug Info.
for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
transferDbgValues(SDValue(From, i), To[i]);
// Iterate over just the existing users of From. See the comments in
// the ReplaceAllUsesWith above.
SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
RAUWUpdateListener Listener(*this, UI, UE);
while (UI != UE) {
SDNode *User = *UI;
// This node is about to morph, remove its old self from the CSE maps.
RemoveNodeFromCSEMaps(User);
// A user can appear in a use list multiple times, and when this happens the
// uses are usually next to each other in the list. To help reduce the
// number of CSE and divergence recomputations, process all the uses of this
// user that we can find this way.
bool To_IsDivergent = false;
do {
SDUse &Use = UI.getUse();
const SDValue &ToOp = To[Use.getResNo()];
++UI;
Use.set(ToOp);
To_IsDivergent |= ToOp->isDivergent();
} while (UI != UE && *UI == User);
if (To_IsDivergent != From->isDivergent())
updateDivergence(User);
// Now that we have modified User, add it back to the CSE maps. If it
// already exists there, recursively merge the results together.
AddModifiedNodeToCSEMaps(User);
}
// If we just RAUW'd the root, take note.
if (From == getRoot().getNode())
setRoot(SDValue(To[getRoot().getResNo()]));
}
/// ReplaceAllUsesOfValueWith - Replace any uses of From with To, leaving
/// uses of other values produced by From.getNode() alone. The Deleted
/// vector is handled the same way as for ReplaceAllUsesWith.
void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){
// Handle the really simple, really trivial case efficiently.
if (From == To) return;
// Handle the simple, trivial, case efficiently.
if (From.getNode()->getNumValues() == 1) {
ReplaceAllUsesWith(From, To);
return;
}
// Preserve Debug Info.
transferDbgValues(From, To);
// Iterate over just the existing users of From. See the comments in
// the ReplaceAllUsesWith above.
SDNode::use_iterator UI = From.getNode()->use_begin(),
UE = From.getNode()->use_end();
RAUWUpdateListener Listener(*this, UI, UE);
while (UI != UE) {
SDNode *User = *UI;
bool UserRemovedFromCSEMaps = false;
// A user can appear in a use list multiple times, and when this
// happens the uses are usually next to each other in the list.
// To help reduce the number of CSE recomputations, process all
// the uses of this user that we can find this way.
do {
SDUse &Use = UI.getUse();
// Skip uses of different values from the same node.
if (Use.getResNo() != From.getResNo()) {
++UI;
continue;
}
// If this node hasn't been modified yet, it's still in the CSE maps,
// so remove its old self from the CSE maps.
if (!UserRemovedFromCSEMaps) {
RemoveNodeFromCSEMaps(User);
UserRemovedFromCSEMaps = true;
}
++UI;
Use.set(To);
if (To->isDivergent() != From->isDivergent())
updateDivergence(User);
} while (UI != UE && *UI == User);
// We are iterating over all uses of the From node, so if a use
// doesn't use the specific value, no changes are made.
if (!UserRemovedFromCSEMaps)
continue;
// Now that we have modified User, add it back to the CSE maps. If it
// already exists there, recursively merge the results together.
AddModifiedNodeToCSEMaps(User);
}
// If we just RAUW'd the root, take note.
if (From == getRoot())
setRoot(To);
}
namespace {
/// UseMemo - This class is used by SelectionDAG::ReplaceAllUsesOfValuesWith
/// to record information about a use.
struct UseMemo {
SDNode *User;
unsigned Index;
SDUse *Use;
};
/// operator< - Sort Memos by User.
bool operator<(const UseMemo &L, const UseMemo &R) {
return (intptr_t)L.User < (intptr_t)R.User;
}
} // end anonymous namespace
void SelectionDAG::updateDivergence(SDNode * N)
{
if (TLI->isSDNodeAlwaysUniform(N))
return;
bool IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, DA);
for (auto &Op : N->ops()) {
if (Op.Val.getValueType() != MVT::Other)
IsDivergent |= Op.getNode()->isDivergent();
}
if (N->SDNodeBits.IsDivergent != IsDivergent) {
N->SDNodeBits.IsDivergent = IsDivergent;
for (auto U : N->uses()) {
updateDivergence(U);
}
}
}
void SelectionDAG::CreateTopologicalOrder(std::vector<SDNode *> &Order) {
DenseMap<SDNode *, unsigned> Degree;
Order.reserve(AllNodes.size());
for (auto &N : allnodes()) {
unsigned NOps = N.getNumOperands();
Degree[&N] = NOps;
if (0 == NOps)
Order.push_back(&N);
}
for (size_t I = 0; I != Order.size(); ++I) {
SDNode *N = Order[I];
for (auto U : N->uses()) {
unsigned &UnsortedOps = Degree[U];
if (0 == --UnsortedOps)
Order.push_back(U);
}
}
}
#ifndef NDEBUG
void SelectionDAG::VerifyDAGDiverence() {
std::vector<SDNode *> TopoOrder;
CreateTopologicalOrder(TopoOrder);
const TargetLowering &TLI = getTargetLoweringInfo();
DenseMap<const SDNode *, bool> DivergenceMap;
for (auto &N : allnodes()) {
DivergenceMap[&N] = false;
}
for (auto N : TopoOrder) {
bool IsDivergent = DivergenceMap[N];
bool IsSDNodeDivergent = TLI.isSDNodeSourceOfDivergence(N, FLI, DA);
for (auto &Op : N->ops()) {
if (Op.Val.getValueType() != MVT::Other)
IsSDNodeDivergent |= DivergenceMap[Op.getNode()];
}
if (!IsDivergent && IsSDNodeDivergent && !TLI.isSDNodeAlwaysUniform(N)) {
DivergenceMap[N] = true;
}
}
for (auto &N : allnodes()) {
(void)N;
assert(DivergenceMap[&N] == N.isDivergent() &&
"Divergence bit inconsistency detected\n");
}
}
#endif
/// ReplaceAllUsesOfValuesWith - Replace any uses of From with To, leaving
/// uses of other values produced by From.getNode() alone. The same value
/// may appear in both the From and To list. The Deleted vector is
/// handled the same way as for ReplaceAllUsesWith.
void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
const SDValue *To,
unsigned Num){
// Handle the simple, trivial case efficiently.
if (Num == 1)
return ReplaceAllUsesOfValueWith(*From, *To);
transferDbgValues(*From, *To);
// Read up all the uses and make records of them. This helps
// processing new uses that are introduced during the
// replacement process.
SmallVector<UseMemo, 4> Uses;
for (unsigned i = 0; i != Num; ++i) {
unsigned FromResNo = From[i].getResNo();
SDNode *FromNode = From[i].getNode();
for (SDNode::use_iterator UI = FromNode->use_begin(),
E = FromNode->use_end(); UI != E; ++UI) {
SDUse &Use = UI.getUse();
if (Use.getResNo() == FromResNo) {
UseMemo Memo = { *UI, i, &Use };
Uses.push_back(Memo);
}
}
}
// Sort the uses, so that all the uses from a given User are together.
llvm::sort(Uses);
for (unsigned UseIndex = 0, UseIndexEnd = Uses.size();
UseIndex != UseIndexEnd; ) {
// We know that this user uses some value of From. If it is the right
// value, update it.
SDNode *User = Uses[UseIndex].User;
// This node is about to morph, remove its old self from the CSE maps.
RemoveNodeFromCSEMaps(User);
// The Uses array is sorted, so all the uses for a given User
// are next to each other in the list.
// To help reduce the number of CSE recomputations, process all
// the uses of this user that we can find this way.
do {
unsigned i = Uses[UseIndex].Index;
SDUse &Use = *Uses[UseIndex].Use;
++UseIndex;
Use.set(To[i]);
} while (UseIndex != UseIndexEnd && Uses[UseIndex].User == User);
// Now that we have modified User, add it back to the CSE maps. If it
// already exists there, recursively merge the results together.
AddModifiedNodeToCSEMaps(User);
}
}
/// AssignTopologicalOrder - Assign a unique node id for each node in the DAG
/// based on their topological order. It returns the maximum id and a vector
/// of the SDNodes* in assigned order by reference.
unsigned SelectionDAG::AssignTopologicalOrder() {
unsigned DAGSize = 0;
// SortedPos tracks the progress of the algorithm. Nodes before it are
// sorted, nodes after it are unsorted. When the algorithm completes
// it is at the end of the list.
allnodes_iterator SortedPos = allnodes_begin();
// Visit all the nodes. Move nodes with no operands to the front of
// the list immediately. Annotate nodes that do have operands with their
// operand count. Before we do this, the Node Id fields of the nodes
// may contain arbitrary values. After, the Node Id fields for nodes
// before SortedPos will contain the topological sort index, and the
// Node Id fields for nodes At SortedPos and after will contain the
// count of outstanding operands.
for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ) {
SDNode *N = &*I++;
checkForCycles(N, this);
unsigned Degree = N->getNumOperands();
if (Degree == 0) {
// A node with no uses, add it to the result array immediately.
N->setNodeId(DAGSize++);
allnodes_iterator Q(N);
if (Q != SortedPos)
SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(Q));
assert(SortedPos != AllNodes.end() && "Overran node list");
++SortedPos;
} else {
// Temporarily use the Node Id as scratch space for the degree count.
N->setNodeId(Degree);
}
}
// Visit all the nodes. As we iterate, move nodes into sorted order,
// such that by the time the end is reached all nodes will be sorted.
for (SDNode &Node : allnodes()) {
SDNode *N = &Node;
checkForCycles(N, this);
// N is in sorted position, so all its uses have one less operand
// that needs to be sorted.
for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
UI != UE; ++UI) {
SDNode *P = *UI;
unsigned Degree = P->getNodeId();
assert(Degree != 0 && "Invalid node degree");
--Degree;
if (Degree == 0) {
// All of P's operands are sorted, so P may sorted now.
P->setNodeId(DAGSize++);
if (P->getIterator() != SortedPos)
SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(P));
assert(SortedPos != AllNodes.end() && "Overran node list");
++SortedPos;
} else {
// Update P's outstanding operand count.
P->setNodeId(Degree);
}
}
if (Node.getIterator() == SortedPos) {
#ifndef NDEBUG
allnodes_iterator I(N);
SDNode *S = &*++I;
dbgs() << "Overran sorted position:\n";
S->dumprFull(this); dbgs() << "\n";
dbgs() << "Checking if this is due to cycles\n";
checkForCycles(this, true);
#endif
llvm_unreachable(nullptr);
}
}
assert(SortedPos == AllNodes.end() &&
"Topological sort incomplete!");
assert(AllNodes.front().getOpcode() == ISD::EntryToken &&
"First node in topological sort is not the entry token!");
assert(AllNodes.front().getNodeId() == 0 &&
"First node in topological sort has non-zero id!");
assert(AllNodes.front().getNumOperands() == 0 &&
"First node in topological sort has operands!");
assert(AllNodes.back().getNodeId() == (int)DAGSize-1 &&
"Last node in topologic sort has unexpected id!");
assert(AllNodes.back().use_empty() &&
"Last node in topologic sort has users!");
assert(DAGSize == allnodes_size() && "Node count mismatch!");
return DAGSize;
}
/// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the
/// value is produced by SD.
void SelectionDAG::AddDbgValue(SDDbgValue *DB, SDNode *SD, bool isParameter) {
if (SD) {
assert(DbgInfo->getSDDbgValues(SD).empty() || SD->getHasDebugValue());
SD->setHasDebugValue(true);
}
DbgInfo->add(DB, SD, isParameter);
}
void SelectionDAG::AddDbgLabel(SDDbgLabel *DB) {
DbgInfo->add(DB);
}
SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
SDValue NewMemOp) {
assert(isa<MemSDNode>(NewMemOp.getNode()) && "Expected a memop node");
// The new memory operation must have the same position as the old load in
// terms of memory dependency. Create a TokenFactor for the old load and new
// memory operation and update uses of the old load's output chain to use that
// TokenFactor.
SDValue OldChain = SDValue(OldLoad, 1);
SDValue NewChain = SDValue(NewMemOp.getNode(), 1);
if (!OldLoad->hasAnyUseOfValue(1))
return NewChain;
SDValue TokenFactor =
getNode(ISD::TokenFactor, SDLoc(OldLoad), MVT::Other, OldChain, NewChain);
ReplaceAllUsesOfValueWith(OldChain, TokenFactor);
UpdateNodeOperands(TokenFactor.getNode(), OldChain, NewChain);
return TokenFactor;
}
SDValue SelectionDAG::getSymbolFunctionGlobalAddress(SDValue Op,
Function **OutFunction) {
assert(isa<ExternalSymbolSDNode>(Op) && "Node should be an ExternalSymbol");
auto *Symbol = cast<ExternalSymbolSDNode>(Op)->getSymbol();
auto *Module = MF->getFunction().getParent();
auto *Function = Module->getFunction(Symbol);
if (OutFunction != nullptr)
*OutFunction = Function;
if (Function != nullptr) {
auto PtrTy = TLI->getPointerTy(getDataLayout(), Function->getAddressSpace());
return getGlobalAddress(Function, SDLoc(Op), PtrTy);
}
std::string ErrorStr;
raw_string_ostream ErrorFormatter(ErrorStr);
ErrorFormatter << "Undefined external symbol ";
ErrorFormatter << '"' << Symbol << '"';
ErrorFormatter.flush();
report_fatal_error(ErrorStr);
}
//===----------------------------------------------------------------------===//
// SDNode Class
//===----------------------------------------------------------------------===//
bool llvm::isNullConstant(SDValue V) {
ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
return Const != nullptr && Const->isNullValue();
}
bool llvm::isNullFPConstant(SDValue V) {
ConstantFPSDNode *Const = dyn_cast<ConstantFPSDNode>(V);
return Const != nullptr && Const->isZero() && !Const->isNegative();
}
bool llvm::isAllOnesConstant(SDValue V) {
ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
return Const != nullptr && Const->isAllOnesValue();
}
bool llvm::isOneConstant(SDValue V) {
ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
return Const != nullptr && Const->isOne();
}
SDValue llvm::peekThroughBitcasts(SDValue V) {
while (V.getOpcode() == ISD::BITCAST)
V = V.getOperand(0);
return V;
}
SDValue llvm::peekThroughOneUseBitcasts(SDValue V) {
while (V.getOpcode() == ISD::BITCAST && V.getOperand(0).hasOneUse())
V = V.getOperand(0);
return V;
}
SDValue llvm::peekThroughExtractSubvectors(SDValue V) {
while (V.getOpcode() == ISD::EXTRACT_SUBVECTOR)
V = V.getOperand(0);
return V;
}
bool llvm::isBitwiseNot(SDValue V, bool AllowUndefs) {
if (V.getOpcode() != ISD::XOR)
return false;
V = peekThroughBitcasts(V.getOperand(1));
unsigned NumBits = V.getScalarValueSizeInBits();
ConstantSDNode *C =
isConstOrConstSplat(V, AllowUndefs, /*AllowTruncation*/ true);
return C && (C->getAPIntValue().countTrailingOnes() >= NumBits);
}
ConstantSDNode *llvm::isConstOrConstSplat(SDValue N, bool AllowUndefs,
bool AllowTruncation) {
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N))
return CN;
if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
BitVector UndefElements;
ConstantSDNode *CN = BV->getConstantSplatNode(&UndefElements);
// BuildVectors can truncate their operands. Ignore that case here unless
// AllowTruncation is set.
if (CN && (UndefElements.none() || AllowUndefs)) {
EVT CVT = CN->getValueType(0);
EVT NSVT = N.getValueType().getScalarType();
assert(CVT.bitsGE(NSVT) && "Illegal build vector element extension");
if (AllowTruncation || (CVT == NSVT))
return CN;
}
}
return nullptr;
}
ConstantSDNode *llvm::isConstOrConstSplat(SDValue N, const APInt &DemandedElts,
bool AllowUndefs,
bool AllowTruncation) {
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N))
return CN;
if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
BitVector UndefElements;
ConstantSDNode *CN = BV->getConstantSplatNode(DemandedElts, &UndefElements);
// BuildVectors can truncate their operands. Ignore that case here unless
// AllowTruncation is set.
if (CN && (UndefElements.none() || AllowUndefs)) {
EVT CVT = CN->getValueType(0);
EVT NSVT = N.getValueType().getScalarType();
assert(CVT.bitsGE(NSVT) && "Illegal build vector element extension");
if (AllowTruncation || (CVT == NSVT))
return CN;
}
}
return nullptr;
}
ConstantFPSDNode *llvm::isConstOrConstSplatFP(SDValue N, bool AllowUndefs) {
if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
return CN;
if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
BitVector UndefElements;
ConstantFPSDNode *CN = BV->getConstantFPSplatNode(&UndefElements);
if (CN && (UndefElements.none() || AllowUndefs))
return CN;
}
return nullptr;
}
ConstantFPSDNode *llvm::isConstOrConstSplatFP(SDValue N,
const APInt &DemandedElts,
bool AllowUndefs) {
if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
return CN;
if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
BitVector UndefElements;
ConstantFPSDNode *CN =
BV->getConstantFPSplatNode(DemandedElts, &UndefElements);
if (CN && (UndefElements.none() || AllowUndefs))
return CN;
}
return nullptr;
}
bool llvm::isNullOrNullSplat(SDValue N, bool AllowUndefs) {
// TODO: may want to use peekThroughBitcast() here.
ConstantSDNode *C = isConstOrConstSplat(N, AllowUndefs);
return C && C->isNullValue();
}
bool llvm::isOneOrOneSplat(SDValue N) {
// TODO: may want to use peekThroughBitcast() here.
unsigned BitWidth = N.getScalarValueSizeInBits();
ConstantSDNode *C = isConstOrConstSplat(N);
return C && C->isOne() && C->getValueSizeInBits(0) == BitWidth;
}
bool llvm::isAllOnesOrAllOnesSplat(SDValue N) {
N = peekThroughBitcasts(N);
unsigned BitWidth = N.getScalarValueSizeInBits();
ConstantSDNode *C = isConstOrConstSplat(N);
return C && C->isAllOnesValue() && C->getValueSizeInBits(0) == BitWidth;
}
HandleSDNode::~HandleSDNode() {
DropOperands();
}
GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, unsigned Order,
const DebugLoc &DL,
const GlobalValue *GA, EVT VT,
int64_t o, unsigned char TF)
: SDNode(Opc, Order, DL, getSDVTList(VT)), Offset(o), TargetFlags(TF) {
TheGlobal = GA;
}
AddrSpaceCastSDNode::AddrSpaceCastSDNode(unsigned Order, const DebugLoc &dl,
EVT VT, unsigned SrcAS,
unsigned DestAS)
: SDNode(ISD::ADDRSPACECAST, Order, dl, getSDVTList(VT)),
SrcAddrSpace(SrcAS), DestAddrSpace(DestAS) {}
MemSDNode::MemSDNode(unsigned Opc, unsigned Order, const DebugLoc &dl,
SDVTList VTs, EVT memvt, MachineMemOperand *mmo)
: SDNode(Opc, Order, dl, VTs), MemoryVT(memvt), MMO(mmo) {
MemSDNodeBits.IsVolatile = MMO->isVolatile();
MemSDNodeBits.IsNonTemporal = MMO->isNonTemporal();
MemSDNodeBits.IsDereferenceable = MMO->isDereferenceable();
MemSDNodeBits.IsInvariant = MMO->isInvariant();
// We check here that the size of the memory operand fits within the size of
// the MMO. This is because the MMO might indicate only a possible address
// range instead of specifying the affected memory addresses precisely.
assert(memvt.getStoreSize() <= MMO->getSize() && "Size mismatch!");
}
/// Profile - Gather unique data for the node.
///
void SDNode::Profile(FoldingSetNodeID &ID) const {
AddNodeIDNode(ID, this);
}
namespace {
struct EVTArray {
std::vector<EVT> VTs;
EVTArray() {
VTs.reserve(MVT::LAST_VALUETYPE);
for (unsigned i = 0; i < MVT::LAST_VALUETYPE; ++i)
VTs.push_back(MVT((MVT::SimpleValueType)i));
}
};
} // end anonymous namespace
static ManagedStatic<std::set<EVT, EVT::compareRawBits>> EVTs;
static ManagedStatic<EVTArray> SimpleVTArray;
static ManagedStatic<sys::SmartMutex<true>> VTMutex;
/// getValueTypeList - Return a pointer to the specified value type.
///
const EVT *SDNode::getValueTypeList(EVT VT) {
if (VT.isExtended()) {
sys::SmartScopedLock<true> Lock(*VTMutex);
return &(*EVTs->insert(VT).first);
} else {
assert(VT.getSimpleVT() < MVT::LAST_VALUETYPE &&
"Value type out of range!");
return &SimpleVTArray->VTs[VT.getSimpleVT().SimpleTy];
}
}
/// hasNUsesOfValue - Return true if there are exactly NUSES uses of the
/// indicated value. This method ignores uses of other values defined by this
/// operation.
bool SDNode::hasNUsesOfValue(unsigned NUses, unsigned Value) const {
assert(Value < getNumValues() && "Bad value!");
// TODO: Only iterate over uses of a given value of the node
for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) {
if (UI.getUse().getResNo() == Value) {
if (NUses == 0)
return false;
--NUses;
}
}
// Found exactly the right number of uses?
return NUses == 0;
}
/// hasAnyUseOfValue - Return true if there are any use of the indicated
/// value. This method ignores uses of other values defined by this operation.
bool SDNode::hasAnyUseOfValue(unsigned Value) const {
assert(Value < getNumValues() && "Bad value!");
for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI)
if (UI.getUse().getResNo() == Value)
return true;
return false;
}
/// isOnlyUserOf - Return true if this node is the only use of N.
bool SDNode::isOnlyUserOf(const SDNode *N) const {
bool Seen = false;
for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
SDNode *User = *I;
if (User == this)
Seen = true;
else
return false;
}
return Seen;
}
/// Return true if the only users of N are contained in Nodes.
bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N) {
bool Seen = false;
for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
SDNode *User = *I;
if (llvm::any_of(Nodes,
[&User](const SDNode *Node) { return User == Node; }))
Seen = true;
else
return false;
}
return Seen;
}
/// isOperand - Return true if this node is an operand of N.
bool SDValue::isOperandOf(const SDNode *N) const {
return any_of(N->op_values(), [this](SDValue Op) { return *this == Op; });
}
bool SDNode::isOperandOf(const SDNode *N) const {
return any_of(N->op_values(),
[this](SDValue Op) { return this == Op.getNode(); });
}
/// reachesChainWithoutSideEffects - Return true if this operand (which must
/// be a chain) reaches the specified operand without crossing any
/// side-effecting instructions on any chain path. In practice, this looks
/// through token factors and non-volatile loads. In order to remain efficient,
/// this only looks a couple of nodes in, it does not do an exhaustive search.
///
/// Note that we only need to examine chains when we're searching for
/// side-effects; SelectionDAG requires that all side-effects are represented
/// by chains, even if another operand would force a specific ordering. This
/// constraint is necessary to allow transformations like splitting loads.
bool SDValue::reachesChainWithoutSideEffects(SDValue Dest,
unsigned Depth) const {
if (*this == Dest) return true;
// Don't search too deeply, we just want to be able to see through
// TokenFactor's etc.
if (Depth == 0) return false;
// If this is a token factor, all inputs to the TF happen in parallel.
if (getOpcode() == ISD::TokenFactor) {
// First, try a shallow search.
if (is_contained((*this)->ops(), Dest)) {
// We found the chain we want as an operand of this TokenFactor.
// Essentially, we reach the chain without side-effects if we could
// serialize the TokenFactor into a simple chain of operations with
// Dest as the last operation. This is automatically true if the
// chain has one use: there are no other ordering constraints.
// If the chain has more than one use, we give up: some other
// use of Dest might force a side-effect between Dest and the current
// node.
if (Dest.hasOneUse())
return true;
}
// Next, try a deep search: check whether every operand of the TokenFactor
// reaches Dest.
return llvm::all_of((*this)->ops(), [=](SDValue Op) {
return Op.reachesChainWithoutSideEffects(Dest, Depth - 1);
});
}
// Loads don't have side effects, look through them.
if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(*this)) {
if (!Ld->isVolatile())
return Ld->getChain().reachesChainWithoutSideEffects(Dest, Depth-1);
}
return false;
}
bool SDNode::hasPredecessor(const SDNode *N) const {
SmallPtrSet<const SDNode *, 32> Visited;
SmallVector<const SDNode *, 16> Worklist;
Worklist.push_back(this);
return hasPredecessorHelper(N, Visited, Worklist);
}
void SDNode::intersectFlagsWith(const SDNodeFlags Flags) {
this->Flags.intersectWith(Flags);
}
SDValue
SelectionDAG::matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp,
ArrayRef<ISD::NodeType> CandidateBinOps) {
// The pattern must end in an extract from index 0.
if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isNullConstant(Extract->getOperand(1)))
return SDValue();
SDValue Op = Extract->getOperand(0);
unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
// Match against one of the candidate binary ops.
if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
return Op.getOpcode() == unsigned(BinOp);
}))
return SDValue();
// At each stage, we're looking for something that looks like:
// %s = shufflevector <8 x i32> %op, <8 x i32> undef,
// <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
// i32 undef, i32 undef, i32 undef, i32 undef>
// %a = binop <8 x i32> %op, %s
// Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
// we expect something like:
// <4,5,6,7,u,u,u,u>
// <2,3,u,u,u,u,u,u>
// <1,u,u,u,u,u,u,u>
unsigned CandidateBinOp = Op.getOpcode();
for (unsigned i = 0; i < Stages; ++i) {
if (Op.getOpcode() != CandidateBinOp)
return SDValue();
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(Op0);
if (Shuffle) {
Op = Op1;
} else {
Shuffle = dyn_cast<ShuffleVectorSDNode>(Op1);
Op = Op0;
}
// The first operand of the shuffle should be the same as the other operand
// of the binop.
if (!Shuffle || Shuffle->getOperand(0) != Op)
return SDValue();
// Verify the shuffle has the expected (at this stage of the pyramid) mask.
for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
return SDValue();
}
BinOp = (ISD::NodeType)CandidateBinOp;
return Op;
}
SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
assert(N->getNumValues() == 1 &&
"Can't unroll a vector with multiple results!");
EVT VT = N->getValueType(0);
unsigned NE = VT.getVectorNumElements();
EVT EltVT = VT.getVectorElementType();
SDLoc dl(N);
SmallVector<SDValue, 8> Scalars;
SmallVector<SDValue, 4> Operands(N->getNumOperands());
// If ResNE is 0, fully unroll the vector op.
if (ResNE == 0)
ResNE = NE;
else if (NE > ResNE)
NE = ResNE;
unsigned i;
for (i= 0; i != NE; ++i) {
for (unsigned j = 0, e = N->getNumOperands(); j != e; ++j) {
SDValue Operand = N->getOperand(j);
EVT OperandVT = Operand.getValueType();
if (OperandVT.isVector()) {
// A vector operand; extract a single element.
EVT OperandEltVT = OperandVT.getVectorElementType();
Operands[j] =
getNode(ISD::EXTRACT_VECTOR_ELT, dl, OperandEltVT, Operand,
getConstant(i, dl, TLI->getVectorIdxTy(getDataLayout())));
} else {
// A scalar operand; just use it as is.
Operands[j] = Operand;
}
}
switch (N->getOpcode()) {
default: {
Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands,
N->getFlags()));
break;
}
case ISD::VSELECT:
Scalars.push_back(getNode(ISD::SELECT, dl, EltVT, Operands));
break;
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
case ISD::ROTL:
case ISD::ROTR:
Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands[0],
getShiftAmountOperand(Operands[0].getValueType(),
Operands[1])));
break;
case ISD::SIGN_EXTEND_INREG:
case ISD::FP_ROUND_INREG: {
EVT ExtVT = cast<VTSDNode>(Operands[1])->getVT().getVectorElementType();
Scalars.push_back(getNode(N->getOpcode(), dl, EltVT,
Operands[0],
getValueType(ExtVT)));
}
}
}
for (; i < ResNE; ++i)
Scalars.push_back(getUNDEF(EltVT));
EVT VecVT = EVT::getVectorVT(*getContext(), EltVT, ResNE);
return getBuildVector(VecVT, dl, Scalars);
}
std::pair<SDValue, SDValue> SelectionDAG::UnrollVectorOverflowOp(
SDNode *N, unsigned ResNE) {
unsigned Opcode = N->getOpcode();
assert((Opcode == ISD::UADDO || Opcode == ISD::SADDO ||
Opcode == ISD::USUBO || Opcode == ISD::SSUBO ||
Opcode == ISD::UMULO || Opcode == ISD::SMULO) &&
"Expected an overflow opcode");
EVT ResVT = N->getValueType(0);
EVT OvVT = N->getValueType(1);
EVT ResEltVT = ResVT.getVectorElementType();
EVT OvEltVT = OvVT.getVectorElementType();
SDLoc dl(N);
// If ResNE is 0, fully unroll the vector op.
unsigned NE = ResVT.getVectorNumElements();
if (ResNE == 0)
ResNE = NE;
else if (NE > ResNE)
NE = ResNE;
SmallVector<SDValue, 8> LHSScalars;
SmallVector<SDValue, 8> RHSScalars;
ExtractVectorElements(N->getOperand(0), LHSScalars, 0, NE);
ExtractVectorElements(N->getOperand(1), RHSScalars, 0, NE);
EVT SVT = TLI->getSetCCResultType(getDataLayout(), *getContext(), ResEltVT);
SDVTList VTs = getVTList(ResEltVT, SVT);
SmallVector<SDValue, 8> ResScalars;
SmallVector<SDValue, 8> OvScalars;
for (unsigned i = 0; i < NE; ++i) {
SDValue Res = getNode(Opcode, dl, VTs, LHSScalars[i], RHSScalars[i]);
SDValue Ov =
getSelect(dl, OvEltVT, Res.getValue(1),
getBoolConstant(true, dl, OvEltVT, ResVT),
getConstant(0, dl, OvEltVT));
ResScalars.push_back(Res);
OvScalars.push_back(Ov);
}
ResScalars.append(ResNE - NE, getUNDEF(ResEltVT));
OvScalars.append(ResNE - NE, getUNDEF(OvEltVT));
EVT NewResVT = EVT::getVectorVT(*getContext(), ResEltVT, ResNE);
EVT NewOvVT = EVT::getVectorVT(*getContext(), OvEltVT, ResNE);
return std::make_pair(getBuildVector(NewResVT, dl, ResScalars),
getBuildVector(NewOvVT, dl, OvScalars));
}
bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD,
LoadSDNode *Base,
unsigned Bytes,
int Dist) const {
if (LD->isVolatile() || Base->isVolatile())
return false;
if (LD->isIndexed() || Base->isIndexed())
return false;
if (LD->getChain() != Base->getChain())
return false;
EVT VT = LD->getValueType(0);
if (VT.getSizeInBits() / 8 != Bytes)
return false;
auto BaseLocDecomp = BaseIndexOffset::match(Base, *this);
auto LocDecomp = BaseIndexOffset::match(LD, *this);
int64_t Offset = 0;
if (BaseLocDecomp.equalBaseIndex(LocDecomp, *this, Offset))
return (Dist * Bytes == Offset);
return false;
}
/// InferPtrAlignment - Infer alignment of a load / store address. Return 0 if
/// it cannot be inferred.
unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
// If this is a GlobalAddress + cst, return the alignment.
const GlobalValue *GV;
int64_t GVOffset = 0;
if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
unsigned IdxWidth = getDataLayout().getIndexTypeSizeInBits(GV->getType());
KnownBits Known(IdxWidth);
llvm::computeKnownBits(GV, Known, getDataLayout());
unsigned AlignBits = Known.countMinTrailingZeros();
unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0;
if (Align)
return MinAlign(Align, GVOffset);
}
// If this is a direct reference to a stack slot, use information about the
// stack slot's alignment.
int FrameIdx = INT_MIN;
int64_t FrameOffset = 0;
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr)) {
FrameIdx = FI->getIndex();
} else if (isBaseWithConstantOffset(Ptr) &&
isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
// Handle FI+Cst
FrameIdx = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
FrameOffset = Ptr.getConstantOperandVal(1);
}
if (FrameIdx != INT_MIN) {
const MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
unsigned FIInfoAlign = MinAlign(MFI.getObjectAlignment(FrameIdx),
FrameOffset);
return FIInfoAlign;
}
return 0;
}
/// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
/// which is split (or expanded) into two not necessarily identical pieces.
std::pair<EVT, EVT> SelectionDAG::GetSplitDestVTs(const EVT &VT) const {
// Currently all types are split in half.
EVT LoVT, HiVT;
if (!VT.isVector())
LoVT = HiVT = TLI->getTypeToTransformTo(*getContext(), VT);
else
LoVT = HiVT = VT.getHalfNumVectorElementsVT(*getContext());
return std::make_pair(LoVT, HiVT);
}
/// SplitVector - Split the vector with EXTRACT_SUBVECTOR and return the
/// low/high part.
std::pair<SDValue, SDValue>
SelectionDAG::SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT,
const EVT &HiVT) {
assert(LoVT.getVectorNumElements() + HiVT.getVectorNumElements() <=
N.getValueType().getVectorNumElements() &&
"More vector elements requested than available!");
SDValue Lo, Hi;
Lo = getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
getConstant(0, DL, TLI->getVectorIdxTy(getDataLayout())));
Hi = getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, N,
getConstant(LoVT.getVectorNumElements(), DL,
TLI->getVectorIdxTy(getDataLayout())));
return std::make_pair(Lo, Hi);
}
/// Widen the vector up to the next power of two using INSERT_SUBVECTOR.
SDValue SelectionDAG::WidenVector(const SDValue &N, const SDLoc &DL) {
EVT VT = N.getValueType();
EVT WideVT = EVT::getVectorVT(*getContext(), VT.getVectorElementType(),
NextPowerOf2(VT.getVectorNumElements()));
return getNode(ISD::INSERT_SUBVECTOR, DL, WideVT, getUNDEF(WideVT), N,
getConstant(0, DL, TLI->getVectorIdxTy(getDataLayout())));
}
void SelectionDAG::ExtractVectorElements(SDValue Op,
SmallVectorImpl<SDValue> &Args,
unsigned Start, unsigned Count) {
EVT VT = Op.getValueType();
if (Count == 0)
Count = VT.getVectorNumElements();
EVT EltVT = VT.getVectorElementType();
EVT IdxTy = TLI->getVectorIdxTy(getDataLayout());
SDLoc SL(Op);
for (unsigned i = Start, e = Start + Count; i != e; ++i) {
Args.push_back(getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
Op, getConstant(i, SL, IdxTy)));
}
}
// getAddressSpace - Return the address space this GlobalAddress belongs to.
unsigned GlobalAddressSDNode::getAddressSpace() const {
return getGlobal()->getType()->getAddressSpace();
}
Type *ConstantPoolSDNode::getType() const {
if (isMachineConstantPoolEntry())
return Val.MachineCPVal->getType();
return Val.ConstVal->getType();
}
bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, APInt &SplatUndef,
unsigned &SplatBitSize,
bool &HasAnyUndefs,
unsigned MinSplatBits,
bool IsBigEndian) const {
EVT VT = getValueType(0);
assert(VT.isVector() && "Expected a vector type");
unsigned VecWidth = VT.getSizeInBits();
if (MinSplatBits > VecWidth)
return false;
// FIXME: The widths are based on this node's type, but build vectors can
// truncate their operands.
SplatValue = APInt(VecWidth, 0);
SplatUndef = APInt(VecWidth, 0);
// Get the bits. Bits with undefined values (when the corresponding element
// of the vector is an ISD::UNDEF value) are set in SplatUndef and cleared
// in SplatValue. If any of the values are not constant, give up and return
// false.
unsigned int NumOps = getNumOperands();
assert(NumOps > 0 && "isConstantSplat has 0-size build vector");
unsigned EltWidth = VT.getScalarSizeInBits();
for (unsigned j = 0; j < NumOps; ++j) {
unsigned i = IsBigEndian ? NumOps - 1 - j : j;
SDValue OpVal = getOperand(i);
unsigned BitPos = j * EltWidth;
if (OpVal.isUndef())
SplatUndef.setBits(BitPos, BitPos + EltWidth);
else if (auto *CN = dyn_cast<ConstantSDNode>(OpVal))
SplatValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltWidth), BitPos);
else if (auto *CN = dyn_cast<ConstantFPSDNode>(OpVal))
SplatValue.insertBits(CN->getValueAPF().bitcastToAPInt(), BitPos);
else
return false;
}
// The build_vector is all constants or undefs. Find the smallest element
// size that splats the vector.
HasAnyUndefs = (SplatUndef != 0);
// FIXME: This does not work for vectors with elements less than 8 bits.
while (VecWidth > 8) {
unsigned HalfSize = VecWidth / 2;
APInt HighValue = SplatValue.lshr(HalfSize).trunc(HalfSize);
APInt LowValue = SplatValue.trunc(HalfSize);
APInt HighUndef = SplatUndef.lshr(HalfSize).trunc(HalfSize);
APInt LowUndef = SplatUndef.trunc(HalfSize);
// If the two halves do not match (ignoring undef bits), stop here.
if ((HighValue & ~LowUndef) != (LowValue & ~HighUndef) ||
MinSplatBits > HalfSize)
break;
SplatValue = HighValue | LowValue;
SplatUndef = HighUndef & LowUndef;
VecWidth = HalfSize;
}
SplatBitSize = VecWidth;
return true;
}
SDValue BuildVectorSDNode::getSplatValue(const APInt &DemandedElts,
BitVector *UndefElements) const {
if (UndefElements) {
UndefElements->clear();
UndefElements->resize(getNumOperands());
}
assert(getNumOperands() == DemandedElts.getBitWidth() &&
"Unexpected vector size");
if (!DemandedElts)
return SDValue();
SDValue Splatted;
for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
if (!DemandedElts[i])
continue;
SDValue Op = getOperand(i);
if (Op.isUndef()) {
if (UndefElements)
(*UndefElements)[i] = true;
} else if (!Splatted) {
Splatted = Op;
} else if (Splatted != Op) {
return SDValue();
}
}
if (!Splatted) {
unsigned FirstDemandedIdx = DemandedElts.countTrailingZeros();
assert(getOperand(FirstDemandedIdx).isUndef() &&
"Can only have a splat without a constant for all undefs.");
return getOperand(FirstDemandedIdx);
}
return Splatted;
}
SDValue BuildVectorSDNode::getSplatValue(BitVector *UndefElements) const {
APInt DemandedElts = APInt::getAllOnesValue(getNumOperands());
return getSplatValue(DemandedElts, UndefElements);
}
ConstantSDNode *
BuildVectorSDNode::getConstantSplatNode(const APInt &DemandedElts,
BitVector *UndefElements) const {
return dyn_cast_or_null<ConstantSDNode>(
getSplatValue(DemandedElts, UndefElements));
}
ConstantSDNode *
BuildVectorSDNode::getConstantSplatNode(BitVector *UndefElements) const {
return dyn_cast_or_null<ConstantSDNode>(getSplatValue(UndefElements));
}
ConstantFPSDNode *
BuildVectorSDNode::getConstantFPSplatNode(const APInt &DemandedElts,
BitVector *UndefElements) const {
return dyn_cast_or_null<ConstantFPSDNode>(
getSplatValue(DemandedElts, UndefElements));
}
ConstantFPSDNode *
BuildVectorSDNode::getConstantFPSplatNode(BitVector *UndefElements) const {
return dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements));
}
int32_t
BuildVectorSDNode::getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements,
uint32_t BitWidth) const {
if (ConstantFPSDNode *CN =
dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements))) {
bool IsExact;
APSInt IntVal(BitWidth);
const APFloat &APF = CN->getValueAPF();
if (APF.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
APFloat::opOK ||
!IsExact)
return -1;
return IntVal.exactLogBase2();
}
return -1;
}
bool BuildVectorSDNode::isConstant() const {
for (const SDValue &Op : op_values()) {
unsigned Opc = Op.getOpcode();
if (Opc != ISD::UNDEF && Opc != ISD::Constant && Opc != ISD::ConstantFP)
return false;
}
return true;
}
bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) {
// Find the first non-undef value in the shuffle mask.
unsigned i, e;
for (i = 0, e = VT.getVectorNumElements(); i != e && Mask[i] < 0; ++i)
/* search */;
// If all elements are undefined, this shuffle can be considered a splat
// (although it should eventually get simplified away completely).
if (i == e)
return true;
// Make sure all remaining elements are either undef or the same as the first
// non-undef value.
for (int Idx = Mask[i]; i != e; ++i)
if (Mask[i] >= 0 && Mask[i] != Idx)
return false;
return true;
}
// Returns the SDNode if it is a constant integer BuildVector
// or constant integer.
SDNode *SelectionDAG::isConstantIntBuildVectorOrConstantInt(SDValue N) {
if (isa<ConstantSDNode>(N))
return N.getNode();
if (ISD::isBuildVectorOfConstantSDNodes(N.getNode()))
return N.getNode();
// Treat a GlobalAddress supporting constant offset folding as a
// constant integer.
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N))
if (GA->getOpcode() == ISD::GlobalAddress &&
TLI->isOffsetFoldingLegal(GA))
return GA;
return nullptr;
}
SDNode *SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) {
if (isa<ConstantFPSDNode>(N))
return N.getNode();
if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode()))
return N.getNode();
return nullptr;
}
void SelectionDAG::createOperands(SDNode *Node, ArrayRef<SDValue> Vals) {
assert(!Node->OperandList && "Node already has operands");
assert(SDNode::getMaxNumOperands() >= Vals.size() &&
"too many operands to fit into SDNode");
SDUse *Ops = OperandRecycler.allocate(
ArrayRecycler<SDUse>::Capacity::get(Vals.size()), OperandAllocator);
bool IsDivergent = false;
for (unsigned I = 0; I != Vals.size(); ++I) {
Ops[I].setUser(Node);
Ops[I].setInitial(Vals[I]);
if (Ops[I].Val.getValueType() != MVT::Other) // Skip Chain. It does not carry divergence.
IsDivergent = IsDivergent || Ops[I].getNode()->isDivergent();
}
Node->NumOperands = Vals.size();
Node->OperandList = Ops;
IsDivergent |= TLI->isSDNodeSourceOfDivergence(Node, FLI, DA);
if (!TLI->isSDNodeAlwaysUniform(Node))
Node->SDNodeBits.IsDivergent = IsDivergent;
checkForCycles(Node);
}
SDValue SelectionDAG::getTokenFactor(const SDLoc &DL,
SmallVectorImpl<SDValue> &Vals) {
size_t Limit = SDNode::getMaxNumOperands();
while (Vals.size() > Limit) {
unsigned SliceIdx = Vals.size() - Limit;
auto ExtractedTFs = ArrayRef<SDValue>(Vals).slice(SliceIdx, Limit);
SDValue NewTF = getNode(ISD::TokenFactor, DL, MVT::Other, ExtractedTFs);
Vals.erase(Vals.begin() + SliceIdx, Vals.end());
Vals.emplace_back(NewTF);
}
return getNode(ISD::TokenFactor, DL, MVT::Other, Vals);
}
#ifndef NDEBUG
static void checkForCyclesHelper(const SDNode *N,
SmallPtrSetImpl<const SDNode*> &Visited,
SmallPtrSetImpl<const SDNode*> &Checked,
const llvm::SelectionDAG *DAG) {
// If this node has already been checked, don't check it again.
if (Checked.count(N))
return;
// If a node has already been visited on this depth-first walk, reject it as
// a cycle.
if (!Visited.insert(N).second) {
errs() << "Detected cycle in SelectionDAG\n";
dbgs() << "Offending node:\n";
N->dumprFull(DAG); dbgs() << "\n";
abort();
}
for (const SDValue &Op : N->op_values())
checkForCyclesHelper(Op.getNode(), Visited, Checked, DAG);
Checked.insert(N);
Visited.erase(N);
}
#endif
void llvm::checkForCycles(const llvm::SDNode *N,
const llvm::SelectionDAG *DAG,
bool force) {
#ifndef NDEBUG
bool check = force;
#ifdef EXPENSIVE_CHECKS
check = true;
#endif // EXPENSIVE_CHECKS
if (check) {
assert(N && "Checking nonexistent SDNode");
SmallPtrSet<const SDNode*, 32> visited;
SmallPtrSet<const SDNode*, 32> checked;
checkForCyclesHelper(N, visited, checked, DAG);
}
#endif // !NDEBUG
}
void llvm::checkForCycles(const llvm::SelectionDAG *DAG, bool force) {
checkForCycles(DAG->getRoot().getNode(), DAG, force);
}
Index: vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (revision 351303)
@@ -1,10458 +1,10479 @@
//===- SelectionDAGBuilder.cpp - Selection-DAG building -------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This implements routines for translating from LLVM IR into SelectionDAG IR.
//
//===----------------------------------------------------------------------===//
#include "SelectionDAGBuilder.h"
#include "SDNodeDbgValue.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GCMetadata.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
#include "llvm/CodeGen/StackMaps.h"
#include "llvm/CodeGen/SwiftErrorValueTracking.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/CodeGen/WinEHFuncInfo.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Statepoint.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/BranchProbability.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetIntrinsicInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <iterator>
#include <limits>
#include <numeric>
#include <tuple>
#include <utility>
#include <vector>
using namespace llvm;
using namespace PatternMatch;
using namespace SwitchCG;
#define DEBUG_TYPE "isel"
/// LimitFloatPrecision - Generate low-precision inline sequences for
/// some float libcalls (6, 8 or 12 bits).
static unsigned LimitFloatPrecision;
static cl::opt<unsigned, true>
LimitFPPrecision("limit-float-precision",
cl::desc("Generate low-precision inline sequences "
"for some float libcalls"),
cl::location(LimitFloatPrecision), cl::Hidden,
cl::init(0));
static cl::opt<unsigned> SwitchPeelThreshold(
"switch-peel-threshold", cl::Hidden, cl::init(66),
cl::desc("Set the case probability threshold for peeling the case from a "
"switch statement. A value greater than 100 will void this "
"optimization"));
// Limit the width of DAG chains. This is important in general to prevent
// DAG-based analysis from blowing up. For example, alias analysis and
// load clustering may not complete in reasonable time. It is difficult to
// recognize and avoid this situation within each individual analysis, and
// future analyses are likely to have the same behavior. Limiting DAG width is
// the safe approach and will be especially important with global DAGs.
//
// MaxParallelChains default is arbitrarily high to avoid affecting
// optimization, but could be lowered to improve compile time. Any ld-ld-st-st
// sequence over this should have been converted to llvm.memcpy by the
// frontend. It is easy to induce this behavior with .ll code such as:
// %buffer = alloca [4096 x i8]
// %data = load [4096 x i8]* %argPtr
// store [4096 x i8] %data, [4096 x i8]* %buffer
static const unsigned MaxParallelChains = 64;
// Return the calling convention if the Value passed requires ABI mangling as it
// is a parameter to a function or a return value from a function which is not
// an intrinsic.
static Optional<CallingConv::ID> getABIRegCopyCC(const Value *V) {
if (auto *R = dyn_cast<ReturnInst>(V))
return R->getParent()->getParent()->getCallingConv();
if (auto *CI = dyn_cast<CallInst>(V)) {
const bool IsInlineAsm = CI->isInlineAsm();
const bool IsIndirectFunctionCall =
!IsInlineAsm && !CI->getCalledFunction();
// It is possible that the call instruction is an inline asm statement or an
// indirect function call in which case the return value of
// getCalledFunction() would be nullptr.
const bool IsInstrinsicCall =
!IsInlineAsm && !IsIndirectFunctionCall &&
CI->getCalledFunction()->getIntrinsicID() != Intrinsic::not_intrinsic;
if (!IsInlineAsm && !IsInstrinsicCall)
return CI->getCallingConv();
}
return None;
}
static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
const SDValue *Parts, unsigned NumParts,
MVT PartVT, EVT ValueVT, const Value *V,
Optional<CallingConv::ID> CC);
/// getCopyFromParts - Create a value that contains the specified legal parts
/// combined into the value they represent. If the parts combine to a type
/// larger than ValueVT then AssertOp can be used to specify whether the extra
/// bits are known to be zero (ISD::AssertZext) or sign extended from ValueVT
/// (ISD::AssertSext).
static SDValue getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL,
const SDValue *Parts, unsigned NumParts,
MVT PartVT, EVT ValueVT, const Value *V,
Optional<CallingConv::ID> CC = None,
Optional<ISD::NodeType> AssertOp = None) {
if (ValueVT.isVector())
return getCopyFromPartsVector(DAG, DL, Parts, NumParts, PartVT, ValueVT, V,
CC);
assert(NumParts > 0 && "No parts to assemble!");
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Val = Parts[0];
if (NumParts > 1) {
// Assemble the value from multiple parts.
if (ValueVT.isInteger()) {
unsigned PartBits = PartVT.getSizeInBits();
unsigned ValueBits = ValueVT.getSizeInBits();
// Assemble the power of 2 part.
unsigned RoundParts =
(NumParts & (NumParts - 1)) ? 1 << Log2_32(NumParts) : NumParts;
unsigned RoundBits = PartBits * RoundParts;
EVT RoundVT = RoundBits == ValueBits ?
ValueVT : EVT::getIntegerVT(*DAG.getContext(), RoundBits);
SDValue Lo, Hi;
EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), RoundBits/2);
if (RoundParts > 2) {
Lo = getCopyFromParts(DAG, DL, Parts, RoundParts / 2,
PartVT, HalfVT, V);
Hi = getCopyFromParts(DAG, DL, Parts + RoundParts / 2,
RoundParts / 2, PartVT, HalfVT, V);
} else {
Lo = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[0]);
Hi = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[1]);
}
if (DAG.getDataLayout().isBigEndian())
std::swap(Lo, Hi);
Val = DAG.getNode(ISD::BUILD_PAIR, DL, RoundVT, Lo, Hi);
if (RoundParts < NumParts) {
// Assemble the trailing non-power-of-2 part.
unsigned OddParts = NumParts - RoundParts;
EVT OddVT = EVT::getIntegerVT(*DAG.getContext(), OddParts * PartBits);
Hi = getCopyFromParts(DAG, DL, Parts + RoundParts, OddParts, PartVT,
OddVT, V, CC);
// Combine the round and odd parts.
Lo = Val;
if (DAG.getDataLayout().isBigEndian())
std::swap(Lo, Hi);
EVT TotalVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
Hi = DAG.getNode(ISD::ANY_EXTEND, DL, TotalVT, Hi);
Hi =
DAG.getNode(ISD::SHL, DL, TotalVT, Hi,
DAG.getConstant(Lo.getValueSizeInBits(), DL,
TLI.getPointerTy(DAG.getDataLayout())));
Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, TotalVT, Lo);
Val = DAG.getNode(ISD::OR, DL, TotalVT, Lo, Hi);
}
} else if (PartVT.isFloatingPoint()) {
// FP split into multiple FP parts (for ppcf128)
assert(ValueVT == EVT(MVT::ppcf128) && PartVT == MVT::f64 &&
"Unexpected split");
SDValue Lo, Hi;
Lo = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[0]);
Hi = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[1]);
if (TLI.hasBigEndianPartOrdering(ValueVT, DAG.getDataLayout()))
std::swap(Lo, Hi);
Val = DAG.getNode(ISD::BUILD_PAIR, DL, ValueVT, Lo, Hi);
} else {
// FP split into integer parts (soft fp)
assert(ValueVT.isFloatingPoint() && PartVT.isInteger() &&
!PartVT.isVector() && "Unexpected split");
EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
Val = getCopyFromParts(DAG, DL, Parts, NumParts, PartVT, IntVT, V, CC);
}
}
// There is now one part, held in Val. Correct it to match ValueVT.
// PartEVT is the type of the register class that holds the value.
// ValueVT is the type of the inline asm operation.
EVT PartEVT = Val.getValueType();
if (PartEVT == ValueVT)
return Val;
if (PartEVT.isInteger() && ValueVT.isFloatingPoint() &&
ValueVT.bitsLT(PartEVT)) {
// For an FP value in an integer part, we need to truncate to the right
// width first.
PartEVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
Val = DAG.getNode(ISD::TRUNCATE, DL, PartEVT, Val);
}
// Handle types that have the same size.
if (PartEVT.getSizeInBits() == ValueVT.getSizeInBits())
return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
// Handle types with different sizes.
if (PartEVT.isInteger() && ValueVT.isInteger()) {
if (ValueVT.bitsLT(PartEVT)) {
// For a truncate, see if we have any information to
// indicate whether the truncated bits will always be
// zero or sign-extension.
if (AssertOp.hasValue())
Val = DAG.getNode(*AssertOp, DL, PartEVT, Val,
DAG.getValueType(ValueVT));
return DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
}
return DAG.getNode(ISD::ANY_EXTEND, DL, ValueVT, Val);
}
if (PartEVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
// FP_ROUND's are always exact here.
if (ValueVT.bitsLT(Val.getValueType()))
return DAG.getNode(
ISD::FP_ROUND, DL, ValueVT, Val,
DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout())));
return DAG.getNode(ISD::FP_EXTEND, DL, ValueVT, Val);
}
// Handle MMX to a narrower integer type by bitcasting MMX to integer and
// then truncating.
if (PartEVT == MVT::x86mmx && ValueVT.isInteger() &&
ValueVT.bitsLT(PartEVT)) {
Val = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Val);
return DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
}
report_fatal_error("Unknown mismatch in getCopyFromParts!");
}
static void diagnosePossiblyInvalidConstraint(LLVMContext &Ctx, const Value *V,
const Twine &ErrMsg) {
const Instruction *I = dyn_cast_or_null<Instruction>(V);
if (!V)
return Ctx.emitError(ErrMsg);
const char *AsmError = ", possible invalid constraint for vector type";
if (const CallInst *CI = dyn_cast<CallInst>(I))
if (isa<InlineAsm>(CI->getCalledValue()))
return Ctx.emitError(I, ErrMsg + AsmError);
return Ctx.emitError(I, ErrMsg);
}
/// getCopyFromPartsVector - Create a value that contains the specified legal
/// parts combined into the value they represent. If the parts combine to a
/// type larger than ValueVT then AssertOp can be used to specify whether the
/// extra bits are known to be zero (ISD::AssertZext) or sign extended from
/// ValueVT (ISD::AssertSext).
static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
const SDValue *Parts, unsigned NumParts,
MVT PartVT, EVT ValueVT, const Value *V,
Optional<CallingConv::ID> CallConv) {
assert(ValueVT.isVector() && "Not a vector value");
assert(NumParts > 0 && "No parts to assemble!");
const bool IsABIRegCopy = CallConv.hasValue();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Val = Parts[0];
// Handle a multi-element vector.
if (NumParts > 1) {
EVT IntermediateVT;
MVT RegisterVT;
unsigned NumIntermediates;
unsigned NumRegs;
if (IsABIRegCopy) {
NumRegs = TLI.getVectorTypeBreakdownForCallingConv(
*DAG.getContext(), CallConv.getValue(), ValueVT, IntermediateVT,
NumIntermediates, RegisterVT);
} else {
NumRegs =
TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
NumIntermediates, RegisterVT);
}
assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
NumParts = NumRegs; // Silence a compiler warning.
assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
assert(RegisterVT.getSizeInBits() ==
Parts[0].getSimpleValueType().getSizeInBits() &&
"Part type sizes don't match!");
// Assemble the parts into intermediate operands.
SmallVector<SDValue, 8> Ops(NumIntermediates);
if (NumIntermediates == NumParts) {
// If the register was not expanded, truncate or copy the value,
// as appropriate.
for (unsigned i = 0; i != NumParts; ++i)
Ops[i] = getCopyFromParts(DAG, DL, &Parts[i], 1,
PartVT, IntermediateVT, V);
} else if (NumParts > 0) {
// If the intermediate type was expanded, build the intermediate
// operands from the parts.
assert(NumParts % NumIntermediates == 0 &&
"Must expand into a divisible number of parts!");
unsigned Factor = NumParts / NumIntermediates;
for (unsigned i = 0; i != NumIntermediates; ++i)
Ops[i] = getCopyFromParts(DAG, DL, &Parts[i * Factor], Factor,
PartVT, IntermediateVT, V);
}
// Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the
// intermediate operands.
EVT BuiltVectorTy =
EVT::getVectorVT(*DAG.getContext(), IntermediateVT.getScalarType(),
(IntermediateVT.isVector()
? IntermediateVT.getVectorNumElements() * NumParts
: NumIntermediates));
Val = DAG.getNode(IntermediateVT.isVector() ? ISD::CONCAT_VECTORS
: ISD::BUILD_VECTOR,
DL, BuiltVectorTy, Ops);
}
// There is now one part, held in Val. Correct it to match ValueVT.
EVT PartEVT = Val.getValueType();
if (PartEVT == ValueVT)
return Val;
if (PartEVT.isVector()) {
// If the element type of the source/dest vectors are the same, but the
// parts vector has more elements than the value vector, then we have a
// vector widening case (e.g. <2 x float> -> <4 x float>). Extract the
// elements we want.
if (PartEVT.getVectorElementType() == ValueVT.getVectorElementType()) {
assert(PartEVT.getVectorNumElements() > ValueVT.getVectorNumElements() &&
"Cannot narrow, it would be a lossy transformation");
return DAG.getNode(
ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
}
// Vector/Vector bitcast.
if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits())
return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
assert(PartEVT.getVectorNumElements() == ValueVT.getVectorNumElements() &&
"Cannot handle this kind of promotion");
// Promoted vector extract
return DAG.getAnyExtOrTrunc(Val, DL, ValueVT);
}
// Trivial bitcast if the types are the same size and the destination
// vector type is legal.
if (PartEVT.getSizeInBits() == ValueVT.getSizeInBits() &&
TLI.isTypeLegal(ValueVT))
return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
if (ValueVT.getVectorNumElements() != 1) {
// Certain ABIs require that vectors are passed as integers. For vectors
// are the same size, this is an obvious bitcast.
if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits()) {
return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
} else if (ValueVT.getSizeInBits() < PartEVT.getSizeInBits()) {
// Bitcast Val back the original type and extract the corresponding
// vector we want.
unsigned Elts = PartEVT.getSizeInBits() / ValueVT.getScalarSizeInBits();
EVT WiderVecType = EVT::getVectorVT(*DAG.getContext(),
ValueVT.getVectorElementType(), Elts);
Val = DAG.getBitcast(WiderVecType, Val);
return DAG.getNode(
ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
}
diagnosePossiblyInvalidConstraint(
*DAG.getContext(), V, "non-trivial scalar-to-vector conversion");
return DAG.getUNDEF(ValueVT);
}
// Handle cases such as i8 -> <1 x i1>
EVT ValueSVT = ValueVT.getVectorElementType();
if (ValueVT.getVectorNumElements() == 1 && ValueSVT != PartEVT)
Val = ValueVT.isFloatingPoint() ? DAG.getFPExtendOrRound(Val, DL, ValueSVT)
: DAG.getAnyExtOrTrunc(Val, DL, ValueSVT);
return DAG.getBuildVector(ValueVT, DL, Val);
}
static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &dl,
SDValue Val, SDValue *Parts, unsigned NumParts,
MVT PartVT, const Value *V,
Optional<CallingConv::ID> CallConv);
/// getCopyToParts - Create a series of nodes that contain the specified value
/// split into legal parts. If the parts contain more bits than Val, then, for
/// integers, ExtendKind can be used to specify how to generate the extra bits.
static void getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val,
SDValue *Parts, unsigned NumParts, MVT PartVT,
const Value *V,
Optional<CallingConv::ID> CallConv = None,
ISD::NodeType ExtendKind = ISD::ANY_EXTEND) {
EVT ValueVT = Val.getValueType();
// Handle the vector case separately.
if (ValueVT.isVector())
return getCopyToPartsVector(DAG, DL, Val, Parts, NumParts, PartVT, V,
CallConv);
unsigned PartBits = PartVT.getSizeInBits();
unsigned OrigNumParts = NumParts;
assert(DAG.getTargetLoweringInfo().isTypeLegal(PartVT) &&
"Copying to an illegal type!");
if (NumParts == 0)
return;
assert(!ValueVT.isVector() && "Vector case handled elsewhere");
EVT PartEVT = PartVT;
if (PartEVT == ValueVT) {
assert(NumParts == 1 && "No-op copy with multiple parts!");
Parts[0] = Val;
return;
}
if (NumParts * PartBits > ValueVT.getSizeInBits()) {
// If the parts cover more bits than the value has, promote the value.
if (PartVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
assert(NumParts == 1 && "Do not know what to promote to!");
Val = DAG.getNode(ISD::FP_EXTEND, DL, PartVT, Val);
} else {
if (ValueVT.isFloatingPoint()) {
// FP values need to be bitcast, then extended if they are being put
// into a larger container.
ValueVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
}
assert((PartVT.isInteger() || PartVT == MVT::x86mmx) &&
ValueVT.isInteger() &&
"Unknown mismatch!");
ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
Val = DAG.getNode(ExtendKind, DL, ValueVT, Val);
if (PartVT == MVT::x86mmx)
Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
}
} else if (PartBits == ValueVT.getSizeInBits()) {
// Different types of the same size.
assert(NumParts == 1 && PartEVT != ValueVT);
Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
} else if (NumParts * PartBits < ValueVT.getSizeInBits()) {
// If the parts cover less bits than value has, truncate the value.
assert((PartVT.isInteger() || PartVT == MVT::x86mmx) &&
ValueVT.isInteger() &&
"Unknown mismatch!");
ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
Val = DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
if (PartVT == MVT::x86mmx)
Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
}
// The value may have changed - recompute ValueVT.
ValueVT = Val.getValueType();
assert(NumParts * PartBits == ValueVT.getSizeInBits() &&
"Failed to tile the value with PartVT!");
if (NumParts == 1) {
if (PartEVT != ValueVT) {
diagnosePossiblyInvalidConstraint(*DAG.getContext(), V,
"scalar-to-vector conversion failed");
Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
}
Parts[0] = Val;
return;
}
// Expand the value into multiple parts.
if (NumParts & (NumParts - 1)) {
// The number of parts is not a power of 2. Split off and copy the tail.
assert(PartVT.isInteger() && ValueVT.isInteger() &&
"Do not know what to expand to!");
unsigned RoundParts = 1 << Log2_32(NumParts);
unsigned RoundBits = RoundParts * PartBits;
unsigned OddParts = NumParts - RoundParts;
SDValue OddVal = DAG.getNode(ISD::SRL, DL, ValueVT, Val,
DAG.getShiftAmountConstant(RoundBits, ValueVT, DL, /*LegalTypes*/false));
getCopyToParts(DAG, DL, OddVal, Parts + RoundParts, OddParts, PartVT, V,
CallConv);
if (DAG.getDataLayout().isBigEndian())
// The odd parts were reversed by getCopyToParts - unreverse them.
std::reverse(Parts + RoundParts, Parts + NumParts);
NumParts = RoundParts;
ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
Val = DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
}
// The number of parts is a power of 2. Repeatedly bisect the value using
// EXTRACT_ELEMENT.
Parts[0] = DAG.getNode(ISD::BITCAST, DL,
EVT::getIntegerVT(*DAG.getContext(),
ValueVT.getSizeInBits()),
Val);
for (unsigned StepSize = NumParts; StepSize > 1; StepSize /= 2) {
for (unsigned i = 0; i < NumParts; i += StepSize) {
unsigned ThisBits = StepSize * PartBits / 2;
EVT ThisVT = EVT::getIntegerVT(*DAG.getContext(), ThisBits);
SDValue &Part0 = Parts[i];
SDValue &Part1 = Parts[i+StepSize/2];
Part1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL,
ThisVT, Part0, DAG.getIntPtrConstant(1, DL));
Part0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL,
ThisVT, Part0, DAG.getIntPtrConstant(0, DL));
if (ThisBits == PartBits && ThisVT != PartVT) {
Part0 = DAG.getNode(ISD::BITCAST, DL, PartVT, Part0);
Part1 = DAG.getNode(ISD::BITCAST, DL, PartVT, Part1);
}
}
}
if (DAG.getDataLayout().isBigEndian())
std::reverse(Parts, Parts + OrigNumParts);
}
static SDValue widenVectorToPartType(SelectionDAG &DAG,
SDValue Val, const SDLoc &DL, EVT PartVT) {
if (!PartVT.isVector())
return SDValue();
EVT ValueVT = Val.getValueType();
unsigned PartNumElts = PartVT.getVectorNumElements();
unsigned ValueNumElts = ValueVT.getVectorNumElements();
if (PartNumElts > ValueNumElts &&
PartVT.getVectorElementType() == ValueVT.getVectorElementType()) {
EVT ElementVT = PartVT.getVectorElementType();
// Vector widening case, e.g. <2 x float> -> <4 x float>. Shuffle in
// undef elements.
SmallVector<SDValue, 16> Ops;
DAG.ExtractVectorElements(Val, Ops);
SDValue EltUndef = DAG.getUNDEF(ElementVT);
for (unsigned i = ValueNumElts, e = PartNumElts; i != e; ++i)
Ops.push_back(EltUndef);
// FIXME: Use CONCAT for 2x -> 4x.
return DAG.getBuildVector(PartVT, DL, Ops);
}
return SDValue();
}
/// getCopyToPartsVector - Create a series of nodes that contain the specified
/// value split into legal parts.
static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
SDValue Val, SDValue *Parts, unsigned NumParts,
MVT PartVT, const Value *V,
Optional<CallingConv::ID> CallConv) {
EVT ValueVT = Val.getValueType();
assert(ValueVT.isVector() && "Not a vector");
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
const bool IsABIRegCopy = CallConv.hasValue();
if (NumParts == 1) {
EVT PartEVT = PartVT;
if (PartEVT == ValueVT) {
// Nothing to do.
} else if (PartVT.getSizeInBits() == ValueVT.getSizeInBits()) {
// Bitconvert vector->vector case.
Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
} else if (SDValue Widened = widenVectorToPartType(DAG, Val, DL, PartVT)) {
Val = Widened;
} else if (PartVT.isVector() &&
PartEVT.getVectorElementType().bitsGE(
ValueVT.getVectorElementType()) &&
PartEVT.getVectorNumElements() == ValueVT.getVectorNumElements()) {
// Promoted vector extract
Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
} else {
if (ValueVT.getVectorNumElements() == 1) {
Val = DAG.getNode(
ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
} else {
assert(PartVT.getSizeInBits() > ValueVT.getSizeInBits() &&
"lossy conversion of vector to scalar type");
EVT IntermediateType =
EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
Val = DAG.getBitcast(IntermediateType, Val);
Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
}
}
assert(Val.getValueType() == PartVT && "Unexpected vector part value type");
Parts[0] = Val;
return;
}
// Handle a multi-element vector.
EVT IntermediateVT;
MVT RegisterVT;
unsigned NumIntermediates;
unsigned NumRegs;
if (IsABIRegCopy) {
NumRegs = TLI.getVectorTypeBreakdownForCallingConv(
*DAG.getContext(), CallConv.getValue(), ValueVT, IntermediateVT,
NumIntermediates, RegisterVT);
} else {
NumRegs =
TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
NumIntermediates, RegisterVT);
}
assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
NumParts = NumRegs; // Silence a compiler warning.
assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
unsigned IntermediateNumElts = IntermediateVT.isVector() ?
IntermediateVT.getVectorNumElements() : 1;
// Convert the vector to the appropiate type if necessary.
unsigned DestVectorNoElts = NumIntermediates * IntermediateNumElts;
EVT BuiltVectorTy = EVT::getVectorVT(
*DAG.getContext(), IntermediateVT.getScalarType(), DestVectorNoElts);
MVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
if (ValueVT != BuiltVectorTy) {
if (SDValue Widened = widenVectorToPartType(DAG, Val, DL, BuiltVectorTy))
Val = Widened;
Val = DAG.getNode(ISD::BITCAST, DL, BuiltVectorTy, Val);
}
// Split the vector into intermediate operands.
SmallVector<SDValue, 8> Ops(NumIntermediates);
for (unsigned i = 0; i != NumIntermediates; ++i) {
if (IntermediateVT.isVector()) {
Ops[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, IntermediateVT, Val,
DAG.getConstant(i * IntermediateNumElts, DL, IdxVT));
} else {
Ops[i] = DAG.getNode(
ISD::EXTRACT_VECTOR_ELT, DL, IntermediateVT, Val,
DAG.getConstant(i, DL, IdxVT));
}
}
// Split the intermediate operands into legal parts.
if (NumParts == NumIntermediates) {
// If the register was not expanded, promote or copy the value,
// as appropriate.
for (unsigned i = 0; i != NumParts; ++i)
getCopyToParts(DAG, DL, Ops[i], &Parts[i], 1, PartVT, V, CallConv);
} else if (NumParts > 0) {
// If the intermediate type was expanded, split each the value into
// legal parts.
assert(NumIntermediates != 0 && "division by zero");
assert(NumParts % NumIntermediates == 0 &&
"Must expand into a divisible number of parts!");
unsigned Factor = NumParts / NumIntermediates;
for (unsigned i = 0; i != NumIntermediates; ++i)
getCopyToParts(DAG, DL, Ops[i], &Parts[i * Factor], Factor, PartVT, V,
CallConv);
}
}
RegsForValue::RegsForValue(const SmallVector<unsigned, 4> &regs, MVT regvt,
EVT valuevt, Optional<CallingConv::ID> CC)
: ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs),
RegCount(1, regs.size()), CallConv(CC) {}
RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI,
const DataLayout &DL, unsigned Reg, Type *Ty,
Optional<CallingConv::ID> CC) {
ComputeValueVTs(TLI, DL, Ty, ValueVTs);
CallConv = CC;
for (EVT ValueVT : ValueVTs) {
unsigned NumRegs =
isABIMangled()
? TLI.getNumRegistersForCallingConv(Context, CC.getValue(), ValueVT)
: TLI.getNumRegisters(Context, ValueVT);
MVT RegisterVT =
isABIMangled()
? TLI.getRegisterTypeForCallingConv(Context, CC.getValue(), ValueVT)
: TLI.getRegisterType(Context, ValueVT);
for (unsigned i = 0; i != NumRegs; ++i)
Regs.push_back(Reg + i);
RegVTs.push_back(RegisterVT);
RegCount.push_back(NumRegs);
Reg += NumRegs;
}
}
SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
FunctionLoweringInfo &FuncInfo,
const SDLoc &dl, SDValue &Chain,
SDValue *Flag, const Value *V) const {
// A Value with type {} or [0 x %t] needs no registers.
if (ValueVTs.empty())
return SDValue();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Assemble the legal parts into the final values.
SmallVector<SDValue, 4> Values(ValueVTs.size());
SmallVector<SDValue, 8> Parts;
for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
// Copy the legal parts from the registers.
EVT ValueVT = ValueVTs[Value];
unsigned NumRegs = RegCount[Value];
MVT RegisterVT = isABIMangled() ? TLI.getRegisterTypeForCallingConv(
*DAG.getContext(),
CallConv.getValue(), RegVTs[Value])
: RegVTs[Value];
Parts.resize(NumRegs);
for (unsigned i = 0; i != NumRegs; ++i) {
SDValue P;
if (!Flag) {
P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT);
} else {
P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT, *Flag);
*Flag = P.getValue(2);
}
Chain = P.getValue(1);
Parts[i] = P;
// If the source register was virtual and if we know something about it,
// add an assert node.
if (!TargetRegisterInfo::isVirtualRegister(Regs[Part+i]) ||
!RegisterVT.isInteger())
continue;
const FunctionLoweringInfo::LiveOutInfo *LOI =
FuncInfo.GetLiveOutRegInfo(Regs[Part+i]);
if (!LOI)
continue;
unsigned RegSize = RegisterVT.getScalarSizeInBits();
unsigned NumSignBits = LOI->NumSignBits;
unsigned NumZeroBits = LOI->Known.countMinLeadingZeros();
if (NumZeroBits == RegSize) {
// The current value is a zero.
// Explicitly express that as it would be easier for
// optimizations to kick in.
Parts[i] = DAG.getConstant(0, dl, RegisterVT);
continue;
}
// FIXME: We capture more information than the dag can represent. For
// now, just use the tightest assertzext/assertsext possible.
bool isSExt;
EVT FromVT(MVT::Other);
if (NumZeroBits) {
FromVT = EVT::getIntegerVT(*DAG.getContext(), RegSize - NumZeroBits);
isSExt = false;
} else if (NumSignBits > 1) {
FromVT =
EVT::getIntegerVT(*DAG.getContext(), RegSize - NumSignBits + 1);
isSExt = true;
} else {
continue;
}
// Add an assertion node.
assert(FromVT != MVT::Other);
Parts[i] = DAG.getNode(isSExt ? ISD::AssertSext : ISD::AssertZext, dl,
RegisterVT, P, DAG.getValueType(FromVT));
}
Values[Value] = getCopyFromParts(DAG, dl, Parts.begin(), NumRegs,
RegisterVT, ValueVT, V, CallConv);
Part += NumRegs;
Parts.clear();
}
return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(ValueVTs), Values);
}
void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
const SDLoc &dl, SDValue &Chain, SDValue *Flag,
const Value *V,
ISD::NodeType PreferredExtendType) const {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
ISD::NodeType ExtendKind = PreferredExtendType;
// Get the list of the values's legal parts.
unsigned NumRegs = Regs.size();
SmallVector<SDValue, 8> Parts(NumRegs);
for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
unsigned NumParts = RegCount[Value];
MVT RegisterVT = isABIMangled() ? TLI.getRegisterTypeForCallingConv(
*DAG.getContext(),
CallConv.getValue(), RegVTs[Value])
: RegVTs[Value];
if (ExtendKind == ISD::ANY_EXTEND && TLI.isZExtFree(Val, RegisterVT))
ExtendKind = ISD::ZERO_EXTEND;
getCopyToParts(DAG, dl, Val.getValue(Val.getResNo() + Value), &Parts[Part],
NumParts, RegisterVT, V, CallConv, ExtendKind);
Part += NumParts;
}
// Copy the parts into the registers.
SmallVector<SDValue, 8> Chains(NumRegs);
for (unsigned i = 0; i != NumRegs; ++i) {
SDValue Part;
if (!Flag) {
Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i]);
} else {
Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i], *Flag);
*Flag = Part.getValue(1);
}
Chains[i] = Part.getValue(0);
}
if (NumRegs == 1 || Flag)
// If NumRegs > 1 && Flag is used then the use of the last CopyToReg is
// flagged to it. That is the CopyToReg nodes and the user are considered
// a single scheduling unit. If we create a TokenFactor and return it as
// chain, then the TokenFactor is both a predecessor (operand) of the
// user as well as a successor (the TF operands are flagged to the user).
// c1, f1 = CopyToReg
// c2, f2 = CopyToReg
// c3 = TokenFactor c1, c2
// ...
// = op c3, ..., f2
Chain = Chains[NumRegs-1];
else
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
}
void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
unsigned MatchingIdx, const SDLoc &dl,
SelectionDAG &DAG,
std::vector<SDValue> &Ops) const {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned Flag = InlineAsm::getFlagWord(Code, Regs.size());
if (HasMatching)
Flag = InlineAsm::getFlagWordForMatchingOp(Flag, MatchingIdx);
else if (!Regs.empty() &&
TargetRegisterInfo::isVirtualRegister(Regs.front())) {
// Put the register class of the virtual registers in the flag word. That
// way, later passes can recompute register class constraints for inline
// assembly as well as normal instructions.
// Don't do this for tied operands that can use the regclass information
// from the def.
const MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
const TargetRegisterClass *RC = MRI.getRegClass(Regs.front());
Flag = InlineAsm::getFlagWordForRegClass(Flag, RC->getID());
}
SDValue Res = DAG.getTargetConstant(Flag, dl, MVT::i32);
Ops.push_back(Res);
if (Code == InlineAsm::Kind_Clobber) {
// Clobbers should always have a 1:1 mapping with registers, and may
// reference registers that have illegal (e.g. vector) types. Hence, we
// shouldn't try to apply any sort of splitting logic to them.
assert(Regs.size() == RegVTs.size() && Regs.size() == ValueVTs.size() &&
"No 1:1 mapping from clobbers to regs?");
unsigned SP = TLI.getStackPointerRegisterToSaveRestore();
(void)SP;
for (unsigned I = 0, E = ValueVTs.size(); I != E; ++I) {
Ops.push_back(DAG.getRegister(Regs[I], RegVTs[I]));
assert(
(Regs[I] != SP ||
DAG.getMachineFunction().getFrameInfo().hasOpaqueSPAdjustment()) &&
"If we clobbered the stack pointer, MFI should know about it.");
}
return;
}
for (unsigned Value = 0, Reg = 0, e = ValueVTs.size(); Value != e; ++Value) {
unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVTs[Value]);
MVT RegisterVT = RegVTs[Value];
for (unsigned i = 0; i != NumRegs; ++i) {
assert(Reg < Regs.size() && "Mismatch in # registers expected");
unsigned TheReg = Regs[Reg++];
Ops.push_back(DAG.getRegister(TheReg, RegisterVT));
}
}
}
SmallVector<std::pair<unsigned, unsigned>, 4>
RegsForValue::getRegsAndSizes() const {
SmallVector<std::pair<unsigned, unsigned>, 4> OutVec;
unsigned I = 0;
for (auto CountAndVT : zip_first(RegCount, RegVTs)) {
unsigned RegCount = std::get<0>(CountAndVT);
MVT RegisterVT = std::get<1>(CountAndVT);
unsigned RegisterSize = RegisterVT.getSizeInBits();
for (unsigned E = I + RegCount; I != E; ++I)
OutVec.push_back(std::make_pair(Regs[I], RegisterSize));
}
return OutVec;
}
void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis *aa,
const TargetLibraryInfo *li) {
AA = aa;
GFI = gfi;
LibInfo = li;
DL = &DAG.getDataLayout();
Context = DAG.getContext();
LPadToCallSiteMap.clear();
SL->init(DAG.getTargetLoweringInfo(), TM, DAG.getDataLayout());
}
void SelectionDAGBuilder::clear() {
NodeMap.clear();
UnusedArgNodeMap.clear();
PendingLoads.clear();
PendingExports.clear();
CurInst = nullptr;
HasTailCall = false;
SDNodeOrder = LowestSDNodeOrder;
StatepointLowering.clear();
}
void SelectionDAGBuilder::clearDanglingDebugInfo() {
DanglingDebugInfoMap.clear();
}
SDValue SelectionDAGBuilder::getRoot() {
if (PendingLoads.empty())
return DAG.getRoot();
if (PendingLoads.size() == 1) {
SDValue Root = PendingLoads[0];
DAG.setRoot(Root);
PendingLoads.clear();
return Root;
}
// Otherwise, we have to make a token factor node.
SDValue Root = DAG.getTokenFactor(getCurSDLoc(), PendingLoads);
PendingLoads.clear();
DAG.setRoot(Root);
return Root;
}
SDValue SelectionDAGBuilder::getControlRoot() {
SDValue Root = DAG.getRoot();
if (PendingExports.empty())
return Root;
// Turn all of the CopyToReg chains into one factored node.
if (Root.getOpcode() != ISD::EntryToken) {
unsigned i = 0, e = PendingExports.size();
for (; i != e; ++i) {
assert(PendingExports[i].getNode()->getNumOperands() > 1);
if (PendingExports[i].getNode()->getOperand(0) == Root)
break; // Don't add the root if we already indirectly depend on it.
}
if (i == e)
PendingExports.push_back(Root);
}
Root = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other,
PendingExports);
PendingExports.clear();
DAG.setRoot(Root);
return Root;
}
void SelectionDAGBuilder::visit(const Instruction &I) {
// Set up outgoing PHI node register values before emitting the terminator.
if (I.isTerminator()) {
HandlePHINodesInSuccessorBlocks(I.getParent());
}
// Increase the SDNodeOrder if dealing with a non-debug instruction.
if (!isa<DbgInfoIntrinsic>(I))
++SDNodeOrder;
CurInst = &I;
visit(I.getOpcode(), I);
if (auto *FPMO = dyn_cast<FPMathOperator>(&I)) {
// Propagate the fast-math-flags of this IR instruction to the DAG node that
// maps to this instruction.
// TODO: We could handle all flags (nsw, etc) here.
// TODO: If an IR instruction maps to >1 node, only the final node will have
// flags set.
if (SDNode *Node = getNodeForIRValue(&I)) {
SDNodeFlags IncomingFlags;
IncomingFlags.copyFMF(*FPMO);
if (!Node->getFlags().isDefined())
Node->setFlags(IncomingFlags);
else
Node->intersectFlagsWith(IncomingFlags);
}
}
if (!I.isTerminator() && !HasTailCall &&
!isStatepoint(&I)) // statepoints handle their exports internally
CopyToExportRegsIfNeeded(&I);
CurInst = nullptr;
}
void SelectionDAGBuilder::visitPHI(const PHINode &) {
llvm_unreachable("SelectionDAGBuilder shouldn't visit PHI nodes!");
}
void SelectionDAGBuilder::visit(unsigned Opcode, const User &I) {
// Note: this doesn't use InstVisitor, because it has to work with
// ConstantExpr's in addition to instructions.
switch (Opcode) {
default: llvm_unreachable("Unknown instruction type encountered!");
// Build the switch statement using the Instruction.def file.
#define HANDLE_INST(NUM, OPCODE, CLASS) \
case Instruction::OPCODE: visit##OPCODE((const CLASS&)I); break;
#include "llvm/IR/Instruction.def"
}
}
void SelectionDAGBuilder::dropDanglingDebugInfo(const DILocalVariable *Variable,
const DIExpression *Expr) {
auto isMatchingDbgValue = [&](DanglingDebugInfo &DDI) {
const DbgValueInst *DI = DDI.getDI();
DIVariable *DanglingVariable = DI->getVariable();
DIExpression *DanglingExpr = DI->getExpression();
if (DanglingVariable == Variable && Expr->fragmentsOverlap(DanglingExpr)) {
LLVM_DEBUG(dbgs() << "Dropping dangling debug info for " << *DI << "\n");
return true;
}
return false;
};
for (auto &DDIMI : DanglingDebugInfoMap) {
DanglingDebugInfoVector &DDIV = DDIMI.second;
// If debug info is to be dropped, run it through final checks to see
// whether it can be salvaged.
for (auto &DDI : DDIV)
if (isMatchingDbgValue(DDI))
salvageUnresolvedDbgValue(DDI);
DDIV.erase(remove_if(DDIV, isMatchingDbgValue), DDIV.end());
}
}
// resolveDanglingDebugInfo - if we saw an earlier dbg_value referring to V,
// generate the debug data structures now that we've seen its definition.
void SelectionDAGBuilder::resolveDanglingDebugInfo(const Value *V,
SDValue Val) {
auto DanglingDbgInfoIt = DanglingDebugInfoMap.find(V);
if (DanglingDbgInfoIt == DanglingDebugInfoMap.end())
return;
DanglingDebugInfoVector &DDIV = DanglingDbgInfoIt->second;
for (auto &DDI : DDIV) {
const DbgValueInst *DI = DDI.getDI();
assert(DI && "Ill-formed DanglingDebugInfo");
DebugLoc dl = DDI.getdl();
unsigned ValSDNodeOrder = Val.getNode()->getIROrder();
unsigned DbgSDNodeOrder = DDI.getSDNodeOrder();
DILocalVariable *Variable = DI->getVariable();
DIExpression *Expr = DI->getExpression();
assert(Variable->isValidLocationForIntrinsic(dl) &&
"Expected inlined-at fields to agree");
SDDbgValue *SDV;
if (Val.getNode()) {
// FIXME: I doubt that it is correct to resolve a dangling DbgValue as a
// FuncArgumentDbgValue (it would be hoisted to the function entry, and if
// we couldn't resolve it directly when examining the DbgValue intrinsic
// in the first place we should not be more successful here). Unless we
// have some test case that prove this to be correct we should avoid
// calling EmitFuncArgumentDbgValue here.
if (!EmitFuncArgumentDbgValue(V, Variable, Expr, dl, false, Val)) {
LLVM_DEBUG(dbgs() << "Resolve dangling debug info [order="
<< DbgSDNodeOrder << "] for:\n " << *DI << "\n");
LLVM_DEBUG(dbgs() << " By mapping to:\n "; Val.dump());
// Increase the SDNodeOrder for the DbgValue here to make sure it is
// inserted after the definition of Val when emitting the instructions
// after ISel. An alternative could be to teach
// ScheduleDAGSDNodes::EmitSchedule to delay the insertion properly.
LLVM_DEBUG(if (ValSDNodeOrder > DbgSDNodeOrder) dbgs()
<< "changing SDNodeOrder from " << DbgSDNodeOrder << " to "
<< ValSDNodeOrder << "\n");
SDV = getDbgValue(Val, Variable, Expr, dl,
std::max(DbgSDNodeOrder, ValSDNodeOrder));
DAG.AddDbgValue(SDV, Val.getNode(), false);
} else
LLVM_DEBUG(dbgs() << "Resolved dangling debug info for " << *DI
<< "in EmitFuncArgumentDbgValue\n");
} else {
LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
auto Undef =
UndefValue::get(DDI.getDI()->getVariableLocation()->getType());
auto SDV =
DAG.getConstantDbgValue(Variable, Expr, Undef, dl, DbgSDNodeOrder);
DAG.AddDbgValue(SDV, nullptr, false);
}
}
DDIV.clear();
}
void SelectionDAGBuilder::salvageUnresolvedDbgValue(DanglingDebugInfo &DDI) {
Value *V = DDI.getDI()->getValue();
DILocalVariable *Var = DDI.getDI()->getVariable();
DIExpression *Expr = DDI.getDI()->getExpression();
DebugLoc DL = DDI.getdl();
DebugLoc InstDL = DDI.getDI()->getDebugLoc();
unsigned SDOrder = DDI.getSDNodeOrder();
// Currently we consider only dbg.value intrinsics -- we tell the salvager
// that DW_OP_stack_value is desired.
assert(isa<DbgValueInst>(DDI.getDI()));
bool StackValue = true;
// Can this Value can be encoded without any further work?
if (handleDebugValue(V, Var, Expr, DL, InstDL, SDOrder))
return;
// Attempt to salvage back through as many instructions as possible. Bail if
// a non-instruction is seen, such as a constant expression or global
// variable. FIXME: Further work could recover those too.
while (isa<Instruction>(V)) {
Instruction &VAsInst = *cast<Instruction>(V);
DIExpression *NewExpr = salvageDebugInfoImpl(VAsInst, Expr, StackValue);
// If we cannot salvage any further, and haven't yet found a suitable debug
// expression, bail out.
if (!NewExpr)
break;
// New value and expr now represent this debuginfo.
V = VAsInst.getOperand(0);
Expr = NewExpr;
// Some kind of simplification occurred: check whether the operand of the
// salvaged debug expression can be encoded in this DAG.
if (handleDebugValue(V, Var, Expr, DL, InstDL, SDOrder)) {
LLVM_DEBUG(dbgs() << "Salvaged debug location info for:\n "
<< DDI.getDI() << "\nBy stripping back to:\n " << V);
return;
}
}
// This was the final opportunity to salvage this debug information, and it
// couldn't be done. Place an undef DBG_VALUE at this location to terminate
// any earlier variable location.
auto Undef = UndefValue::get(DDI.getDI()->getVariableLocation()->getType());
auto SDV = DAG.getConstantDbgValue(Var, Expr, Undef, DL, SDNodeOrder);
DAG.AddDbgValue(SDV, nullptr, false);
LLVM_DEBUG(dbgs() << "Dropping debug value info for:\n " << DDI.getDI()
<< "\n");
LLVM_DEBUG(dbgs() << " Last seen at:\n " << *DDI.getDI()->getOperand(0)
<< "\n");
}
bool SelectionDAGBuilder::handleDebugValue(const Value *V, DILocalVariable *Var,
DIExpression *Expr, DebugLoc dl,
DebugLoc InstDL, unsigned Order) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDDbgValue *SDV;
if (isa<ConstantInt>(V) || isa<ConstantFP>(V) || isa<UndefValue>(V) ||
isa<ConstantPointerNull>(V)) {
SDV = DAG.getConstantDbgValue(Var, Expr, V, dl, SDNodeOrder);
DAG.AddDbgValue(SDV, nullptr, false);
return true;
}
// If the Value is a frame index, we can create a FrameIndex debug value
// without relying on the DAG at all.
if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
auto SI = FuncInfo.StaticAllocaMap.find(AI);
if (SI != FuncInfo.StaticAllocaMap.end()) {
auto SDV =
DAG.getFrameIndexDbgValue(Var, Expr, SI->second,
/*IsIndirect*/ false, dl, SDNodeOrder);
// Do not attach the SDNodeDbgValue to an SDNode: this variable location
// is still available even if the SDNode gets optimized out.
DAG.AddDbgValue(SDV, nullptr, false);
return true;
}
}
// Do not use getValue() in here; we don't want to generate code at
// this point if it hasn't been done yet.
SDValue N = NodeMap[V];
if (!N.getNode() && isa<Argument>(V)) // Check unused arguments map.
N = UnusedArgNodeMap[V];
if (N.getNode()) {
if (EmitFuncArgumentDbgValue(V, Var, Expr, dl, false, N))
return true;
SDV = getDbgValue(N, Var, Expr, dl, SDNodeOrder);
DAG.AddDbgValue(SDV, N.getNode(), false);
return true;
}
// Special rules apply for the first dbg.values of parameter variables in a
// function. Identify them by the fact they reference Argument Values, that
// they're parameters, and they are parameters of the current function. We
// need to let them dangle until they get an SDNode.
bool IsParamOfFunc = isa<Argument>(V) && Var->isParameter() &&
!InstDL.getInlinedAt();
if (!IsParamOfFunc) {
// The value is not used in this block yet (or it would have an SDNode).
// We still want the value to appear for the user if possible -- if it has
// an associated VReg, we can refer to that instead.
auto VMI = FuncInfo.ValueMap.find(V);
if (VMI != FuncInfo.ValueMap.end()) {
unsigned Reg = VMI->second;
// If this is a PHI node, it may be split up into several MI PHI nodes
// (in FunctionLoweringInfo::set).
RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), Reg,
V->getType(), None);
if (RFV.occupiesMultipleRegs()) {
unsigned Offset = 0;
unsigned BitsToDescribe = 0;
if (auto VarSize = Var->getSizeInBits())
BitsToDescribe = *VarSize;
if (auto Fragment = Expr->getFragmentInfo())
BitsToDescribe = Fragment->SizeInBits;
for (auto RegAndSize : RFV.getRegsAndSizes()) {
unsigned RegisterSize = RegAndSize.second;
// Bail out if all bits are described already.
if (Offset >= BitsToDescribe)
break;
unsigned FragmentSize = (Offset + RegisterSize > BitsToDescribe)
? BitsToDescribe - Offset
: RegisterSize;
auto FragmentExpr = DIExpression::createFragmentExpression(
Expr, Offset, FragmentSize);
if (!FragmentExpr)
continue;
SDV = DAG.getVRegDbgValue(Var, *FragmentExpr, RegAndSize.first,
false, dl, SDNodeOrder);
DAG.AddDbgValue(SDV, nullptr, false);
Offset += RegisterSize;
}
} else {
SDV = DAG.getVRegDbgValue(Var, Expr, Reg, false, dl, SDNodeOrder);
DAG.AddDbgValue(SDV, nullptr, false);
}
return true;
}
}
return false;
}
void SelectionDAGBuilder::resolveOrClearDbgInfo() {
// Try to fixup any remaining dangling debug info -- and drop it if we can't.
for (auto &Pair : DanglingDebugInfoMap)
for (auto &DDI : Pair.second)
salvageUnresolvedDbgValue(DDI);
clearDanglingDebugInfo();
}
/// getCopyFromRegs - If there was virtual register allocated for the value V
/// emit CopyFromReg of the specified type Ty. Return empty SDValue() otherwise.
SDValue SelectionDAGBuilder::getCopyFromRegs(const Value *V, Type *Ty) {
DenseMap<const Value *, unsigned>::iterator It = FuncInfo.ValueMap.find(V);
SDValue Result;
if (It != FuncInfo.ValueMap.end()) {
unsigned InReg = It->second;
RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
DAG.getDataLayout(), InReg, Ty,
None); // This is not an ABI copy.
SDValue Chain = DAG.getEntryNode();
Result = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr,
V);
resolveDanglingDebugInfo(V, Result);
}
return Result;
}
/// getValue - Return an SDValue for the given Value.
SDValue SelectionDAGBuilder::getValue(const Value *V) {
// If we already have an SDValue for this value, use it. It's important
// to do this first, so that we don't create a CopyFromReg if we already
// have a regular SDValue.
SDValue &N = NodeMap[V];
if (N.getNode()) return N;
// If there's a virtual register allocated and initialized for this
// value, use it.
if (SDValue copyFromReg = getCopyFromRegs(V, V->getType()))
return copyFromReg;
// Otherwise create a new SDValue and remember it.
SDValue Val = getValueImpl(V);
NodeMap[V] = Val;
resolveDanglingDebugInfo(V, Val);
return Val;
}
// Return true if SDValue exists for the given Value
bool SelectionDAGBuilder::findValue(const Value *V) const {
return (NodeMap.find(V) != NodeMap.end()) ||
(FuncInfo.ValueMap.find(V) != FuncInfo.ValueMap.end());
}
/// getNonRegisterValue - Return an SDValue for the given Value, but
/// don't look in FuncInfo.ValueMap for a virtual register.
SDValue SelectionDAGBuilder::getNonRegisterValue(const Value *V) {
// If we already have an SDValue for this value, use it.
SDValue &N = NodeMap[V];
if (N.getNode()) {
if (isa<ConstantSDNode>(N) || isa<ConstantFPSDNode>(N)) {
// Remove the debug location from the node as the node is about to be used
// in a location which may differ from the original debug location. This
// is relevant to Constant and ConstantFP nodes because they can appear
// as constant expressions inside PHI nodes.
N->setDebugLoc(DebugLoc());
}
return N;
}
// Otherwise create a new SDValue and remember it.
SDValue Val = getValueImpl(V);
NodeMap[V] = Val;
resolveDanglingDebugInfo(V, Val);
return Val;
}
/// getValueImpl - Helper function for getValue and getNonRegisterValue.
/// Create an SDValue for the given value.
SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (const Constant *C = dyn_cast<Constant>(V)) {
EVT VT = TLI.getValueType(DAG.getDataLayout(), V->getType(), true);
if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
return DAG.getConstant(*CI, getCurSDLoc(), VT);
if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
return DAG.getGlobalAddress(GV, getCurSDLoc(), VT);
if (isa<ConstantPointerNull>(C)) {
unsigned AS = V->getType()->getPointerAddressSpace();
return DAG.getConstant(0, getCurSDLoc(),
TLI.getPointerTy(DAG.getDataLayout(), AS));
}
if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
return DAG.getConstantFP(*CFP, getCurSDLoc(), VT);
if (isa<UndefValue>(C) && !V->getType()->isAggregateType())
return DAG.getUNDEF(VT);
if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
visit(CE->getOpcode(), *CE);
SDValue N1 = NodeMap[V];
assert(N1.getNode() && "visit didn't populate the NodeMap!");
return N1;
}
if (isa<ConstantStruct>(C) || isa<ConstantArray>(C)) {
SmallVector<SDValue, 4> Constants;
for (User::const_op_iterator OI = C->op_begin(), OE = C->op_end();
OI != OE; ++OI) {
SDNode *Val = getValue(*OI).getNode();
// If the operand is an empty aggregate, there are no values.
if (!Val) continue;
// Add each leaf value from the operand to the Constants list
// to form a flattened list of all the values.
for (unsigned i = 0, e = Val->getNumValues(); i != e; ++i)
Constants.push_back(SDValue(Val, i));
}
return DAG.getMergeValues(Constants, getCurSDLoc());
}
if (const ConstantDataSequential *CDS =
dyn_cast<ConstantDataSequential>(C)) {
SmallVector<SDValue, 4> Ops;
for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
SDNode *Val = getValue(CDS->getElementAsConstant(i)).getNode();
// Add each leaf value from the operand to the Constants list
// to form a flattened list of all the values.
for (unsigned i = 0, e = Val->getNumValues(); i != e; ++i)
Ops.push_back(SDValue(Val, i));
}
if (isa<ArrayType>(CDS->getType()))
return DAG.getMergeValues(Ops, getCurSDLoc());
return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops);
}
if (C->getType()->isStructTy() || C->getType()->isArrayTy()) {
assert((isa<ConstantAggregateZero>(C) || isa<UndefValue>(C)) &&
"Unknown struct or array constant!");
SmallVector<EVT, 4> ValueVTs;
ComputeValueVTs(TLI, DAG.getDataLayout(), C->getType(), ValueVTs);
unsigned NumElts = ValueVTs.size();
if (NumElts == 0)
return SDValue(); // empty struct
SmallVector<SDValue, 4> Constants(NumElts);
for (unsigned i = 0; i != NumElts; ++i) {
EVT EltVT = ValueVTs[i];
if (isa<UndefValue>(C))
Constants[i] = DAG.getUNDEF(EltVT);
else if (EltVT.isFloatingPoint())
Constants[i] = DAG.getConstantFP(0, getCurSDLoc(), EltVT);
else
Constants[i] = DAG.getConstant(0, getCurSDLoc(), EltVT);
}
return DAG.getMergeValues(Constants, getCurSDLoc());
}
if (const BlockAddress *BA = dyn_cast<BlockAddress>(C))
return DAG.getBlockAddress(BA, VT);
VectorType *VecTy = cast<VectorType>(V->getType());
unsigned NumElements = VecTy->getNumElements();
// Now that we know the number and type of the elements, get that number of
// elements into the Ops array based on what kind of constant it is.
SmallVector<SDValue, 16> Ops;
if (const ConstantVector *CV = dyn_cast<ConstantVector>(C)) {
for (unsigned i = 0; i != NumElements; ++i)
Ops.push_back(getValue(CV->getOperand(i)));
} else {
assert(isa<ConstantAggregateZero>(C) && "Unknown vector constant!");
EVT EltVT =
TLI.getValueType(DAG.getDataLayout(), VecTy->getElementType());
SDValue Op;
if (EltVT.isFloatingPoint())
Op = DAG.getConstantFP(0, getCurSDLoc(), EltVT);
else
Op = DAG.getConstant(0, getCurSDLoc(), EltVT);
Ops.assign(NumElements, Op);
}
// Create a BUILD_VECTOR node.
return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops);
}
// If this is a static alloca, generate it as the frameindex instead of
// computation.
if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
DenseMap<const AllocaInst*, int>::iterator SI =
FuncInfo.StaticAllocaMap.find(AI);
if (SI != FuncInfo.StaticAllocaMap.end())
return DAG.getFrameIndex(SI->second,
TLI.getFrameIndexTy(DAG.getDataLayout()));
}
// If this is an instruction which fast-isel has deferred, select it now.
if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
unsigned InReg = FuncInfo.InitializeRegForValue(Inst);
RegsForValue RFV(*DAG.getContext(), TLI, DAG.getDataLayout(), InReg,
Inst->getType(), getABIRegCopyCC(V));
SDValue Chain = DAG.getEntryNode();
return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
}
llvm_unreachable("Can't get register for value!");
}
void SelectionDAGBuilder::visitCatchPad(const CatchPadInst &I) {
auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
bool IsMSVCCXX = Pers == EHPersonality::MSVC_CXX;
bool IsCoreCLR = Pers == EHPersonality::CoreCLR;
bool IsSEH = isAsynchronousEHPersonality(Pers);
bool IsWasmCXX = Pers == EHPersonality::Wasm_CXX;
MachineBasicBlock *CatchPadMBB = FuncInfo.MBB;
if (!IsSEH)
CatchPadMBB->setIsEHScopeEntry();
// In MSVC C++ and CoreCLR, catchblocks are funclets and need prologues.
if (IsMSVCCXX || IsCoreCLR)
CatchPadMBB->setIsEHFuncletEntry();
// Wasm does not need catchpads anymore
if (!IsWasmCXX)
DAG.setRoot(DAG.getNode(ISD::CATCHPAD, getCurSDLoc(), MVT::Other,
getControlRoot()));
}
void SelectionDAGBuilder::visitCatchRet(const CatchReturnInst &I) {
// Update machine-CFG edge.
MachineBasicBlock *TargetMBB = FuncInfo.MBBMap[I.getSuccessor()];
FuncInfo.MBB->addSuccessor(TargetMBB);
auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
bool IsSEH = isAsynchronousEHPersonality(Pers);
if (IsSEH) {
// If this is not a fall-through branch or optimizations are switched off,
// emit the branch.
if (TargetMBB != NextBlock(FuncInfo.MBB) ||
TM.getOptLevel() == CodeGenOpt::None)
DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other,
getControlRoot(), DAG.getBasicBlock(TargetMBB)));
return;
}
// Figure out the funclet membership for the catchret's successor.
// This will be used by the FuncletLayout pass to determine how to order the
// BB's.
// A 'catchret' returns to the outer scope's color.
Value *ParentPad = I.getCatchSwitchParentPad();
const BasicBlock *SuccessorColor;
if (isa<ConstantTokenNone>(ParentPad))
SuccessorColor = &FuncInfo.Fn->getEntryBlock();
else
SuccessorColor = cast<Instruction>(ParentPad)->getParent();
assert(SuccessorColor && "No parent funclet for catchret!");
MachineBasicBlock *SuccessorColorMBB = FuncInfo.MBBMap[SuccessorColor];
assert(SuccessorColorMBB && "No MBB for SuccessorColor!");
// Create the terminator node.
SDValue Ret = DAG.getNode(ISD::CATCHRET, getCurSDLoc(), MVT::Other,
getControlRoot(), DAG.getBasicBlock(TargetMBB),
DAG.getBasicBlock(SuccessorColorMBB));
DAG.setRoot(Ret);
}
void SelectionDAGBuilder::visitCleanupPad(const CleanupPadInst &CPI) {
// Don't emit any special code for the cleanuppad instruction. It just marks
// the start of an EH scope/funclet.
FuncInfo.MBB->setIsEHScopeEntry();
auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
if (Pers != EHPersonality::Wasm_CXX) {
FuncInfo.MBB->setIsEHFuncletEntry();
FuncInfo.MBB->setIsCleanupFuncletEntry();
}
}
// For wasm, there's alwyas a single catch pad attached to a catchswitch, and
// the control flow always stops at the single catch pad, as it does for a
// cleanup pad. In case the exception caught is not of the types the catch pad
// catches, it will be rethrown by a rethrow.
static void findWasmUnwindDestinations(
FunctionLoweringInfo &FuncInfo, const BasicBlock *EHPadBB,
BranchProbability Prob,
SmallVectorImpl<std::pair<MachineBasicBlock *, BranchProbability>>
&UnwindDests) {
while (EHPadBB) {
const Instruction *Pad = EHPadBB->getFirstNonPHI();
if (isa<CleanupPadInst>(Pad)) {
// Stop on cleanup pads.
UnwindDests.emplace_back(FuncInfo.MBBMap[EHPadBB], Prob);
UnwindDests.back().first->setIsEHScopeEntry();
break;
} else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Pad)) {
// Add the catchpad handlers to the possible destinations. We don't
// continue to the unwind destination of the catchswitch for wasm.
for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) {
UnwindDests.emplace_back(FuncInfo.MBBMap[CatchPadBB], Prob);
UnwindDests.back().first->setIsEHScopeEntry();
}
break;
} else {
continue;
}
}
}
/// When an invoke or a cleanupret unwinds to the next EH pad, there are
/// many places it could ultimately go. In the IR, we have a single unwind
/// destination, but in the machine CFG, we enumerate all the possible blocks.
/// This function skips over imaginary basic blocks that hold catchswitch
/// instructions, and finds all the "real" machine
/// basic block destinations. As those destinations may not be successors of
/// EHPadBB, here we also calculate the edge probability to those destinations.
/// The passed-in Prob is the edge probability to EHPadBB.
static void findUnwindDestinations(
FunctionLoweringInfo &FuncInfo, const BasicBlock *EHPadBB,
BranchProbability Prob,
SmallVectorImpl<std::pair<MachineBasicBlock *, BranchProbability>>
&UnwindDests) {
EHPersonality Personality =
classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
bool IsMSVCCXX = Personality == EHPersonality::MSVC_CXX;
bool IsCoreCLR = Personality == EHPersonality::CoreCLR;
bool IsWasmCXX = Personality == EHPersonality::Wasm_CXX;
bool IsSEH = isAsynchronousEHPersonality(Personality);
if (IsWasmCXX) {
findWasmUnwindDestinations(FuncInfo, EHPadBB, Prob, UnwindDests);
assert(UnwindDests.size() <= 1 &&
"There should be at most one unwind destination for wasm");
return;
}
while (EHPadBB) {
const Instruction *Pad = EHPadBB->getFirstNonPHI();
BasicBlock *NewEHPadBB = nullptr;
if (isa<LandingPadInst>(Pad)) {
// Stop on landingpads. They are not funclets.
UnwindDests.emplace_back(FuncInfo.MBBMap[EHPadBB], Prob);
break;
} else if (isa<CleanupPadInst>(Pad)) {
// Stop on cleanup pads. Cleanups are always funclet entries for all known
// personalities.
UnwindDests.emplace_back(FuncInfo.MBBMap[EHPadBB], Prob);
UnwindDests.back().first->setIsEHScopeEntry();
UnwindDests.back().first->setIsEHFuncletEntry();
break;
} else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Pad)) {
// Add the catchpad handlers to the possible destinations.
for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) {
UnwindDests.emplace_back(FuncInfo.MBBMap[CatchPadBB], Prob);
// For MSVC++ and the CLR, catchblocks are funclets and need prologues.
if (IsMSVCCXX || IsCoreCLR)
UnwindDests.back().first->setIsEHFuncletEntry();
if (!IsSEH)
UnwindDests.back().first->setIsEHScopeEntry();
}
NewEHPadBB = CatchSwitch->getUnwindDest();
} else {
continue;
}
BranchProbabilityInfo *BPI = FuncInfo.BPI;
if (BPI && NewEHPadBB)
Prob *= BPI->getEdgeProbability(EHPadBB, NewEHPadBB);
EHPadBB = NewEHPadBB;
}
}
void SelectionDAGBuilder::visitCleanupRet(const CleanupReturnInst &I) {
// Update successor info.
SmallVector<std::pair<MachineBasicBlock *, BranchProbability>, 1> UnwindDests;
auto UnwindDest = I.getUnwindDest();
BranchProbabilityInfo *BPI = FuncInfo.BPI;
BranchProbability UnwindDestProb =
(BPI && UnwindDest)
? BPI->getEdgeProbability(FuncInfo.MBB->getBasicBlock(), UnwindDest)
: BranchProbability::getZero();
findUnwindDestinations(FuncInfo, UnwindDest, UnwindDestProb, UnwindDests);
for (auto &UnwindDest : UnwindDests) {
UnwindDest.first->setIsEHPad();
addSuccessorWithProb(FuncInfo.MBB, UnwindDest.first, UnwindDest.second);
}
FuncInfo.MBB->normalizeSuccProbs();
// Create the terminator node.
SDValue Ret =
DAG.getNode(ISD::CLEANUPRET, getCurSDLoc(), MVT::Other, getControlRoot());
DAG.setRoot(Ret);
}
void SelectionDAGBuilder::visitCatchSwitch(const CatchSwitchInst &CSI) {
report_fatal_error("visitCatchSwitch not yet implemented!");
}
void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
auto &DL = DAG.getDataLayout();
SDValue Chain = getControlRoot();
SmallVector<ISD::OutputArg, 8> Outs;
SmallVector<SDValue, 8> OutVals;
// Calls to @llvm.experimental.deoptimize don't generate a return value, so
// lower
//
// %val = call <ty> @llvm.experimental.deoptimize()
// ret <ty> %val
//
// differently.
if (I.getParent()->getTerminatingDeoptimizeCall()) {
LowerDeoptimizingReturn();
return;
}
if (!FuncInfo.CanLowerReturn) {
unsigned DemoteReg = FuncInfo.DemoteRegister;
const Function *F = I.getParent()->getParent();
// Emit a store of the return value through the virtual register.
// Leave Outs empty so that LowerReturn won't try to load return
// registers the usual way.
SmallVector<EVT, 1> PtrValueVTs;
ComputeValueVTs(TLI, DL,
F->getReturnType()->getPointerTo(
DAG.getDataLayout().getAllocaAddrSpace()),
PtrValueVTs);
SDValue RetPtr = DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
DemoteReg, PtrValueVTs[0]);
SDValue RetOp = getValue(I.getOperand(0));
SmallVector<EVT, 4> ValueVTs, MemVTs;
SmallVector<uint64_t, 4> Offsets;
ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs, &MemVTs,
&Offsets);
unsigned NumValues = ValueVTs.size();
SmallVector<SDValue, 4> Chains(NumValues);
for (unsigned i = 0; i != NumValues; ++i) {
// An aggregate return value cannot wrap around the address space, so
// offsets to its parts don't wrap either.
SDValue Ptr = DAG.getObjectPtrOffset(getCurSDLoc(), RetPtr, Offsets[i]);
SDValue Val = RetOp.getValue(i);
if (MemVTs[i] != ValueVTs[i])
Val = DAG.getPtrExtOrTrunc(Val, getCurSDLoc(), MemVTs[i]);
Chains[i] = DAG.getStore(Chain, getCurSDLoc(), Val,
// FIXME: better loc info would be nice.
Ptr, MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
}
Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
MVT::Other, Chains);
} else if (I.getNumOperands() != 0) {
SmallVector<EVT, 4> ValueVTs;
ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs);
unsigned NumValues = ValueVTs.size();
if (NumValues) {
SDValue RetOp = getValue(I.getOperand(0));
const Function *F = I.getParent()->getParent();
bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters(
I.getOperand(0)->getType(), F->getCallingConv(),
/*IsVarArg*/ false);
ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
Attribute::SExt))
ExtendKind = ISD::SIGN_EXTEND;
else if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
Attribute::ZExt))
ExtendKind = ISD::ZERO_EXTEND;
LLVMContext &Context = F->getContext();
bool RetInReg = F->getAttributes().hasAttribute(
AttributeList::ReturnIndex, Attribute::InReg);
for (unsigned j = 0; j != NumValues; ++j) {
EVT VT = ValueVTs[j];
if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger())
VT = TLI.getTypeForExtReturn(Context, VT, ExtendKind);
CallingConv::ID CC = F->getCallingConv();
unsigned NumParts = TLI.getNumRegistersForCallingConv(Context, CC, VT);
MVT PartVT = TLI.getRegisterTypeForCallingConv(Context, CC, VT);
SmallVector<SDValue, 4> Parts(NumParts);
getCopyToParts(DAG, getCurSDLoc(),
SDValue(RetOp.getNode(), RetOp.getResNo() + j),
&Parts[0], NumParts, PartVT, &I, CC, ExtendKind);
// 'inreg' on function refers to return value
ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
if (RetInReg)
Flags.setInReg();
if (I.getOperand(0)->getType()->isPointerTy()) {
Flags.setPointer();
Flags.setPointerAddrSpace(
cast<PointerType>(I.getOperand(0)->getType())->getAddressSpace());
}
if (NeedsRegBlock) {
Flags.setInConsecutiveRegs();
if (j == NumValues - 1)
Flags.setInConsecutiveRegsLast();
}
// Propagate extension type if any
if (ExtendKind == ISD::SIGN_EXTEND)
Flags.setSExt();
else if (ExtendKind == ISD::ZERO_EXTEND)
Flags.setZExt();
for (unsigned i = 0; i < NumParts; ++i) {
Outs.push_back(ISD::OutputArg(Flags, Parts[i].getValueType(),
VT, /*isfixed=*/true, 0, 0));
OutVals.push_back(Parts[i]);
}
}
}
}
// Push in swifterror virtual register as the last element of Outs. This makes
// sure swifterror virtual register will be returned in the swifterror
// physical register.
const Function *F = I.getParent()->getParent();
if (TLI.supportSwiftError() &&
F->getAttributes().hasAttrSomewhere(Attribute::SwiftError)) {
assert(SwiftError.getFunctionArg() && "Need a swift error argument");
ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
Flags.setSwiftError();
Outs.push_back(ISD::OutputArg(Flags, EVT(TLI.getPointerTy(DL)) /*vt*/,
EVT(TLI.getPointerTy(DL)) /*argvt*/,
true /*isfixed*/, 1 /*origidx*/,
0 /*partOffs*/));
// Create SDNode for the swifterror virtual register.
OutVals.push_back(
DAG.getRegister(SwiftError.getOrCreateVRegUseAt(
&I, FuncInfo.MBB, SwiftError.getFunctionArg()),
EVT(TLI.getPointerTy(DL))));
}
bool isVarArg = DAG.getMachineFunction().getFunction().isVarArg();
CallingConv::ID CallConv =
DAG.getMachineFunction().getFunction().getCallingConv();
Chain = DAG.getTargetLoweringInfo().LowerReturn(
Chain, CallConv, isVarArg, Outs, OutVals, getCurSDLoc(), DAG);
// Verify that the target's LowerReturn behaved as expected.
assert(Chain.getNode() && Chain.getValueType() == MVT::Other &&
"LowerReturn didn't return a valid chain!");
// Update the DAG with the new chain value resulting from return lowering.
DAG.setRoot(Chain);
}
/// CopyToExportRegsIfNeeded - If the given value has virtual registers
/// created for it, emit nodes to copy the value into the virtual
/// registers.
void SelectionDAGBuilder::CopyToExportRegsIfNeeded(const Value *V) {
// Skip empty types
if (V->getType()->isEmptyTy())
return;
DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V);
if (VMI != FuncInfo.ValueMap.end()) {
assert(!V->use_empty() && "Unused value assigned virtual registers!");
CopyValueToVirtualRegister(V, VMI->second);
}
}
/// ExportFromCurrentBlock - If this condition isn't known to be exported from
/// the current basic block, add it to ValueMap now so that we'll get a
/// CopyTo/FromReg.
void SelectionDAGBuilder::ExportFromCurrentBlock(const Value *V) {
// No need to export constants.
if (!isa<Instruction>(V) && !isa<Argument>(V)) return;
// Already exported?
if (FuncInfo.isExportedInst(V)) return;
unsigned Reg = FuncInfo.InitializeRegForValue(V);
CopyValueToVirtualRegister(V, Reg);
}
bool SelectionDAGBuilder::isExportableFromCurrentBlock(const Value *V,
const BasicBlock *FromBB) {
// The operands of the setcc have to be in this block. We don't know
// how to export them from some other block.
if (const Instruction *VI = dyn_cast<Instruction>(V)) {
// Can export from current BB.
if (VI->getParent() == FromBB)
return true;
// Is already exported, noop.
return FuncInfo.isExportedInst(V);
}
// If this is an argument, we can export it if the BB is the entry block or
// if it is already exported.
if (isa<Argument>(V)) {
if (FromBB == &FromBB->getParent()->getEntryBlock())
return true;
// Otherwise, can only export this if it is already exported.
return FuncInfo.isExportedInst(V);
}
// Otherwise, constants can always be exported.
return true;
}
/// Return branch probability calculated by BranchProbabilityInfo for IR blocks.
BranchProbability
SelectionDAGBuilder::getEdgeProbability(const MachineBasicBlock *Src,
const MachineBasicBlock *Dst) const {
BranchProbabilityInfo *BPI = FuncInfo.BPI;
const BasicBlock *SrcBB = Src->getBasicBlock();
const BasicBlock *DstBB = Dst->getBasicBlock();
if (!BPI) {
// If BPI is not available, set the default probability as 1 / N, where N is
// the number of successors.
auto SuccSize = std::max<uint32_t>(succ_size(SrcBB), 1);
return BranchProbability(1, SuccSize);
}
return BPI->getEdgeProbability(SrcBB, DstBB);
}
void SelectionDAGBuilder::addSuccessorWithProb(MachineBasicBlock *Src,
MachineBasicBlock *Dst,
BranchProbability Prob) {
if (!FuncInfo.BPI)
Src->addSuccessorWithoutProb(Dst);
else {
if (Prob.isUnknown())
Prob = getEdgeProbability(Src, Dst);
Src->addSuccessor(Dst, Prob);
}
}
static bool InBlock(const Value *V, const BasicBlock *BB) {
if (const Instruction *I = dyn_cast<Instruction>(V))
return I->getParent() == BB;
return true;
}
/// EmitBranchForMergedCondition - Helper method for FindMergedConditions.
/// This function emits a branch and is used at the leaves of an OR or an
/// AND operator tree.
void
SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
MachineBasicBlock *TBB,
MachineBasicBlock *FBB,
MachineBasicBlock *CurBB,
MachineBasicBlock *SwitchBB,
BranchProbability TProb,
BranchProbability FProb,
bool InvertCond) {
const BasicBlock *BB = CurBB->getBasicBlock();
// If the leaf of the tree is a comparison, merge the condition into
// the caseblock.
if (const CmpInst *BOp = dyn_cast<CmpInst>(Cond)) {
// The operands of the cmp have to be in this block. We don't know
// how to export them from some other block. If this is the first block
// of the sequence, no exporting is needed.
if (CurBB == SwitchBB ||
(isExportableFromCurrentBlock(BOp->getOperand(0), BB) &&
isExportableFromCurrentBlock(BOp->getOperand(1), BB))) {
ISD::CondCode Condition;
if (const ICmpInst *IC = dyn_cast<ICmpInst>(Cond)) {
ICmpInst::Predicate Pred =
InvertCond ? IC->getInversePredicate() : IC->getPredicate();
Condition = getICmpCondCode(Pred);
} else {
const FCmpInst *FC = cast<FCmpInst>(Cond);
FCmpInst::Predicate Pred =
InvertCond ? FC->getInversePredicate() : FC->getPredicate();
Condition = getFCmpCondCode(Pred);
if (TM.Options.NoNaNsFPMath)
Condition = getFCmpCodeWithoutNaN(Condition);
}
CaseBlock CB(Condition, BOp->getOperand(0), BOp->getOperand(1), nullptr,
TBB, FBB, CurBB, getCurSDLoc(), TProb, FProb);
SL->SwitchCases.push_back(CB);
return;
}
}
// Create a CaseBlock record representing this branch.
ISD::CondCode Opc = InvertCond ? ISD::SETNE : ISD::SETEQ;
CaseBlock CB(Opc, Cond, ConstantInt::getTrue(*DAG.getContext()),
nullptr, TBB, FBB, CurBB, getCurSDLoc(), TProb, FProb);
SL->SwitchCases.push_back(CB);
}
void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
MachineBasicBlock *TBB,
MachineBasicBlock *FBB,
MachineBasicBlock *CurBB,
MachineBasicBlock *SwitchBB,
Instruction::BinaryOps Opc,
BranchProbability TProb,
BranchProbability FProb,
bool InvertCond) {
// Skip over not part of the tree and remember to invert op and operands at
// next level.
Value *NotCond;
if (match(Cond, m_OneUse(m_Not(m_Value(NotCond)))) &&
InBlock(NotCond, CurBB->getBasicBlock())) {
FindMergedConditions(NotCond, TBB, FBB, CurBB, SwitchBB, Opc, TProb, FProb,
!InvertCond);
return;
}
const Instruction *BOp = dyn_cast<Instruction>(Cond);
// Compute the effective opcode for Cond, taking into account whether it needs
// to be inverted, e.g.
// and (not (or A, B)), C
// gets lowered as
// and (and (not A, not B), C)
unsigned BOpc = 0;
if (BOp) {
BOpc = BOp->getOpcode();
if (InvertCond) {
if (BOpc == Instruction::And)
BOpc = Instruction::Or;
else if (BOpc == Instruction::Or)
BOpc = Instruction::And;
}
}
// If this node is not part of the or/and tree, emit it as a branch.
if (!BOp || !(isa<BinaryOperator>(BOp) || isa<CmpInst>(BOp)) ||
BOpc != unsigned(Opc) || !BOp->hasOneUse() ||
BOp->getParent() != CurBB->getBasicBlock() ||
!InBlock(BOp->getOperand(0), CurBB->getBasicBlock()) ||
!InBlock(BOp->getOperand(1), CurBB->getBasicBlock())) {
EmitBranchForMergedCondition(Cond, TBB, FBB, CurBB, SwitchBB,
TProb, FProb, InvertCond);
return;
}
// Create TmpBB after CurBB.
MachineFunction::iterator BBI(CurBB);
MachineFunction &MF = DAG.getMachineFunction();
MachineBasicBlock *TmpBB = MF.CreateMachineBasicBlock(CurBB->getBasicBlock());
CurBB->getParent()->insert(++BBI, TmpBB);
if (Opc == Instruction::Or) {
// Codegen X | Y as:
// BB1:
// jmp_if_X TBB
// jmp TmpBB
// TmpBB:
// jmp_if_Y TBB
// jmp FBB
//
// We have flexibility in setting Prob for BB1 and Prob for TmpBB.
// The requirement is that
// TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
// = TrueProb for original BB.
// Assuming the original probabilities are A and B, one choice is to set
// BB1's probabilities to A/2 and A/2+B, and set TmpBB's probabilities to
// A/(1+B) and 2B/(1+B). This choice assumes that
// TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
// Another choice is to assume TrueProb for BB1 equals to TrueProb for
// TmpBB, but the math is more complicated.
auto NewTrueProb = TProb / 2;
auto NewFalseProb = TProb / 2 + FProb;
// Emit the LHS condition.
FindMergedConditions(BOp->getOperand(0), TBB, TmpBB, CurBB, SwitchBB, Opc,
NewTrueProb, NewFalseProb, InvertCond);
// Normalize A/2 and B to get A/(1+B) and 2B/(1+B).
SmallVector<BranchProbability, 2> Probs{TProb / 2, FProb};
BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
// Emit the RHS condition into TmpBB.
FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc,
Probs[0], Probs[1], InvertCond);
} else {
assert(Opc == Instruction::And && "Unknown merge op!");
// Codegen X & Y as:
// BB1:
// jmp_if_X TmpBB
// jmp FBB
// TmpBB:
// jmp_if_Y TBB
// jmp FBB
//
// This requires creation of TmpBB after CurBB.
// We have flexibility in setting Prob for BB1 and Prob for TmpBB.
// The requirement is that
// FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
// = FalseProb for original BB.
// Assuming the original probabilities are A and B, one choice is to set
// BB1's probabilities to A+B/2 and B/2, and set TmpBB's probabilities to
// 2A/(1+A) and B/(1+A). This choice assumes that FalseProb for BB1 ==
// TrueProb for BB1 * FalseProb for TmpBB.
auto NewTrueProb = TProb + FProb / 2;
auto NewFalseProb = FProb / 2;
// Emit the LHS condition.
FindMergedConditions(BOp->getOperand(0), TmpBB, FBB, CurBB, SwitchBB, Opc,
NewTrueProb, NewFalseProb, InvertCond);
// Normalize A and B/2 to get 2A/(1+A) and B/(1+A).
SmallVector<BranchProbability, 2> Probs{TProb, FProb / 2};
BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
// Emit the RHS condition into TmpBB.
FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc,
Probs[0], Probs[1], InvertCond);
}
}
/// If the set of cases should be emitted as a series of branches, return true.
/// If we should emit this as a bunch of and/or'd together conditions, return
/// false.
bool
SelectionDAGBuilder::ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases) {
if (Cases.size() != 2) return true;
// If this is two comparisons of the same values or'd or and'd together, they
// will get folded into a single comparison, so don't emit two blocks.
if ((Cases[0].CmpLHS == Cases[1].CmpLHS &&
Cases[0].CmpRHS == Cases[1].CmpRHS) ||
(Cases[0].CmpRHS == Cases[1].CmpLHS &&
Cases[0].CmpLHS == Cases[1].CmpRHS)) {
return false;
}
// Handle: (X != null) | (Y != null) --> (X|Y) != 0
// Handle: (X == null) & (Y == null) --> (X|Y) == 0
if (Cases[0].CmpRHS == Cases[1].CmpRHS &&
Cases[0].CC == Cases[1].CC &&
isa<Constant>(Cases[0].CmpRHS) &&
cast<Constant>(Cases[0].CmpRHS)->isNullValue()) {
if (Cases[0].CC == ISD::SETEQ && Cases[0].TrueBB == Cases[1].ThisBB)
return false;
if (Cases[0].CC == ISD::SETNE && Cases[0].FalseBB == Cases[1].ThisBB)
return false;
}
return true;
}
void SelectionDAGBuilder::visitBr(const BranchInst &I) {
MachineBasicBlock *BrMBB = FuncInfo.MBB;
// Update machine-CFG edges.
MachineBasicBlock *Succ0MBB = FuncInfo.MBBMap[I.getSuccessor(0)];
if (I.isUnconditional()) {
// Update machine-CFG edges.
BrMBB->addSuccessor(Succ0MBB);
// If this is not a fall-through branch or optimizations are switched off,
// emit the branch.
if (Succ0MBB != NextBlock(BrMBB) || TM.getOptLevel() == CodeGenOpt::None)
DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
MVT::Other, getControlRoot(),
DAG.getBasicBlock(Succ0MBB)));
return;
}
// If this condition is one of the special cases we handle, do special stuff
// now.
const Value *CondVal = I.getCondition();
MachineBasicBlock *Succ1MBB = FuncInfo.MBBMap[I.getSuccessor(1)];
// If this is a series of conditions that are or'd or and'd together, emit
// this as a sequence of branches instead of setcc's with and/or operations.
// As long as jumps are not expensive, this should improve performance.
// For example, instead of something like:
// cmp A, B
// C = seteq
// cmp D, E
// F = setle
// or C, F
// jnz foo
// Emit:
// cmp A, B
// je foo
// cmp D, E
// jle foo
if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(CondVal)) {
Instruction::BinaryOps Opcode = BOp->getOpcode();
if (!DAG.getTargetLoweringInfo().isJumpExpensive() && BOp->hasOneUse() &&
!I.getMetadata(LLVMContext::MD_unpredictable) &&
(Opcode == Instruction::And || Opcode == Instruction::Or)) {
FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB,
Opcode,
getEdgeProbability(BrMBB, Succ0MBB),
getEdgeProbability(BrMBB, Succ1MBB),
/*InvertCond=*/false);
// If the compares in later blocks need to use values not currently
// exported from this block, export them now. This block should always
// be the first entry.
assert(SL->SwitchCases[0].ThisBB == BrMBB && "Unexpected lowering!");
// Allow some cases to be rejected.
if (ShouldEmitAsBranches(SL->SwitchCases)) {
for (unsigned i = 1, e = SL->SwitchCases.size(); i != e; ++i) {
ExportFromCurrentBlock(SL->SwitchCases[i].CmpLHS);
ExportFromCurrentBlock(SL->SwitchCases[i].CmpRHS);
}
// Emit the branch for this block.
visitSwitchCase(SL->SwitchCases[0], BrMBB);
SL->SwitchCases.erase(SL->SwitchCases.begin());
return;
}
// Okay, we decided not to do this, remove any inserted MBB's and clear
// SwitchCases.
for (unsigned i = 1, e = SL->SwitchCases.size(); i != e; ++i)
FuncInfo.MF->erase(SL->SwitchCases[i].ThisBB);
SL->SwitchCases.clear();
}
}
// Create a CaseBlock record representing this branch.
CaseBlock CB(ISD::SETEQ, CondVal, ConstantInt::getTrue(*DAG.getContext()),
nullptr, Succ0MBB, Succ1MBB, BrMBB, getCurSDLoc());
// Use visitSwitchCase to actually insert the fast branch sequence for this
// cond branch.
visitSwitchCase(CB, BrMBB);
}
/// visitSwitchCase - Emits the necessary code to represent a single node in
/// the binary search tree resulting from lowering a switch instruction.
void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
MachineBasicBlock *SwitchBB) {
SDValue Cond;
SDValue CondLHS = getValue(CB.CmpLHS);
SDLoc dl = CB.DL;
if (CB.CC == ISD::SETTRUE) {
// Branch or fall through to TrueBB.
addSuccessorWithProb(SwitchBB, CB.TrueBB, CB.TrueProb);
SwitchBB->normalizeSuccProbs();
if (CB.TrueBB != NextBlock(SwitchBB)) {
DAG.setRoot(DAG.getNode(ISD::BR, dl, MVT::Other, getControlRoot(),
DAG.getBasicBlock(CB.TrueBB)));
}
return;
}
auto &TLI = DAG.getTargetLoweringInfo();
EVT MemVT = TLI.getMemValueType(DAG.getDataLayout(), CB.CmpLHS->getType());
// Build the setcc now.
if (!CB.CmpMHS) {
// Fold "(X == true)" to X and "(X == false)" to !X to
// handle common cases produced by branch lowering.
if (CB.CmpRHS == ConstantInt::getTrue(*DAG.getContext()) &&
CB.CC == ISD::SETEQ)
Cond = CondLHS;
else if (CB.CmpRHS == ConstantInt::getFalse(*DAG.getContext()) &&
CB.CC == ISD::SETEQ) {
SDValue True = DAG.getConstant(1, dl, CondLHS.getValueType());
Cond = DAG.getNode(ISD::XOR, dl, CondLHS.getValueType(), CondLHS, True);
} else {
SDValue CondRHS = getValue(CB.CmpRHS);
// If a pointer's DAG type is larger than its memory type then the DAG
// values are zero-extended. This breaks signed comparisons so truncate
// back to the underlying type before doing the compare.
if (CondLHS.getValueType() != MemVT) {
CondLHS = DAG.getPtrExtOrTrunc(CondLHS, getCurSDLoc(), MemVT);
CondRHS = DAG.getPtrExtOrTrunc(CondRHS, getCurSDLoc(), MemVT);
}
Cond = DAG.getSetCC(dl, MVT::i1, CondLHS, CondRHS, CB.CC);
}
} else {
assert(CB.CC == ISD::SETLE && "Can handle only LE ranges now");
const APInt& Low = cast<ConstantInt>(CB.CmpLHS)->getValue();
const APInt& High = cast<ConstantInt>(CB.CmpRHS)->getValue();
SDValue CmpOp = getValue(CB.CmpMHS);
EVT VT = CmpOp.getValueType();
if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(true)) {
Cond = DAG.getSetCC(dl, MVT::i1, CmpOp, DAG.getConstant(High, dl, VT),
ISD::SETLE);
} else {
SDValue SUB = DAG.getNode(ISD::SUB, dl,
VT, CmpOp, DAG.getConstant(Low, dl, VT));
Cond = DAG.getSetCC(dl, MVT::i1, SUB,
DAG.getConstant(High-Low, dl, VT), ISD::SETULE);
}
}
// Update successor info
addSuccessorWithProb(SwitchBB, CB.TrueBB, CB.TrueProb);
// TrueBB and FalseBB are always different unless the incoming IR is
// degenerate. This only happens when running llc on weird IR.
if (CB.TrueBB != CB.FalseBB)
addSuccessorWithProb(SwitchBB, CB.FalseBB, CB.FalseProb);
SwitchBB->normalizeSuccProbs();
// If the lhs block is the next block, invert the condition so that we can
// fall through to the lhs instead of the rhs block.
if (CB.TrueBB == NextBlock(SwitchBB)) {
std::swap(CB.TrueBB, CB.FalseBB);
SDValue True = DAG.getConstant(1, dl, Cond.getValueType());
Cond = DAG.getNode(ISD::XOR, dl, Cond.getValueType(), Cond, True);
}
SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
MVT::Other, getControlRoot(), Cond,
DAG.getBasicBlock(CB.TrueBB));
// Insert the false branch. Do this even if it's a fall through branch,
// this makes it easier to do DAG optimizations which require inverting
// the branch condition.
BrCond = DAG.getNode(ISD::BR, dl, MVT::Other, BrCond,
DAG.getBasicBlock(CB.FalseBB));
DAG.setRoot(BrCond);
}
/// visitJumpTable - Emit JumpTable node in the current MBB
void SelectionDAGBuilder::visitJumpTable(SwitchCG::JumpTable &JT) {
// Emit the code for the jump table
assert(JT.Reg != -1U && "Should lower JT Header first!");
EVT PTy = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
SDValue Index = DAG.getCopyFromReg(getControlRoot(), getCurSDLoc(),
JT.Reg, PTy);
SDValue Table = DAG.getJumpTable(JT.JTI, PTy);
SDValue BrJumpTable = DAG.getNode(ISD::BR_JT, getCurSDLoc(),
MVT::Other, Index.getValue(1),
Table, Index);
DAG.setRoot(BrJumpTable);
}
/// visitJumpTableHeader - This function emits necessary code to produce index
/// in the JumpTable from switch case.
void SelectionDAGBuilder::visitJumpTableHeader(SwitchCG::JumpTable &JT,
JumpTableHeader &JTH,
MachineBasicBlock *SwitchBB) {
SDLoc dl = getCurSDLoc();
// Subtract the lowest switch case value from the value being switched on.
SDValue SwitchOp = getValue(JTH.SValue);
EVT VT = SwitchOp.getValueType();
SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SwitchOp,
DAG.getConstant(JTH.First, dl, VT));
// The SDNode we just created, which holds the value being switched on minus
// the smallest case value, needs to be copied to a virtual register so it
// can be used as an index into the jump table in a subsequent basic block.
// This value may be smaller or larger than the target's pointer type, and
// therefore require extension or truncating.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SwitchOp = DAG.getZExtOrTrunc(Sub, dl, TLI.getPointerTy(DAG.getDataLayout()));
unsigned JumpTableReg =
FuncInfo.CreateReg(TLI.getPointerTy(DAG.getDataLayout()));
SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), dl,
JumpTableReg, SwitchOp);
JT.Reg = JumpTableReg;
if (!JTH.OmitRangeCheck) {
// Emit the range check for the jump table, and branch to the default block
// for the switch statement if the value being switched on exceeds the
// largest case in the switch.
SDValue CMP = DAG.getSetCC(
dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
Sub.getValueType()),
Sub, DAG.getConstant(JTH.Last - JTH.First, dl, VT), ISD::SETUGT);
SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
MVT::Other, CopyTo, CMP,
DAG.getBasicBlock(JT.Default));
// Avoid emitting unnecessary branches to the next block.
if (JT.MBB != NextBlock(SwitchBB))
BrCond = DAG.getNode(ISD::BR, dl, MVT::Other, BrCond,
DAG.getBasicBlock(JT.MBB));
DAG.setRoot(BrCond);
} else {
// Avoid emitting unnecessary branches to the next block.
if (JT.MBB != NextBlock(SwitchBB))
DAG.setRoot(DAG.getNode(ISD::BR, dl, MVT::Other, CopyTo,
DAG.getBasicBlock(JT.MBB)));
else
DAG.setRoot(CopyTo);
}
}
/// Create a LOAD_STACK_GUARD node, and let it carry the target specific global
/// variable if there exists one.
static SDValue getLoadStackGuard(SelectionDAG &DAG, const SDLoc &DL,
SDValue &Chain) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
EVT PtrMemTy = TLI.getPointerMemTy(DAG.getDataLayout());
MachineFunction &MF = DAG.getMachineFunction();
Value *Global = TLI.getSDagStackGuard(*MF.getFunction().getParent());
MachineSDNode *Node =
DAG.getMachineNode(TargetOpcode::LOAD_STACK_GUARD, DL, PtrTy, Chain);
if (Global) {
MachinePointerInfo MPInfo(Global);
auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
MachineMemOperand::MODereferenceable;
MachineMemOperand *MemRef = MF.getMachineMemOperand(
MPInfo, Flags, PtrTy.getSizeInBits() / 8, DAG.getEVTAlignment(PtrTy));
DAG.setNodeMemRefs(Node, {MemRef});
}
if (PtrTy != PtrMemTy)
return DAG.getPtrExtOrTrunc(SDValue(Node, 0), DL, PtrMemTy);
return SDValue(Node, 0);
}
/// Codegen a new tail for a stack protector check ParentMBB which has had its
/// tail spliced into a stack protector check success bb.
///
/// For a high level explanation of how this fits into the stack protector
/// generation see the comment on the declaration of class
/// StackProtectorDescriptor.
void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
MachineBasicBlock *ParentBB) {
// First create the loads to the guard/stack slot for the comparison.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
EVT PtrMemTy = TLI.getPointerMemTy(DAG.getDataLayout());
MachineFrameInfo &MFI = ParentBB->getParent()->getFrameInfo();
int FI = MFI.getStackProtectorIndex();
SDValue Guard;
SDLoc dl = getCurSDLoc();
SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy);
const Module &M = *ParentBB->getParent()->getFunction().getParent();
unsigned Align = DL->getPrefTypeAlignment(Type::getInt8PtrTy(M.getContext()));
// Generate code to load the content of the guard slot.
SDValue GuardVal = DAG.getLoad(
PtrMemTy, dl, DAG.getEntryNode(), StackSlotPtr,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), Align,
MachineMemOperand::MOVolatile);
if (TLI.useStackGuardXorFP())
GuardVal = TLI.emitStackGuardXorFP(DAG, GuardVal, dl);
// Retrieve guard check function, nullptr if instrumentation is inlined.
if (const Function *GuardCheckFn = TLI.getSSPStackGuardCheck(M)) {
// The target provides a guard check function to validate the guard value.
// Generate a call to that function with the content of the guard slot as
// argument.
FunctionType *FnTy = GuardCheckFn->getFunctionType();
assert(FnTy->getNumParams() == 1 && "Invalid function signature");
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
Entry.Node = GuardVal;
Entry.Ty = FnTy->getParamType(0);
if (GuardCheckFn->hasAttribute(1, Attribute::AttrKind::InReg))
Entry.IsInReg = true;
Args.push_back(Entry);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(getCurSDLoc())
.setChain(DAG.getEntryNode())
.setCallee(GuardCheckFn->getCallingConv(), FnTy->getReturnType(),
getValue(GuardCheckFn), std::move(Args));
std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
DAG.setRoot(Result.second);
return;
}
// If useLoadStackGuardNode returns true, generate LOAD_STACK_GUARD.
// Otherwise, emit a volatile load to retrieve the stack guard value.
SDValue Chain = DAG.getEntryNode();
if (TLI.useLoadStackGuardNode()) {
Guard = getLoadStackGuard(DAG, dl, Chain);
} else {
const Value *IRGuard = TLI.getSDagStackGuard(M);
SDValue GuardPtr = getValue(IRGuard);
Guard = DAG.getLoad(PtrMemTy, dl, Chain, GuardPtr,
MachinePointerInfo(IRGuard, 0), Align,
MachineMemOperand::MOVolatile);
}
// Perform the comparison via a subtract/getsetcc.
EVT VT = Guard.getValueType();
SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Guard, GuardVal);
SDValue Cmp = DAG.getSetCC(dl, TLI.getSetCCResultType(DAG.getDataLayout(),
*DAG.getContext(),
Sub.getValueType()),
Sub, DAG.getConstant(0, dl, VT), ISD::SETNE);
// If the sub is not 0, then we know the guard/stackslot do not equal, so
// branch to failure MBB.
SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
MVT::Other, GuardVal.getOperand(0),
Cmp, DAG.getBasicBlock(SPD.getFailureMBB()));
// Otherwise branch to success MBB.
SDValue Br = DAG.getNode(ISD::BR, dl,
MVT::Other, BrCond,
DAG.getBasicBlock(SPD.getSuccessMBB()));
DAG.setRoot(Br);
}
/// Codegen the failure basic block for a stack protector check.
///
/// A failure stack protector machine basic block consists simply of a call to
/// __stack_chk_fail().
///
/// For a high level explanation of how this fits into the stack protector
/// generation see the comment on the declaration of class
/// StackProtectorDescriptor.
void
SelectionDAGBuilder::visitSPDescriptorFailure(StackProtectorDescriptor &SPD) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Chain =
TLI.makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL, MVT::isVoid,
None, false, getCurSDLoc(), false, false).second;
// On PS4, the "return address" must still be within the calling function,
// even if it's at the very end, so emit an explicit TRAP here.
// Passing 'true' for doesNotReturn above won't generate the trap for us.
if (TM.getTargetTriple().isPS4CPU())
Chain = DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, Chain);
DAG.setRoot(Chain);
}
/// visitBitTestHeader - This function emits necessary code to produce value
/// suitable for "bit tests"
void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
MachineBasicBlock *SwitchBB) {
SDLoc dl = getCurSDLoc();
// Subtract the minimum value
SDValue SwitchOp = getValue(B.SValue);
EVT VT = SwitchOp.getValueType();
SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SwitchOp,
DAG.getConstant(B.First, dl, VT));
// Check range
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue RangeCmp = DAG.getSetCC(
dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
Sub.getValueType()),
Sub, DAG.getConstant(B.Range, dl, VT), ISD::SETUGT);
// Determine the type of the test operands.
bool UsePtrType = false;
if (!TLI.isTypeLegal(VT))
UsePtrType = true;
else {
for (unsigned i = 0, e = B.Cases.size(); i != e; ++i)
if (!isUIntN(VT.getSizeInBits(), B.Cases[i].Mask)) {
// Switch table case range are encoded into series of masks.
// Just use pointer type, it's guaranteed to fit.
UsePtrType = true;
break;
}
}
if (UsePtrType) {
VT = TLI.getPointerTy(DAG.getDataLayout());
Sub = DAG.getZExtOrTrunc(Sub, dl, VT);
}
B.RegVT = VT.getSimpleVT();
B.Reg = FuncInfo.CreateReg(B.RegVT);
SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), dl, B.Reg, Sub);
MachineBasicBlock* MBB = B.Cases[0].ThisBB;
addSuccessorWithProb(SwitchBB, B.Default, B.DefaultProb);
addSuccessorWithProb(SwitchBB, MBB, B.Prob);
SwitchBB->normalizeSuccProbs();
SDValue BrRange = DAG.getNode(ISD::BRCOND, dl,
MVT::Other, CopyTo, RangeCmp,
DAG.getBasicBlock(B.Default));
// Avoid emitting unnecessary branches to the next block.
if (MBB != NextBlock(SwitchBB))
BrRange = DAG.getNode(ISD::BR, dl, MVT::Other, BrRange,
DAG.getBasicBlock(MBB));
DAG.setRoot(BrRange);
}
/// visitBitTestCase - this function produces one "bit test"
void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
MachineBasicBlock* NextMBB,
BranchProbability BranchProbToNext,
unsigned Reg,
BitTestCase &B,
MachineBasicBlock *SwitchBB) {
SDLoc dl = getCurSDLoc();
MVT VT = BB.RegVT;
SDValue ShiftOp = DAG.getCopyFromReg(getControlRoot(), dl, Reg, VT);
SDValue Cmp;
unsigned PopCount = countPopulation(B.Mask);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (PopCount == 1) {
// Testing for a single bit; just compare the shift count with what it
// would need to be to shift a 1 bit in that position.
Cmp = DAG.getSetCC(
dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT),
ShiftOp, DAG.getConstant(countTrailingZeros(B.Mask), dl, VT),
ISD::SETEQ);
} else if (PopCount == BB.Range) {
// There is only one zero bit in the range, test for it directly.
Cmp = DAG.getSetCC(
dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT),
ShiftOp, DAG.getConstant(countTrailingOnes(B.Mask), dl, VT),
ISD::SETNE);
} else {
// Make desired shift
SDValue SwitchVal = DAG.getNode(ISD::SHL, dl, VT,
DAG.getConstant(1, dl, VT), ShiftOp);
// Emit bit tests and jumps
SDValue AndOp = DAG.getNode(ISD::AND, dl,
VT, SwitchVal, DAG.getConstant(B.Mask, dl, VT));
Cmp = DAG.getSetCC(
dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT),
AndOp, DAG.getConstant(0, dl, VT), ISD::SETNE);
}
// The branch probability from SwitchBB to B.TargetBB is B.ExtraProb.
addSuccessorWithProb(SwitchBB, B.TargetBB, B.ExtraProb);
// The branch probability from SwitchBB to NextMBB is BranchProbToNext.
addSuccessorWithProb(SwitchBB, NextMBB, BranchProbToNext);
// It is not guaranteed that the sum of B.ExtraProb and BranchProbToNext is
// one as they are relative probabilities (and thus work more like weights),
// and hence we need to normalize them to let the sum of them become one.
SwitchBB->normalizeSuccProbs();
SDValue BrAnd = DAG.getNode(ISD::BRCOND, dl,
MVT::Other, getControlRoot(),
Cmp, DAG.getBasicBlock(B.TargetBB));
// Avoid emitting unnecessary branches to the next block.
if (NextMBB != NextBlock(SwitchBB))
BrAnd = DAG.getNode(ISD::BR, dl, MVT::Other, BrAnd,
DAG.getBasicBlock(NextMBB));
DAG.setRoot(BrAnd);
}
void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
MachineBasicBlock *InvokeMBB = FuncInfo.MBB;
// Retrieve successors. Look through artificial IR level blocks like
// catchswitch for successors.
MachineBasicBlock *Return = FuncInfo.MBBMap[I.getSuccessor(0)];
const BasicBlock *EHPadBB = I.getSuccessor(1);
// Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
// have to do anything here to lower funclet bundles.
assert(!I.hasOperandBundlesOtherThan(
{LLVMContext::OB_deopt, LLVMContext::OB_funclet}) &&
"Cannot lower invokes with arbitrary operand bundles yet!");
const Value *Callee(I.getCalledValue());
const Function *Fn = dyn_cast<Function>(Callee);
if (isa<InlineAsm>(Callee))
visitInlineAsm(&I);
else if (Fn && Fn->isIntrinsic()) {
switch (Fn->getIntrinsicID()) {
default:
llvm_unreachable("Cannot invoke this intrinsic");
case Intrinsic::donothing:
// Ignore invokes to @llvm.donothing: jump directly to the next BB.
break;
case Intrinsic::experimental_patchpoint_void:
case Intrinsic::experimental_patchpoint_i64:
visitPatchpoint(&I, EHPadBB);
break;
case Intrinsic::experimental_gc_statepoint:
LowerStatepoint(ImmutableStatepoint(&I), EHPadBB);
break;
case Intrinsic::wasm_rethrow_in_catch: {
// This is usually done in visitTargetIntrinsic, but this intrinsic is
// special because it can be invoked, so we manually lower it to a DAG
// node here.
SmallVector<SDValue, 8> Ops;
Ops.push_back(getRoot()); // inchain
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
Ops.push_back(
DAG.getTargetConstant(Intrinsic::wasm_rethrow_in_catch, getCurSDLoc(),
TLI.getPointerTy(DAG.getDataLayout())));
SDVTList VTs = DAG.getVTList(ArrayRef<EVT>({MVT::Other})); // outchain
DAG.setRoot(DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops));
break;
}
}
} else if (I.countOperandBundlesOfType(LLVMContext::OB_deopt)) {
// Currently we do not lower any intrinsic calls with deopt operand bundles.
// Eventually we will support lowering the @llvm.experimental.deoptimize
// intrinsic, and right now there are no plans to support other intrinsics
// with deopt state.
LowerCallSiteWithDeoptBundle(&I, getValue(Callee), EHPadBB);
} else {
LowerCallTo(&I, getValue(Callee), false, EHPadBB);
}
// If the value of the invoke is used outside of its defining block, make it
// available as a virtual register.
// We already took care of the exported value for the statepoint instruction
// during call to the LowerStatepoint.
if (!isStatepoint(I)) {
CopyToExportRegsIfNeeded(&I);
}
SmallVector<std::pair<MachineBasicBlock *, BranchProbability>, 1> UnwindDests;
BranchProbabilityInfo *BPI = FuncInfo.BPI;
BranchProbability EHPadBBProb =
BPI ? BPI->getEdgeProbability(InvokeMBB->getBasicBlock(), EHPadBB)
: BranchProbability::getZero();
findUnwindDestinations(FuncInfo, EHPadBB, EHPadBBProb, UnwindDests);
// Update successor info.
addSuccessorWithProb(InvokeMBB, Return);
for (auto &UnwindDest : UnwindDests) {
UnwindDest.first->setIsEHPad();
addSuccessorWithProb(InvokeMBB, UnwindDest.first, UnwindDest.second);
}
InvokeMBB->normalizeSuccProbs();
// Drop into normal successor.
DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other, getControlRoot(),
DAG.getBasicBlock(Return)));
}
void SelectionDAGBuilder::visitCallBr(const CallBrInst &I) {
MachineBasicBlock *CallBrMBB = FuncInfo.MBB;
// Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
// have to do anything here to lower funclet bundles.
assert(!I.hasOperandBundlesOtherThan(
{LLVMContext::OB_deopt, LLVMContext::OB_funclet}) &&
"Cannot lower callbrs with arbitrary operand bundles yet!");
assert(isa<InlineAsm>(I.getCalledValue()) &&
"Only know how to handle inlineasm callbr");
visitInlineAsm(&I);
// Retrieve successors.
MachineBasicBlock *Return = FuncInfo.MBBMap[I.getDefaultDest()];
// Update successor info.
addSuccessorWithProb(CallBrMBB, Return);
for (unsigned i = 0, e = I.getNumIndirectDests(); i < e; ++i) {
MachineBasicBlock *Target = FuncInfo.MBBMap[I.getIndirectDest(i)];
addSuccessorWithProb(CallBrMBB, Target);
}
CallBrMBB->normalizeSuccProbs();
// Drop into default successor.
DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
MVT::Other, getControlRoot(),
DAG.getBasicBlock(Return)));
}
void SelectionDAGBuilder::visitResume(const ResumeInst &RI) {
llvm_unreachable("SelectionDAGBuilder shouldn't visit resume instructions!");
}
void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) {
assert(FuncInfo.MBB->isEHPad() &&
"Call to landingpad not in landing pad!");
// If there aren't registers to copy the values into (e.g., during SjLj
// exceptions), then don't bother to create these DAG nodes.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
const Constant *PersonalityFn = FuncInfo.Fn->getPersonalityFn();
if (TLI.getExceptionPointerRegister(PersonalityFn) == 0 &&
TLI.getExceptionSelectorRegister(PersonalityFn) == 0)
return;
// If landingpad's return type is token type, we don't create DAG nodes
// for its exception pointer and selector value. The extraction of exception
// pointer or selector value from token type landingpads is not currently
// supported.
if (LP.getType()->isTokenTy())
return;
SmallVector<EVT, 2> ValueVTs;
SDLoc dl = getCurSDLoc();
ComputeValueVTs(TLI, DAG.getDataLayout(), LP.getType(), ValueVTs);
assert(ValueVTs.size() == 2 && "Only two-valued landingpads are supported");
// Get the two live-in registers as SDValues. The physregs have already been
// copied into virtual registers.
SDValue Ops[2];
if (FuncInfo.ExceptionPointerVirtReg) {
Ops[0] = DAG.getZExtOrTrunc(
DAG.getCopyFromReg(DAG.getEntryNode(), dl,
FuncInfo.ExceptionPointerVirtReg,
TLI.getPointerTy(DAG.getDataLayout())),
dl, ValueVTs[0]);
} else {
Ops[0] = DAG.getConstant(0, dl, TLI.getPointerTy(DAG.getDataLayout()));
}
Ops[1] = DAG.getZExtOrTrunc(
DAG.getCopyFromReg(DAG.getEntryNode(), dl,
FuncInfo.ExceptionSelectorVirtReg,
TLI.getPointerTy(DAG.getDataLayout())),
dl, ValueVTs[1]);
// Merge into one.
SDValue Res = DAG.getNode(ISD::MERGE_VALUES, dl,
DAG.getVTList(ValueVTs), Ops);
setValue(&LP, Res);
}
void SelectionDAGBuilder::UpdateSplitBlock(MachineBasicBlock *First,
MachineBasicBlock *Last) {
// Update JTCases.
for (unsigned i = 0, e = SL->JTCases.size(); i != e; ++i)
if (SL->JTCases[i].first.HeaderBB == First)
SL->JTCases[i].first.HeaderBB = Last;
// Update BitTestCases.
for (unsigned i = 0, e = SL->BitTestCases.size(); i != e; ++i)
if (SL->BitTestCases[i].Parent == First)
SL->BitTestCases[i].Parent = Last;
}
void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
MachineBasicBlock *IndirectBrMBB = FuncInfo.MBB;
// Update machine-CFG edges with unique successors.
SmallSet<BasicBlock*, 32> Done;
for (unsigned i = 0, e = I.getNumSuccessors(); i != e; ++i) {
BasicBlock *BB = I.getSuccessor(i);
bool Inserted = Done.insert(BB).second;
if (!Inserted)
continue;
MachineBasicBlock *Succ = FuncInfo.MBBMap[BB];
addSuccessorWithProb(IndirectBrMBB, Succ);
}
IndirectBrMBB->normalizeSuccProbs();
DAG.setRoot(DAG.getNode(ISD::BRIND, getCurSDLoc(),
MVT::Other, getControlRoot(),
getValue(I.getAddress())));
}
void SelectionDAGBuilder::visitUnreachable(const UnreachableInst &I) {
if (!DAG.getTarget().Options.TrapUnreachable)
return;
// We may be able to ignore unreachable behind a noreturn call.
if (DAG.getTarget().Options.NoTrapAfterNoreturn) {
const BasicBlock &BB = *I.getParent();
if (&I != &BB.front()) {
BasicBlock::const_iterator PredI =
std::prev(BasicBlock::const_iterator(&I));
if (const CallInst *Call = dyn_cast<CallInst>(&*PredI)) {
if (Call->doesNotReturn())
return;
}
}
}
DAG.setRoot(DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot()));
}
void SelectionDAGBuilder::visitFSub(const User &I) {
// -0.0 - X --> fneg
Type *Ty = I.getType();
if (isa<Constant>(I.getOperand(0)) &&
I.getOperand(0) == ConstantFP::getZeroValueForNegation(Ty)) {
SDValue Op2 = getValue(I.getOperand(1));
setValue(&I, DAG.getNode(ISD::FNEG, getCurSDLoc(),
Op2.getValueType(), Op2));
return;
}
visitBinary(I, ISD::FSUB);
}
/// Checks if the given instruction performs a vector reduction, in which case
/// we have the freedom to alter the elements in the result as long as the
/// reduction of them stays unchanged.
static bool isVectorReductionOp(const User *I) {
const Instruction *Inst = dyn_cast<Instruction>(I);
if (!Inst || !Inst->getType()->isVectorTy())
return false;
auto OpCode = Inst->getOpcode();
switch (OpCode) {
case Instruction::Add:
case Instruction::Mul:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
break;
case Instruction::FAdd:
case Instruction::FMul:
if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
if (FPOp->getFastMathFlags().isFast())
break;
LLVM_FALLTHROUGH;
default:
return false;
}
unsigned ElemNum = Inst->getType()->getVectorNumElements();
// Ensure the reduction size is a power of 2.
if (!isPowerOf2_32(ElemNum))
return false;
unsigned ElemNumToReduce = ElemNum;
// Do DFS search on the def-use chain from the given instruction. We only
// allow four kinds of operations during the search until we reach the
// instruction that extracts the first element from the vector:
//
// 1. The reduction operation of the same opcode as the given instruction.
//
// 2. PHI node.
//
// 3. ShuffleVector instruction together with a reduction operation that
// does a partial reduction.
//
// 4. ExtractElement that extracts the first element from the vector, and we
// stop searching the def-use chain here.
//
// 3 & 4 above perform a reduction on all elements of the vector. We push defs
// from 1-3 to the stack to continue the DFS. The given instruction is not
// a reduction operation if we meet any other instructions other than those
// listed above.
SmallVector<const User *, 16> UsersToVisit{Inst};
SmallPtrSet<const User *, 16> Visited;
bool ReduxExtracted = false;
while (!UsersToVisit.empty()) {
auto User = UsersToVisit.back();
UsersToVisit.pop_back();
if (!Visited.insert(User).second)
continue;
for (const auto &U : User->users()) {
auto Inst = dyn_cast<Instruction>(U);
if (!Inst)
return false;
if (Inst->getOpcode() == OpCode || isa<PHINode>(U)) {
if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().isFast())
return false;
UsersToVisit.push_back(U);
} else if (const ShuffleVectorInst *ShufInst =
dyn_cast<ShuffleVectorInst>(U)) {
// Detect the following pattern: A ShuffleVector instruction together
// with a reduction that do partial reduction on the first and second
// ElemNumToReduce / 2 elements, and store the result in
// ElemNumToReduce / 2 elements in another vector.
unsigned ResultElements = ShufInst->getType()->getVectorNumElements();
if (ResultElements < ElemNum)
return false;
if (ElemNumToReduce == 1)
return false;
if (!isa<UndefValue>(U->getOperand(1)))
return false;
for (unsigned i = 0; i < ElemNumToReduce / 2; ++i)
if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2))
return false;
for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i)
if (ShufInst->getMaskValue(i) != -1)
return false;
// There is only one user of this ShuffleVector instruction, which
// must be a reduction operation.
if (!U->hasOneUse())
return false;
auto U2 = dyn_cast<Instruction>(*U->user_begin());
if (!U2 || U2->getOpcode() != OpCode)
return false;
// Check operands of the reduction operation.
if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) ||
(U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) {
UsersToVisit.push_back(U2);
ElemNumToReduce /= 2;
} else
return false;
} else if (isa<ExtractElementInst>(U)) {
// At this moment we should have reduced all elements in the vector.
if (ElemNumToReduce != 1)
return false;
const ConstantInt *Val = dyn_cast<ConstantInt>(U->getOperand(1));
if (!Val || !Val->isZero())
return false;
ReduxExtracted = true;
} else
return false;
}
}
return ReduxExtracted;
}
void SelectionDAGBuilder::visitUnary(const User &I, unsigned Opcode) {
SDNodeFlags Flags;
SDValue Op = getValue(I.getOperand(0));
SDValue UnNodeValue = DAG.getNode(Opcode, getCurSDLoc(), Op.getValueType(),
Op, Flags);
setValue(&I, UnNodeValue);
}
void SelectionDAGBuilder::visitBinary(const User &I, unsigned Opcode) {
SDNodeFlags Flags;
if (auto *OFBinOp = dyn_cast<OverflowingBinaryOperator>(&I)) {
Flags.setNoSignedWrap(OFBinOp->hasNoSignedWrap());
Flags.setNoUnsignedWrap(OFBinOp->hasNoUnsignedWrap());
}
if (auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) {
Flags.setExact(ExactOp->isExact());
}
if (isVectorReductionOp(&I)) {
Flags.setVectorReduction(true);
LLVM_DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n");
}
SDValue Op1 = getValue(I.getOperand(0));
SDValue Op2 = getValue(I.getOperand(1));
SDValue BinNodeValue = DAG.getNode(Opcode, getCurSDLoc(), Op1.getValueType(),
Op1, Op2, Flags);
setValue(&I, BinNodeValue);
}
void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
SDValue Op1 = getValue(I.getOperand(0));
SDValue Op2 = getValue(I.getOperand(1));
EVT ShiftTy = DAG.getTargetLoweringInfo().getShiftAmountTy(
Op1.getValueType(), DAG.getDataLayout());
// Coerce the shift amount to the right type if we can.
if (!I.getType()->isVectorTy() && Op2.getValueType() != ShiftTy) {
unsigned ShiftSize = ShiftTy.getSizeInBits();
unsigned Op2Size = Op2.getValueSizeInBits();
SDLoc DL = getCurSDLoc();
// If the operand is smaller than the shift count type, promote it.
if (ShiftSize > Op2Size)
Op2 = DAG.getNode(ISD::ZERO_EXTEND, DL, ShiftTy, Op2);
// If the operand is larger than the shift count type but the shift
// count type has enough bits to represent any shift value, truncate
// it now. This is a common case and it exposes the truncate to
// optimization early.
else if (ShiftSize >= Log2_32_Ceil(Op2.getValueSizeInBits()))
Op2 = DAG.getNode(ISD::TRUNCATE, DL, ShiftTy, Op2);
// Otherwise we'll need to temporarily settle for some other convenient
// type. Type legalization will make adjustments once the shiftee is split.
else
Op2 = DAG.getZExtOrTrunc(Op2, DL, MVT::i32);
}
bool nuw = false;
bool nsw = false;
bool exact = false;
if (Opcode == ISD::SRL || Opcode == ISD::SRA || Opcode == ISD::SHL) {
if (const OverflowingBinaryOperator *OFBinOp =
dyn_cast<const OverflowingBinaryOperator>(&I)) {
nuw = OFBinOp->hasNoUnsignedWrap();
nsw = OFBinOp->hasNoSignedWrap();
}
if (const PossiblyExactOperator *ExactOp =
dyn_cast<const PossiblyExactOperator>(&I))
exact = ExactOp->isExact();
}
SDNodeFlags Flags;
Flags.setExact(exact);
Flags.setNoSignedWrap(nsw);
Flags.setNoUnsignedWrap(nuw);
SDValue Res = DAG.getNode(Opcode, getCurSDLoc(), Op1.getValueType(), Op1, Op2,
Flags);
setValue(&I, Res);
}
void SelectionDAGBuilder::visitSDiv(const User &I) {
SDValue Op1 = getValue(I.getOperand(0));
SDValue Op2 = getValue(I.getOperand(1));
SDNodeFlags Flags;
Flags.setExact(isa<PossiblyExactOperator>(&I) &&
cast<PossiblyExactOperator>(&I)->isExact());
setValue(&I, DAG.getNode(ISD::SDIV, getCurSDLoc(), Op1.getValueType(), Op1,
Op2, Flags));
}
void SelectionDAGBuilder::visitICmp(const User &I) {
ICmpInst::Predicate predicate = ICmpInst::BAD_ICMP_PREDICATE;
if (const ICmpInst *IC = dyn_cast<ICmpInst>(&I))
predicate = IC->getPredicate();
else if (const ConstantExpr *IC = dyn_cast<ConstantExpr>(&I))
predicate = ICmpInst::Predicate(IC->getPredicate());
SDValue Op1 = getValue(I.getOperand(0));
SDValue Op2 = getValue(I.getOperand(1));
ISD::CondCode Opcode = getICmpCondCode(predicate);
auto &TLI = DAG.getTargetLoweringInfo();
EVT MemVT =
TLI.getMemValueType(DAG.getDataLayout(), I.getOperand(0)->getType());
// If a pointer's DAG type is larger than its memory type then the DAG values
// are zero-extended. This breaks signed comparisons so truncate back to the
// underlying type before doing the compare.
if (Op1.getValueType() != MemVT) {
Op1 = DAG.getPtrExtOrTrunc(Op1, getCurSDLoc(), MemVT);
Op2 = DAG.getPtrExtOrTrunc(Op2, getCurSDLoc(), MemVT);
}
EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
I.getType());
setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Opcode));
}
void SelectionDAGBuilder::visitFCmp(const User &I) {
FCmpInst::Predicate predicate = FCmpInst::BAD_FCMP_PREDICATE;
if (const FCmpInst *FC = dyn_cast<FCmpInst>(&I))
predicate = FC->getPredicate();
else if (const ConstantExpr *FC = dyn_cast<ConstantExpr>(&I))
predicate = FCmpInst::Predicate(FC->getPredicate());
SDValue Op1 = getValue(I.getOperand(0));
SDValue Op2 = getValue(I.getOperand(1));
ISD::CondCode Condition = getFCmpCondCode(predicate);
auto *FPMO = dyn_cast<FPMathOperator>(&I);
if ((FPMO && FPMO->hasNoNaNs()) || TM.Options.NoNaNsFPMath)
Condition = getFCmpCodeWithoutNaN(Condition);
EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
I.getType());
setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Condition));
}
// Check if the condition of the select has one use or two users that are both
// selects with the same condition.
static bool hasOnlySelectUsers(const Value *Cond) {
return llvm::all_of(Cond->users(), [](const Value *V) {
return isa<SelectInst>(V);
});
}
void SelectionDAGBuilder::visitSelect(const User &I) {
SmallVector<EVT, 4> ValueVTs;
ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), I.getType(),
ValueVTs);
unsigned NumValues = ValueVTs.size();
if (NumValues == 0) return;
SmallVector<SDValue, 4> Values(NumValues);
SDValue Cond = getValue(I.getOperand(0));
SDValue LHSVal = getValue(I.getOperand(1));
SDValue RHSVal = getValue(I.getOperand(2));
auto BaseOps = {Cond};
ISD::NodeType OpCode = Cond.getValueType().isVector() ?
ISD::VSELECT : ISD::SELECT;
bool IsUnaryAbs = false;
// Min/max matching is only viable if all output VTs are the same.
if (is_splat(ValueVTs)) {
EVT VT = ValueVTs[0];
LLVMContext &Ctx = *DAG.getContext();
auto &TLI = DAG.getTargetLoweringInfo();
// We care about the legality of the operation after it has been type
// legalized.
while (TLI.getTypeAction(Ctx, VT) != TargetLoweringBase::TypeLegal &&
VT != TLI.getTypeToTransformTo(Ctx, VT))
VT = TLI.getTypeToTransformTo(Ctx, VT);
// If the vselect is legal, assume we want to leave this as a vector setcc +
// vselect. Otherwise, if this is going to be scalarized, we want to see if
// min/max is legal on the scalar type.
bool UseScalarMinMax = VT.isVector() &&
!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT);
Value *LHS, *RHS;
auto SPR = matchSelectPattern(const_cast<User*>(&I), LHS, RHS);
ISD::NodeType Opc = ISD::DELETED_NODE;
switch (SPR.Flavor) {
case SPF_UMAX: Opc = ISD::UMAX; break;
case SPF_UMIN: Opc = ISD::UMIN; break;
case SPF_SMAX: Opc = ISD::SMAX; break;
case SPF_SMIN: Opc = ISD::SMIN; break;
case SPF_FMINNUM:
switch (SPR.NaNBehavior) {
case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?");
case SPNB_RETURNS_NAN: Opc = ISD::FMINIMUM; break;
case SPNB_RETURNS_OTHER: Opc = ISD::FMINNUM; break;
case SPNB_RETURNS_ANY: {
if (TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT))
Opc = ISD::FMINNUM;
else if (TLI.isOperationLegalOrCustom(ISD::FMINIMUM, VT))
Opc = ISD::FMINIMUM;
else if (UseScalarMinMax)
Opc = TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT.getScalarType()) ?
ISD::FMINNUM : ISD::FMINIMUM;
break;
}
}
break;
case SPF_FMAXNUM:
switch (SPR.NaNBehavior) {
case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?");
case SPNB_RETURNS_NAN: Opc = ISD::FMAXIMUM; break;
case SPNB_RETURNS_OTHER: Opc = ISD::FMAXNUM; break;
case SPNB_RETURNS_ANY:
if (TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT))
Opc = ISD::FMAXNUM;
else if (TLI.isOperationLegalOrCustom(ISD::FMAXIMUM, VT))
Opc = ISD::FMAXIMUM;
else if (UseScalarMinMax)
Opc = TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT.getScalarType()) ?
ISD::FMAXNUM : ISD::FMAXIMUM;
break;
}
break;
case SPF_ABS:
IsUnaryAbs = true;
Opc = ISD::ABS;
break;
case SPF_NABS:
// TODO: we need to produce sub(0, abs(X)).
default: break;
}
if (!IsUnaryAbs && Opc != ISD::DELETED_NODE &&
(TLI.isOperationLegalOrCustom(Opc, VT) ||
(UseScalarMinMax &&
TLI.isOperationLegalOrCustom(Opc, VT.getScalarType()))) &&
// If the underlying comparison instruction is used by any other
// instruction, the consumed instructions won't be destroyed, so it is
// not profitable to convert to a min/max.
hasOnlySelectUsers(cast<SelectInst>(I).getCondition())) {
OpCode = Opc;
LHSVal = getValue(LHS);
RHSVal = getValue(RHS);
BaseOps = {};
}
if (IsUnaryAbs) {
OpCode = Opc;
LHSVal = getValue(LHS);
BaseOps = {};
}
}
if (IsUnaryAbs) {
for (unsigned i = 0; i != NumValues; ++i) {
Values[i] =
DAG.getNode(OpCode, getCurSDLoc(),
LHSVal.getNode()->getValueType(LHSVal.getResNo() + i),
SDValue(LHSVal.getNode(), LHSVal.getResNo() + i));
}
} else {
for (unsigned i = 0; i != NumValues; ++i) {
SmallVector<SDValue, 3> Ops(BaseOps.begin(), BaseOps.end());
Ops.push_back(SDValue(LHSVal.getNode(), LHSVal.getResNo() + i));
Ops.push_back(SDValue(RHSVal.getNode(), RHSVal.getResNo() + i));
Values[i] = DAG.getNode(
OpCode, getCurSDLoc(),
LHSVal.getNode()->getValueType(LHSVal.getResNo() + i), Ops);
}
}
setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
DAG.getVTList(ValueVTs), Values));
}
void SelectionDAGBuilder::visitTrunc(const User &I) {
// TruncInst cannot be a no-op cast because sizeof(src) > sizeof(dest).
SDValue N = getValue(I.getOperand(0));
EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
I.getType());
setValue(&I, DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), DestVT, N));
}
void SelectionDAGBuilder::visitZExt(const User &I) {
// ZExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
// ZExt also can't be a cast to bool for same reason. So, nothing much to do
SDValue N = getValue(I.getOperand(0));
EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
I.getType());
setValue(&I, DAG.getNode(ISD::ZERO_EXTEND, getCurSDLoc(), DestVT, N));
}
void SelectionDAGBuilder::visitSExt(const User &I) {
// SExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
// SExt also can't be a cast to bool for same reason. So, nothing much to do
SDValue N = getValue(I.getOperand(0));
EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
I.getType());
setValue(&I, DAG.getNode(ISD::SIGN_EXTEND, getCurSDLoc(), DestVT, N));
}
void SelectionDAGBuilder::visitFPTrunc(const User &I) {
// FPTrunc is never a no-op cast, no need to check
SDValue N = getValue(I.getOperand(0));
SDLoc dl = getCurSDLoc();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
setValue(&I, DAG.getNode(ISD::FP_ROUND, dl, DestVT, N,
DAG.getTargetConstant(
0, dl, TLI.getPointerTy(DAG.getDataLayout()))));
}
void SelectionDAGBuilder::visitFPExt(const User &I) {
// FPExt is never a no-op cast, no need to check
SDValue N = getValue(I.getOperand(0));
EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
I.getType());
setValue(&I, DAG.getNode(ISD::FP_EXTEND, getCurSDLoc(), DestVT, N));
}
void SelectionDAGBuilder::visitFPToUI(const User &I) {
// FPToUI is never a no-op cast, no need to check
SDValue N = getValue(I.getOperand(0));
EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
I.getType());
setValue(&I, DAG.getNode(ISD::FP_TO_UINT, getCurSDLoc(), DestVT, N));
}
void SelectionDAGBuilder::visitFPToSI(const User &I) {
// FPToSI is never a no-op cast, no need to check
SDValue N = getValue(I.getOperand(0));
EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
I.getType());
setValue(&I, DAG.getNode(ISD::FP_TO_SINT, getCurSDLoc(), DestVT, N));
}
void SelectionDAGBuilder::visitUIToFP(const User &I) {
// UIToFP is never a no-op cast, no need to check
SDValue N = getValue(I.getOperand(0));
EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
I.getType());
setValue(&I, DAG.getNode(ISD::UINT_TO_FP, getCurSDLoc(), DestVT, N));
}
void SelectionDAGBuilder::visitSIToFP(const User &I) {
// SIToFP is never a no-op cast, no need to check
SDValue N = getValue(I.getOperand(0));
EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
I.getType());
setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurSDLoc(), DestVT, N));
}
void SelectionDAGBuilder::visitPtrToInt(const User &I) {
// What to do depends on the size of the integer and the size of the pointer.
// We can either truncate, zero extend, or no-op, accordingly.
SDValue N = getValue(I.getOperand(0));
auto &TLI = DAG.getTargetLoweringInfo();
EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
I.getType());
EVT PtrMemVT =
TLI.getMemValueType(DAG.getDataLayout(), I.getOperand(0)->getType());
N = DAG.getPtrExtOrTrunc(N, getCurSDLoc(), PtrMemVT);
N = DAG.getZExtOrTrunc(N, getCurSDLoc(), DestVT);
setValue(&I, N);
}
void SelectionDAGBuilder::visitIntToPtr(const User &I) {
// What to do depends on the size of the integer and the size of the pointer.
// We can either truncate, zero extend, or no-op, accordingly.
SDValue N = getValue(I.getOperand(0));
auto &TLI = DAG.getTargetLoweringInfo();
EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
EVT PtrMemVT = TLI.getMemValueType(DAG.getDataLayout(), I.getType());
N = DAG.getZExtOrTrunc(N, getCurSDLoc(), PtrMemVT);
N = DAG.getPtrExtOrTrunc(N, getCurSDLoc(), DestVT);
setValue(&I, N);
}
void SelectionDAGBuilder::visitBitCast(const User &I) {
SDValue N = getValue(I.getOperand(0));
SDLoc dl = getCurSDLoc();
EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
I.getType());
// BitCast assures us that source and destination are the same size so this is
// either a BITCAST or a no-op.
if (DestVT != N.getValueType())
setValue(&I, DAG.getNode(ISD::BITCAST, dl,
DestVT, N)); // convert types.
// Check if the original LLVM IR Operand was a ConstantInt, because getValue()
// might fold any kind of constant expression to an integer constant and that
// is not what we are looking for. Only recognize a bitcast of a genuine
// constant integer as an opaque constant.
else if(ConstantInt *C = dyn_cast<ConstantInt>(I.getOperand(0)))
setValue(&I, DAG.getConstant(C->getValue(), dl, DestVT, /*isTarget=*/false,
/*isOpaque*/true));
else
setValue(&I, N); // noop cast.
}
void SelectionDAGBuilder::visitAddrSpaceCast(const User &I) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
const Value *SV = I.getOperand(0);
SDValue N = getValue(SV);
EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
unsigned SrcAS = SV->getType()->getPointerAddressSpace();
unsigned DestAS = I.getType()->getPointerAddressSpace();
if (!TLI.isNoopAddrSpaceCast(SrcAS, DestAS))
N = DAG.getAddrSpaceCast(getCurSDLoc(), DestVT, N, SrcAS, DestAS);
setValue(&I, N);
}
void SelectionDAGBuilder::visitInsertElement(const User &I) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue InVec = getValue(I.getOperand(0));
SDValue InVal = getValue(I.getOperand(1));
SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(2)), getCurSDLoc(),
TLI.getVectorIdxTy(DAG.getDataLayout()));
setValue(&I, DAG.getNode(ISD::INSERT_VECTOR_ELT, getCurSDLoc(),
TLI.getValueType(DAG.getDataLayout(), I.getType()),
InVec, InVal, InIdx));
}
void SelectionDAGBuilder::visitExtractElement(const User &I) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue InVec = getValue(I.getOperand(0));
SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(1)), getCurSDLoc(),
TLI.getVectorIdxTy(DAG.getDataLayout()));
setValue(&I, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurSDLoc(),
TLI.getValueType(DAG.getDataLayout(), I.getType()),
InVec, InIdx));
}
void SelectionDAGBuilder::visitShuffleVector(const User &I) {
SDValue Src1 = getValue(I.getOperand(0));
SDValue Src2 = getValue(I.getOperand(1));
SDLoc DL = getCurSDLoc();
SmallVector<int, 8> Mask;
ShuffleVectorInst::getShuffleMask(cast<Constant>(I.getOperand(2)), Mask);
unsigned MaskNumElts = Mask.size();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
EVT SrcVT = Src1.getValueType();
unsigned SrcNumElts = SrcVT.getVectorNumElements();
if (SrcNumElts == MaskNumElts) {
setValue(&I, DAG.getVectorShuffle(VT, DL, Src1, Src2, Mask));
return;
}
// Normalize the shuffle vector since mask and vector length don't match.
if (SrcNumElts < MaskNumElts) {
// Mask is longer than the source vectors. We can use concatenate vector to
// make the mask and vectors lengths match.
if (MaskNumElts % SrcNumElts == 0) {
// Mask length is a multiple of the source vector length.
// Check if the shuffle is some kind of concatenation of the input
// vectors.
unsigned NumConcat = MaskNumElts / SrcNumElts;
bool IsConcat = true;
SmallVector<int, 8> ConcatSrcs(NumConcat, -1);
for (unsigned i = 0; i != MaskNumElts; ++i) {
int Idx = Mask[i];
if (Idx < 0)
continue;
// Ensure the indices in each SrcVT sized piece are sequential and that
// the same source is used for the whole piece.
if ((Idx % SrcNumElts != (i % SrcNumElts)) ||
(ConcatSrcs[i / SrcNumElts] >= 0 &&
ConcatSrcs[i / SrcNumElts] != (int)(Idx / SrcNumElts))) {
IsConcat = false;
break;
}
// Remember which source this index came from.
ConcatSrcs[i / SrcNumElts] = Idx / SrcNumElts;
}
// The shuffle is concatenating multiple vectors together. Just emit
// a CONCAT_VECTORS operation.
if (IsConcat) {
SmallVector<SDValue, 8> ConcatOps;
for (auto Src : ConcatSrcs) {
if (Src < 0)
ConcatOps.push_back(DAG.getUNDEF(SrcVT));
else if (Src == 0)
ConcatOps.push_back(Src1);
else
ConcatOps.push_back(Src2);
}
setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps));
return;
}
}
unsigned PaddedMaskNumElts = alignTo(MaskNumElts, SrcNumElts);
unsigned NumConcat = PaddedMaskNumElts / SrcNumElts;
EVT PaddedVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
PaddedMaskNumElts);
// Pad both vectors with undefs to make them the same length as the mask.
SDValue UndefVal = DAG.getUNDEF(SrcVT);
SmallVector<SDValue, 8> MOps1(NumConcat, UndefVal);
SmallVector<SDValue, 8> MOps2(NumConcat, UndefVal);
MOps1[0] = Src1;
MOps2[0] = Src2;
Src1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, PaddedVT, MOps1);
Src2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, PaddedVT, MOps2);
// Readjust mask for new input vector length.
SmallVector<int, 8> MappedOps(PaddedMaskNumElts, -1);
for (unsigned i = 0; i != MaskNumElts; ++i) {
int Idx = Mask[i];
if (Idx >= (int)SrcNumElts)
Idx -= SrcNumElts - PaddedMaskNumElts;
MappedOps[i] = Idx;
}
SDValue Result = DAG.getVectorShuffle(PaddedVT, DL, Src1, Src2, MappedOps);
// If the concatenated vector was padded, extract a subvector with the
// correct number of elements.
if (MaskNumElts != PaddedMaskNumElts)
Result = DAG.getNode(
ISD::EXTRACT_SUBVECTOR, DL, VT, Result,
DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
setValue(&I, Result);
return;
}
if (SrcNumElts > MaskNumElts) {
// Analyze the access pattern of the vector to see if we can extract
// two subvectors and do the shuffle.
int StartIdx[2] = { -1, -1 }; // StartIdx to extract from
bool CanExtract = true;
for (int Idx : Mask) {
unsigned Input = 0;
if (Idx < 0)
continue;
if (Idx >= (int)SrcNumElts) {
Input = 1;
Idx -= SrcNumElts;
}
// If all the indices come from the same MaskNumElts sized portion of
// the sources we can use extract. Also make sure the extract wouldn't
// extract past the end of the source.
int NewStartIdx = alignDown(Idx, MaskNumElts);
if (NewStartIdx + MaskNumElts > SrcNumElts ||
(StartIdx[Input] >= 0 && StartIdx[Input] != NewStartIdx))
CanExtract = false;
// Make sure we always update StartIdx as we use it to track if all
// elements are undef.
StartIdx[Input] = NewStartIdx;
}
if (StartIdx[0] < 0 && StartIdx[1] < 0) {
setValue(&I, DAG.getUNDEF(VT)); // Vectors are not used.
return;
}
if (CanExtract) {
// Extract appropriate subvector and generate a vector shuffle
for (unsigned Input = 0; Input < 2; ++Input) {
SDValue &Src = Input == 0 ? Src1 : Src2;
if (StartIdx[Input] < 0)
Src = DAG.getUNDEF(VT);
else {
Src = DAG.getNode(
ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
DAG.getConstant(StartIdx[Input], DL,
TLI.getVectorIdxTy(DAG.getDataLayout())));
}
}
// Calculate new mask.
SmallVector<int, 8> MappedOps(Mask.begin(), Mask.end());
for (int &Idx : MappedOps) {
if (Idx >= (int)SrcNumElts)
Idx -= SrcNumElts + StartIdx[1] - MaskNumElts;
else if (Idx >= 0)
Idx -= StartIdx[0];
}
setValue(&I, DAG.getVectorShuffle(VT, DL, Src1, Src2, MappedOps));
return;
}
}
// We can't use either concat vectors or extract subvectors so fall back to
// replacing the shuffle with extract and build vector.
// to insert and build vector.
EVT EltVT = VT.getVectorElementType();
EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
SmallVector<SDValue,8> Ops;
for (int Idx : Mask) {
SDValue Res;
if (Idx < 0) {
Res = DAG.getUNDEF(EltVT);
} else {
SDValue &Src = Idx < (int)SrcNumElts ? Src1 : Src2;
if (Idx >= (int)SrcNumElts) Idx -= SrcNumElts;
Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
EltVT, Src, DAG.getConstant(Idx, DL, IdxVT));
}
Ops.push_back(Res);
}
setValue(&I, DAG.getBuildVector(VT, DL, Ops));
}
void SelectionDAGBuilder::visitInsertValue(const User &I) {
ArrayRef<unsigned> Indices;
if (const InsertValueInst *IV = dyn_cast<InsertValueInst>(&I))
Indices = IV->getIndices();
else
Indices = cast<ConstantExpr>(&I)->getIndices();
const Value *Op0 = I.getOperand(0);
const Value *Op1 = I.getOperand(1);
Type *AggTy = I.getType();
Type *ValTy = Op1->getType();
bool IntoUndef = isa<UndefValue>(Op0);
bool FromUndef = isa<UndefValue>(Op1);
unsigned LinearIndex = ComputeLinearIndex(AggTy, Indices);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SmallVector<EVT, 4> AggValueVTs;
ComputeValueVTs(TLI, DAG.getDataLayout(), AggTy, AggValueVTs);
SmallVector<EVT, 4> ValValueVTs;
ComputeValueVTs(TLI, DAG.getDataLayout(), ValTy, ValValueVTs);
unsigned NumAggValues = AggValueVTs.size();
unsigned NumValValues = ValValueVTs.size();
SmallVector<SDValue, 4> Values(NumAggValues);
// Ignore an insertvalue that produces an empty object
if (!NumAggValues) {
setValue(&I, DAG.getUNDEF(MVT(MVT::Other)));
return;
}
SDValue Agg = getValue(Op0);
unsigned i = 0;
// Copy the beginning value(s) from the original aggregate.
for (; i != LinearIndex; ++i)
Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
SDValue(Agg.getNode(), Agg.getResNo() + i);
// Copy values from the inserted value(s).
if (NumValValues) {
SDValue Val = getValue(Op1);
for (; i != LinearIndex + NumValValues; ++i)
Values[i] = FromUndef ? DAG.getUNDEF(AggValueVTs[i]) :
SDValue(Val.getNode(), Val.getResNo() + i - LinearIndex);
}
// Copy remaining value(s) from the original aggregate.
for (; i != NumAggValues; ++i)
Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
SDValue(Agg.getNode(), Agg.getResNo() + i);
setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
DAG.getVTList(AggValueVTs), Values));
}
void SelectionDAGBuilder::visitExtractValue(const User &I) {
ArrayRef<unsigned> Indices;
if (const ExtractValueInst *EV = dyn_cast<ExtractValueInst>(&I))
Indices = EV->getIndices();
else
Indices = cast<ConstantExpr>(&I)->getIndices();
const Value *Op0 = I.getOperand(0);
Type *AggTy = Op0->getType();
Type *ValTy = I.getType();
bool OutOfUndef = isa<UndefValue>(Op0);
unsigned LinearIndex = ComputeLinearIndex(AggTy, Indices);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SmallVector<EVT, 4> ValValueVTs;
ComputeValueVTs(TLI, DAG.getDataLayout(), ValTy, ValValueVTs);
unsigned NumValValues = ValValueVTs.size();
// Ignore a extractvalue that produces an empty object
if (!NumValValues) {
setValue(&I, DAG.getUNDEF(MVT(MVT::Other)));
return;
}
SmallVector<SDValue, 4> Values(NumValValues);
SDValue Agg = getValue(Op0);
// Copy out the selected value(s).
for (unsigned i = LinearIndex; i != LinearIndex + NumValValues; ++i)
Values[i - LinearIndex] =
OutOfUndef ?
DAG.getUNDEF(Agg.getNode()->getValueType(Agg.getResNo() + i)) :
SDValue(Agg.getNode(), Agg.getResNo() + i);
setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
DAG.getVTList(ValValueVTs), Values));
}
void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
Value *Op0 = I.getOperand(0);
// Note that the pointer operand may be a vector of pointers. Take the scalar
// element which holds a pointer.
unsigned AS = Op0->getType()->getScalarType()->getPointerAddressSpace();
SDValue N = getValue(Op0);
SDLoc dl = getCurSDLoc();
auto &TLI = DAG.getTargetLoweringInfo();
MVT PtrTy = TLI.getPointerTy(DAG.getDataLayout(), AS);
MVT PtrMemTy = TLI.getPointerMemTy(DAG.getDataLayout(), AS);
// Normalize Vector GEP - all scalar operands should be converted to the
// splat vector.
unsigned VectorWidth = I.getType()->isVectorTy() ?
cast<VectorType>(I.getType())->getVectorNumElements() : 0;
if (VectorWidth && !N.getValueType().isVector()) {
LLVMContext &Context = *DAG.getContext();
EVT VT = EVT::getVectorVT(Context, N.getValueType(), VectorWidth);
N = DAG.getSplatBuildVector(VT, dl, N);
}
for (gep_type_iterator GTI = gep_type_begin(&I), E = gep_type_end(&I);
GTI != E; ++GTI) {
const Value *Idx = GTI.getOperand();
if (StructType *StTy = GTI.getStructTypeOrNull()) {
unsigned Field = cast<Constant>(Idx)->getUniqueInteger().getZExtValue();
if (Field) {
// N = N + Offset
uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field);
// In an inbounds GEP with an offset that is nonnegative even when
// interpreted as signed, assume there is no unsigned overflow.
SDNodeFlags Flags;
if (int64_t(Offset) >= 0 && cast<GEPOperator>(I).isInBounds())
Flags.setNoUnsignedWrap(true);
N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N,
DAG.getConstant(Offset, dl, N.getValueType()), Flags);
}
} else {
unsigned IdxSize = DAG.getDataLayout().getIndexSizeInBits(AS);
MVT IdxTy = MVT::getIntegerVT(IdxSize);
APInt ElementSize(IdxSize, DL->getTypeAllocSize(GTI.getIndexedType()));
// If this is a scalar constant or a splat vector of constants,
// handle it quickly.
const auto *CI = dyn_cast<ConstantInt>(Idx);
if (!CI && isa<ConstantDataVector>(Idx) &&
cast<ConstantDataVector>(Idx)->getSplatValue())
CI = cast<ConstantInt>(cast<ConstantDataVector>(Idx)->getSplatValue());
if (CI) {
if (CI->isZero())
continue;
APInt Offs = ElementSize * CI->getValue().sextOrTrunc(IdxSize);
LLVMContext &Context = *DAG.getContext();
SDValue OffsVal = VectorWidth ?
DAG.getConstant(Offs, dl, EVT::getVectorVT(Context, IdxTy, VectorWidth)) :
DAG.getConstant(Offs, dl, IdxTy);
// In an inbouds GEP with an offset that is nonnegative even when
// interpreted as signed, assume there is no unsigned overflow.
SDNodeFlags Flags;
if (Offs.isNonNegative() && cast<GEPOperator>(I).isInBounds())
Flags.setNoUnsignedWrap(true);
OffsVal = DAG.getSExtOrTrunc(OffsVal, dl, N.getValueType());
N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal, Flags);
continue;
}
// N = N + Idx * ElementSize;
SDValue IdxN = getValue(Idx);
if (!IdxN.getValueType().isVector() && VectorWidth) {
EVT VT = EVT::getVectorVT(*Context, IdxN.getValueType(), VectorWidth);
IdxN = DAG.getSplatBuildVector(VT, dl, IdxN);
}
// If the index is smaller or larger than intptr_t, truncate or extend
// it.
IdxN = DAG.getSExtOrTrunc(IdxN, dl, N.getValueType());
// If this is a multiply by a power of two, turn it into a shl
// immediately. This is a very common case.
if (ElementSize != 1) {
if (ElementSize.isPowerOf2()) {
unsigned Amt = ElementSize.logBase2();
IdxN = DAG.getNode(ISD::SHL, dl,
N.getValueType(), IdxN,
DAG.getConstant(Amt, dl, IdxN.getValueType()));
} else {
SDValue Scale = DAG.getConstant(ElementSize.getZExtValue(), dl,
IdxN.getValueType());
IdxN = DAG.getNode(ISD::MUL, dl,
N.getValueType(), IdxN, Scale);
}
}
N = DAG.getNode(ISD::ADD, dl,
N.getValueType(), N, IdxN);
}
}
if (PtrMemTy != PtrTy && !cast<GEPOperator>(I).isInBounds())
N = DAG.getPtrExtendInReg(N, dl, PtrMemTy);
setValue(&I, N);
}
void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
// If this is a fixed sized alloca in the entry block of the function,
// allocate it statically on the stack.
if (FuncInfo.StaticAllocaMap.count(&I))
return; // getValue will auto-populate this.
SDLoc dl = getCurSDLoc();
Type *Ty = I.getAllocatedType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
auto &DL = DAG.getDataLayout();
uint64_t TySize = DL.getTypeAllocSize(Ty);
unsigned Align =
std::max((unsigned)DL.getPrefTypeAlignment(Ty), I.getAlignment());
SDValue AllocSize = getValue(I.getArraySize());
EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout(), DL.getAllocaAddrSpace());
if (AllocSize.getValueType() != IntPtr)
AllocSize = DAG.getZExtOrTrunc(AllocSize, dl, IntPtr);
AllocSize = DAG.getNode(ISD::MUL, dl, IntPtr,
AllocSize,
DAG.getConstant(TySize, dl, IntPtr));
// Handle alignment. If the requested alignment is less than or equal to
// the stack alignment, ignore it. If the size is greater than or equal to
// the stack alignment, we note this in the DYNAMIC_STACKALLOC node.
unsigned StackAlign =
DAG.getSubtarget().getFrameLowering()->getStackAlignment();
if (Align <= StackAlign)
Align = 0;
// Round the size of the allocation up to the stack alignment size
// by add SA-1 to the size. This doesn't overflow because we're computing
// an address inside an alloca.
SDNodeFlags Flags;
Flags.setNoUnsignedWrap(true);
AllocSize = DAG.getNode(ISD::ADD, dl, AllocSize.getValueType(), AllocSize,
DAG.getConstant(StackAlign - 1, dl, IntPtr), Flags);
// Mask out the low bits for alignment purposes.
AllocSize =
DAG.getNode(ISD::AND, dl, AllocSize.getValueType(), AllocSize,
DAG.getConstant(~(uint64_t)(StackAlign - 1), dl, IntPtr));
SDValue Ops[] = {getRoot(), AllocSize, DAG.getConstant(Align, dl, IntPtr)};
SDVTList VTs = DAG.getVTList(AllocSize.getValueType(), MVT::Other);
SDValue DSA = DAG.getNode(ISD::DYNAMIC_STACKALLOC, dl, VTs, Ops);
setValue(&I, DSA);
DAG.setRoot(DSA.getValue(1));
assert(FuncInfo.MF->getFrameInfo().hasVarSizedObjects());
}
void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
if (I.isAtomic())
return visitAtomicLoad(I);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
const Value *SV = I.getOperand(0);
if (TLI.supportSwiftError()) {
// Swifterror values can come from either a function parameter with
// swifterror attribute or an alloca with swifterror attribute.
if (const Argument *Arg = dyn_cast<Argument>(SV)) {
if (Arg->hasSwiftErrorAttr())
return visitLoadFromSwiftError(I);
}
if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
if (Alloca->isSwiftError())
return visitLoadFromSwiftError(I);
}
}
SDValue Ptr = getValue(SV);
Type *Ty = I.getType();
bool isVolatile = I.isVolatile();
bool isNonTemporal = I.getMetadata(LLVMContext::MD_nontemporal) != nullptr;
bool isInvariant = I.getMetadata(LLVMContext::MD_invariant_load) != nullptr;
bool isDereferenceable =
isDereferenceablePointer(SV, I.getType(), DAG.getDataLayout());
unsigned Alignment = I.getAlignment();
AAMDNodes AAInfo;
I.getAAMetadata(AAInfo);
const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
SmallVector<EVT, 4> ValueVTs, MemVTs;
SmallVector<uint64_t, 4> Offsets;
ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &MemVTs, &Offsets);
unsigned NumValues = ValueVTs.size();
if (NumValues == 0)
return;
SDValue Root;
bool ConstantMemory = false;
if (isVolatile || NumValues > MaxParallelChains)
// Serialize volatile loads with other side effects.
Root = getRoot();
else if (AA &&
AA->pointsToConstantMemory(MemoryLocation(
SV,
LocationSize::precise(DAG.getDataLayout().getTypeStoreSize(Ty)),
AAInfo))) {
// Do not serialize (non-volatile) loads of constant memory with anything.
Root = DAG.getEntryNode();
ConstantMemory = true;
} else {
// Do not serialize non-volatile loads against each other.
Root = DAG.getRoot();
}
SDLoc dl = getCurSDLoc();
if (isVolatile)
Root = TLI.prepareVolatileOrAtomicLoad(Root, dl, DAG);
// An aggregate load cannot wrap around the address space, so offsets to its
// parts don't wrap either.
SDNodeFlags Flags;
Flags.setNoUnsignedWrap(true);
SmallVector<SDValue, 4> Values(NumValues);
SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
EVT PtrVT = Ptr.getValueType();
unsigned ChainI = 0;
for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
// Serializing loads here may result in excessive register pressure, and
// TokenFactor places arbitrary choke points on the scheduler. SD scheduling
// could recover a bit by hoisting nodes upward in the chain by recognizing
// they are side-effect free or do not alias. The optimizer should really
// avoid this case by converting large object/array copies to llvm.memcpy
// (MaxParallelChains should always remain as failsafe).
if (ChainI == MaxParallelChains) {
assert(PendingLoads.empty() && "PendingLoads must be serialized first");
SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
makeArrayRef(Chains.data(), ChainI));
Root = Chain;
ChainI = 0;
}
SDValue A = DAG.getNode(ISD::ADD, dl,
PtrVT, Ptr,
DAG.getConstant(Offsets[i], dl, PtrVT),
Flags);
auto MMOFlags = MachineMemOperand::MONone;
if (isVolatile)
MMOFlags |= MachineMemOperand::MOVolatile;
if (isNonTemporal)
MMOFlags |= MachineMemOperand::MONonTemporal;
if (isInvariant)
MMOFlags |= MachineMemOperand::MOInvariant;
if (isDereferenceable)
MMOFlags |= MachineMemOperand::MODereferenceable;
MMOFlags |= TLI.getMMOFlags(I);
SDValue L = DAG.getLoad(MemVTs[i], dl, Root, A,
MachinePointerInfo(SV, Offsets[i]), Alignment,
MMOFlags, AAInfo, Ranges);
Chains[ChainI] = L.getValue(1);
if (MemVTs[i] != ValueVTs[i])
L = DAG.getZExtOrTrunc(L, dl, ValueVTs[i]);
Values[i] = L;
}
if (!ConstantMemory) {
SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
makeArrayRef(Chains.data(), ChainI));
if (isVolatile)
DAG.setRoot(Chain);
else
PendingLoads.push_back(Chain);
}
setValue(&I, DAG.getNode(ISD::MERGE_VALUES, dl,
DAG.getVTList(ValueVTs), Values));
}
void SelectionDAGBuilder::visitStoreToSwiftError(const StoreInst &I) {
assert(DAG.getTargetLoweringInfo().supportSwiftError() &&
"call visitStoreToSwiftError when backend supports swifterror");
SmallVector<EVT, 4> ValueVTs;
SmallVector<uint64_t, 4> Offsets;
const Value *SrcV = I.getOperand(0);
ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
SrcV->getType(), ValueVTs, &Offsets);
assert(ValueVTs.size() == 1 && Offsets[0] == 0 &&
"expect a single EVT for swifterror");
SDValue Src = getValue(SrcV);
// Create a virtual register, then update the virtual register.
unsigned VReg =
SwiftError.getOrCreateVRegDefAt(&I, FuncInfo.MBB, I.getPointerOperand());
// Chain, DL, Reg, N or Chain, DL, Reg, N, Glue
// Chain can be getRoot or getControlRoot.
SDValue CopyNode = DAG.getCopyToReg(getRoot(), getCurSDLoc(), VReg,
SDValue(Src.getNode(), Src.getResNo()));
DAG.setRoot(CopyNode);
}
void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) {
assert(DAG.getTargetLoweringInfo().supportSwiftError() &&
"call visitLoadFromSwiftError when backend supports swifterror");
assert(!I.isVolatile() &&
I.getMetadata(LLVMContext::MD_nontemporal) == nullptr &&
I.getMetadata(LLVMContext::MD_invariant_load) == nullptr &&
"Support volatile, non temporal, invariant for load_from_swift_error");
const Value *SV = I.getOperand(0);
Type *Ty = I.getType();
AAMDNodes AAInfo;
I.getAAMetadata(AAInfo);
assert(
(!AA ||
!AA->pointsToConstantMemory(MemoryLocation(
SV, LocationSize::precise(DAG.getDataLayout().getTypeStoreSize(Ty)),
AAInfo))) &&
"load_from_swift_error should not be constant memory");
SmallVector<EVT, 4> ValueVTs;
SmallVector<uint64_t, 4> Offsets;
ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), Ty,
ValueVTs, &Offsets);
assert(ValueVTs.size() == 1 && Offsets[0] == 0 &&
"expect a single EVT for swifterror");
// Chain, DL, Reg, VT, Glue or Chain, DL, Reg, VT
SDValue L = DAG.getCopyFromReg(
getRoot(), getCurSDLoc(),
SwiftError.getOrCreateVRegUseAt(&I, FuncInfo.MBB, SV), ValueVTs[0]);
setValue(&I, L);
}
void SelectionDAGBuilder::visitStore(const StoreInst &I) {
if (I.isAtomic())
return visitAtomicStore(I);
const Value *SrcV = I.getOperand(0);
const Value *PtrV = I.getOperand(1);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.supportSwiftError()) {
// Swifterror values can come from either a function parameter with
// swifterror attribute or an alloca with swifterror attribute.
if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
if (Arg->hasSwiftErrorAttr())
return visitStoreToSwiftError(I);
}
if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
if (Alloca->isSwiftError())
return visitStoreToSwiftError(I);
}
}
SmallVector<EVT, 4> ValueVTs, MemVTs;
SmallVector<uint64_t, 4> Offsets;
ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
SrcV->getType(), ValueVTs, &MemVTs, &Offsets);
unsigned NumValues = ValueVTs.size();
if (NumValues == 0)
return;
// Get the lowered operands. Note that we do this after
// checking if NumResults is zero, because with zero results
// the operands won't have values in the map.
SDValue Src = getValue(SrcV);
SDValue Ptr = getValue(PtrV);
SDValue Root = getRoot();
SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
SDLoc dl = getCurSDLoc();
EVT PtrVT = Ptr.getValueType();
unsigned Alignment = I.getAlignment();
AAMDNodes AAInfo;
I.getAAMetadata(AAInfo);
auto MMOFlags = MachineMemOperand::MONone;
if (I.isVolatile())
MMOFlags |= MachineMemOperand::MOVolatile;
if (I.getMetadata(LLVMContext::MD_nontemporal) != nullptr)
MMOFlags |= MachineMemOperand::MONonTemporal;
MMOFlags |= TLI.getMMOFlags(I);
// An aggregate load cannot wrap around the address space, so offsets to its
// parts don't wrap either.
SDNodeFlags Flags;
Flags.setNoUnsignedWrap(true);
unsigned ChainI = 0;
for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
// See visitLoad comments.
if (ChainI == MaxParallelChains) {
SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
makeArrayRef(Chains.data(), ChainI));
Root = Chain;
ChainI = 0;
}
SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr,
DAG.getConstant(Offsets[i], dl, PtrVT), Flags);
SDValue Val = SDValue(Src.getNode(), Src.getResNo() + i);
if (MemVTs[i] != ValueVTs[i])
Val = DAG.getPtrExtOrTrunc(Val, dl, MemVTs[i]);
SDValue St =
DAG.getStore(Root, dl, Val, Add, MachinePointerInfo(PtrV, Offsets[i]),
Alignment, MMOFlags, AAInfo);
Chains[ChainI] = St;
}
SDValue StoreNode = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
makeArrayRef(Chains.data(), ChainI));
DAG.setRoot(StoreNode);
}
void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
bool IsCompressing) {
SDLoc sdl = getCurSDLoc();
auto getMaskedStoreOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
unsigned& Alignment) {
// llvm.masked.store.*(Src0, Ptr, alignment, Mask)
Src0 = I.getArgOperand(0);
Ptr = I.getArgOperand(1);
Alignment = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
Mask = I.getArgOperand(3);
};
auto getCompressingStoreOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
unsigned& Alignment) {
// llvm.masked.compressstore.*(Src0, Ptr, Mask)
Src0 = I.getArgOperand(0);
Ptr = I.getArgOperand(1);
Mask = I.getArgOperand(2);
Alignment = 0;
};
Value *PtrOperand, *MaskOperand, *Src0Operand;
unsigned Alignment;
if (IsCompressing)
getCompressingStoreOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
else
getMaskedStoreOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
SDValue Ptr = getValue(PtrOperand);
SDValue Src0 = getValue(Src0Operand);
SDValue Mask = getValue(MaskOperand);
EVT VT = Src0.getValueType();
if (!Alignment)
Alignment = DAG.getEVTAlignment(VT);
AAMDNodes AAInfo;
I.getAAMetadata(AAInfo);
MachineMemOperand *MMO =
DAG.getMachineFunction().
getMachineMemOperand(MachinePointerInfo(PtrOperand),
MachineMemOperand::MOStore, VT.getStoreSize(),
Alignment, AAInfo);
SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, VT,
MMO, false /* Truncating */,
IsCompressing);
DAG.setRoot(StoreNode);
setValue(&I, StoreNode);
}
// Get a uniform base for the Gather/Scatter intrinsic.
// The first argument of the Gather/Scatter intrinsic is a vector of pointers.
// We try to represent it as a base pointer + vector of indices.
// Usually, the vector of pointers comes from a 'getelementptr' instruction.
// The first operand of the GEP may be a single pointer or a vector of pointers
// Example:
// %gep.ptr = getelementptr i32, <8 x i32*> %vptr, <8 x i32> %ind
// or
// %gep.ptr = getelementptr i32, i32* %ptr, <8 x i32> %ind
// %res = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.ptr, ..
//
// When the first GEP operand is a single pointer - it is the uniform base we
// are looking for. If first operand of the GEP is a splat vector - we
// extract the splat value and use it as a uniform base.
// In all other cases the function returns 'false'.
static bool getUniformBase(const Value* &Ptr, SDValue& Base, SDValue& Index,
SDValue &Scale, SelectionDAGBuilder* SDB) {
SelectionDAG& DAG = SDB->DAG;
LLVMContext &Context = *DAG.getContext();
assert(Ptr->getType()->isVectorTy() && "Uexpected pointer type");
const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
if (!GEP)
return false;
const Value *GEPPtr = GEP->getPointerOperand();
if (!GEPPtr->getType()->isVectorTy())
Ptr = GEPPtr;
else if (!(Ptr = getSplatValue(GEPPtr)))
return false;
unsigned FinalIndex = GEP->getNumOperands() - 1;
Value *IndexVal = GEP->getOperand(FinalIndex);
// Ensure all the other indices are 0.
for (unsigned i = 1; i < FinalIndex; ++i) {
auto *C = dyn_cast<ConstantInt>(GEP->getOperand(i));
if (!C || !C->isZero())
return false;
}
// The operands of the GEP may be defined in another basic block.
// In this case we'll not find nodes for the operands.
if (!SDB->findValue(Ptr) || !SDB->findValue(IndexVal))
return false;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
const DataLayout &DL = DAG.getDataLayout();
Scale = DAG.getTargetConstant(DL.getTypeAllocSize(GEP->getResultElementType()),
SDB->getCurSDLoc(), TLI.getPointerTy(DL));
Base = SDB->getValue(Ptr);
Index = SDB->getValue(IndexVal);
if (!Index.getValueType().isVector()) {
unsigned GEPWidth = GEP->getType()->getVectorNumElements();
EVT VT = EVT::getVectorVT(Context, Index.getValueType(), GEPWidth);
Index = DAG.getSplatBuildVector(VT, SDLoc(Index), Index);
}
return true;
}
void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
SDLoc sdl = getCurSDLoc();
// llvm.masked.scatter.*(Src0, Ptrs, alignemt, Mask)
const Value *Ptr = I.getArgOperand(1);
SDValue Src0 = getValue(I.getArgOperand(0));
SDValue Mask = getValue(I.getArgOperand(3));
EVT VT = Src0.getValueType();
unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(2)))->getZExtValue();
if (!Alignment)
Alignment = DAG.getEVTAlignment(VT);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
AAMDNodes AAInfo;
I.getAAMetadata(AAInfo);
SDValue Base;
SDValue Index;
SDValue Scale;
const Value *BasePtr = Ptr;
bool UniformBase = getUniformBase(BasePtr, Base, Index, Scale, this);
const Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr;
MachineMemOperand *MMO = DAG.getMachineFunction().
getMachineMemOperand(MachinePointerInfo(MemOpBasePtr),
MachineMemOperand::MOStore, VT.getStoreSize(),
Alignment, AAInfo);
if (!UniformBase) {
Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
Index = getValue(Ptr);
Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
}
SDValue Ops[] = { getRoot(), Src0, Mask, Base, Index, Scale };
SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), VT, sdl,
Ops, MMO);
DAG.setRoot(Scatter);
setValue(&I, Scatter);
}
void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
SDLoc sdl = getCurSDLoc();
auto getMaskedLoadOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
unsigned& Alignment) {
// @llvm.masked.load.*(Ptr, alignment, Mask, Src0)
Ptr = I.getArgOperand(0);
Alignment = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
Mask = I.getArgOperand(2);
Src0 = I.getArgOperand(3);
};
auto getExpandingLoadOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
unsigned& Alignment) {
// @llvm.masked.expandload.*(Ptr, Mask, Src0)
Ptr = I.getArgOperand(0);
Alignment = 0;
Mask = I.getArgOperand(1);
Src0 = I.getArgOperand(2);
};
Value *PtrOperand, *MaskOperand, *Src0Operand;
unsigned Alignment;
if (IsExpanding)
getExpandingLoadOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
else
getMaskedLoadOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
SDValue Ptr = getValue(PtrOperand);
SDValue Src0 = getValue(Src0Operand);
SDValue Mask = getValue(MaskOperand);
EVT VT = Src0.getValueType();
if (!Alignment)
Alignment = DAG.getEVTAlignment(VT);
AAMDNodes AAInfo;
I.getAAMetadata(AAInfo);
const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
// Do not serialize masked loads of constant memory with anything.
bool AddToChain =
!AA || !AA->pointsToConstantMemory(MemoryLocation(
PtrOperand,
LocationSize::precise(
DAG.getDataLayout().getTypeStoreSize(I.getType())),
AAInfo));
SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
MachineMemOperand *MMO =
DAG.getMachineFunction().
getMachineMemOperand(MachinePointerInfo(PtrOperand),
MachineMemOperand::MOLoad, VT.getStoreSize(),
Alignment, AAInfo, Ranges);
SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, VT, MMO,
ISD::NON_EXTLOAD, IsExpanding);
if (AddToChain)
PendingLoads.push_back(Load.getValue(1));
setValue(&I, Load);
}
void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
SDLoc sdl = getCurSDLoc();
// @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0)
const Value *Ptr = I.getArgOperand(0);
SDValue Src0 = getValue(I.getArgOperand(3));
SDValue Mask = getValue(I.getArgOperand(2));
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(1)))->getZExtValue();
if (!Alignment)
Alignment = DAG.getEVTAlignment(VT);
AAMDNodes AAInfo;
I.getAAMetadata(AAInfo);
const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
SDValue Root = DAG.getRoot();
SDValue Base;
SDValue Index;
SDValue Scale;
const Value *BasePtr = Ptr;
bool UniformBase = getUniformBase(BasePtr, Base, Index, Scale, this);
bool ConstantMemory = false;
if (UniformBase && AA &&
AA->pointsToConstantMemory(
MemoryLocation(BasePtr,
LocationSize::precise(
DAG.getDataLayout().getTypeStoreSize(I.getType())),
AAInfo))) {
// Do not serialize (non-volatile) loads of constant memory with anything.
Root = DAG.getEntryNode();
ConstantMemory = true;
}
MachineMemOperand *MMO =
DAG.getMachineFunction().
getMachineMemOperand(MachinePointerInfo(UniformBase ? BasePtr : nullptr),
MachineMemOperand::MOLoad, VT.getStoreSize(),
Alignment, AAInfo, Ranges);
if (!UniformBase) {
Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
Index = getValue(Ptr);
Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
}
SDValue Ops[] = { Root, Src0, Mask, Base, Index, Scale };
SDValue Gather = DAG.getMaskedGather(DAG.getVTList(VT, MVT::Other), VT, sdl,
Ops, MMO);
SDValue OutChain = Gather.getValue(1);
if (!ConstantMemory)
PendingLoads.push_back(OutChain);
setValue(&I, Gather);
}
void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
SDLoc dl = getCurSDLoc();
AtomicOrdering SuccessOrdering = I.getSuccessOrdering();
AtomicOrdering FailureOrdering = I.getFailureOrdering();
SyncScope::ID SSID = I.getSyncScopeID();
SDValue InChain = getRoot();
MVT MemVT = getValue(I.getCompareOperand()).getSimpleValueType();
SDVTList VTs = DAG.getVTList(MemVT, MVT::i1, MVT::Other);
auto Alignment = DAG.getEVTAlignment(MemVT);
auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
if (I.isVolatile())
Flags |= MachineMemOperand::MOVolatile;
Flags |= DAG.getTargetLoweringInfo().getMMOFlags(I);
MachineFunction &MF = DAG.getMachineFunction();
MachineMemOperand *MMO =
MF.getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()),
Flags, MemVT.getStoreSize(), Alignment,
AAMDNodes(), nullptr, SSID, SuccessOrdering,
FailureOrdering);
SDValue L = DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
dl, MemVT, VTs, InChain,
getValue(I.getPointerOperand()),
getValue(I.getCompareOperand()),
getValue(I.getNewValOperand()), MMO);
SDValue OutChain = L.getValue(2);
setValue(&I, L);
DAG.setRoot(OutChain);
}
void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) {
SDLoc dl = getCurSDLoc();
ISD::NodeType NT;
switch (I.getOperation()) {
default: llvm_unreachable("Unknown atomicrmw operation");
case AtomicRMWInst::Xchg: NT = ISD::ATOMIC_SWAP; break;
case AtomicRMWInst::Add: NT = ISD::ATOMIC_LOAD_ADD; break;
case AtomicRMWInst::Sub: NT = ISD::ATOMIC_LOAD_SUB; break;
case AtomicRMWInst::And: NT = ISD::ATOMIC_LOAD_AND; break;
case AtomicRMWInst::Nand: NT = ISD::ATOMIC_LOAD_NAND; break;
case AtomicRMWInst::Or: NT = ISD::ATOMIC_LOAD_OR; break;
case AtomicRMWInst::Xor: NT = ISD::ATOMIC_LOAD_XOR; break;
case AtomicRMWInst::Max: NT = ISD::ATOMIC_LOAD_MAX; break;
case AtomicRMWInst::Min: NT = ISD::ATOMIC_LOAD_MIN; break;
case AtomicRMWInst::UMax: NT = ISD::ATOMIC_LOAD_UMAX; break;
case AtomicRMWInst::UMin: NT = ISD::ATOMIC_LOAD_UMIN; break;
case AtomicRMWInst::FAdd: NT = ISD::ATOMIC_LOAD_FADD; break;
case AtomicRMWInst::FSub: NT = ISD::ATOMIC_LOAD_FSUB; break;
}
AtomicOrdering Ordering = I.getOrdering();
SyncScope::ID SSID = I.getSyncScopeID();
SDValue InChain = getRoot();
auto MemVT = getValue(I.getValOperand()).getSimpleValueType();
auto Alignment = DAG.getEVTAlignment(MemVT);
auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
if (I.isVolatile())
Flags |= MachineMemOperand::MOVolatile;
Flags |= DAG.getTargetLoweringInfo().getMMOFlags(I);
MachineFunction &MF = DAG.getMachineFunction();
MachineMemOperand *MMO =
MF.getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()), Flags,
MemVT.getStoreSize(), Alignment, AAMDNodes(),
nullptr, SSID, Ordering);
SDValue L =
DAG.getAtomic(NT, dl, MemVT, InChain,
getValue(I.getPointerOperand()), getValue(I.getValOperand()),
MMO);
SDValue OutChain = L.getValue(1);
setValue(&I, L);
DAG.setRoot(OutChain);
}
void SelectionDAGBuilder::visitFence(const FenceInst &I) {
SDLoc dl = getCurSDLoc();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Ops[3];
Ops[0] = getRoot();
Ops[1] = DAG.getConstant((unsigned)I.getOrdering(), dl,
TLI.getFenceOperandTy(DAG.getDataLayout()));
Ops[2] = DAG.getConstant(I.getSyncScopeID(), dl,
TLI.getFenceOperandTy(DAG.getDataLayout()));
DAG.setRoot(DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops));
}
void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
SDLoc dl = getCurSDLoc();
AtomicOrdering Order = I.getOrdering();
SyncScope::ID SSID = I.getSyncScopeID();
SDValue InChain = getRoot();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
EVT MemVT = TLI.getMemValueType(DAG.getDataLayout(), I.getType());
if (!TLI.supportsUnalignedAtomics() &&
I.getAlignment() < MemVT.getSizeInBits() / 8)
report_fatal_error("Cannot generate unaligned atomic load");
auto Flags = MachineMemOperand::MOLoad;
if (I.isVolatile())
Flags |= MachineMemOperand::MOVolatile;
if (I.getMetadata(LLVMContext::MD_invariant_load) != nullptr)
Flags |= MachineMemOperand::MOInvariant;
if (isDereferenceablePointer(I.getPointerOperand(), I.getType(),
DAG.getDataLayout()))
Flags |= MachineMemOperand::MODereferenceable;
Flags |= TLI.getMMOFlags(I);
MachineMemOperand *MMO =
DAG.getMachineFunction().
getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()),
Flags, MemVT.getStoreSize(),
I.getAlignment() ? I.getAlignment() :
DAG.getEVTAlignment(MemVT),
AAMDNodes(), nullptr, SSID, Order);
InChain = TLI.prepareVolatileOrAtomicLoad(InChain, dl, DAG);
SDValue L =
DAG.getAtomic(ISD::ATOMIC_LOAD, dl, MemVT, MemVT, InChain,
getValue(I.getPointerOperand()), MMO);
SDValue OutChain = L.getValue(1);
if (MemVT != VT)
L = DAG.getPtrExtOrTrunc(L, dl, VT);
setValue(&I, L);
DAG.setRoot(OutChain);
}
void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
SDLoc dl = getCurSDLoc();
AtomicOrdering Ordering = I.getOrdering();
SyncScope::ID SSID = I.getSyncScopeID();
SDValue InChain = getRoot();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT MemVT =
TLI.getMemValueType(DAG.getDataLayout(), I.getValueOperand()->getType());
if (I.getAlignment() < MemVT.getSizeInBits() / 8)
report_fatal_error("Cannot generate unaligned atomic store");
auto Flags = MachineMemOperand::MOStore;
if (I.isVolatile())
Flags |= MachineMemOperand::MOVolatile;
Flags |= TLI.getMMOFlags(I);
MachineFunction &MF = DAG.getMachineFunction();
MachineMemOperand *MMO =
MF.getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()), Flags,
MemVT.getStoreSize(), I.getAlignment(), AAMDNodes(),
nullptr, SSID, Ordering);
SDValue Val = getValue(I.getValueOperand());
if (Val.getValueType() != MemVT)
Val = DAG.getPtrExtOrTrunc(Val, dl, MemVT);
SDValue OutChain = DAG.getAtomic(ISD::ATOMIC_STORE, dl, MemVT, InChain,
getValue(I.getPointerOperand()), Val, MMO);
DAG.setRoot(OutChain);
}
/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC
/// node.
void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
unsigned Intrinsic) {
// Ignore the callsite's attributes. A specific call site may be marked with
// readnone, but the lowering code will expect the chain based on the
// definition.
const Function *F = I.getCalledFunction();
bool HasChain = !F->doesNotAccessMemory();
bool OnlyLoad = HasChain && F->onlyReadsMemory();
// Build the operand list.
SmallVector<SDValue, 8> Ops;
if (HasChain) { // If this intrinsic has side-effects, chainify it.
if (OnlyLoad) {
// We don't need to serialize loads against other loads.
Ops.push_back(DAG.getRoot());
} else {
Ops.push_back(getRoot());
}
}
// Info is set by getTgtMemInstrinsic
TargetLowering::IntrinsicInfo Info;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
bool IsTgtIntrinsic = TLI.getTgtMemIntrinsic(Info, I,
DAG.getMachineFunction(),
Intrinsic);
// Add the intrinsic ID as an integer operand if it's not a target intrinsic.
if (!IsTgtIntrinsic || Info.opc == ISD::INTRINSIC_VOID ||
Info.opc == ISD::INTRINSIC_W_CHAIN)
Ops.push_back(DAG.getTargetConstant(Intrinsic, getCurSDLoc(),
TLI.getPointerTy(DAG.getDataLayout())));
// Add all operands of the call to the operand list.
for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
SDValue Op = getValue(I.getArgOperand(i));
Ops.push_back(Op);
}
SmallVector<EVT, 4> ValueVTs;
ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs);
if (HasChain)
ValueVTs.push_back(MVT::Other);
SDVTList VTs = DAG.getVTList(ValueVTs);
// Create the node.
SDValue Result;
if (IsTgtIntrinsic) {
// This is target intrinsic that touches memory
AAMDNodes AAInfo;
I.getAAMetadata(AAInfo);
Result =
DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops, Info.memVT,
MachinePointerInfo(Info.ptrVal, Info.offset),
Info.align, Info.flags, Info.size, AAInfo);
} else if (!HasChain) {
Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
} else if (!I.getType()->isVoidTy()) {
Result = DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops);
} else {
Result = DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops);
}
if (HasChain) {
SDValue Chain = Result.getValue(Result.getNode()->getNumValues()-1);
if (OnlyLoad)
PendingLoads.push_back(Chain);
else
DAG.setRoot(Chain);
}
if (!I.getType()->isVoidTy()) {
if (VectorType *PTy = dyn_cast<VectorType>(I.getType())) {
EVT VT = TLI.getValueType(DAG.getDataLayout(), PTy);
Result = DAG.getNode(ISD::BITCAST, getCurSDLoc(), VT, Result);
} else
Result = lowerRangeToAssertZExt(DAG, I, Result);
setValue(&I, Result);
}
}
/// GetSignificand - Get the significand and build it into a floating-point
/// number with exponent of 1:
///
/// Op = (Op & 0x007fffff) | 0x3f800000;
///
/// where Op is the hexadecimal representation of floating point value.
static SDValue GetSignificand(SelectionDAG &DAG, SDValue Op, const SDLoc &dl) {
SDValue t1 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
DAG.getConstant(0x007fffff, dl, MVT::i32));
SDValue t2 = DAG.getNode(ISD::OR, dl, MVT::i32, t1,
DAG.getConstant(0x3f800000, dl, MVT::i32));
return DAG.getNode(ISD::BITCAST, dl, MVT::f32, t2);
}
/// GetExponent - Get the exponent:
///
/// (float)(int)(((Op & 0x7f800000) >> 23) - 127);
///
/// where Op is the hexadecimal representation of floating point value.
static SDValue GetExponent(SelectionDAG &DAG, SDValue Op,
const TargetLowering &TLI, const SDLoc &dl) {
SDValue t0 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
DAG.getConstant(0x7f800000, dl, MVT::i32));
SDValue t1 = DAG.getNode(
ISD::SRL, dl, MVT::i32, t0,
DAG.getConstant(23, dl, TLI.getPointerTy(DAG.getDataLayout())));
SDValue t2 = DAG.getNode(ISD::SUB, dl, MVT::i32, t1,
DAG.getConstant(127, dl, MVT::i32));
return DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, t2);
}
/// getF32Constant - Get 32-bit floating point constant.
static SDValue getF32Constant(SelectionDAG &DAG, unsigned Flt,
const SDLoc &dl) {
return DAG.getConstantFP(APFloat(APFloat::IEEEsingle(), APInt(32, Flt)), dl,
MVT::f32);
}
static SDValue getLimitedPrecisionExp2(SDValue t0, const SDLoc &dl,
SelectionDAG &DAG) {
// TODO: What fast-math-flags should be set on the floating-point nodes?
// IntegerPartOfX = ((int32_t)(t0);
SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, t0);
// FractionalPartOfX = t0 - (float)IntegerPartOfX;
SDValue t1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, IntegerPartOfX);
SDValue X = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0, t1);
// IntegerPartOfX <<= 23;
IntegerPartOfX = DAG.getNode(
ISD::SHL, dl, MVT::i32, IntegerPartOfX,
DAG.getConstant(23, dl, DAG.getTargetLoweringInfo().getPointerTy(
DAG.getDataLayout())));
SDValue TwoToFractionalPartOfX;
if (LimitFloatPrecision <= 6) {
// For floating-point precision of 6:
//
// TwoToFractionalPartOfX =
// 0.997535578f +
// (0.735607626f + 0.252464424f * x) * x;
//
// error 0.0144103317, which is 6 bits
SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
getF32Constant(DAG, 0x3e814304, dl));
SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
getF32Constant(DAG, 0x3f3c50c8, dl));
SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
getF32Constant(DAG, 0x3f7f5e7e, dl));
} else if (LimitFloatPrecision <= 12) {
// For floating-point precision of 12:
//
// TwoToFractionalPartOfX =
// 0.999892986f +
// (0.696457318f +
// (0.224338339f + 0.792043434e-1f * x) * x) * x;
//
// error 0.000107046256, which is 13 to 14 bits
SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
getF32Constant(DAG, 0x3da235e3, dl));
SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
getF32Constant(DAG, 0x3e65b8f3, dl));
SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
getF32Constant(DAG, 0x3f324b07, dl));
SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
getF32Constant(DAG, 0x3f7ff8fd, dl));
} else { // LimitFloatPrecision <= 18
// For floating-point precision of 18:
//
// TwoToFractionalPartOfX =
// 0.999999982f +
// (0.693148872f +
// (0.240227044f +
// (0.554906021e-1f +
// (0.961591928e-2f +
// (0.136028312e-2f + 0.157059148e-3f *x)*x)*x)*x)*x)*x;
// error 2.47208000*10^(-7), which is better than 18 bits
SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
getF32Constant(DAG, 0x3924b03e, dl));
SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
getF32Constant(DAG, 0x3ab24b87, dl));
SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
getF32Constant(DAG, 0x3c1d8c17, dl));
SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
getF32Constant(DAG, 0x3d634a1d, dl));
SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
getF32Constant(DAG, 0x3e75fe14, dl));
SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
SDValue t11 = DAG.getNode(ISD::FADD, dl, MVT::f32, t10,
getF32Constant(DAG, 0x3f317234, dl));
SDValue t12 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t11, X);
TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t12,
getF32Constant(DAG, 0x3f800000, dl));
}
// Add the exponent into the result in integer domain.
SDValue t13 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, TwoToFractionalPartOfX);
return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
DAG.getNode(ISD::ADD, dl, MVT::i32, t13, IntegerPartOfX));
}
/// expandExp - Lower an exp intrinsic. Handles the special sequences for
/// limited-precision mode.
static SDValue expandExp(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
const TargetLowering &TLI) {
if (Op.getValueType() == MVT::f32 &&
LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
// Put the exponent in the right bit position for later addition to the
// final result:
//
// #define LOG2OFe 1.4426950f
// t0 = Op * LOG2OFe
// TODO: What fast-math-flags should be set here?
SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, Op,
getF32Constant(DAG, 0x3fb8aa3b, dl));
return getLimitedPrecisionExp2(t0, dl, DAG);
}
// No special expansion.
return DAG.getNode(ISD::FEXP, dl, Op.getValueType(), Op);
}
/// expandLog - Lower a log intrinsic. Handles the special sequences for
/// limited-precision mode.
static SDValue expandLog(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
const TargetLowering &TLI) {
// TODO: What fast-math-flags should be set on the floating-point nodes?
if (Op.getValueType() == MVT::f32 &&
LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
// Scale the exponent by log(2) [0.69314718f].
SDValue Exp = GetExponent(DAG, Op1, TLI, dl);
SDValue LogOfExponent = DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
getF32Constant(DAG, 0x3f317218, dl));
// Get the significand and build it into a floating-point number with
// exponent of 1.
SDValue X = GetSignificand(DAG, Op1, dl);
SDValue LogOfMantissa;
if (LimitFloatPrecision <= 6) {
// For floating-point precision of 6:
//
// LogofMantissa =
// -1.1609546f +
// (1.4034025f - 0.23903021f * x) * x;
//
// error 0.0034276066, which is better than 8 bits
SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
getF32Constant(DAG, 0xbe74c456, dl));
SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
getF32Constant(DAG, 0x3fb3a2b1, dl));
SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
getF32Constant(DAG, 0x3f949a29, dl));
} else if (LimitFloatPrecision <= 12) {
// For floating-point precision of 12:
//
// LogOfMantissa =
// -1.7417939f +
// (2.8212026f +
// (-1.4699568f +
// (0.44717955f - 0.56570851e-1f * x) * x) * x) * x;
//
// error 0.000061011436, which is 14 bits
SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
getF32Constant(DAG, 0xbd67b6d6, dl));
SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
getF32Constant(DAG, 0x3ee4f4b8, dl));
SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
getF32Constant(DAG, 0x3fbc278b, dl));
SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
getF32Constant(DAG, 0x40348e95, dl));
SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
getF32Constant(DAG, 0x3fdef31a, dl));
} else { // LimitFloatPrecision <= 18
// For floating-point precision of 18:
//
// LogOfMantissa =
// -2.1072184f +
// (4.2372794f +
// (-3.7029485f +
// (2.2781945f +
// (-0.87823314f +
// (0.19073739f - 0.17809712e-1f * x) * x) * x) * x) * x)*x;
//
// error 0.0000023660568, which is better than 18 bits
SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
getF32Constant(DAG, 0xbc91e5ac, dl));
SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
getF32Constant(DAG, 0x3e4350aa, dl));
SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
getF32Constant(DAG, 0x3f60d3e3, dl));
SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
getF32Constant(DAG, 0x4011cdf0, dl));
SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
SDValue t7 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
getF32Constant(DAG, 0x406cfd1c, dl));
SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
getF32Constant(DAG, 0x408797cb, dl));
SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t10,
getF32Constant(DAG, 0x4006dcab, dl));
}
return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, LogOfMantissa);
}
// No special expansion.
return DAG.getNode(ISD::FLOG, dl, Op.getValueType(), Op);
}
/// expandLog2 - Lower a log2 intrinsic. Handles the special sequences for
/// limited-precision mode.
static SDValue expandLog2(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
const TargetLowering &TLI) {
// TODO: What fast-math-flags should be set on the floating-point nodes?
if (Op.getValueType() == MVT::f32 &&
LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
// Get the exponent.
SDValue LogOfExponent = GetExponent(DAG, Op1, TLI, dl);
// Get the significand and build it into a floating-point number with
// exponent of 1.
SDValue X = GetSignificand(DAG, Op1, dl);
// Different possible minimax approximations of significand in
// floating-point for various degrees of accuracy over [1,2].
SDValue Log2ofMantissa;
if (LimitFloatPrecision <= 6) {
// For floating-point precision of 6:
//
// Log2ofMantissa = -1.6749035f + (2.0246817f - .34484768f * x) * x;
//
// error 0.0049451742, which is more than 7 bits
SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
getF32Constant(DAG, 0xbeb08fe0, dl));
SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
getF32Constant(DAG, 0x40019463, dl));
SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
getF32Constant(DAG, 0x3fd6633d, dl));
} else if (LimitFloatPrecision <= 12) {
// For floating-point precision of 12:
//
// Log2ofMantissa =
// -2.51285454f +
// (4.07009056f +
// (-2.12067489f +
// (.645142248f - 0.816157886e-1f * x) * x) * x) * x;
//
// error 0.0000876136000, which is better than 13 bits
SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
getF32Constant(DAG, 0xbda7262e, dl));
SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
getF32Constant(DAG, 0x3f25280b, dl));
SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
getF32Constant(DAG, 0x4007b923, dl));
SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
getF32Constant(DAG, 0x40823e2f, dl));
SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
getF32Constant(DAG, 0x4020d29c, dl));
} else { // LimitFloatPrecision <= 18
// For floating-point precision of 18:
//
// Log2ofMantissa =
// -3.0400495f +
// (6.1129976f +
// (-5.3420409f +
// (3.2865683f +
// (-1.2669343f +
// (0.27515199f -
// 0.25691327e-1f * x) * x) * x) * x) * x) * x;
//
// error 0.0000018516, which is better than 18 bits
SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
getF32Constant(DAG, 0xbcd2769e, dl));
SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
getF32Constant(DAG, 0x3e8ce0b9, dl));
SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
getF32Constant(DAG, 0x3fa22ae7, dl));
SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
getF32Constant(DAG, 0x40525723, dl));
SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
SDValue t7 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
getF32Constant(DAG, 0x40aaf200, dl));
SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
getF32Constant(DAG, 0x40c39dad, dl));
SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t10,
getF32Constant(DAG, 0x4042902c, dl));
}
return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, Log2ofMantissa);
}
// No special expansion.
return DAG.getNode(ISD::FLOG2, dl, Op.getValueType(), Op);
}
/// expandLog10 - Lower a log10 intrinsic. Handles the special sequences for
/// limited-precision mode.
static SDValue expandLog10(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
const TargetLowering &TLI) {
// TODO: What fast-math-flags should be set on the floating-point nodes?
if (Op.getValueType() == MVT::f32 &&
LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
// Scale the exponent by log10(2) [0.30102999f].
SDValue Exp = GetExponent(DAG, Op1, TLI, dl);
SDValue LogOfExponent = DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
getF32Constant(DAG, 0x3e9a209a, dl));
// Get the significand and build it into a floating-point number with
// exponent of 1.
SDValue X = GetSignificand(DAG, Op1, dl);
SDValue Log10ofMantissa;
if (LimitFloatPrecision <= 6) {
// For floating-point precision of 6:
//
// Log10ofMantissa =
// -0.50419619f +
// (0.60948995f - 0.10380950f * x) * x;
//
// error 0.0014886165, which is 6 bits
SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
getF32Constant(DAG, 0xbdd49a13, dl));
SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
getF32Constant(DAG, 0x3f1c0789, dl));
SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
getF32Constant(DAG, 0x3f011300, dl));
} else if (LimitFloatPrecision <= 12) {
// For floating-point precision of 12:
//
// Log10ofMantissa =
// -0.64831180f +
// (0.91751397f +
// (-0.31664806f + 0.47637168e-1f * x) * x) * x;
//
// error 0.00019228036, which is better than 12 bits
SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
getF32Constant(DAG, 0x3d431f31, dl));
SDValue t1 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0,
getF32Constant(DAG, 0x3ea21fb2, dl));
SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
getF32Constant(DAG, 0x3f6ae232, dl));
SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t4,
getF32Constant(DAG, 0x3f25f7c3, dl));
} else { // LimitFloatPrecision <= 18
// For floating-point precision of 18:
//
// Log10ofMantissa =
// -0.84299375f +
// (1.5327582f +
// (-1.0688956f +
// (0.49102474f +
// (-0.12539807f + 0.13508273e-1f * x) * x) * x) * x) * x;
//
// error 0.0000037995730, which is better than 18 bits
SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
getF32Constant(DAG, 0x3c5d51ce, dl));
SDValue t1 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0,
getF32Constant(DAG, 0x3e00685a, dl));
SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
getF32Constant(DAG, 0x3efb6798, dl));
SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
SDValue t5 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t4,
getF32Constant(DAG, 0x3f88d192, dl));
SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
getF32Constant(DAG, 0x3fc4316c, dl));
SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t8,
getF32Constant(DAG, 0x3f57ce70, dl));
}
return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, Log10ofMantissa);
}
// No special expansion.
return DAG.getNode(ISD::FLOG10, dl, Op.getValueType(), Op);
}
/// expandExp2 - Lower an exp2 intrinsic. Handles the special sequences for
/// limited-precision mode.
static SDValue expandExp2(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
const TargetLowering &TLI) {
if (Op.getValueType() == MVT::f32 &&
LimitFloatPrecision > 0 && LimitFloatPrecision <= 18)
return getLimitedPrecisionExp2(Op, dl, DAG);
// No special expansion.
return DAG.getNode(ISD::FEXP2, dl, Op.getValueType(), Op);
}
/// visitPow - Lower a pow intrinsic. Handles the special sequences for
/// limited-precision mode with x == 10.0f.
static SDValue expandPow(const SDLoc &dl, SDValue LHS, SDValue RHS,
SelectionDAG &DAG, const TargetLowering &TLI) {
bool IsExp10 = false;
if (LHS.getValueType() == MVT::f32 && RHS.getValueType() == MVT::f32 &&
LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
if (ConstantFPSDNode *LHSC = dyn_cast<ConstantFPSDNode>(LHS)) {
APFloat Ten(10.0f);
IsExp10 = LHSC->isExactlyValue(Ten);
}
}
// TODO: What fast-math-flags should be set on the FMUL node?
if (IsExp10) {
// Put the exponent in the right bit position for later addition to the
// final result:
//
// #define LOG2OF10 3.3219281f
// t0 = Op * LOG2OF10;
SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, RHS,
getF32Constant(DAG, 0x40549a78, dl));
return getLimitedPrecisionExp2(t0, dl, DAG);
}
// No special expansion.
return DAG.getNode(ISD::FPOW, dl, LHS.getValueType(), LHS, RHS);
}
/// ExpandPowI - Expand a llvm.powi intrinsic.
static SDValue ExpandPowI(const SDLoc &DL, SDValue LHS, SDValue RHS,
SelectionDAG &DAG) {
// If RHS is a constant, we can expand this out to a multiplication tree,
// otherwise we end up lowering to a call to __powidf2 (for example). When
// optimizing for size, we only want to do this if the expansion would produce
// a small number of multiplies, otherwise we do the full expansion.
if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
// Get the exponent as a positive value.
unsigned Val = RHSC->getSExtValue();
if ((int)Val < 0) Val = -Val;
// powi(x, 0) -> 1.0
if (Val == 0)
return DAG.getConstantFP(1.0, DL, LHS.getValueType());
const Function &F = DAG.getMachineFunction().getFunction();
if (!F.hasOptSize() ||
// If optimizing for size, don't insert too many multiplies.
// This inserts up to 5 multiplies.
countPopulation(Val) + Log2_32(Val) < 7) {
// We use the simple binary decomposition method to generate the multiply
// sequence. There are more optimal ways to do this (for example,
// powi(x,15) generates one more multiply than it should), but this has
// the benefit of being both really simple and much better than a libcall.
SDValue Res; // Logically starts equal to 1.0
SDValue CurSquare = LHS;
// TODO: Intrinsics should have fast-math-flags that propagate to these
// nodes.
while (Val) {
if (Val & 1) {
if (Res.getNode())
Res = DAG.getNode(ISD::FMUL, DL,Res.getValueType(), Res, CurSquare);
else
Res = CurSquare; // 1.0*CurSquare.
}
CurSquare = DAG.getNode(ISD::FMUL, DL, CurSquare.getValueType(),
CurSquare, CurSquare);
Val >>= 1;
}
// If the original was negative, invert the result, producing 1/(x*x*x).
if (RHSC->getSExtValue() < 0)
Res = DAG.getNode(ISD::FDIV, DL, LHS.getValueType(),
DAG.getConstantFP(1.0, DL, LHS.getValueType()), Res);
return Res;
}
}
// Otherwise, expand to a libcall.
return DAG.getNode(ISD::FPOWI, DL, LHS.getValueType(), LHS, RHS);
}
// getUnderlyingArgReg - Find underlying register used for a truncated or
// bitcasted argument.
static unsigned getUnderlyingArgReg(const SDValue &N) {
switch (N.getOpcode()) {
case ISD::CopyFromReg:
return cast<RegisterSDNode>(N.getOperand(1))->getReg();
case ISD::BITCAST:
case ISD::AssertZext:
case ISD::AssertSext:
case ISD::TRUNCATE:
return getUnderlyingArgReg(N.getOperand(0));
default:
return 0;
}
}
/// If the DbgValueInst is a dbg_value of a function argument, create the
/// corresponding DBG_VALUE machine instruction for it now. At the end of
/// instruction selection, they will be inserted to the entry BB.
bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
const Value *V, DILocalVariable *Variable, DIExpression *Expr,
DILocation *DL, bool IsDbgDeclare, const SDValue &N) {
const Argument *Arg = dyn_cast<Argument>(V);
if (!Arg)
return false;
if (!IsDbgDeclare) {
// ArgDbgValues are hoisted to the beginning of the entry block. So we
// should only emit as ArgDbgValue if the dbg.value intrinsic is found in
// the entry block.
bool IsInEntryBlock = FuncInfo.MBB == &FuncInfo.MF->front();
if (!IsInEntryBlock)
return false;
// ArgDbgValues are hoisted to the beginning of the entry block. So we
// should only emit as ArgDbgValue if the dbg.value intrinsic describes a
// variable that also is a param.
//
// Although, if we are at the top of the entry block already, we can still
// emit using ArgDbgValue. This might catch some situations when the
// dbg.value refers to an argument that isn't used in the entry block, so
// any CopyToReg node would be optimized out and the only way to express
// this DBG_VALUE is by using the physical reg (or FI) as done in this
// method. ArgDbgValues are hoisted to the beginning of the entry block. So
// we should only emit as ArgDbgValue if the Variable is an argument to the
// current function, and the dbg.value intrinsic is found in the entry
// block.
bool VariableIsFunctionInputArg = Variable->isParameter() &&
!DL->getInlinedAt();
bool IsInPrologue = SDNodeOrder == LowestSDNodeOrder;
if (!IsInPrologue && !VariableIsFunctionInputArg)
return false;
// Here we assume that a function argument on IR level only can be used to
// describe one input parameter on source level. If we for example have
// source code like this
//
// struct A { long x, y; };
// void foo(struct A a, long b) {
// ...
// b = a.x;
// ...
// }
//
// and IR like this
//
// define void @foo(i32 %a1, i32 %a2, i32 %b) {
// entry:
// call void @llvm.dbg.value(metadata i32 %a1, "a", DW_OP_LLVM_fragment
// call void @llvm.dbg.value(metadata i32 %a2, "a", DW_OP_LLVM_fragment
// call void @llvm.dbg.value(metadata i32 %b, "b",
// ...
// call void @llvm.dbg.value(metadata i32 %a1, "b"
// ...
//
// then the last dbg.value is describing a parameter "b" using a value that
// is an argument. But since we already has used %a1 to describe a parameter
// we should not handle that last dbg.value here (that would result in an
// incorrect hoisting of the DBG_VALUE to the function entry).
// Notice that we allow one dbg.value per IR level argument, to accomodate
// for the situation with fragments above.
if (VariableIsFunctionInputArg) {
unsigned ArgNo = Arg->getArgNo();
if (ArgNo >= FuncInfo.DescribedArgs.size())
FuncInfo.DescribedArgs.resize(ArgNo + 1, false);
else if (!IsInPrologue && FuncInfo.DescribedArgs.test(ArgNo))
return false;
FuncInfo.DescribedArgs.set(ArgNo);
}
}
MachineFunction &MF = DAG.getMachineFunction();
const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();
bool IsIndirect = false;
Optional<MachineOperand> Op;
// Some arguments' frame index is recorded during argument lowering.
int FI = FuncInfo.getArgumentFrameIndex(Arg);
if (FI != std::numeric_limits<int>::max())
Op = MachineOperand::CreateFI(FI);
if (!Op && N.getNode()) {
unsigned Reg = getUnderlyingArgReg(N);
if (Reg && TargetRegisterInfo::isVirtualRegister(Reg)) {
MachineRegisterInfo &RegInfo = MF.getRegInfo();
unsigned PR = RegInfo.getLiveInPhysReg(Reg);
if (PR)
Reg = PR;
}
if (Reg) {
Op = MachineOperand::CreateReg(Reg, false);
IsIndirect = IsDbgDeclare;
}
}
if (!Op && N.getNode()) {
// Check if frame index is available.
SDValue LCandidate = peekThroughBitcasts(N);
if (LoadSDNode *LNode = dyn_cast<LoadSDNode>(LCandidate.getNode()))
if (FrameIndexSDNode *FINode =
dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()))
Op = MachineOperand::CreateFI(FINode->getIndex());
}
if (!Op) {
// Check if ValueMap has reg number.
DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V);
if (VMI != FuncInfo.ValueMap.end()) {
const auto &TLI = DAG.getTargetLoweringInfo();
RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), VMI->second,
V->getType(), getABIRegCopyCC(V));
if (RFV.occupiesMultipleRegs()) {
unsigned Offset = 0;
for (auto RegAndSize : RFV.getRegsAndSizes()) {
Op = MachineOperand::CreateReg(RegAndSize.first, false);
auto FragmentExpr = DIExpression::createFragmentExpression(
Expr, Offset, RegAndSize.second);
if (!FragmentExpr)
continue;
FuncInfo.ArgDbgValues.push_back(
BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsDbgDeclare,
Op->getReg(), Variable, *FragmentExpr));
Offset += RegAndSize.second;
}
return true;
}
Op = MachineOperand::CreateReg(VMI->second, false);
IsIndirect = IsDbgDeclare;
}
}
if (!Op)
return false;
assert(Variable->isValidLocationForIntrinsic(DL) &&
"Expected inlined-at fields to agree");
IsIndirect = (Op->isReg()) ? IsIndirect : true;
FuncInfo.ArgDbgValues.push_back(
BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsIndirect,
*Op, Variable, Expr));
return true;
}
/// Return the appropriate SDDbgValue based on N.
SDDbgValue *SelectionDAGBuilder::getDbgValue(SDValue N,
DILocalVariable *Variable,
DIExpression *Expr,
const DebugLoc &dl,
unsigned DbgSDNodeOrder) {
if (auto *FISDN = dyn_cast<FrameIndexSDNode>(N.getNode())) {
// Construct a FrameIndexDbgValue for FrameIndexSDNodes so we can describe
// stack slot locations.
//
// Consider "int x = 0; int *px = &x;". There are two kinds of interesting
// debug values here after optimization:
//
// dbg.value(i32* %px, !"int *px", !DIExpression()), and
// dbg.value(i32* %px, !"int x", !DIExpression(DW_OP_deref))
//
// Both describe the direct values of their associated variables.
return DAG.getFrameIndexDbgValue(Variable, Expr, FISDN->getIndex(),
/*IsIndirect*/ false, dl, DbgSDNodeOrder);
}
return DAG.getDbgValue(Variable, Expr, N.getNode(), N.getResNo(),
/*IsIndirect*/ false, dl, DbgSDNodeOrder);
}
// VisualStudio defines setjmp as _setjmp
#if defined(_MSC_VER) && defined(setjmp) && \
!defined(setjmp_undefined_for_msvc)
# pragma push_macro("setjmp")
# undef setjmp
# define setjmp_undefined_for_msvc
#endif
static unsigned FixedPointIntrinsicToOpcode(unsigned Intrinsic) {
switch (Intrinsic) {
case Intrinsic::smul_fix:
return ISD::SMULFIX;
case Intrinsic::umul_fix:
return ISD::UMULFIX;
default:
llvm_unreachable("Unhandled fixed point intrinsic");
}
}
void SelectionDAGBuilder::lowerCallToExternalSymbol(const CallInst &I,
const char *FunctionName) {
assert(FunctionName && "FunctionName must not be nullptr");
SDValue Callee = DAG.getExternalSymbol(
FunctionName,
DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()));
LowerCallTo(&I, Callee, I.isTailCall());
}
/// Lower the call to the specified intrinsic function.
void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
unsigned Intrinsic) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDLoc sdl = getCurSDLoc();
DebugLoc dl = getCurDebugLoc();
SDValue Res;
switch (Intrinsic) {
default:
// By default, turn this into a target intrinsic node.
visitTargetIntrinsic(I, Intrinsic);
return;
case Intrinsic::vastart: visitVAStart(I); return;
case Intrinsic::vaend: visitVAEnd(I); return;
case Intrinsic::vacopy: visitVACopy(I); return;
case Intrinsic::returnaddress:
setValue(&I, DAG.getNode(ISD::RETURNADDR, sdl,
TLI.getPointerTy(DAG.getDataLayout()),
getValue(I.getArgOperand(0))));
return;
case Intrinsic::addressofreturnaddress:
setValue(&I, DAG.getNode(ISD::ADDROFRETURNADDR, sdl,
TLI.getPointerTy(DAG.getDataLayout())));
return;
case Intrinsic::sponentry:
setValue(&I, DAG.getNode(ISD::SPONENTRY, sdl,
TLI.getPointerTy(DAG.getDataLayout())));
return;
case Intrinsic::frameaddress:
setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl,
TLI.getPointerTy(DAG.getDataLayout()),
getValue(I.getArgOperand(0))));
return;
case Intrinsic::read_register: {
Value *Reg = I.getArgOperand(0);
SDValue Chain = getRoot();
SDValue RegName =
DAG.getMDNode(cast<MDNode>(cast<MetadataAsValue>(Reg)->getMetadata()));
EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
Res = DAG.getNode(ISD::READ_REGISTER, sdl,
DAG.getVTList(VT, MVT::Other), Chain, RegName);
setValue(&I, Res);
DAG.setRoot(Res.getValue(1));
return;
}
case Intrinsic::write_register: {
Value *Reg = I.getArgOperand(0);
Value *RegValue = I.getArgOperand(1);
SDValue Chain = getRoot();
SDValue RegName =
DAG.getMDNode(cast<MDNode>(cast<MetadataAsValue>(Reg)->getMetadata()));
DAG.setRoot(DAG.getNode(ISD::WRITE_REGISTER, sdl, MVT::Other, Chain,
RegName, getValue(RegValue)));
return;
}
case Intrinsic::setjmp:
lowerCallToExternalSymbol(I, &"_setjmp"[!TLI.usesUnderscoreSetJmp()]);
return;
case Intrinsic::longjmp:
lowerCallToExternalSymbol(I, &"_longjmp"[!TLI.usesUnderscoreLongJmp()]);
return;
case Intrinsic::memcpy: {
const auto &MCI = cast<MemCpyInst>(I);
SDValue Op1 = getValue(I.getArgOperand(0));
SDValue Op2 = getValue(I.getArgOperand(1));
SDValue Op3 = getValue(I.getArgOperand(2));
// @llvm.memcpy defines 0 and 1 to both mean no alignment.
unsigned DstAlign = std::max<unsigned>(MCI.getDestAlignment(), 1);
unsigned SrcAlign = std::max<unsigned>(MCI.getSourceAlignment(), 1);
unsigned Align = MinAlign(DstAlign, SrcAlign);
bool isVol = MCI.isVolatile();
bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
// FIXME: Support passing different dest/src alignments to the memcpy DAG
// node.
SDValue MC = DAG.getMemcpy(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
false, isTC,
MachinePointerInfo(I.getArgOperand(0)),
MachinePointerInfo(I.getArgOperand(1)));
updateDAGForMaybeTailCall(MC);
return;
}
case Intrinsic::memset: {
const auto &MSI = cast<MemSetInst>(I);
SDValue Op1 = getValue(I.getArgOperand(0));
SDValue Op2 = getValue(I.getArgOperand(1));
SDValue Op3 = getValue(I.getArgOperand(2));
// @llvm.memset defines 0 and 1 to both mean no alignment.
unsigned Align = std::max<unsigned>(MSI.getDestAlignment(), 1);
bool isVol = MSI.isVolatile();
bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
SDValue MS = DAG.getMemset(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
isTC, MachinePointerInfo(I.getArgOperand(0)));
updateDAGForMaybeTailCall(MS);
return;
}
case Intrinsic::memmove: {
const auto &MMI = cast<MemMoveInst>(I);
SDValue Op1 = getValue(I.getArgOperand(0));
SDValue Op2 = getValue(I.getArgOperand(1));
SDValue Op3 = getValue(I.getArgOperand(2));
// @llvm.memmove defines 0 and 1 to both mean no alignment.
unsigned DstAlign = std::max<unsigned>(MMI.getDestAlignment(), 1);
unsigned SrcAlign = std::max<unsigned>(MMI.getSourceAlignment(), 1);
unsigned Align = MinAlign(DstAlign, SrcAlign);
bool isVol = MMI.isVolatile();
bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
// FIXME: Support passing different dest/src alignments to the memmove DAG
// node.
SDValue MM = DAG.getMemmove(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
isTC, MachinePointerInfo(I.getArgOperand(0)),
MachinePointerInfo(I.getArgOperand(1)));
updateDAGForMaybeTailCall(MM);
return;
}
case Intrinsic::memcpy_element_unordered_atomic: {
const AtomicMemCpyInst &MI = cast<AtomicMemCpyInst>(I);
SDValue Dst = getValue(MI.getRawDest());
SDValue Src = getValue(MI.getRawSource());
SDValue Length = getValue(MI.getLength());
unsigned DstAlign = MI.getDestAlignment();
unsigned SrcAlign = MI.getSourceAlignment();
Type *LengthTy = MI.getLength()->getType();
unsigned ElemSz = MI.getElementSizeInBytes();
bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
SDValue MC = DAG.getAtomicMemcpy(getRoot(), sdl, Dst, DstAlign, Src,
SrcAlign, Length, LengthTy, ElemSz, isTC,
MachinePointerInfo(MI.getRawDest()),
MachinePointerInfo(MI.getRawSource()));
updateDAGForMaybeTailCall(MC);
return;
}
case Intrinsic::memmove_element_unordered_atomic: {
auto &MI = cast<AtomicMemMoveInst>(I);
SDValue Dst = getValue(MI.getRawDest());
SDValue Src = getValue(MI.getRawSource());
SDValue Length = getValue(MI.getLength());
unsigned DstAlign = MI.getDestAlignment();
unsigned SrcAlign = MI.getSourceAlignment();
Type *LengthTy = MI.getLength()->getType();
unsigned ElemSz = MI.getElementSizeInBytes();
bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
SDValue MC = DAG.getAtomicMemmove(getRoot(), sdl, Dst, DstAlign, Src,
SrcAlign, Length, LengthTy, ElemSz, isTC,
MachinePointerInfo(MI.getRawDest()),
MachinePointerInfo(MI.getRawSource()));
updateDAGForMaybeTailCall(MC);
return;
}
case Intrinsic::memset_element_unordered_atomic: {
auto &MI = cast<AtomicMemSetInst>(I);
SDValue Dst = getValue(MI.getRawDest());
SDValue Val = getValue(MI.getValue());
SDValue Length = getValue(MI.getLength());
unsigned DstAlign = MI.getDestAlignment();
Type *LengthTy = MI.getLength()->getType();
unsigned ElemSz = MI.getElementSizeInBytes();
bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
SDValue MC = DAG.getAtomicMemset(getRoot(), sdl, Dst, DstAlign, Val, Length,
LengthTy, ElemSz, isTC,
MachinePointerInfo(MI.getRawDest()));
updateDAGForMaybeTailCall(MC);
return;
}
case Intrinsic::dbg_addr:
case Intrinsic::dbg_declare: {
const auto &DI = cast<DbgVariableIntrinsic>(I);
DILocalVariable *Variable = DI.getVariable();
DIExpression *Expression = DI.getExpression();
dropDanglingDebugInfo(Variable, Expression);
assert(Variable && "Missing variable");
// Check if address has undef value.
const Value *Address = DI.getVariableLocation();
if (!Address || isa<UndefValue>(Address) ||
(Address->use_empty() && !isa<Argument>(Address))) {
LLVM_DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
return;
}
bool isParameter = Variable->isParameter() || isa<Argument>(Address);
// Check if this variable can be described by a frame index, typically
// either as a static alloca or a byval parameter.
int FI = std::numeric_limits<int>::max();
if (const auto *AI =
dyn_cast<AllocaInst>(Address->stripInBoundsConstantOffsets())) {
if (AI->isStaticAlloca()) {
auto I = FuncInfo.StaticAllocaMap.find(AI);
if (I != FuncInfo.StaticAllocaMap.end())
FI = I->second;
}
} else if (const auto *Arg = dyn_cast<Argument>(
Address->stripInBoundsConstantOffsets())) {
FI = FuncInfo.getArgumentFrameIndex(Arg);
}
// llvm.dbg.addr is control dependent and always generates indirect
// DBG_VALUE instructions. llvm.dbg.declare is handled as a frame index in
// the MachineFunction variable table.
if (FI != std::numeric_limits<int>::max()) {
if (Intrinsic == Intrinsic::dbg_addr) {
SDDbgValue *SDV = DAG.getFrameIndexDbgValue(
Variable, Expression, FI, /*IsIndirect*/ true, dl, SDNodeOrder);
DAG.AddDbgValue(SDV, getRoot().getNode(), isParameter);
}
return;
}
SDValue &N = NodeMap[Address];
if (!N.getNode() && isa<Argument>(Address))
// Check unused arguments map.
N = UnusedArgNodeMap[Address];
SDDbgValue *SDV;
if (N.getNode()) {
if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Address))
Address = BCI->getOperand(0);
// Parameters are handled specially.
auto FINode = dyn_cast<FrameIndexSDNode>(N.getNode());
if (isParameter && FINode) {
// Byval parameter. We have a frame index at this point.
SDV =
DAG.getFrameIndexDbgValue(Variable, Expression, FINode->getIndex(),
/*IsIndirect*/ true, dl, SDNodeOrder);
} else if (isa<Argument>(Address)) {
// Address is an argument, so try to emit its dbg value using
// virtual register info from the FuncInfo.ValueMap.
EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, true, N);
return;
} else {
SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(),
true, dl, SDNodeOrder);
}
DAG.AddDbgValue(SDV, N.getNode(), isParameter);
} else {
// If Address is an argument then try to emit its dbg value using
// virtual register info from the FuncInfo.ValueMap.
if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, true,
N)) {
LLVM_DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
}
}
return;
}
case Intrinsic::dbg_label: {
const DbgLabelInst &DI = cast<DbgLabelInst>(I);
DILabel *Label = DI.getLabel();
assert(Label && "Missing label");
SDDbgLabel *SDV;
SDV = DAG.getDbgLabel(Label, dl, SDNodeOrder);
DAG.AddDbgLabel(SDV);
return;
}
case Intrinsic::dbg_value: {
const DbgValueInst &DI = cast<DbgValueInst>(I);
assert(DI.getVariable() && "Missing variable");
DILocalVariable *Variable = DI.getVariable();
DIExpression *Expression = DI.getExpression();
dropDanglingDebugInfo(Variable, Expression);
const Value *V = DI.getValue();
if (!V)
return;
if (handleDebugValue(V, Variable, Expression, dl, DI.getDebugLoc(),
SDNodeOrder))
return;
// TODO: Dangling debug info will eventually either be resolved or produce
// an Undef DBG_VALUE. However in the resolution case, a gap may appear
// between the original dbg.value location and its resolved DBG_VALUE, which
// we should ideally fill with an extra Undef DBG_VALUE.
DanglingDebugInfoMap[V].emplace_back(&DI, dl, SDNodeOrder);
return;
}
case Intrinsic::eh_typeid_for: {
// Find the type id for the given typeinfo.
GlobalValue *GV = ExtractTypeInfo(I.getArgOperand(0));
unsigned TypeID = DAG.getMachineFunction().getTypeIDFor(GV);
Res = DAG.getConstant(TypeID, sdl, MVT::i32);
setValue(&I, Res);
return;
}
case Intrinsic::eh_return_i32:
case Intrinsic::eh_return_i64:
DAG.getMachineFunction().setCallsEHReturn(true);
DAG.setRoot(DAG.getNode(ISD::EH_RETURN, sdl,
MVT::Other,
getControlRoot(),
getValue(I.getArgOperand(0)),
getValue(I.getArgOperand(1))));
return;
case Intrinsic::eh_unwind_init:
DAG.getMachineFunction().setCallsUnwindInit(true);
return;
case Intrinsic::eh_dwarf_cfa:
setValue(&I, DAG.getNode(ISD::EH_DWARF_CFA, sdl,
TLI.getPointerTy(DAG.getDataLayout()),
getValue(I.getArgOperand(0))));
return;
case Intrinsic::eh_sjlj_callsite: {
MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(0));
assert(CI && "Non-constant call site value in eh.sjlj.callsite!");
assert(MMI.getCurrentCallSite() == 0 && "Overlapping call sites!");
MMI.setCurrentCallSite(CI->getZExtValue());
return;
}
case Intrinsic::eh_sjlj_functioncontext: {
// Get and store the index of the function context.
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
AllocaInst *FnCtx =
cast<AllocaInst>(I.getArgOperand(0)->stripPointerCasts());
int FI = FuncInfo.StaticAllocaMap[FnCtx];
MFI.setFunctionContextIndex(FI);
return;
}
case Intrinsic::eh_sjlj_setjmp: {
SDValue Ops[2];
Ops[0] = getRoot();
Ops[1] = getValue(I.getArgOperand(0));
SDValue Op = DAG.getNode(ISD::EH_SJLJ_SETJMP, sdl,
DAG.getVTList(MVT::i32, MVT::Other), Ops);
setValue(&I, Op.getValue(0));
DAG.setRoot(Op.getValue(1));
return;
}
case Intrinsic::eh_sjlj_longjmp:
DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_LONGJMP, sdl, MVT::Other,
getRoot(), getValue(I.getArgOperand(0))));
return;
case Intrinsic::eh_sjlj_setup_dispatch:
DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_SETUP_DISPATCH, sdl, MVT::Other,
getRoot()));
return;
case Intrinsic::masked_gather:
visitMaskedGather(I);
return;
case Intrinsic::masked_load:
visitMaskedLoad(I);
return;
case Intrinsic::masked_scatter:
visitMaskedScatter(I);
return;
case Intrinsic::masked_store:
visitMaskedStore(I);
return;
case Intrinsic::masked_expandload:
visitMaskedLoad(I, true /* IsExpanding */);
return;
case Intrinsic::masked_compressstore:
visitMaskedStore(I, true /* IsCompressing */);
return;
case Intrinsic::x86_mmx_pslli_w:
case Intrinsic::x86_mmx_pslli_d:
case Intrinsic::x86_mmx_pslli_q:
case Intrinsic::x86_mmx_psrli_w:
case Intrinsic::x86_mmx_psrli_d:
case Intrinsic::x86_mmx_psrli_q:
case Intrinsic::x86_mmx_psrai_w:
case Intrinsic::x86_mmx_psrai_d: {
SDValue ShAmt = getValue(I.getArgOperand(1));
if (isa<ConstantSDNode>(ShAmt)) {
visitTargetIntrinsic(I, Intrinsic);
return;
}
unsigned NewIntrinsic = 0;
EVT ShAmtVT = MVT::v2i32;
switch (Intrinsic) {
case Intrinsic::x86_mmx_pslli_w:
NewIntrinsic = Intrinsic::x86_mmx_psll_w;
break;
case Intrinsic::x86_mmx_pslli_d:
NewIntrinsic = Intrinsic::x86_mmx_psll_d;
break;
case Intrinsic::x86_mmx_pslli_q:
NewIntrinsic = Intrinsic::x86_mmx_psll_q;
break;
case Intrinsic::x86_mmx_psrli_w:
NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
break;
case Intrinsic::x86_mmx_psrli_d:
NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
break;
case Intrinsic::x86_mmx_psrli_q:
NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
break;
case Intrinsic::x86_mmx_psrai_w:
NewIntrinsic = Intrinsic::x86_mmx_psra_w;
break;
case Intrinsic::x86_mmx_psrai_d:
NewIntrinsic = Intrinsic::x86_mmx_psra_d;
break;
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
}
// The vector shift intrinsics with scalars uses 32b shift amounts but
// the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
// to be zero.
// We must do this early because v2i32 is not a legal type.
SDValue ShOps[2];
ShOps[0] = ShAmt;
ShOps[1] = DAG.getConstant(0, sdl, MVT::i32);
ShAmt = DAG.getBuildVector(ShAmtVT, sdl, ShOps);
EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
ShAmt = DAG.getNode(ISD::BITCAST, sdl, DestVT, ShAmt);
Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, sdl, DestVT,
DAG.getConstant(NewIntrinsic, sdl, MVT::i32),
getValue(I.getArgOperand(0)), ShAmt);
setValue(&I, Res);
return;
}
case Intrinsic::powi:
setValue(&I, ExpandPowI(sdl, getValue(I.getArgOperand(0)),
getValue(I.getArgOperand(1)), DAG));
return;
case Intrinsic::log:
setValue(&I, expandLog(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
return;
case Intrinsic::log2:
setValue(&I, expandLog2(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
return;
case Intrinsic::log10:
setValue(&I, expandLog10(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
return;
case Intrinsic::exp:
setValue(&I, expandExp(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
return;
case Intrinsic::exp2:
setValue(&I, expandExp2(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
return;
case Intrinsic::pow:
setValue(&I, expandPow(sdl, getValue(I.getArgOperand(0)),
getValue(I.getArgOperand(1)), DAG, TLI));
return;
case Intrinsic::sqrt:
case Intrinsic::fabs:
case Intrinsic::sin:
case Intrinsic::cos:
case Intrinsic::floor:
case Intrinsic::ceil:
case Intrinsic::trunc:
case Intrinsic::rint:
case Intrinsic::nearbyint:
case Intrinsic::round:
case Intrinsic::canonicalize: {
unsigned Opcode;
switch (Intrinsic) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::sqrt: Opcode = ISD::FSQRT; break;
case Intrinsic::fabs: Opcode = ISD::FABS; break;
case Intrinsic::sin: Opcode = ISD::FSIN; break;
case Intrinsic::cos: Opcode = ISD::FCOS; break;
case Intrinsic::floor: Opcode = ISD::FFLOOR; break;
case Intrinsic::ceil: Opcode = ISD::FCEIL; break;
case Intrinsic::trunc: Opcode = ISD::FTRUNC; break;
case Intrinsic::rint: Opcode = ISD::FRINT; break;
case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
case Intrinsic::round: Opcode = ISD::FROUND; break;
case Intrinsic::canonicalize: Opcode = ISD::FCANONICALIZE; break;
}
setValue(&I, DAG.getNode(Opcode, sdl,
getValue(I.getArgOperand(0)).getValueType(),
getValue(I.getArgOperand(0))));
return;
}
case Intrinsic::lround:
case Intrinsic::llround:
case Intrinsic::lrint:
case Intrinsic::llrint: {
unsigned Opcode;
switch (Intrinsic) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::lround: Opcode = ISD::LROUND; break;
case Intrinsic::llround: Opcode = ISD::LLROUND; break;
case Intrinsic::lrint: Opcode = ISD::LRINT; break;
case Intrinsic::llrint: Opcode = ISD::LLRINT; break;
}
EVT RetVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
setValue(&I, DAG.getNode(Opcode, sdl, RetVT,
getValue(I.getArgOperand(0))));
return;
}
case Intrinsic::minnum:
setValue(&I, DAG.getNode(ISD::FMINNUM, sdl,
getValue(I.getArgOperand(0)).getValueType(),
getValue(I.getArgOperand(0)),
getValue(I.getArgOperand(1))));
return;
case Intrinsic::maxnum:
setValue(&I, DAG.getNode(ISD::FMAXNUM, sdl,
getValue(I.getArgOperand(0)).getValueType(),
getValue(I.getArgOperand(0)),
getValue(I.getArgOperand(1))));
return;
case Intrinsic::minimum:
setValue(&I, DAG.getNode(ISD::FMINIMUM, sdl,
getValue(I.getArgOperand(0)).getValueType(),
getValue(I.getArgOperand(0)),
getValue(I.getArgOperand(1))));
return;
case Intrinsic::maximum:
setValue(&I, DAG.getNode(ISD::FMAXIMUM, sdl,
getValue(I.getArgOperand(0)).getValueType(),
getValue(I.getArgOperand(0)),
getValue(I.getArgOperand(1))));
return;
case Intrinsic::copysign:
setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl,
getValue(I.getArgOperand(0)).getValueType(),
getValue(I.getArgOperand(0)),
getValue(I.getArgOperand(1))));
return;
case Intrinsic::fma:
setValue(&I, DAG.getNode(ISD::FMA, sdl,
getValue(I.getArgOperand(0)).getValueType(),
getValue(I.getArgOperand(0)),
getValue(I.getArgOperand(1)),
getValue(I.getArgOperand(2))));
return;
case Intrinsic::experimental_constrained_fadd:
case Intrinsic::experimental_constrained_fsub:
case Intrinsic::experimental_constrained_fmul:
case Intrinsic::experimental_constrained_fdiv:
case Intrinsic::experimental_constrained_frem:
case Intrinsic::experimental_constrained_fma:
case Intrinsic::experimental_constrained_fptrunc:
case Intrinsic::experimental_constrained_fpext:
case Intrinsic::experimental_constrained_sqrt:
case Intrinsic::experimental_constrained_pow:
case Intrinsic::experimental_constrained_powi:
case Intrinsic::experimental_constrained_sin:
case Intrinsic::experimental_constrained_cos:
case Intrinsic::experimental_constrained_exp:
case Intrinsic::experimental_constrained_exp2:
case Intrinsic::experimental_constrained_log:
case Intrinsic::experimental_constrained_log10:
case Intrinsic::experimental_constrained_log2:
case Intrinsic::experimental_constrained_rint:
case Intrinsic::experimental_constrained_nearbyint:
case Intrinsic::experimental_constrained_maxnum:
case Intrinsic::experimental_constrained_minnum:
case Intrinsic::experimental_constrained_ceil:
case Intrinsic::experimental_constrained_floor:
case Intrinsic::experimental_constrained_round:
case Intrinsic::experimental_constrained_trunc:
visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(I));
return;
case Intrinsic::fmuladd: {
EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
TLI.isFMAFasterThanFMulAndFAdd(VT)) {
setValue(&I, DAG.getNode(ISD::FMA, sdl,
getValue(I.getArgOperand(0)).getValueType(),
getValue(I.getArgOperand(0)),
getValue(I.getArgOperand(1)),
getValue(I.getArgOperand(2))));
} else {
// TODO: Intrinsic calls should have fast-math-flags.
SDValue Mul = DAG.getNode(ISD::FMUL, sdl,
getValue(I.getArgOperand(0)).getValueType(),
getValue(I.getArgOperand(0)),
getValue(I.getArgOperand(1)));
SDValue Add = DAG.getNode(ISD::FADD, sdl,
getValue(I.getArgOperand(0)).getValueType(),
Mul,
getValue(I.getArgOperand(2)));
setValue(&I, Add);
}
return;
}
case Intrinsic::convert_to_fp16:
setValue(&I, DAG.getNode(ISD::BITCAST, sdl, MVT::i16,
DAG.getNode(ISD::FP_ROUND, sdl, MVT::f16,
getValue(I.getArgOperand(0)),
DAG.getTargetConstant(0, sdl,
MVT::i32))));
return;
case Intrinsic::convert_from_fp16:
setValue(&I, DAG.getNode(ISD::FP_EXTEND, sdl,
TLI.getValueType(DAG.getDataLayout(), I.getType()),
DAG.getNode(ISD::BITCAST, sdl, MVT::f16,
getValue(I.getArgOperand(0)))));
return;
case Intrinsic::pcmarker: {
SDValue Tmp = getValue(I.getArgOperand(0));
DAG.setRoot(DAG.getNode(ISD::PCMARKER, sdl, MVT::Other, getRoot(), Tmp));
return;
}
case Intrinsic::readcyclecounter: {
SDValue Op = getRoot();
Res = DAG.getNode(ISD::READCYCLECOUNTER, sdl,
DAG.getVTList(MVT::i64, MVT::Other), Op);
setValue(&I, Res);
DAG.setRoot(Res.getValue(1));
return;
}
case Intrinsic::bitreverse:
setValue(&I, DAG.getNode(ISD::BITREVERSE, sdl,
getValue(I.getArgOperand(0)).getValueType(),
getValue(I.getArgOperand(0))));
return;
case Intrinsic::bswap:
setValue(&I, DAG.getNode(ISD::BSWAP, sdl,
getValue(I.getArgOperand(0)).getValueType(),
getValue(I.getArgOperand(0))));
return;
case Intrinsic::cttz: {
SDValue Arg = getValue(I.getArgOperand(0));
ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
EVT Ty = Arg.getValueType();
setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTTZ : ISD::CTTZ_ZERO_UNDEF,
sdl, Ty, Arg));
return;
}
case Intrinsic::ctlz: {
SDValue Arg = getValue(I.getArgOperand(0));
ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
EVT Ty = Arg.getValueType();
setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTLZ : ISD::CTLZ_ZERO_UNDEF,
sdl, Ty, Arg));
return;
}
case Intrinsic::ctpop: {
SDValue Arg = getValue(I.getArgOperand(0));
EVT Ty = Arg.getValueType();
setValue(&I, DAG.getNode(ISD::CTPOP, sdl, Ty, Arg));
return;
}
case Intrinsic::fshl:
case Intrinsic::fshr: {
bool IsFSHL = Intrinsic == Intrinsic::fshl;
SDValue X = getValue(I.getArgOperand(0));
SDValue Y = getValue(I.getArgOperand(1));
SDValue Z = getValue(I.getArgOperand(2));
EVT VT = X.getValueType();
SDValue BitWidthC = DAG.getConstant(VT.getScalarSizeInBits(), sdl, VT);
SDValue Zero = DAG.getConstant(0, sdl, VT);
SDValue ShAmt = DAG.getNode(ISD::UREM, sdl, VT, Z, BitWidthC);
auto FunnelOpcode = IsFSHL ? ISD::FSHL : ISD::FSHR;
if (TLI.isOperationLegalOrCustom(FunnelOpcode, VT)) {
setValue(&I, DAG.getNode(FunnelOpcode, sdl, VT, X, Y, Z));
return;
}
// When X == Y, this is rotate. If the data type has a power-of-2 size, we
// avoid the select that is necessary in the general case to filter out
// the 0-shift possibility that leads to UB.
if (X == Y && isPowerOf2_32(VT.getScalarSizeInBits())) {
auto RotateOpcode = IsFSHL ? ISD::ROTL : ISD::ROTR;
if (TLI.isOperationLegalOrCustom(RotateOpcode, VT)) {
setValue(&I, DAG.getNode(RotateOpcode, sdl, VT, X, Z));
return;
}
// Some targets only rotate one way. Try the opposite direction.
RotateOpcode = IsFSHL ? ISD::ROTR : ISD::ROTL;
if (TLI.isOperationLegalOrCustom(RotateOpcode, VT)) {
// Negate the shift amount because it is safe to ignore the high bits.
SDValue NegShAmt = DAG.getNode(ISD::SUB, sdl, VT, Zero, Z);
setValue(&I, DAG.getNode(RotateOpcode, sdl, VT, X, NegShAmt));
return;
}
// fshl (rotl): (X << (Z % BW)) | (X >> ((0 - Z) % BW))
// fshr (rotr): (X << ((0 - Z) % BW)) | (X >> (Z % BW))
SDValue NegZ = DAG.getNode(ISD::SUB, sdl, VT, Zero, Z);
SDValue NShAmt = DAG.getNode(ISD::UREM, sdl, VT, NegZ, BitWidthC);
SDValue ShX = DAG.getNode(ISD::SHL, sdl, VT, X, IsFSHL ? ShAmt : NShAmt);
SDValue ShY = DAG.getNode(ISD::SRL, sdl, VT, X, IsFSHL ? NShAmt : ShAmt);
setValue(&I, DAG.getNode(ISD::OR, sdl, VT, ShX, ShY));
return;
}
// fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
// fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
SDValue InvShAmt = DAG.getNode(ISD::SUB, sdl, VT, BitWidthC, ShAmt);
SDValue ShX = DAG.getNode(ISD::SHL, sdl, VT, X, IsFSHL ? ShAmt : InvShAmt);
SDValue ShY = DAG.getNode(ISD::SRL, sdl, VT, Y, IsFSHL ? InvShAmt : ShAmt);
SDValue Or = DAG.getNode(ISD::OR, sdl, VT, ShX, ShY);
// If (Z % BW == 0), then the opposite direction shift is shift-by-bitwidth,
// and that is undefined. We must compare and select to avoid UB.
EVT CCVT = MVT::i1;
if (VT.isVector())
CCVT = EVT::getVectorVT(*Context, CCVT, VT.getVectorNumElements());
// For fshl, 0-shift returns the 1st arg (X).
// For fshr, 0-shift returns the 2nd arg (Y).
SDValue IsZeroShift = DAG.getSetCC(sdl, CCVT, ShAmt, Zero, ISD::SETEQ);
setValue(&I, DAG.getSelect(sdl, VT, IsZeroShift, IsFSHL ? X : Y, Or));
return;
}
case Intrinsic::sadd_sat: {
SDValue Op1 = getValue(I.getArgOperand(0));
SDValue Op2 = getValue(I.getArgOperand(1));
setValue(&I, DAG.getNode(ISD::SADDSAT, sdl, Op1.getValueType(), Op1, Op2));
return;
}
case Intrinsic::uadd_sat: {
SDValue Op1 = getValue(I.getArgOperand(0));
SDValue Op2 = getValue(I.getArgOperand(1));
setValue(&I, DAG.getNode(ISD::UADDSAT, sdl, Op1.getValueType(), Op1, Op2));
return;
}
case Intrinsic::ssub_sat: {
SDValue Op1 = getValue(I.getArgOperand(0));
SDValue Op2 = getValue(I.getArgOperand(1));
setValue(&I, DAG.getNode(ISD::SSUBSAT, sdl, Op1.getValueType(), Op1, Op2));
return;
}
case Intrinsic::usub_sat: {
SDValue Op1 = getValue(I.getArgOperand(0));
SDValue Op2 = getValue(I.getArgOperand(1));
setValue(&I, DAG.getNode(ISD::USUBSAT, sdl, Op1.getValueType(), Op1, Op2));
return;
}
case Intrinsic::smul_fix:
case Intrinsic::umul_fix: {
SDValue Op1 = getValue(I.getArgOperand(0));
SDValue Op2 = getValue(I.getArgOperand(1));
SDValue Op3 = getValue(I.getArgOperand(2));
setValue(&I, DAG.getNode(FixedPointIntrinsicToOpcode(Intrinsic), sdl,
Op1.getValueType(), Op1, Op2, Op3));
return;
}
case Intrinsic::smul_fix_sat: {
SDValue Op1 = getValue(I.getArgOperand(0));
SDValue Op2 = getValue(I.getArgOperand(1));
SDValue Op3 = getValue(I.getArgOperand(2));
setValue(&I, DAG.getNode(ISD::SMULFIXSAT, sdl, Op1.getValueType(), Op1, Op2,
Op3));
return;
}
case Intrinsic::stacksave: {
SDValue Op = getRoot();
Res = DAG.getNode(
ISD::STACKSAVE, sdl,
DAG.getVTList(TLI.getPointerTy(DAG.getDataLayout()), MVT::Other), Op);
setValue(&I, Res);
DAG.setRoot(Res.getValue(1));
return;
}
case Intrinsic::stackrestore:
Res = getValue(I.getArgOperand(0));
DAG.setRoot(DAG.getNode(ISD::STACKRESTORE, sdl, MVT::Other, getRoot(), Res));
return;
case Intrinsic::get_dynamic_area_offset: {
SDValue Op = getRoot();
EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
EVT ResTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
// Result type for @llvm.get.dynamic.area.offset should match PtrTy for
// target.
if (PtrTy.getSizeInBits() < ResTy.getSizeInBits())
report_fatal_error("Wrong result type for @llvm.get.dynamic.area.offset"
" intrinsic!");
Res = DAG.getNode(ISD::GET_DYNAMIC_AREA_OFFSET, sdl, DAG.getVTList(ResTy),
Op);
DAG.setRoot(Op);
setValue(&I, Res);
return;
}
case Intrinsic::stackguard: {
EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
MachineFunction &MF = DAG.getMachineFunction();
const Module &M = *MF.getFunction().getParent();
SDValue Chain = getRoot();
if (TLI.useLoadStackGuardNode()) {
Res = getLoadStackGuard(DAG, sdl, Chain);
} else {
const Value *Global = TLI.getSDagStackGuard(M);
unsigned Align = DL->getPrefTypeAlignment(Global->getType());
Res = DAG.getLoad(PtrTy, sdl, Chain, getValue(Global),
MachinePointerInfo(Global, 0), Align,
MachineMemOperand::MOVolatile);
}
if (TLI.useStackGuardXorFP())
Res = TLI.emitStackGuardXorFP(DAG, Res, sdl);
DAG.setRoot(Chain);
setValue(&I, Res);
return;
}
case Intrinsic::stackprotector: {
// Emit code into the DAG to store the stack guard onto the stack.
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
SDValue Src, Chain = getRoot();
if (TLI.useLoadStackGuardNode())
Src = getLoadStackGuard(DAG, sdl, Chain);
else
Src = getValue(I.getArgOperand(0)); // The guard's value.
AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1));
int FI = FuncInfo.StaticAllocaMap[Slot];
MFI.setStackProtectorIndex(FI);
SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
// Store the stack protector onto the stack.
Res = DAG.getStore(Chain, sdl, Src, FIN, MachinePointerInfo::getFixedStack(
DAG.getMachineFunction(), FI),
/* Alignment = */ 0, MachineMemOperand::MOVolatile);
setValue(&I, Res);
DAG.setRoot(Res);
return;
}
case Intrinsic::objectsize: {
// If we don't know by now, we're never going to know.
ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(1));
assert(CI && "Non-constant type in __builtin_object_size?");
SDValue Arg = getValue(I.getCalledValue());
EVT Ty = Arg.getValueType();
if (CI->isZero())
Res = DAG.getConstant(-1ULL, sdl, Ty);
else
Res = DAG.getConstant(0, sdl, Ty);
setValue(&I, Res);
return;
}
case Intrinsic::is_constant:
// If this wasn't constant-folded away by now, then it's not a
// constant.
setValue(&I, DAG.getConstant(0, sdl, MVT::i1));
return;
case Intrinsic::annotation:
case Intrinsic::ptr_annotation:
case Intrinsic::launder_invariant_group:
case Intrinsic::strip_invariant_group:
// Drop the intrinsic, but forward the value
setValue(&I, getValue(I.getOperand(0)));
return;
case Intrinsic::assume:
case Intrinsic::var_annotation:
case Intrinsic::sideeffect:
// Discard annotate attributes, assumptions, and artificial side-effects.
return;
case Intrinsic::codeview_annotation: {
// Emit a label associated with this metadata.
MachineFunction &MF = DAG.getMachineFunction();
MCSymbol *Label =
MF.getMMI().getContext().createTempSymbol("annotation", true);
Metadata *MD = cast<MetadataAsValue>(I.getArgOperand(0))->getMetadata();
MF.addCodeViewAnnotation(Label, cast<MDNode>(MD));
Res = DAG.getLabelNode(ISD::ANNOTATION_LABEL, sdl, getRoot(), Label);
DAG.setRoot(Res);
return;
}
case Intrinsic::init_trampoline: {
const Function *F = cast<Function>(I.getArgOperand(1)->stripPointerCasts());
SDValue Ops[6];
Ops[0] = getRoot();
Ops[1] = getValue(I.getArgOperand(0));
Ops[2] = getValue(I.getArgOperand(1));
Ops[3] = getValue(I.getArgOperand(2));
Ops[4] = DAG.getSrcValue(I.getArgOperand(0));
Ops[5] = DAG.getSrcValue(F);
Res = DAG.getNode(ISD::INIT_TRAMPOLINE, sdl, MVT::Other, Ops);
DAG.setRoot(Res);
return;
}
case Intrinsic::adjust_trampoline:
setValue(&I, DAG.getNode(ISD::ADJUST_TRAMPOLINE, sdl,
TLI.getPointerTy(DAG.getDataLayout()),
getValue(I.getArgOperand(0))));
return;
case Intrinsic::gcroot: {
assert(DAG.getMachineFunction().getFunction().hasGC() &&
"only valid in functions with gc specified, enforced by Verifier");
assert(GFI && "implied by previous");
const Value *Alloca = I.getArgOperand(0)->stripPointerCasts();
const Constant *TypeMap = cast<Constant>(I.getArgOperand(1));
FrameIndexSDNode *FI = cast<FrameIndexSDNode>(getValue(Alloca).getNode());
GFI->addStackRoot(FI->getIndex(), TypeMap);
return;
}
case Intrinsic::gcread:
case Intrinsic::gcwrite:
llvm_unreachable("GC failed to lower gcread/gcwrite intrinsics!");
case Intrinsic::flt_rounds:
setValue(&I, DAG.getNode(ISD::FLT_ROUNDS_, sdl, MVT::i32));
return;
case Intrinsic::expect:
// Just replace __builtin_expect(exp, c) with EXP.
setValue(&I, getValue(I.getArgOperand(0)));
return;
case Intrinsic::debugtrap:
case Intrinsic::trap: {
StringRef TrapFuncName =
I.getAttributes()
.getAttribute(AttributeList::FunctionIndex, "trap-func-name")
.getValueAsString();
if (TrapFuncName.empty()) {
ISD::NodeType Op = (Intrinsic == Intrinsic::trap) ?
ISD::TRAP : ISD::DEBUGTRAP;
DAG.setRoot(DAG.getNode(Op, sdl,MVT::Other, getRoot()));
return;
}
TargetLowering::ArgListTy Args;
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee(
CallingConv::C, I.getType(),
DAG.getExternalSymbol(TrapFuncName.data(),
TLI.getPointerTy(DAG.getDataLayout())),
std::move(Args));
std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
DAG.setRoot(Result.second);
return;
}
case Intrinsic::uadd_with_overflow:
case Intrinsic::sadd_with_overflow:
case Intrinsic::usub_with_overflow:
case Intrinsic::ssub_with_overflow:
case Intrinsic::umul_with_overflow:
case Intrinsic::smul_with_overflow: {
ISD::NodeType Op;
switch (Intrinsic) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::uadd_with_overflow: Op = ISD::UADDO; break;
case Intrinsic::sadd_with_overflow: Op = ISD::SADDO; break;
case Intrinsic::usub_with_overflow: Op = ISD::USUBO; break;
case Intrinsic::ssub_with_overflow: Op = ISD::SSUBO; break;
case Intrinsic::umul_with_overflow: Op = ISD::UMULO; break;
case Intrinsic::smul_with_overflow: Op = ISD::SMULO; break;
}
SDValue Op1 = getValue(I.getArgOperand(0));
SDValue Op2 = getValue(I.getArgOperand(1));
EVT ResultVT = Op1.getValueType();
EVT OverflowVT = MVT::i1;
if (ResultVT.isVector())
OverflowVT = EVT::getVectorVT(
*Context, OverflowVT, ResultVT.getVectorNumElements());
SDVTList VTs = DAG.getVTList(ResultVT, OverflowVT);
setValue(&I, DAG.getNode(Op, sdl, VTs, Op1, Op2));
return;
}
case Intrinsic::prefetch: {
SDValue Ops[5];
unsigned rw = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
auto Flags = rw == 0 ? MachineMemOperand::MOLoad :MachineMemOperand::MOStore;
Ops[0] = DAG.getRoot();
Ops[1] = getValue(I.getArgOperand(0));
Ops[2] = getValue(I.getArgOperand(1));
Ops[3] = getValue(I.getArgOperand(2));
Ops[4] = getValue(I.getArgOperand(3));
SDValue Result = DAG.getMemIntrinsicNode(ISD::PREFETCH, sdl,
DAG.getVTList(MVT::Other), Ops,
EVT::getIntegerVT(*Context, 8),
MachinePointerInfo(I.getArgOperand(0)),
0, /* align */
Flags);
// Chain the prefetch in parallell with any pending loads, to stay out of
// the way of later optimizations.
PendingLoads.push_back(Result);
Result = getRoot();
DAG.setRoot(Result);
return;
}
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end: {
bool IsStart = (Intrinsic == Intrinsic::lifetime_start);
// Stack coloring is not enabled in O0, discard region information.
if (TM.getOptLevel() == CodeGenOpt::None)
return;
const int64_t ObjectSize =
cast<ConstantInt>(I.getArgOperand(0))->getSExtValue();
Value *const ObjectPtr = I.getArgOperand(1);
SmallVector<const Value *, 4> Allocas;
GetUnderlyingObjects(ObjectPtr, Allocas, *DL);
for (SmallVectorImpl<const Value*>::iterator Object = Allocas.begin(),
E = Allocas.end(); Object != E; ++Object) {
const AllocaInst *LifetimeObject = dyn_cast_or_null<AllocaInst>(*Object);
// Could not find an Alloca.
if (!LifetimeObject)
continue;
// First check that the Alloca is static, otherwise it won't have a
// valid frame index.
auto SI = FuncInfo.StaticAllocaMap.find(LifetimeObject);
if (SI == FuncInfo.StaticAllocaMap.end())
return;
const int FrameIndex = SI->second;
int64_t Offset;
if (GetPointerBaseWithConstantOffset(
ObjectPtr, Offset, DAG.getDataLayout()) != LifetimeObject)
Offset = -1; // Cannot determine offset from alloca to lifetime object.
Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex, ObjectSize,
Offset);
DAG.setRoot(Res);
}
return;
}
case Intrinsic::invariant_start:
// Discard region information.
setValue(&I, DAG.getUNDEF(TLI.getPointerTy(DAG.getDataLayout())));
return;
case Intrinsic::invariant_end:
// Discard region information.
return;
case Intrinsic::clear_cache:
/// FunctionName may be null.
if (const char *FunctionName = TLI.getClearCacheBuiltinName())
lowerCallToExternalSymbol(I, FunctionName);
return;
case Intrinsic::donothing:
// ignore
return;
case Intrinsic::experimental_stackmap:
visitStackmap(I);
return;
case Intrinsic::experimental_patchpoint_void:
case Intrinsic::experimental_patchpoint_i64:
visitPatchpoint(&I);
return;
case Intrinsic::experimental_gc_statepoint:
LowerStatepoint(ImmutableStatepoint(&I));
return;
case Intrinsic::experimental_gc_result:
visitGCResult(cast<GCResultInst>(I));
return;
case Intrinsic::experimental_gc_relocate:
visitGCRelocate(cast<GCRelocateInst>(I));
return;
case Intrinsic::instrprof_increment:
llvm_unreachable("instrprof failed to lower an increment");
case Intrinsic::instrprof_value_profile:
llvm_unreachable("instrprof failed to lower a value profiling call");
case Intrinsic::localescape: {
MachineFunction &MF = DAG.getMachineFunction();
const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();
// Directly emit some LOCAL_ESCAPE machine instrs. Label assignment emission
// is the same on all targets.
for (unsigned Idx = 0, E = I.getNumArgOperands(); Idx < E; ++Idx) {
Value *Arg = I.getArgOperand(Idx)->stripPointerCasts();
if (isa<ConstantPointerNull>(Arg))
continue; // Skip null pointers. They represent a hole in index space.
AllocaInst *Slot = cast<AllocaInst>(Arg);
assert(FuncInfo.StaticAllocaMap.count(Slot) &&
"can only escape static allocas");
int FI = FuncInfo.StaticAllocaMap[Slot];
MCSymbol *FrameAllocSym =
MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
GlobalValue::dropLLVMManglingEscape(MF.getName()), Idx);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, dl,
TII->get(TargetOpcode::LOCAL_ESCAPE))
.addSym(FrameAllocSym)
.addFrameIndex(FI);
}
return;
}
case Intrinsic::localrecover: {
// i8* @llvm.localrecover(i8* %fn, i8* %fp, i32 %idx)
MachineFunction &MF = DAG.getMachineFunction();
MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout(), 0);
// Get the symbol that defines the frame offset.
auto *Fn = cast<Function>(I.getArgOperand(0)->stripPointerCasts());
auto *Idx = cast<ConstantInt>(I.getArgOperand(2));
unsigned IdxVal =
unsigned(Idx->getLimitedValue(std::numeric_limits<int>::max()));
MCSymbol *FrameAllocSym =
MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
GlobalValue::dropLLVMManglingEscape(Fn->getName()), IdxVal);
// Create a MCSymbol for the label to avoid any target lowering
// that would make this PC relative.
SDValue OffsetSym = DAG.getMCSymbol(FrameAllocSym, PtrVT);
SDValue OffsetVal =
DAG.getNode(ISD::LOCAL_RECOVER, sdl, PtrVT, OffsetSym);
// Add the offset to the FP.
Value *FP = I.getArgOperand(1);
SDValue FPVal = getValue(FP);
SDValue Add = DAG.getNode(ISD::ADD, sdl, PtrVT, FPVal, OffsetVal);
setValue(&I, Add);
return;
}
case Intrinsic::eh_exceptionpointer:
case Intrinsic::eh_exceptioncode: {
// Get the exception pointer vreg, copy from it, and resize it to fit.
const auto *CPI = cast<CatchPadInst>(I.getArgOperand(0));
MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
const TargetRegisterClass *PtrRC = TLI.getRegClassFor(PtrVT);
unsigned VReg = FuncInfo.getCatchPadExceptionPointerVReg(CPI, PtrRC);
SDValue N =
DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(), VReg, PtrVT);
if (Intrinsic == Intrinsic::eh_exceptioncode)
N = DAG.getZExtOrTrunc(N, getCurSDLoc(), MVT::i32);
setValue(&I, N);
return;
}
case Intrinsic::xray_customevent: {
// Here we want to make sure that the intrinsic behaves as if it has a
// specific calling convention, and only for x86_64.
// FIXME: Support other platforms later.
const auto &Triple = DAG.getTarget().getTargetTriple();
if (Triple.getArch() != Triple::x86_64 || !Triple.isOSLinux())
return;
SDLoc DL = getCurSDLoc();
SmallVector<SDValue, 8> Ops;
// We want to say that we always want the arguments in registers.
SDValue LogEntryVal = getValue(I.getArgOperand(0));
SDValue StrSizeVal = getValue(I.getArgOperand(1));
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue Chain = getRoot();
Ops.push_back(LogEntryVal);
Ops.push_back(StrSizeVal);
Ops.push_back(Chain);
// We need to enforce the calling convention for the callsite, so that
// argument ordering is enforced correctly, and that register allocation can
// see that some registers may be assumed clobbered and have to preserve
// them across calls to the intrinsic.
MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::PATCHABLE_EVENT_CALL,
DL, NodeTys, Ops);
SDValue patchableNode = SDValue(MN, 0);
DAG.setRoot(patchableNode);
setValue(&I, patchableNode);
return;
}
case Intrinsic::xray_typedevent: {
// Here we want to make sure that the intrinsic behaves as if it has a
// specific calling convention, and only for x86_64.
// FIXME: Support other platforms later.
const auto &Triple = DAG.getTarget().getTargetTriple();
if (Triple.getArch() != Triple::x86_64 || !Triple.isOSLinux())
return;
SDLoc DL = getCurSDLoc();
SmallVector<SDValue, 8> Ops;
// We want to say that we always want the arguments in registers.
// It's unclear to me how manipulating the selection DAG here forces callers
// to provide arguments in registers instead of on the stack.
SDValue LogTypeId = getValue(I.getArgOperand(0));
SDValue LogEntryVal = getValue(I.getArgOperand(1));
SDValue StrSizeVal = getValue(I.getArgOperand(2));
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue Chain = getRoot();
Ops.push_back(LogTypeId);
Ops.push_back(LogEntryVal);
Ops.push_back(StrSizeVal);
Ops.push_back(Chain);
// We need to enforce the calling convention for the callsite, so that
// argument ordering is enforced correctly, and that register allocation can
// see that some registers may be assumed clobbered and have to preserve
// them across calls to the intrinsic.
MachineSDNode *MN = DAG.getMachineNode(
TargetOpcode::PATCHABLE_TYPED_EVENT_CALL, DL, NodeTys, Ops);
SDValue patchableNode = SDValue(MN, 0);
DAG.setRoot(patchableNode);
setValue(&I, patchableNode);
return;
}
case Intrinsic::experimental_deoptimize:
LowerDeoptimizeCall(&I);
return;
case Intrinsic::experimental_vector_reduce_v2_fadd:
case Intrinsic::experimental_vector_reduce_v2_fmul:
case Intrinsic::experimental_vector_reduce_add:
case Intrinsic::experimental_vector_reduce_mul:
case Intrinsic::experimental_vector_reduce_and:
case Intrinsic::experimental_vector_reduce_or:
case Intrinsic::experimental_vector_reduce_xor:
case Intrinsic::experimental_vector_reduce_smax:
case Intrinsic::experimental_vector_reduce_smin:
case Intrinsic::experimental_vector_reduce_umax:
case Intrinsic::experimental_vector_reduce_umin:
case Intrinsic::experimental_vector_reduce_fmax:
case Intrinsic::experimental_vector_reduce_fmin:
visitVectorReduce(I, Intrinsic);
return;
case Intrinsic::icall_branch_funnel: {
SmallVector<SDValue, 16> Ops;
Ops.push_back(getValue(I.getArgOperand(0)));
int64_t Offset;
auto *Base = dyn_cast<GlobalObject>(GetPointerBaseWithConstantOffset(
I.getArgOperand(1), Offset, DAG.getDataLayout()));
if (!Base)
report_fatal_error(
"llvm.icall.branch.funnel operand must be a GlobalValue");
Ops.push_back(DAG.getTargetGlobalAddress(Base, getCurSDLoc(), MVT::i64, 0));
struct BranchFunnelTarget {
int64_t Offset;
SDValue Target;
};
SmallVector<BranchFunnelTarget, 8> Targets;
for (unsigned Op = 1, N = I.getNumArgOperands(); Op != N; Op += 2) {
auto *ElemBase = dyn_cast<GlobalObject>(GetPointerBaseWithConstantOffset(
I.getArgOperand(Op), Offset, DAG.getDataLayout()));
if (ElemBase != Base)
report_fatal_error("all llvm.icall.branch.funnel operands must refer "
"to the same GlobalValue");
SDValue Val = getValue(I.getArgOperand(Op + 1));
auto *GA = dyn_cast<GlobalAddressSDNode>(Val);
if (!GA)
report_fatal_error(
"llvm.icall.branch.funnel operand must be a GlobalValue");
Targets.push_back({Offset, DAG.getTargetGlobalAddress(
GA->getGlobal(), getCurSDLoc(),
Val.getValueType(), GA->getOffset())});
}
llvm::sort(Targets,
[](const BranchFunnelTarget &T1, const BranchFunnelTarget &T2) {
return T1.Offset < T2.Offset;
});
for (auto &T : Targets) {
Ops.push_back(DAG.getTargetConstant(T.Offset, getCurSDLoc(), MVT::i32));
Ops.push_back(T.Target);
}
Ops.push_back(DAG.getRoot()); // Chain
SDValue N(DAG.getMachineNode(TargetOpcode::ICALL_BRANCH_FUNNEL,
getCurSDLoc(), MVT::Other, Ops),
0);
DAG.setRoot(N);
setValue(&I, N);
HasTailCall = true;
return;
}
case Intrinsic::wasm_landingpad_index:
// Information this intrinsic contained has been transferred to
// MachineFunction in SelectionDAGISel::PrepareEHLandingPad. We can safely
// delete it now.
return;
case Intrinsic::aarch64_settag:
case Intrinsic::aarch64_settag_zero: {
const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
bool ZeroMemory = Intrinsic == Intrinsic::aarch64_settag_zero;
SDValue Val = TSI.EmitTargetCodeForSetTag(
DAG, getCurSDLoc(), getRoot(), getValue(I.getArgOperand(0)),
getValue(I.getArgOperand(1)), MachinePointerInfo(I.getArgOperand(0)),
ZeroMemory);
DAG.setRoot(Val);
setValue(&I, Val);
return;
}
}
}
void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
const ConstrainedFPIntrinsic &FPI) {
SDLoc sdl = getCurSDLoc();
unsigned Opcode;
switch (FPI.getIntrinsicID()) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::experimental_constrained_fadd:
Opcode = ISD::STRICT_FADD;
break;
case Intrinsic::experimental_constrained_fsub:
Opcode = ISD::STRICT_FSUB;
break;
case Intrinsic::experimental_constrained_fmul:
Opcode = ISD::STRICT_FMUL;
break;
case Intrinsic::experimental_constrained_fdiv:
Opcode = ISD::STRICT_FDIV;
break;
case Intrinsic::experimental_constrained_frem:
Opcode = ISD::STRICT_FREM;
break;
case Intrinsic::experimental_constrained_fma:
Opcode = ISD::STRICT_FMA;
break;
case Intrinsic::experimental_constrained_fptrunc:
Opcode = ISD::STRICT_FP_ROUND;
break;
case Intrinsic::experimental_constrained_fpext:
Opcode = ISD::STRICT_FP_EXTEND;
break;
case Intrinsic::experimental_constrained_sqrt:
Opcode = ISD::STRICT_FSQRT;
break;
case Intrinsic::experimental_constrained_pow:
Opcode = ISD::STRICT_FPOW;
break;
case Intrinsic::experimental_constrained_powi:
Opcode = ISD::STRICT_FPOWI;
break;
case Intrinsic::experimental_constrained_sin:
Opcode = ISD::STRICT_FSIN;
break;
case Intrinsic::experimental_constrained_cos:
Opcode = ISD::STRICT_FCOS;
break;
case Intrinsic::experimental_constrained_exp:
Opcode = ISD::STRICT_FEXP;
break;
case Intrinsic::experimental_constrained_exp2:
Opcode = ISD::STRICT_FEXP2;
break;
case Intrinsic::experimental_constrained_log:
Opcode = ISD::STRICT_FLOG;
break;
case Intrinsic::experimental_constrained_log10:
Opcode = ISD::STRICT_FLOG10;
break;
case Intrinsic::experimental_constrained_log2:
Opcode = ISD::STRICT_FLOG2;
break;
case Intrinsic::experimental_constrained_rint:
Opcode = ISD::STRICT_FRINT;
break;
case Intrinsic::experimental_constrained_nearbyint:
Opcode = ISD::STRICT_FNEARBYINT;
break;
case Intrinsic::experimental_constrained_maxnum:
Opcode = ISD::STRICT_FMAXNUM;
break;
case Intrinsic::experimental_constrained_minnum:
Opcode = ISD::STRICT_FMINNUM;
break;
case Intrinsic::experimental_constrained_ceil:
Opcode = ISD::STRICT_FCEIL;
break;
case Intrinsic::experimental_constrained_floor:
Opcode = ISD::STRICT_FFLOOR;
break;
case Intrinsic::experimental_constrained_round:
Opcode = ISD::STRICT_FROUND;
break;
case Intrinsic::experimental_constrained_trunc:
Opcode = ISD::STRICT_FTRUNC;
break;
}
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Chain = getRoot();
SmallVector<EVT, 4> ValueVTs;
ComputeValueVTs(TLI, DAG.getDataLayout(), FPI.getType(), ValueVTs);
ValueVTs.push_back(MVT::Other); // Out chain
SDVTList VTs = DAG.getVTList(ValueVTs);
SDValue Result;
if (Opcode == ISD::STRICT_FP_ROUND)
Result = DAG.getNode(Opcode, sdl, VTs,
{ Chain, getValue(FPI.getArgOperand(0)),
DAG.getTargetConstant(0, sdl,
TLI.getPointerTy(DAG.getDataLayout())) });
else if (FPI.isUnaryOp())
Result = DAG.getNode(Opcode, sdl, VTs,
{ Chain, getValue(FPI.getArgOperand(0)) });
else if (FPI.isTernaryOp())
Result = DAG.getNode(Opcode, sdl, VTs,
{ Chain, getValue(FPI.getArgOperand(0)),
getValue(FPI.getArgOperand(1)),
getValue(FPI.getArgOperand(2)) });
else
Result = DAG.getNode(Opcode, sdl, VTs,
{ Chain, getValue(FPI.getArgOperand(0)),
getValue(FPI.getArgOperand(1)) });
if (FPI.getExceptionBehavior() !=
ConstrainedFPIntrinsic::ExceptionBehavior::ebIgnore) {
SDNodeFlags Flags;
Flags.setFPExcept(true);
Result->setFlags(Flags);
}
assert(Result.getNode()->getNumValues() == 2);
SDValue OutChain = Result.getValue(1);
DAG.setRoot(OutChain);
SDValue FPResult = Result.getValue(0);
setValue(&FPI, FPResult);
}
std::pair<SDValue, SDValue>
SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
const BasicBlock *EHPadBB) {
MachineFunction &MF = DAG.getMachineFunction();
MachineModuleInfo &MMI = MF.getMMI();
MCSymbol *BeginLabel = nullptr;
if (EHPadBB) {
// Insert a label before the invoke call to mark the try range. This can be
// used to detect deletion of the invoke via the MachineModuleInfo.
BeginLabel = MMI.getContext().createTempSymbol();
// For SjLj, keep track of which landing pads go with which invokes
// so as to maintain the ordering of pads in the LSDA.
unsigned CallSiteIndex = MMI.getCurrentCallSite();
if (CallSiteIndex) {
MF.setCallSiteBeginLabel(BeginLabel, CallSiteIndex);
LPadToCallSiteMap[FuncInfo.MBBMap[EHPadBB]].push_back(CallSiteIndex);
// Now that the call site is handled, stop tracking it.
MMI.setCurrentCallSite(0);
}
// Both PendingLoads and PendingExports must be flushed here;
// this call might not return.
(void)getRoot();
DAG.setRoot(DAG.getEHLabel(getCurSDLoc(), getControlRoot(), BeginLabel));
CLI.setChain(getRoot());
}
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
assert((CLI.IsTailCall || Result.second.getNode()) &&
"Non-null chain expected with non-tail call!");
assert((Result.second.getNode() || !Result.first.getNode()) &&
"Null value expected with tail call!");
if (!Result.second.getNode()) {
// As a special case, a null chain means that a tail call has been emitted
// and the DAG root is already updated.
HasTailCall = true;
// Since there's no actual continuation from this block, nothing can be
// relying on us setting vregs for them.
PendingExports.clear();
} else {
DAG.setRoot(Result.second);
}
if (EHPadBB) {
// Insert a label at the end of the invoke call to mark the try range. This
// can be used to detect deletion of the invoke via the MachineModuleInfo.
MCSymbol *EndLabel = MMI.getContext().createTempSymbol();
DAG.setRoot(DAG.getEHLabel(getCurSDLoc(), getRoot(), EndLabel));
// Inform MachineModuleInfo of range.
auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
// There is a platform (e.g. wasm) that uses funclet style IR but does not
// actually use outlined funclets and their LSDA info style.
if (MF.hasEHFunclets() && isFuncletEHPersonality(Pers)) {
assert(CLI.CS);
WinEHFuncInfo *EHInfo = DAG.getMachineFunction().getWinEHFuncInfo();
EHInfo->addIPToStateRange(cast<InvokeInst>(CLI.CS.getInstruction()),
BeginLabel, EndLabel);
} else if (!isScopedEHPersonality(Pers)) {
MF.addInvoke(FuncInfo.MBBMap[EHPadBB], BeginLabel, EndLabel);
}
}
return Result;
}
void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
bool isTailCall,
const BasicBlock *EHPadBB) {
auto &DL = DAG.getDataLayout();
FunctionType *FTy = CS.getFunctionType();
Type *RetTy = CS.getType();
TargetLowering::ArgListTy Args;
Args.reserve(CS.arg_size());
const Value *SwiftErrorVal = nullptr;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// We can't tail call inside a function with a swifterror argument. Lowering
// does not support this yet. It would have to move into the swifterror
// register before the call.
auto *Caller = CS.getInstruction()->getParent()->getParent();
if (TLI.supportSwiftError() &&
Caller->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
isTailCall = false;
for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
i != e; ++i) {
TargetLowering::ArgListEntry Entry;
const Value *V = *i;
// Skip empty types
if (V->getType()->isEmptyTy())
continue;
SDValue ArgNode = getValue(V);
Entry.Node = ArgNode; Entry.Ty = V->getType();
Entry.setAttributes(&CS, i - CS.arg_begin());
// Use swifterror virtual register as input to the call.
if (Entry.IsSwiftError && TLI.supportSwiftError()) {
SwiftErrorVal = V;
// We find the virtual register for the actual swifterror argument.
// Instead of using the Value, we use the virtual register instead.
Entry.Node = DAG.getRegister(
SwiftError.getOrCreateVRegUseAt(CS.getInstruction(), FuncInfo.MBB, V),
EVT(TLI.getPointerTy(DL)));
}
Args.push_back(Entry);
// If we have an explicit sret argument that is an Instruction, (i.e., it
// might point to function-local memory), we can't meaningfully tail-call.
if (Entry.IsSRet && isa<Instruction>(V))
isTailCall = false;
}
// Check if target-independent constraints permit a tail call here.
// Target-dependent constraints are checked within TLI->LowerCallTo.
if (isTailCall && !isInTailCallPosition(CS, DAG.getTarget()))
isTailCall = false;
// Disable tail calls if there is an swifterror argument. Targets have not
// been updated to support tail calls.
if (TLI.supportSwiftError() && SwiftErrorVal)
isTailCall = false;
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(getCurSDLoc())
.setChain(getRoot())
.setCallee(RetTy, FTy, Callee, std::move(Args), CS)
.setTailCall(isTailCall)
.setConvergent(CS.isConvergent());
std::pair<SDValue, SDValue> Result = lowerInvokable(CLI, EHPadBB);
if (Result.first.getNode()) {
const Instruction *Inst = CS.getInstruction();
Result.first = lowerRangeToAssertZExt(DAG, *Inst, Result.first);
setValue(Inst, Result.first);
}
// The last element of CLI.InVals has the SDValue for swifterror return.
// Here we copy it to a virtual register and update SwiftErrorMap for
// book-keeping.
if (SwiftErrorVal && TLI.supportSwiftError()) {
// Get the last element of InVals.
SDValue Src = CLI.InVals.back();
unsigned VReg = SwiftError.getOrCreateVRegDefAt(
CS.getInstruction(), FuncInfo.MBB, SwiftErrorVal);
SDValue CopyNode = CLI.DAG.getCopyToReg(Result.second, CLI.DL, VReg, Src);
DAG.setRoot(CopyNode);
}
}
static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
SelectionDAGBuilder &Builder) {
// Check to see if this load can be trivially constant folded, e.g. if the
// input is from a string literal.
if (const Constant *LoadInput = dyn_cast<Constant>(PtrVal)) {
// Cast pointer to the type we really want to load.
Type *LoadTy =
Type::getIntNTy(PtrVal->getContext(), LoadVT.getScalarSizeInBits());
if (LoadVT.isVector())
LoadTy = VectorType::get(LoadTy, LoadVT.getVectorNumElements());
LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput),
PointerType::getUnqual(LoadTy));
if (const Constant *LoadCst = ConstantFoldLoadFromConstPtr(
const_cast<Constant *>(LoadInput), LoadTy, *Builder.DL))
return Builder.getValue(LoadCst);
}
// Otherwise, we have to emit the load. If the pointer is to unfoldable but
// still constant memory, the input chain can be the entry node.
SDValue Root;
bool ConstantMemory = false;
// Do not serialize (non-volatile) loads of constant memory with anything.
if (Builder.AA && Builder.AA->pointsToConstantMemory(PtrVal)) {
Root = Builder.DAG.getEntryNode();
ConstantMemory = true;
} else {
// Do not serialize non-volatile loads against each other.
Root = Builder.DAG.getRoot();
}
SDValue Ptr = Builder.getValue(PtrVal);
SDValue LoadVal = Builder.DAG.getLoad(LoadVT, Builder.getCurSDLoc(), Root,
Ptr, MachinePointerInfo(PtrVal),
/* Alignment = */ 1);
if (!ConstantMemory)
Builder.PendingLoads.push_back(LoadVal.getValue(1));
return LoadVal;
}
/// Record the value for an instruction that produces an integer result,
/// converting the type where necessary.
void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,
SDValue Value,
bool IsSigned) {
EVT VT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
I.getType(), true);
if (IsSigned)
Value = DAG.getSExtOrTrunc(Value, getCurSDLoc(), VT);
else
Value = DAG.getZExtOrTrunc(Value, getCurSDLoc(), VT);
setValue(&I, Value);
}
/// See if we can lower a memcmp call into an optimized form. If so, return
/// true and lower it. Otherwise return false, and it will be lowered like a
/// normal call.
/// The caller already checked that \p I calls the appropriate LibFunc with a
/// correct prototype.
bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1);
const Value *Size = I.getArgOperand(2);
const ConstantInt *CSize = dyn_cast<ConstantInt>(Size);
if (CSize && CSize->getZExtValue() == 0) {
EVT CallVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
I.getType(), true);
setValue(&I, DAG.getConstant(0, getCurSDLoc(), CallVT));
return true;
}
const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
std::pair<SDValue, SDValue> Res = TSI.EmitTargetCodeForMemcmp(
DAG, getCurSDLoc(), DAG.getRoot(), getValue(LHS), getValue(RHS),
getValue(Size), MachinePointerInfo(LHS), MachinePointerInfo(RHS));
if (Res.first.getNode()) {
processIntegerCallValue(I, Res.first, true);
PendingLoads.push_back(Res.second);
return true;
}
// memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS) != 0
// memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS) != 0
if (!CSize || !isOnlyUsedInZeroEqualityComparison(&I))
return false;
// If the target has a fast compare for the given size, it will return a
// preferred load type for that size. Require that the load VT is legal and
// that the target supports unaligned loads of that type. Otherwise, return
// INVALID.
auto hasFastLoadsAndCompare = [&](unsigned NumBits) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
MVT LVT = TLI.hasFastEqualityCompare(NumBits);
if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) {
// TODO: Handle 5 byte compare as 4-byte + 1 byte.
// TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
// TODO: Check alignment of src and dest ptrs.
unsigned DstAS = LHS->getType()->getPointerAddressSpace();
unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
if (!TLI.isTypeLegal(LVT) ||
!TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) ||
!TLI.allowsMisalignedMemoryAccesses(LVT, DstAS))
LVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
}
return LVT;
};
// This turns into unaligned loads. We only do this if the target natively
// supports the MVT we'll be loading or if it is small enough (<= 4) that
// we'll only produce a small number of byte loads.
MVT LoadVT;
unsigned NumBitsToCompare = CSize->getZExtValue() * 8;
switch (NumBitsToCompare) {
default:
return false;
case 16:
LoadVT = MVT::i16;
break;
case 32:
LoadVT = MVT::i32;
break;
case 64:
case 128:
case 256:
LoadVT = hasFastLoadsAndCompare(NumBitsToCompare);
break;
}
if (LoadVT == MVT::INVALID_SIMPLE_VALUE_TYPE)
return false;
SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this);
SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this);
// Bitcast to a wide integer type if the loads are vectors.
if (LoadVT.isVector()) {
EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits());
LoadL = DAG.getBitcast(CmpVT, LoadL);
LoadR = DAG.getBitcast(CmpVT, LoadR);
}
SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE);
processIntegerCallValue(I, Cmp, false);
return true;
}
/// See if we can lower a memchr call into an optimized form. If so, return
/// true and lower it. Otherwise return false, and it will be lowered like a
/// normal call.
/// The caller already checked that \p I calls the appropriate LibFunc with a
/// correct prototype.
bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) {
const Value *Src = I.getArgOperand(0);
const Value *Char = I.getArgOperand(1);
const Value *Length = I.getArgOperand(2);
const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
std::pair<SDValue, SDValue> Res =
TSI.EmitTargetCodeForMemchr(DAG, getCurSDLoc(), DAG.getRoot(),
getValue(Src), getValue(Char), getValue(Length),
MachinePointerInfo(Src));
if (Res.first.getNode()) {
setValue(&I, Res.first);
PendingLoads.push_back(Res.second);
return true;
}
return false;
}
/// See if we can lower a mempcpy call into an optimized form. If so, return
/// true and lower it. Otherwise return false, and it will be lowered like a
/// normal call.
/// The caller already checked that \p I calls the appropriate LibFunc with a
/// correct prototype.
bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) {
SDValue Dst = getValue(I.getArgOperand(0));
SDValue Src = getValue(I.getArgOperand(1));
SDValue Size = getValue(I.getArgOperand(2));
unsigned DstAlign = DAG.InferPtrAlignment(Dst);
unsigned SrcAlign = DAG.InferPtrAlignment(Src);
unsigned Align = std::min(DstAlign, SrcAlign);
if (Align == 0) // Alignment of one or both could not be inferred.
Align = 1; // 0 and 1 both specify no alignment, but 0 is reserved.
bool isVol = false;
SDLoc sdl = getCurSDLoc();
// In the mempcpy context we need to pass in a false value for isTailCall
// because the return pointer needs to be adjusted by the size of
// the copied memory.
SDValue MC = DAG.getMemcpy(getRoot(), sdl, Dst, Src, Size, Align, isVol,
false, /*isTailCall=*/false,
MachinePointerInfo(I.getArgOperand(0)),
MachinePointerInfo(I.getArgOperand(1)));
assert(MC.getNode() != nullptr &&
"** memcpy should not be lowered as TailCall in mempcpy context **");
DAG.setRoot(MC);
// Check if Size needs to be truncated or extended.
Size = DAG.getSExtOrTrunc(Size, sdl, Dst.getValueType());
// Adjust return pointer to point just past the last dst byte.
SDValue DstPlusSize = DAG.getNode(ISD::ADD, sdl, Dst.getValueType(),
Dst, Size);
setValue(&I, DstPlusSize);
return true;
}
/// See if we can lower a strcpy call into an optimized form. If so, return
/// true and lower it, otherwise return false and it will be lowered like a
/// normal call.
/// The caller already checked that \p I calls the appropriate LibFunc with a
/// correct prototype.
bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) {
const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
std::pair<SDValue, SDValue> Res =
TSI.EmitTargetCodeForStrcpy(DAG, getCurSDLoc(), getRoot(),
getValue(Arg0), getValue(Arg1),
MachinePointerInfo(Arg0),
MachinePointerInfo(Arg1), isStpcpy);
if (Res.first.getNode()) {
setValue(&I, Res.first);
DAG.setRoot(Res.second);
return true;
}
return false;
}
/// See if we can lower a strcmp call into an optimized form. If so, return
/// true and lower it, otherwise return false and it will be lowered like a
/// normal call.
/// The caller already checked that \p I calls the appropriate LibFunc with a
/// correct prototype.
bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) {
const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
std::pair<SDValue, SDValue> Res =
TSI.EmitTargetCodeForStrcmp(DAG, getCurSDLoc(), DAG.getRoot(),
getValue(Arg0), getValue(Arg1),
MachinePointerInfo(Arg0),
MachinePointerInfo(Arg1));
if (Res.first.getNode()) {
processIntegerCallValue(I, Res.first, true);
PendingLoads.push_back(Res.second);
return true;
}
return false;
}
/// See if we can lower a strlen call into an optimized form. If so, return
/// true and lower it, otherwise return false and it will be lowered like a
/// normal call.
/// The caller already checked that \p I calls the appropriate LibFunc with a
/// correct prototype.
bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) {
const Value *Arg0 = I.getArgOperand(0);
const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
std::pair<SDValue, SDValue> Res =
TSI.EmitTargetCodeForStrlen(DAG, getCurSDLoc(), DAG.getRoot(),
getValue(Arg0), MachinePointerInfo(Arg0));
if (Res.first.getNode()) {
processIntegerCallValue(I, Res.first, false);
PendingLoads.push_back(Res.second);
return true;
}
return false;
}
/// See if we can lower a strnlen call into an optimized form. If so, return
/// true and lower it, otherwise return false and it will be lowered like a
/// normal call.
/// The caller already checked that \p I calls the appropriate LibFunc with a
/// correct prototype.
bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) {
const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
std::pair<SDValue, SDValue> Res =
TSI.EmitTargetCodeForStrnlen(DAG, getCurSDLoc(), DAG.getRoot(),
getValue(Arg0), getValue(Arg1),
MachinePointerInfo(Arg0));
if (Res.first.getNode()) {
processIntegerCallValue(I, Res.first, false);
PendingLoads.push_back(Res.second);
return true;
}
return false;
}
/// See if we can lower a unary floating-point operation into an SDNode with
/// the specified Opcode. If so, return true and lower it, otherwise return
/// false and it will be lowered like a normal call.
/// The caller already checked that \p I calls the appropriate LibFunc with a
/// correct prototype.
bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,
unsigned Opcode) {
// We already checked this call's prototype; verify it doesn't modify errno.
if (!I.onlyReadsMemory())
return false;
SDValue Tmp = getValue(I.getArgOperand(0));
setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), Tmp.getValueType(), Tmp));
return true;
}
/// See if we can lower a binary floating-point operation into an SDNode with
/// the specified Opcode. If so, return true and lower it. Otherwise return
/// false, and it will be lowered like a normal call.
/// The caller already checked that \p I calls the appropriate LibFunc with a
/// correct prototype.
bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I,
unsigned Opcode) {
// We already checked this call's prototype; verify it doesn't modify errno.
if (!I.onlyReadsMemory())
return false;
SDValue Tmp0 = getValue(I.getArgOperand(0));
SDValue Tmp1 = getValue(I.getArgOperand(1));
EVT VT = Tmp0.getValueType();
setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), VT, Tmp0, Tmp1));
return true;
}
void SelectionDAGBuilder::visitCall(const CallInst &I) {
// Handle inline assembly differently.
if (isa<InlineAsm>(I.getCalledValue())) {
visitInlineAsm(&I);
return;
}
if (Function *F = I.getCalledFunction()) {
if (F->isDeclaration()) {
// Is this an LLVM intrinsic or a target-specific intrinsic?
unsigned IID = F->getIntrinsicID();
if (!IID)
if (const TargetIntrinsicInfo *II = TM.getIntrinsicInfo())
IID = II->getIntrinsicID(F);
if (IID) {
visitIntrinsicCall(I, IID);
return;
}
}
// Check for well-known libc/libm calls. If the function is internal, it
// can't be a library call. Don't do the check if marked as nobuiltin for
// some reason or the call site requires strict floating point semantics.
LibFunc Func;
if (!I.isNoBuiltin() && !I.isStrictFP() && !F->hasLocalLinkage() &&
F->hasName() && LibInfo->getLibFunc(*F, Func) &&
LibInfo->hasOptimizedCodeGen(Func)) {
switch (Func) {
default: break;
case LibFunc_copysign:
case LibFunc_copysignf:
case LibFunc_copysignl:
// We already checked this call's prototype; verify it doesn't modify
// errno.
if (I.onlyReadsMemory()) {
SDValue LHS = getValue(I.getArgOperand(0));
SDValue RHS = getValue(I.getArgOperand(1));
setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurSDLoc(),
LHS.getValueType(), LHS, RHS));
return;
}
break;
case LibFunc_fabs:
case LibFunc_fabsf:
case LibFunc_fabsl:
if (visitUnaryFloatCall(I, ISD::FABS))
return;
break;
case LibFunc_fmin:
case LibFunc_fminf:
case LibFunc_fminl:
if (visitBinaryFloatCall(I, ISD::FMINNUM))
return;
break;
case LibFunc_fmax:
case LibFunc_fmaxf:
case LibFunc_fmaxl:
if (visitBinaryFloatCall(I, ISD::FMAXNUM))
return;
break;
case LibFunc_sin:
case LibFunc_sinf:
case LibFunc_sinl:
if (visitUnaryFloatCall(I, ISD::FSIN))
return;
break;
case LibFunc_cos:
case LibFunc_cosf:
case LibFunc_cosl:
if (visitUnaryFloatCall(I, ISD::FCOS))
return;
break;
case LibFunc_sqrt:
case LibFunc_sqrtf:
case LibFunc_sqrtl:
case LibFunc_sqrt_finite:
case LibFunc_sqrtf_finite:
case LibFunc_sqrtl_finite:
if (visitUnaryFloatCall(I, ISD::FSQRT))
return;
break;
case LibFunc_floor:
case LibFunc_floorf:
case LibFunc_floorl:
if (visitUnaryFloatCall(I, ISD::FFLOOR))
return;
break;
case LibFunc_nearbyint:
case LibFunc_nearbyintf:
case LibFunc_nearbyintl:
if (visitUnaryFloatCall(I, ISD::FNEARBYINT))
return;
break;
case LibFunc_ceil:
case LibFunc_ceilf:
case LibFunc_ceill:
if (visitUnaryFloatCall(I, ISD::FCEIL))
return;
break;
case LibFunc_rint:
case LibFunc_rintf:
case LibFunc_rintl:
if (visitUnaryFloatCall(I, ISD::FRINT))
return;
break;
case LibFunc_round:
case LibFunc_roundf:
case LibFunc_roundl:
if (visitUnaryFloatCall(I, ISD::FROUND))
return;
break;
case LibFunc_trunc:
case LibFunc_truncf:
case LibFunc_truncl:
if (visitUnaryFloatCall(I, ISD::FTRUNC))
return;
break;
case LibFunc_log2:
case LibFunc_log2f:
case LibFunc_log2l:
if (visitUnaryFloatCall(I, ISD::FLOG2))
return;
break;
case LibFunc_exp2:
case LibFunc_exp2f:
case LibFunc_exp2l:
if (visitUnaryFloatCall(I, ISD::FEXP2))
return;
break;
case LibFunc_memcmp:
if (visitMemCmpCall(I))
return;
break;
case LibFunc_mempcpy:
if (visitMemPCpyCall(I))
return;
break;
case LibFunc_memchr:
if (visitMemChrCall(I))
return;
break;
case LibFunc_strcpy:
if (visitStrCpyCall(I, false))
return;
break;
case LibFunc_stpcpy:
if (visitStrCpyCall(I, true))
return;
break;
case LibFunc_strcmp:
if (visitStrCmpCall(I))
return;
break;
case LibFunc_strlen:
if (visitStrLenCall(I))
return;
break;
case LibFunc_strnlen:
if (visitStrNLenCall(I))
return;
break;
}
}
}
// Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
// have to do anything here to lower funclet bundles.
assert(!I.hasOperandBundlesOtherThan(
{LLVMContext::OB_deopt, LLVMContext::OB_funclet}) &&
"Cannot lower calls with arbitrary operand bundles!");
SDValue Callee = getValue(I.getCalledValue());
if (I.countOperandBundlesOfType(LLVMContext::OB_deopt))
LowerCallSiteWithDeoptBundle(&I, Callee, nullptr);
else
// Check if we can potentially perform a tail call. More detailed checking
// is be done within LowerCallTo, after more information about the call is
// known.
LowerCallTo(&I, Callee, I.isTailCall());
}
namespace {
/// AsmOperandInfo - This contains information for each constraint that we are
/// lowering.
class SDISelAsmOperandInfo : public TargetLowering::AsmOperandInfo {
public:
/// CallOperand - If this is the result output operand or a clobber
/// this is null, otherwise it is the incoming operand to the CallInst.
/// This gets modified as the asm is processed.
SDValue CallOperand;
/// AssignedRegs - If this is a register or register class operand, this
/// contains the set of register corresponding to the operand.
RegsForValue AssignedRegs;
explicit SDISelAsmOperandInfo(const TargetLowering::AsmOperandInfo &info)
: TargetLowering::AsmOperandInfo(info), CallOperand(nullptr, 0) {
}
/// Whether or not this operand accesses memory
bool hasMemory(const TargetLowering &TLI) const {
// Indirect operand accesses access memory.
if (isIndirect)
return true;
for (const auto &Code : Codes)
if (TLI.getConstraintType(Code) == TargetLowering::C_Memory)
return true;
return false;
}
/// getCallOperandValEVT - Return the EVT of the Value* that this operand
/// corresponds to. If there is no Value* for this operand, it returns
/// MVT::Other.
EVT getCallOperandValEVT(LLVMContext &Context, const TargetLowering &TLI,
const DataLayout &DL) const {
if (!CallOperandVal) return MVT::Other;
if (isa<BasicBlock>(CallOperandVal))
return TLI.getPointerTy(DL);
llvm::Type *OpTy = CallOperandVal->getType();
// FIXME: code duplicated from TargetLowering::ParseConstraints().
// If this is an indirect operand, the operand is a pointer to the
// accessed type.
if (isIndirect) {
PointerType *PtrTy = dyn_cast<PointerType>(OpTy);
if (!PtrTy)
report_fatal_error("Indirect operand for inline asm not a pointer!");
OpTy = PtrTy->getElementType();
}
// Look for vector wrapped in a struct. e.g. { <16 x i8> }.
if (StructType *STy = dyn_cast<StructType>(OpTy))
if (STy->getNumElements() == 1)
OpTy = STy->getElementType(0);
// If OpTy is not a single value, it may be a struct/union that we
// can tile with integers.
if (!OpTy->isSingleValueType() && OpTy->isSized()) {
unsigned BitSize = DL.getTypeSizeInBits(OpTy);
switch (BitSize) {
default: break;
case 1:
case 8:
case 16:
case 32:
case 64:
case 128:
OpTy = IntegerType::get(Context, BitSize);
break;
}
}
return TLI.getValueType(DL, OpTy, true);
}
};
using SDISelAsmOperandInfoVector = SmallVector<SDISelAsmOperandInfo, 16>;
} // end anonymous namespace
/// Make sure that the output operand \p OpInfo and its corresponding input
/// operand \p MatchingOpInfo have compatible constraint types (otherwise error
/// out).
static void patchMatchingInput(const SDISelAsmOperandInfo &OpInfo,
SDISelAsmOperandInfo &MatchingOpInfo,
SelectionDAG &DAG) {
if (OpInfo.ConstraintVT == MatchingOpInfo.ConstraintVT)
return;
const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
const auto &TLI = DAG.getTargetLoweringInfo();
std::pair<unsigned, const TargetRegisterClass *> MatchRC =
TLI.getRegForInlineAsmConstraint(TRI, OpInfo.ConstraintCode,
OpInfo.ConstraintVT);
std::pair<unsigned, const TargetRegisterClass *> InputRC =
TLI.getRegForInlineAsmConstraint(TRI, MatchingOpInfo.ConstraintCode,
MatchingOpInfo.ConstraintVT);
if ((OpInfo.ConstraintVT.isInteger() !=
MatchingOpInfo.ConstraintVT.isInteger()) ||
(MatchRC.second != InputRC.second)) {
// FIXME: error out in a more elegant fashion
report_fatal_error("Unsupported asm: input constraint"
" with a matching output constraint of"
" incompatible type!");
}
MatchingOpInfo.ConstraintVT = OpInfo.ConstraintVT;
}
/// Get a direct memory input to behave well as an indirect operand.
/// This may introduce stores, hence the need for a \p Chain.
/// \return The (possibly updated) chain.
static SDValue getAddressForMemoryInput(SDValue Chain, const SDLoc &Location,
SDISelAsmOperandInfo &OpInfo,
SelectionDAG &DAG) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// If we don't have an indirect input, put it in the constpool if we can,
// otherwise spill it to a stack slot.
// TODO: This isn't quite right. We need to handle these according to
// the addressing mode that the constraint wants. Also, this may take
// an additional register for the computation and we don't want that
// either.
// If the operand is a float, integer, or vector constant, spill to a
// constant pool entry to get its address.
const Value *OpVal = OpInfo.CallOperandVal;
if (isa<ConstantFP>(OpVal) || isa<ConstantInt>(OpVal) ||
isa<ConstantVector>(OpVal) || isa<ConstantDataVector>(OpVal)) {
OpInfo.CallOperand = DAG.getConstantPool(
cast<Constant>(OpVal), TLI.getPointerTy(DAG.getDataLayout()));
return Chain;
}
// Otherwise, create a stack slot and emit a store to it before the asm.
Type *Ty = OpVal->getType();
auto &DL = DAG.getDataLayout();
uint64_t TySize = DL.getTypeAllocSize(Ty);
unsigned Align = DL.getPrefTypeAlignment(Ty);
MachineFunction &MF = DAG.getMachineFunction();
int SSFI = MF.getFrameInfo().CreateStackObject(TySize, Align, false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getFrameIndexTy(DL));
Chain = DAG.getTruncStore(Chain, Location, OpInfo.CallOperand, StackSlot,
MachinePointerInfo::getFixedStack(MF, SSFI),
TLI.getMemValueType(DL, Ty));
OpInfo.CallOperand = StackSlot;
return Chain;
}
/// GetRegistersForValue - Assign registers (virtual or physical) for the
/// specified operand. We prefer to assign virtual registers, to allow the
/// register allocator to handle the assignment process. However, if the asm
/// uses features that we can't model on machineinstrs, we have SDISel do the
/// allocation. This produces generally horrible, but correct, code.
///
/// OpInfo describes the operand
/// RefOpInfo describes the matching operand if any, the operand otherwise
static void GetRegistersForValue(SelectionDAG &DAG, const SDLoc &DL,
SDISelAsmOperandInfo &OpInfo,
SDISelAsmOperandInfo &RefOpInfo) {
LLVMContext &Context = *DAG.getContext();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
MachineFunction &MF = DAG.getMachineFunction();
SmallVector<unsigned, 4> Regs;
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
// No work to do for memory operations.
if (OpInfo.ConstraintType == TargetLowering::C_Memory)
return;
// If this is a constraint for a single physreg, or a constraint for a
// register class, find it.
unsigned AssignedReg;
const TargetRegisterClass *RC;
std::tie(AssignedReg, RC) = TLI.getRegForInlineAsmConstraint(
&TRI, RefOpInfo.ConstraintCode, RefOpInfo.ConstraintVT);
// RC is unset only on failure. Return immediately.
if (!RC)
return;
// Get the actual register value type. This is important, because the user
// may have asked for (e.g.) the AX register in i32 type. We need to
// remember that AX is actually i16 to get the right extension.
const MVT RegVT = *TRI.legalclasstypes_begin(*RC);
if (OpInfo.ConstraintVT != MVT::Other) {
// If this is an FP operand in an integer register (or visa versa), or more
// generally if the operand value disagrees with the register class we plan
// to stick it in, fix the operand type.
//
// If this is an input value, the bitcast to the new type is done now.
// Bitcast for output value is done at the end of visitInlineAsm().
if ((OpInfo.Type == InlineAsm::isOutput ||
OpInfo.Type == InlineAsm::isInput) &&
!TRI.isTypeLegalForClass(*RC, OpInfo.ConstraintVT)) {
// Try to convert to the first EVT that the reg class contains. If the
// types are identical size, use a bitcast to convert (e.g. two differing
// vector types). Note: output bitcast is done at the end of
// visitInlineAsm().
if (RegVT.getSizeInBits() == OpInfo.ConstraintVT.getSizeInBits()) {
// Exclude indirect inputs while they are unsupported because the code
// to perform the load is missing and thus OpInfo.CallOperand still
// refers to the input address rather than the pointed-to value.
if (OpInfo.Type == InlineAsm::isInput && !OpInfo.isIndirect)
OpInfo.CallOperand =
DAG.getNode(ISD::BITCAST, DL, RegVT, OpInfo.CallOperand);
OpInfo.ConstraintVT = RegVT;
// If the operand is an FP value and we want it in integer registers,
// use the corresponding integer type. This turns an f64 value into
// i64, which can be passed with two i32 values on a 32-bit machine.
} else if (RegVT.isInteger() && OpInfo.ConstraintVT.isFloatingPoint()) {
MVT VT = MVT::getIntegerVT(OpInfo.ConstraintVT.getSizeInBits());
if (OpInfo.Type == InlineAsm::isInput)
OpInfo.CallOperand =
DAG.getNode(ISD::BITCAST, DL, VT, OpInfo.CallOperand);
OpInfo.ConstraintVT = VT;
}
}
}
// No need to allocate a matching input constraint since the constraint it's
// matching to has already been allocated.
if (OpInfo.isMatchingInputConstraint())
return;
EVT ValueVT = OpInfo.ConstraintVT;
if (OpInfo.ConstraintVT == MVT::Other)
ValueVT = RegVT;
// Initialize NumRegs.
unsigned NumRegs = 1;
if (OpInfo.ConstraintVT != MVT::Other)
NumRegs = TLI.getNumRegisters(Context, OpInfo.ConstraintVT);
// If this is a constraint for a specific physical register, like {r17},
// assign it now.
// If this associated to a specific register, initialize iterator to correct
// place. If virtual, make sure we have enough registers
// Initialize iterator if necessary
TargetRegisterClass::iterator I = RC->begin();
MachineRegisterInfo &RegInfo = MF.getRegInfo();
// Do not check for single registers.
if (AssignedReg) {
for (; *I != AssignedReg; ++I)
assert(I != RC->end() && "AssignedReg should be member of RC");
}
for (; NumRegs; --NumRegs, ++I) {
assert(I != RC->end() && "Ran out of registers to allocate!");
Register R = AssignedReg ? Register(*I) : RegInfo.createVirtualRegister(RC);
Regs.push_back(R);
}
OpInfo.AssignedRegs = RegsForValue(Regs, RegVT, ValueVT);
}
static unsigned
findMatchingInlineAsmOperand(unsigned OperandNo,
const std::vector<SDValue> &AsmNodeOperands) {
// Scan until we find the definition we already emitted of this operand.
unsigned CurOp = InlineAsm::Op_FirstOperand;
for (; OperandNo; --OperandNo) {
// Advance to the next operand.
unsigned OpFlag =
cast<ConstantSDNode>(AsmNodeOperands[CurOp])->getZExtValue();
assert((InlineAsm::isRegDefKind(OpFlag) ||
InlineAsm::isRegDefEarlyClobberKind(OpFlag) ||
InlineAsm::isMemKind(OpFlag)) &&
"Skipped past definitions?");
CurOp += InlineAsm::getNumOperandRegisters(OpFlag) + 1;
}
return CurOp;
}
namespace {
class ExtraFlags {
unsigned Flags = 0;
public:
explicit ExtraFlags(ImmutableCallSite CS) {
const InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());
if (IA->hasSideEffects())
Flags |= InlineAsm::Extra_HasSideEffects;
if (IA->isAlignStack())
Flags |= InlineAsm::Extra_IsAlignStack;
if (CS.isConvergent())
Flags |= InlineAsm::Extra_IsConvergent;
Flags |= IA->getDialect() * InlineAsm::Extra_AsmDialect;
}
void update(const TargetLowering::AsmOperandInfo &OpInfo) {
// Ideally, we would only check against memory constraints. However, the
// meaning of an Other constraint can be target-specific and we can't easily
// reason about it. Therefore, be conservative and set MayLoad/MayStore
// for Other constraints as well.
if (OpInfo.ConstraintType == TargetLowering::C_Memory ||
OpInfo.ConstraintType == TargetLowering::C_Other) {
if (OpInfo.Type == InlineAsm::isInput)
Flags |= InlineAsm::Extra_MayLoad;
else if (OpInfo.Type == InlineAsm::isOutput)
Flags |= InlineAsm::Extra_MayStore;
else if (OpInfo.Type == InlineAsm::isClobber)
Flags |= (InlineAsm::Extra_MayLoad | InlineAsm::Extra_MayStore);
}
}
unsigned get() const { return Flags; }
};
} // end anonymous namespace
/// visitInlineAsm - Handle a call to an InlineAsm object.
void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
const InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());
/// ConstraintOperands - Information about all of the constraints.
SDISelAsmOperandInfoVector ConstraintOperands;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(
DAG.getDataLayout(), DAG.getSubtarget().getRegisterInfo(), CS);
// First Pass: Calculate HasSideEffects and ExtraFlags (AlignStack,
// AsmDialect, MayLoad, MayStore).
bool HasSideEffect = IA->hasSideEffects();
ExtraFlags ExtraInfo(CS);
unsigned ArgNo = 0; // ArgNo - The argument of the CallInst.
unsigned ResNo = 0; // ResNo - The result number of the next output.
for (auto &T : TargetConstraints) {
ConstraintOperands.push_back(SDISelAsmOperandInfo(T));
SDISelAsmOperandInfo &OpInfo = ConstraintOperands.back();
// Compute the value type for each operand.
if (OpInfo.Type == InlineAsm::isInput ||
(OpInfo.Type == InlineAsm::isOutput && OpInfo.isIndirect)) {
OpInfo.CallOperandVal = const_cast<Value *>(CS.getArgument(ArgNo++));
// Process the call argument. BasicBlocks are labels, currently appearing
// only in asm's.
const Instruction *I = CS.getInstruction();
if (isa<CallBrInst>(I) &&
(ArgNo - 1) >= (cast<CallBrInst>(I)->getNumArgOperands() -
cast<CallBrInst>(I)->getNumIndirectDests())) {
const auto *BA = cast<BlockAddress>(OpInfo.CallOperandVal);
EVT VT = TLI.getValueType(DAG.getDataLayout(), BA->getType(), true);
OpInfo.CallOperand = DAG.getTargetBlockAddress(BA, VT);
} else if (const auto *BB = dyn_cast<BasicBlock>(OpInfo.CallOperandVal)) {
OpInfo.CallOperand = DAG.getBasicBlock(FuncInfo.MBBMap[BB]);
} else {
OpInfo.CallOperand = getValue(OpInfo.CallOperandVal);
}
OpInfo.ConstraintVT =
OpInfo
.getCallOperandValEVT(*DAG.getContext(), TLI, DAG.getDataLayout())
.getSimpleVT();
} else if (OpInfo.Type == InlineAsm::isOutput && !OpInfo.isIndirect) {
// The return value of the call is this value. As such, there is no
// corresponding argument.
assert(!CS.getType()->isVoidTy() && "Bad inline asm!");
if (StructType *STy = dyn_cast<StructType>(CS.getType())) {
OpInfo.ConstraintVT = TLI.getSimpleValueType(
DAG.getDataLayout(), STy->getElementType(ResNo));
} else {
assert(ResNo == 0 && "Asm only has one result!");
OpInfo.ConstraintVT =
TLI.getSimpleValueType(DAG.getDataLayout(), CS.getType());
}
++ResNo;
} else {
OpInfo.ConstraintVT = MVT::Other;
}
if (!HasSideEffect)
HasSideEffect = OpInfo.hasMemory(TLI);
// Determine if this InlineAsm MayLoad or MayStore based on the constraints.
// FIXME: Could we compute this on OpInfo rather than T?
// Compute the constraint code and ConstraintType to use.
TLI.ComputeConstraintToUse(T, SDValue());
+ if (T.ConstraintType == TargetLowering::C_Immediate &&
+ OpInfo.CallOperand && !isa<ConstantSDNode>(OpInfo.CallOperand))
+ // We've delayed emitting a diagnostic like the "n" constraint because
+ // inlining could cause an integer showing up.
+ return emitInlineAsmError(
+ CS, "constraint '" + Twine(T.ConstraintCode) + "' expects an "
+ "integer constant expression");
+
ExtraInfo.update(T);
}
// We won't need to flush pending loads if this asm doesn't touch
// memory and is nonvolatile.
SDValue Flag, Chain = (HasSideEffect) ? getRoot() : DAG.getRoot();
bool IsCallBr = isa<CallBrInst>(CS.getInstruction());
if (IsCallBr) {
// If this is a callbr we need to flush pending exports since inlineasm_br
// is a terminator. We need to do this before nodes are glued to
// the inlineasm_br node.
Chain = getControlRoot();
}
// Second pass over the constraints: compute which constraint option to use.
for (SDISelAsmOperandInfo &OpInfo : ConstraintOperands) {
// If this is an output operand with a matching input operand, look up the
// matching input. If their types mismatch, e.g. one is an integer, the
// other is floating point, or their sizes are different, flag it as an
// error.
if (OpInfo.hasMatchingInput()) {
SDISelAsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
patchMatchingInput(OpInfo, Input, DAG);
}
// Compute the constraint code and ConstraintType to use.
TLI.ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, &DAG);
if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
OpInfo.Type == InlineAsm::isClobber)
continue;
// If this is a memory input, and if the operand is not indirect, do what we
// need to provide an address for the memory input.
if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
!OpInfo.isIndirect) {
assert((OpInfo.isMultipleAlternative ||
(OpInfo.Type == InlineAsm::isInput)) &&
"Can only indirectify direct input operands!");
// Memory operands really want the address of the value.
Chain = getAddressForMemoryInput(Chain, getCurSDLoc(), OpInfo, DAG);
// There is no longer a Value* corresponding to this operand.
OpInfo.CallOperandVal = nullptr;
// It is now an indirect operand.
OpInfo.isIndirect = true;
}
}
// AsmNodeOperands - The operands for the ISD::INLINEASM node.
std::vector<SDValue> AsmNodeOperands;
AsmNodeOperands.push_back(SDValue()); // reserve space for input chain
AsmNodeOperands.push_back(DAG.getTargetExternalSymbol(
IA->getAsmString().c_str(), TLI.getPointerTy(DAG.getDataLayout())));
// If we have a !srcloc metadata node associated with it, we want to attach
// this to the ultimately generated inline asm machineinstr. To do this, we
// pass in the third operand as this (potentially null) inline asm MDNode.
const MDNode *SrcLoc = CS.getInstruction()->getMetadata("srcloc");
AsmNodeOperands.push_back(DAG.getMDNode(SrcLoc));
// Remember the HasSideEffect, AlignStack, AsmDialect, MayLoad and MayStore
// bits as operand 3.
AsmNodeOperands.push_back(DAG.getTargetConstant(
ExtraInfo.get(), getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));
// Third pass: Loop over operands to prepare DAG-level operands.. As part of
// this, assign virtual and physical registers for inputs and otput.
for (SDISelAsmOperandInfo &OpInfo : ConstraintOperands) {
// Assign Registers.
SDISelAsmOperandInfo &RefOpInfo =
OpInfo.isMatchingInputConstraint()
? ConstraintOperands[OpInfo.getMatchedOperand()]
: OpInfo;
GetRegistersForValue(DAG, getCurSDLoc(), OpInfo, RefOpInfo);
switch (OpInfo.Type) {
case InlineAsm::isOutput:
if (OpInfo.ConstraintType == TargetLowering::C_Memory ||
- (OpInfo.ConstraintType == TargetLowering::C_Other &&
+ ((OpInfo.ConstraintType == TargetLowering::C_Immediate ||
+ OpInfo.ConstraintType == TargetLowering::C_Other) &&
OpInfo.isIndirect)) {
unsigned ConstraintID =
TLI.getInlineAsmMemConstraint(OpInfo.ConstraintCode);
assert(ConstraintID != InlineAsm::Constraint_Unknown &&
"Failed to convert memory constraint code to constraint id.");
// Add information to the INLINEASM node to know about this output.
unsigned OpFlags = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
OpFlags = InlineAsm::getFlagWordForMem(OpFlags, ConstraintID);
AsmNodeOperands.push_back(DAG.getTargetConstant(OpFlags, getCurSDLoc(),
MVT::i32));
AsmNodeOperands.push_back(OpInfo.CallOperand);
break;
- } else if ((OpInfo.ConstraintType == TargetLowering::C_Other &&
+ } else if (((OpInfo.ConstraintType == TargetLowering::C_Immediate ||
+ OpInfo.ConstraintType == TargetLowering::C_Other) &&
!OpInfo.isIndirect) ||
OpInfo.ConstraintType == TargetLowering::C_Register ||
OpInfo.ConstraintType == TargetLowering::C_RegisterClass) {
// Otherwise, this outputs to a register (directly for C_Register /
- // C_RegisterClass, and a target-defined fashion for C_Other). Find a
- // register that we can use.
+ // C_RegisterClass, and a target-defined fashion for
+ // C_Immediate/C_Other). Find a register that we can use.
if (OpInfo.AssignedRegs.Regs.empty()) {
emitInlineAsmError(
CS, "couldn't allocate output register for constraint '" +
Twine(OpInfo.ConstraintCode) + "'");
return;
}
// Add information to the INLINEASM node to know that this register is
// set.
OpInfo.AssignedRegs.AddInlineAsmOperands(
OpInfo.isEarlyClobber ? InlineAsm::Kind_RegDefEarlyClobber
: InlineAsm::Kind_RegDef,
false, 0, getCurSDLoc(), DAG, AsmNodeOperands);
}
break;
case InlineAsm::isInput: {
SDValue InOperandVal = OpInfo.CallOperand;
if (OpInfo.isMatchingInputConstraint()) {
// If this is required to match an output register we have already set,
// just use its register.
auto CurOp = findMatchingInlineAsmOperand(OpInfo.getMatchedOperand(),
AsmNodeOperands);
unsigned OpFlag =
cast<ConstantSDNode>(AsmNodeOperands[CurOp])->getZExtValue();
if (InlineAsm::isRegDefKind(OpFlag) ||
InlineAsm::isRegDefEarlyClobberKind(OpFlag)) {
// Add (OpFlag&0xffff)>>3 registers to MatchedRegs.
if (OpInfo.isIndirect) {
// This happens on gcc/testsuite/gcc.dg/pr8788-1.c
emitInlineAsmError(CS, "inline asm not supported yet:"
" don't know how to handle tied "
"indirect register inputs");
return;
}
MVT RegVT = AsmNodeOperands[CurOp+1].getSimpleValueType();
SmallVector<unsigned, 4> Regs;
if (const TargetRegisterClass *RC = TLI.getRegClassFor(RegVT)) {
unsigned NumRegs = InlineAsm::getNumOperandRegisters(OpFlag);
MachineRegisterInfo &RegInfo =
DAG.getMachineFunction().getRegInfo();
for (unsigned i = 0; i != NumRegs; ++i)
Regs.push_back(RegInfo.createVirtualRegister(RC));
} else {
emitInlineAsmError(CS, "inline asm error: This value type register "
"class is not natively supported!");
return;
}
RegsForValue MatchedRegs(Regs, RegVT, InOperandVal.getValueType());
SDLoc dl = getCurSDLoc();
// Use the produced MatchedRegs object to
MatchedRegs.getCopyToRegs(InOperandVal, DAG, dl, Chain, &Flag,
CS.getInstruction());
MatchedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse,
true, OpInfo.getMatchedOperand(), dl,
DAG, AsmNodeOperands);
break;
}
assert(InlineAsm::isMemKind(OpFlag) && "Unknown matching constraint!");
assert(InlineAsm::getNumOperandRegisters(OpFlag) == 1 &&
"Unexpected number of operands");
// Add information to the INLINEASM node to know about this input.
// See InlineAsm.h isUseOperandTiedToDef.
OpFlag = InlineAsm::convertMemFlagWordToMatchingFlagWord(OpFlag);
OpFlag = InlineAsm::getFlagWordForMatchingOp(OpFlag,
OpInfo.getMatchedOperand());
AsmNodeOperands.push_back(DAG.getTargetConstant(
OpFlag, getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));
AsmNodeOperands.push_back(AsmNodeOperands[CurOp+1]);
break;
}
// Treat indirect 'X' constraint as memory.
- if (OpInfo.ConstraintType == TargetLowering::C_Other &&
+ if ((OpInfo.ConstraintType == TargetLowering::C_Immediate ||
+ OpInfo.ConstraintType == TargetLowering::C_Other) &&
OpInfo.isIndirect)
OpInfo.ConstraintType = TargetLowering::C_Memory;
- if (OpInfo.ConstraintType == TargetLowering::C_Other) {
+ if (OpInfo.ConstraintType == TargetLowering::C_Immediate ||
+ OpInfo.ConstraintType == TargetLowering::C_Other) {
std::vector<SDValue> Ops;
TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode,
Ops, DAG);
if (Ops.empty()) {
+ if (OpInfo.ConstraintType == TargetLowering::C_Immediate)
+ if (isa<ConstantSDNode>(InOperandVal)) {
+ emitInlineAsmError(CS, "value out of range for constraint '" +
+ Twine(OpInfo.ConstraintCode) + "'");
+ return;
+ }
+
emitInlineAsmError(CS, "invalid operand for inline asm constraint '" +
Twine(OpInfo.ConstraintCode) + "'");
return;
}
// Add information to the INLINEASM node to know about this input.
unsigned ResOpType =
InlineAsm::getFlagWord(InlineAsm::Kind_Imm, Ops.size());
AsmNodeOperands.push_back(DAG.getTargetConstant(
ResOpType, getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));
AsmNodeOperands.insert(AsmNodeOperands.end(), Ops.begin(), Ops.end());
break;
}
if (OpInfo.ConstraintType == TargetLowering::C_Memory) {
assert(OpInfo.isIndirect && "Operand must be indirect to be a mem!");
assert(InOperandVal.getValueType() ==
TLI.getPointerTy(DAG.getDataLayout()) &&
"Memory operands expect pointer values");
unsigned ConstraintID =
TLI.getInlineAsmMemConstraint(OpInfo.ConstraintCode);
assert(ConstraintID != InlineAsm::Constraint_Unknown &&
"Failed to convert memory constraint code to constraint id.");
// Add information to the INLINEASM node to know about this input.
unsigned ResOpType = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
ResOpType = InlineAsm::getFlagWordForMem(ResOpType, ConstraintID);
AsmNodeOperands.push_back(DAG.getTargetConstant(ResOpType,
getCurSDLoc(),
MVT::i32));
AsmNodeOperands.push_back(InOperandVal);
break;
}
assert((OpInfo.ConstraintType == TargetLowering::C_RegisterClass ||
- OpInfo.ConstraintType == TargetLowering::C_Register) &&
+ OpInfo.ConstraintType == TargetLowering::C_Register ||
+ OpInfo.ConstraintType == TargetLowering::C_Immediate) &&
"Unknown constraint type!");
// TODO: Support this.
if (OpInfo.isIndirect) {
emitInlineAsmError(
CS, "Don't know how to handle indirect register inputs yet "
"for constraint '" +
Twine(OpInfo.ConstraintCode) + "'");
return;
}
// Copy the input into the appropriate registers.
if (OpInfo.AssignedRegs.Regs.empty()) {
emitInlineAsmError(CS, "couldn't allocate input reg for constraint '" +
Twine(OpInfo.ConstraintCode) + "'");
return;
}
SDLoc dl = getCurSDLoc();
OpInfo.AssignedRegs.getCopyToRegs(InOperandVal, DAG, dl,
Chain, &Flag, CS.getInstruction());
OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse, false, 0,
dl, DAG, AsmNodeOperands);
break;
}
case InlineAsm::isClobber:
// Add the clobbered value to the operand list, so that the register
// allocator is aware that the physreg got clobbered.
if (!OpInfo.AssignedRegs.Regs.empty())
OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind_Clobber,
false, 0, getCurSDLoc(), DAG,
AsmNodeOperands);
break;
}
}
// Finish up input operands. Set the input chain and add the flag last.
AsmNodeOperands[InlineAsm::Op_InputChain] = Chain;
if (Flag.getNode()) AsmNodeOperands.push_back(Flag);
unsigned ISDOpc = IsCallBr ? ISD::INLINEASM_BR : ISD::INLINEASM;
Chain = DAG.getNode(ISDOpc, getCurSDLoc(),
DAG.getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
Flag = Chain.getValue(1);
// Do additional work to generate outputs.
SmallVector<EVT, 1> ResultVTs;
SmallVector<SDValue, 1> ResultValues;
SmallVector<SDValue, 8> OutChains;
llvm::Type *CSResultType = CS.getType();
ArrayRef<Type *> ResultTypes;
if (StructType *StructResult = dyn_cast<StructType>(CSResultType))
ResultTypes = StructResult->elements();
else if (!CSResultType->isVoidTy())
ResultTypes = makeArrayRef(CSResultType);
auto CurResultType = ResultTypes.begin();
auto handleRegAssign = [&](SDValue V) {
assert(CurResultType != ResultTypes.end() && "Unexpected value");
assert((*CurResultType)->isSized() && "Unexpected unsized type");
EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), *CurResultType);
++CurResultType;
// If the type of the inline asm call site return value is different but has
// same size as the type of the asm output bitcast it. One example of this
// is for vectors with different width / number of elements. This can
// happen for register classes that can contain multiple different value
// types. The preg or vreg allocated may not have the same VT as was
// expected.
//
// This can also happen for a return value that disagrees with the register
// class it is put in, eg. a double in a general-purpose register on a
// 32-bit machine.
if (ResultVT != V.getValueType() &&
ResultVT.getSizeInBits() == V.getValueSizeInBits())
V = DAG.getNode(ISD::BITCAST, getCurSDLoc(), ResultVT, V);
else if (ResultVT != V.getValueType() && ResultVT.isInteger() &&
V.getValueType().isInteger()) {
// If a result value was tied to an input value, the computed result
// may have a wider width than the expected result. Extract the
// relevant portion.
V = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), ResultVT, V);
}
assert(ResultVT == V.getValueType() && "Asm result value mismatch!");
ResultVTs.push_back(ResultVT);
ResultValues.push_back(V);
};
// Deal with output operands.
for (SDISelAsmOperandInfo &OpInfo : ConstraintOperands) {
if (OpInfo.Type == InlineAsm::isOutput) {
SDValue Val;
// Skip trivial output operands.
if (OpInfo.AssignedRegs.Regs.empty())
continue;
switch (OpInfo.ConstraintType) {
case TargetLowering::C_Register:
case TargetLowering::C_RegisterClass:
Val = OpInfo.AssignedRegs.getCopyFromRegs(
DAG, FuncInfo, getCurSDLoc(), Chain, &Flag, CS.getInstruction());
break;
+ case TargetLowering::C_Immediate:
case TargetLowering::C_Other:
Val = TLI.LowerAsmOutputForConstraint(Chain, Flag, getCurSDLoc(),
OpInfo, DAG);
break;
case TargetLowering::C_Memory:
break; // Already handled.
case TargetLowering::C_Unknown:
assert(false && "Unexpected unknown constraint");
}
// Indirect output manifest as stores. Record output chains.
if (OpInfo.isIndirect) {
const Value *Ptr = OpInfo.CallOperandVal;
assert(Ptr && "Expected value CallOperandVal for indirect asm operand");
SDValue Store = DAG.getStore(Chain, getCurSDLoc(), Val, getValue(Ptr),
MachinePointerInfo(Ptr));
OutChains.push_back(Store);
} else {
// generate CopyFromRegs to associated registers.
assert(!CS.getType()->isVoidTy() && "Bad inline asm!");
if (Val.getOpcode() == ISD::MERGE_VALUES) {
for (const SDValue &V : Val->op_values())
handleRegAssign(V);
} else
handleRegAssign(Val);
}
}
}
// Set results.
if (!ResultValues.empty()) {
assert(CurResultType == ResultTypes.end() &&
"Mismatch in number of ResultTypes");
assert(ResultValues.size() == ResultTypes.size() &&
"Mismatch in number of output operands in asm result");
SDValue V = DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
DAG.getVTList(ResultVTs), ResultValues);
setValue(CS.getInstruction(), V);
}
// Collect store chains.
if (!OutChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other, OutChains);
// Only Update Root if inline assembly has a memory effect.
if (ResultValues.empty() || HasSideEffect || !OutChains.empty() || IsCallBr)
DAG.setRoot(Chain);
}
void SelectionDAGBuilder::emitInlineAsmError(ImmutableCallSite CS,
const Twine &Message) {
LLVMContext &Ctx = *DAG.getContext();
Ctx.emitError(CS.getInstruction(), Message);
// Make sure we leave the DAG in a valid state
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SmallVector<EVT, 1> ValueVTs;
ComputeValueVTs(TLI, DAG.getDataLayout(), CS->getType(), ValueVTs);
if (ValueVTs.empty())
return;
SmallVector<SDValue, 1> Ops;
for (unsigned i = 0, e = ValueVTs.size(); i != e; ++i)
Ops.push_back(DAG.getUNDEF(ValueVTs[i]));
setValue(CS.getInstruction(), DAG.getMergeValues(Ops, getCurSDLoc()));
}
void SelectionDAGBuilder::visitVAStart(const CallInst &I) {
DAG.setRoot(DAG.getNode(ISD::VASTART, getCurSDLoc(),
MVT::Other, getRoot(),
getValue(I.getArgOperand(0)),
DAG.getSrcValue(I.getArgOperand(0))));
}
void SelectionDAGBuilder::visitVAArg(const VAArgInst &I) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
const DataLayout &DL = DAG.getDataLayout();
SDValue V = DAG.getVAArg(
TLI.getMemValueType(DAG.getDataLayout(), I.getType()), getCurSDLoc(),
getRoot(), getValue(I.getOperand(0)), DAG.getSrcValue(I.getOperand(0)),
DL.getABITypeAlignment(I.getType()));
DAG.setRoot(V.getValue(1));
if (I.getType()->isPointerTy())
V = DAG.getPtrExtOrTrunc(
V, getCurSDLoc(), TLI.getValueType(DAG.getDataLayout(), I.getType()));
setValue(&I, V);
}
void SelectionDAGBuilder::visitVAEnd(const CallInst &I) {
DAG.setRoot(DAG.getNode(ISD::VAEND, getCurSDLoc(),
MVT::Other, getRoot(),
getValue(I.getArgOperand(0)),
DAG.getSrcValue(I.getArgOperand(0))));
}
void SelectionDAGBuilder::visitVACopy(const CallInst &I) {
DAG.setRoot(DAG.getNode(ISD::VACOPY, getCurSDLoc(),
MVT::Other, getRoot(),
getValue(I.getArgOperand(0)),
getValue(I.getArgOperand(1)),
DAG.getSrcValue(I.getArgOperand(0)),
DAG.getSrcValue(I.getArgOperand(1))));
}
SDValue SelectionDAGBuilder::lowerRangeToAssertZExt(SelectionDAG &DAG,
const Instruction &I,
SDValue Op) {
const MDNode *Range = I.getMetadata(LLVMContext::MD_range);
if (!Range)
return Op;
ConstantRange CR = getConstantRangeFromMetadata(*Range);
if (CR.isFullSet() || CR.isEmptySet() || CR.isUpperWrapped())
return Op;
APInt Lo = CR.getUnsignedMin();
if (!Lo.isMinValue())
return Op;
APInt Hi = CR.getUnsignedMax();
unsigned Bits = std::max(Hi.getActiveBits(),
static_cast<unsigned>(IntegerType::MIN_INT_BITS));
EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
SDLoc SL = getCurSDLoc();
SDValue ZExt = DAG.getNode(ISD::AssertZext, SL, Op.getValueType(), Op,
DAG.getValueType(SmallVT));
unsigned NumVals = Op.getNode()->getNumValues();
if (NumVals == 1)
return ZExt;
SmallVector<SDValue, 4> Ops;
Ops.push_back(ZExt);
for (unsigned I = 1; I != NumVals; ++I)
Ops.push_back(Op.getValue(I));
return DAG.getMergeValues(Ops, SL);
}
/// Populate a CallLowerinInfo (into \p CLI) based on the properties of
/// the call being lowered.
///
/// This is a helper for lowering intrinsics that follow a target calling
/// convention or require stack pointer adjustment. Only a subset of the
/// intrinsic's operands need to participate in the calling convention.
void SelectionDAGBuilder::populateCallLoweringInfo(
TargetLowering::CallLoweringInfo &CLI, const CallBase *Call,
unsigned ArgIdx, unsigned NumArgs, SDValue Callee, Type *ReturnTy,
bool IsPatchPoint) {
TargetLowering::ArgListTy Args;
Args.reserve(NumArgs);
// Populate the argument list.
// Attributes for args start at offset 1, after the return attribute.
for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs;
ArgI != ArgE; ++ArgI) {
const Value *V = Call->getOperand(ArgI);
assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
TargetLowering::ArgListEntry Entry;
Entry.Node = getValue(V);
Entry.Ty = V->getType();
Entry.setAttributes(Call, ArgI);
Args.push_back(Entry);
}
CLI.setDebugLoc(getCurSDLoc())
.setChain(getRoot())
.setCallee(Call->getCallingConv(), ReturnTy, Callee, std::move(Args))
.setDiscardResult(Call->use_empty())
.setIsPatchPoint(IsPatchPoint);
}
/// Add a stack map intrinsic call's live variable operands to a stackmap
/// or patchpoint target node's operand list.
///
/// Constants are converted to TargetConstants purely as an optimization to
/// avoid constant materialization and register allocation.
///
/// FrameIndex operands are converted to TargetFrameIndex so that ISEL does not
/// generate addess computation nodes, and so FinalizeISel can convert the
/// TargetFrameIndex into a DirectMemRefOp StackMap location. This avoids
/// address materialization and register allocation, but may also be required
/// for correctness. If a StackMap (or PatchPoint) intrinsic directly uses an
/// alloca in the entry block, then the runtime may assume that the alloca's
/// StackMap location can be read immediately after compilation and that the
/// location is valid at any point during execution (this is similar to the
/// assumption made by the llvm.gcroot intrinsic). If the alloca's location were
/// only available in a register, then the runtime would need to trap when
/// execution reaches the StackMap in order to read the alloca's location.
static void addStackMapLiveVars(ImmutableCallSite CS, unsigned StartIdx,
const SDLoc &DL, SmallVectorImpl<SDValue> &Ops,
SelectionDAGBuilder &Builder) {
for (unsigned i = StartIdx, e = CS.arg_size(); i != e; ++i) {
SDValue OpVal = Builder.getValue(CS.getArgument(i));
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpVal)) {
Ops.push_back(
Builder.DAG.getTargetConstant(StackMaps::ConstantOp, DL, MVT::i64));
Ops.push_back(
Builder.DAG.getTargetConstant(C->getSExtValue(), DL, MVT::i64));
} else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(OpVal)) {
const TargetLowering &TLI = Builder.DAG.getTargetLoweringInfo();
Ops.push_back(Builder.DAG.getTargetFrameIndex(
FI->getIndex(), TLI.getFrameIndexTy(Builder.DAG.getDataLayout())));
} else
Ops.push_back(OpVal);
}
}
/// Lower llvm.experimental.stackmap directly to its target opcode.
void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
// void @llvm.experimental.stackmap(i32 <id>, i32 <numShadowBytes>,
// [live variables...])
assert(CI.getType()->isVoidTy() && "Stackmap cannot return a value.");
SDValue Chain, InFlag, Callee, NullPtr;
SmallVector<SDValue, 32> Ops;
SDLoc DL = getCurSDLoc();
Callee = getValue(CI.getCalledValue());
NullPtr = DAG.getIntPtrConstant(0, DL, true);
// The stackmap intrinsic only records the live variables (the arguemnts
// passed to it) and emits NOPS (if requested). Unlike the patchpoint
// intrinsic, this won't be lowered to a function call. This means we don't
// have to worry about calling conventions and target specific lowering code.
// Instead we perform the call lowering right here.
//
// chain, flag = CALLSEQ_START(chain, 0, 0)
// chain, flag = STACKMAP(id, nbytes, ..., chain, flag)
// chain, flag = CALLSEQ_END(chain, 0, 0, flag)
//
Chain = DAG.getCALLSEQ_START(getRoot(), 0, 0, DL);
InFlag = Chain.getValue(1);
// Add the <id> and <numBytes> constants.
SDValue IDVal = getValue(CI.getOperand(PatchPointOpers::IDPos));
Ops.push_back(DAG.getTargetConstant(
cast<ConstantSDNode>(IDVal)->getZExtValue(), DL, MVT::i64));
SDValue NBytesVal = getValue(CI.getOperand(PatchPointOpers::NBytesPos));
Ops.push_back(DAG.getTargetConstant(
cast<ConstantSDNode>(NBytesVal)->getZExtValue(), DL,
MVT::i32));
// Push live variables for the stack map.
addStackMapLiveVars(&CI, 2, DL, Ops, *this);
// We are not pushing any register mask info here on the operands list,
// because the stackmap doesn't clobber anything.
// Push the chain and the glue flag.
Ops.push_back(Chain);
Ops.push_back(InFlag);
// Create the STACKMAP node.
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
SDNode *SM = DAG.getMachineNode(TargetOpcode::STACKMAP, DL, NodeTys, Ops);
Chain = SDValue(SM, 0);
InFlag = Chain.getValue(1);
Chain = DAG.getCALLSEQ_END(Chain, NullPtr, NullPtr, InFlag, DL);
// Stackmaps don't generate values, so nothing goes into the NodeMap.
// Set the root to the target-lowered call chain.
DAG.setRoot(Chain);
// Inform the Frame Information that we have a stackmap in this function.
FuncInfo.MF->getFrameInfo().setHasStackMap();
}
/// Lower llvm.experimental.patchpoint directly to its target opcode.
void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
const BasicBlock *EHPadBB) {
// void|i64 @llvm.experimental.patchpoint.void|i64(i64 <id>,
// i32 <numBytes>,
// i8* <target>,
// i32 <numArgs>,
// [Args...],
// [live variables...])
CallingConv::ID CC = CS.getCallingConv();
bool IsAnyRegCC = CC == CallingConv::AnyReg;
bool HasDef = !CS->getType()->isVoidTy();
SDLoc dl = getCurSDLoc();
SDValue Callee = getValue(CS->getOperand(PatchPointOpers::TargetPos));
// Handle immediate and symbolic callees.
if (auto* ConstCallee = dyn_cast<ConstantSDNode>(Callee))
Callee = DAG.getIntPtrConstant(ConstCallee->getZExtValue(), dl,
/*isTarget=*/true);
else if (auto* SymbolicCallee = dyn_cast<GlobalAddressSDNode>(Callee))
Callee = DAG.getTargetGlobalAddress(SymbolicCallee->getGlobal(),
SDLoc(SymbolicCallee),
SymbolicCallee->getValueType(0));
// Get the real number of arguments participating in the call <numArgs>
SDValue NArgVal = getValue(CS.getArgument(PatchPointOpers::NArgPos));
unsigned NumArgs = cast<ConstantSDNode>(NArgVal)->getZExtValue();
// Skip the four meta args: <id>, <numNopBytes>, <target>, <numArgs>
// Intrinsics include all meta-operands up to but not including CC.
unsigned NumMetaOpers = PatchPointOpers::CCPos;
assert(CS.arg_size() >= NumMetaOpers + NumArgs &&
"Not enough arguments provided to the patchpoint intrinsic");
// For AnyRegCC the arguments are lowered later on manually.
unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs;
Type *ReturnTy =
IsAnyRegCC ? Type::getVoidTy(*DAG.getContext()) : CS->getType();
TargetLowering::CallLoweringInfo CLI(DAG);
populateCallLoweringInfo(CLI, cast<CallBase>(CS.getInstruction()),
NumMetaOpers, NumCallArgs, Callee, ReturnTy, true);
std::pair<SDValue, SDValue> Result = lowerInvokable(CLI, EHPadBB);
SDNode *CallEnd = Result.second.getNode();
if (HasDef && (CallEnd->getOpcode() == ISD::CopyFromReg))
CallEnd = CallEnd->getOperand(0).getNode();
/// Get a call instruction from the call sequence chain.
/// Tail calls are not allowed.
assert(CallEnd->getOpcode() == ISD::CALLSEQ_END &&
"Expected a callseq node.");
SDNode *Call = CallEnd->getOperand(0).getNode();
bool HasGlue = Call->getGluedNode();
// Replace the target specific call node with the patchable intrinsic.
SmallVector<SDValue, 8> Ops;
// Add the <id> and <numBytes> constants.
SDValue IDVal = getValue(CS->getOperand(PatchPointOpers::IDPos));
Ops.push_back(DAG.getTargetConstant(
cast<ConstantSDNode>(IDVal)->getZExtValue(), dl, MVT::i64));
SDValue NBytesVal = getValue(CS->getOperand(PatchPointOpers::NBytesPos));
Ops.push_back(DAG.getTargetConstant(
cast<ConstantSDNode>(NBytesVal)->getZExtValue(), dl,
MVT::i32));
// Add the callee.
Ops.push_back(Callee);
// Adjust <numArgs> to account for any arguments that have been passed on the
// stack instead.
// Call Node: Chain, Target, {Args}, RegMask, [Glue]
unsigned NumCallRegArgs = Call->getNumOperands() - (HasGlue ? 4 : 3);
NumCallRegArgs = IsAnyRegCC ? NumArgs : NumCallRegArgs;
Ops.push_back(DAG.getTargetConstant(NumCallRegArgs, dl, MVT::i32));
// Add the calling convention
Ops.push_back(DAG.getTargetConstant((unsigned)CC, dl, MVT::i32));
// Add the arguments we omitted previously. The register allocator should
// place these in any free register.
if (IsAnyRegCC)
for (unsigned i = NumMetaOpers, e = NumMetaOpers + NumArgs; i != e; ++i)
Ops.push_back(getValue(CS.getArgument(i)));
// Push the arguments from the call instruction up to the register mask.
SDNode::op_iterator e = HasGlue ? Call->op_end()-2 : Call->op_end()-1;
Ops.append(Call->op_begin() + 2, e);
// Push live variables for the stack map.
addStackMapLiveVars(CS, NumMetaOpers + NumArgs, dl, Ops, *this);
// Push the register mask info.
if (HasGlue)
Ops.push_back(*(Call->op_end()-2));
else
Ops.push_back(*(Call->op_end()-1));
// Push the chain (this is originally the first operand of the call, but
// becomes now the last or second to last operand).
Ops.push_back(*(Call->op_begin()));
// Push the glue flag (last operand).
if (HasGlue)
Ops.push_back(*(Call->op_end()-1));
SDVTList NodeTys;
if (IsAnyRegCC && HasDef) {
// Create the return types based on the intrinsic definition
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SmallVector<EVT, 3> ValueVTs;
ComputeValueVTs(TLI, DAG.getDataLayout(), CS->getType(), ValueVTs);
assert(ValueVTs.size() == 1 && "Expected only one return value type.");
// There is always a chain and a glue type at the end
ValueVTs.push_back(MVT::Other);
ValueVTs.push_back(MVT::Glue);
NodeTys = DAG.getVTList(ValueVTs);
} else
NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
// Replace the target specific call node with a PATCHPOINT node.
MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::PATCHPOINT,
dl, NodeTys, Ops);
// Update the NodeMap.
if (HasDef) {
if (IsAnyRegCC)
setValue(CS.getInstruction(), SDValue(MN, 0));
else
setValue(CS.getInstruction(), Result.first);
}
// Fixup the consumers of the intrinsic. The chain and glue may be used in the
// call sequence. Furthermore the location of the chain and glue can change
// when the AnyReg calling convention is used and the intrinsic returns a
// value.
if (IsAnyRegCC && HasDef) {
SDValue From[] = {SDValue(Call, 0), SDValue(Call, 1)};
SDValue To[] = {SDValue(MN, 1), SDValue(MN, 2)};
DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
} else
DAG.ReplaceAllUsesWith(Call, MN);
DAG.DeleteNode(Call);
// Inform the Frame Information that we have a patchpoint in this function.
FuncInfo.MF->getFrameInfo().setHasPatchPoint();
}
void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
unsigned Intrinsic) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Op1 = getValue(I.getArgOperand(0));
SDValue Op2;
if (I.getNumArgOperands() > 1)
Op2 = getValue(I.getArgOperand(1));
SDLoc dl = getCurSDLoc();
EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
SDValue Res;
FastMathFlags FMF;
if (isa<FPMathOperator>(I))
FMF = I.getFastMathFlags();
switch (Intrinsic) {
case Intrinsic::experimental_vector_reduce_v2_fadd:
if (FMF.allowReassoc())
Res = DAG.getNode(ISD::FADD, dl, VT, Op1,
DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2));
else
Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2);
break;
case Intrinsic::experimental_vector_reduce_v2_fmul:
if (FMF.allowReassoc())
Res = DAG.getNode(ISD::FMUL, dl, VT, Op1,
DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2));
else
Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2);
break;
case Intrinsic::experimental_vector_reduce_add:
Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1);
break;
case Intrinsic::experimental_vector_reduce_mul:
Res = DAG.getNode(ISD::VECREDUCE_MUL, dl, VT, Op1);
break;
case Intrinsic::experimental_vector_reduce_and:
Res = DAG.getNode(ISD::VECREDUCE_AND, dl, VT, Op1);
break;
case Intrinsic::experimental_vector_reduce_or:
Res = DAG.getNode(ISD::VECREDUCE_OR, dl, VT, Op1);
break;
case Intrinsic::experimental_vector_reduce_xor:
Res = DAG.getNode(ISD::VECREDUCE_XOR, dl, VT, Op1);
break;
case Intrinsic::experimental_vector_reduce_smax:
Res = DAG.getNode(ISD::VECREDUCE_SMAX, dl, VT, Op1);
break;
case Intrinsic::experimental_vector_reduce_smin:
Res = DAG.getNode(ISD::VECREDUCE_SMIN, dl, VT, Op1);
break;
case Intrinsic::experimental_vector_reduce_umax:
Res = DAG.getNode(ISD::VECREDUCE_UMAX, dl, VT, Op1);
break;
case Intrinsic::experimental_vector_reduce_umin:
Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1);
break;
case Intrinsic::experimental_vector_reduce_fmax:
Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1);
break;
case Intrinsic::experimental_vector_reduce_fmin:
Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1);
break;
default:
llvm_unreachable("Unhandled vector reduce intrinsic");
}
setValue(&I, Res);
}
/// Returns an AttributeList representing the attributes applied to the return
/// value of the given call.
static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
SmallVector<Attribute::AttrKind, 2> Attrs;
if (CLI.RetSExt)
Attrs.push_back(Attribute::SExt);
if (CLI.RetZExt)
Attrs.push_back(Attribute::ZExt);
if (CLI.IsInReg)
Attrs.push_back(Attribute::InReg);
return AttributeList::get(CLI.RetTy->getContext(), AttributeList::ReturnIndex,
Attrs);
}
/// TargetLowering::LowerCallTo - This is the default LowerCallTo
/// implementation, which just calls LowerCall.
/// FIXME: When all targets are
/// migrated to using LowerCall, this hook should be integrated into SDISel.
std::pair<SDValue, SDValue>
TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
// Handle the incoming return values from the call.
CLI.Ins.clear();
Type *OrigRetTy = CLI.RetTy;
SmallVector<EVT, 4> RetTys;
SmallVector<uint64_t, 4> Offsets;
auto &DL = CLI.DAG.getDataLayout();
ComputeValueVTs(*this, DL, CLI.RetTy, RetTys, &Offsets);
if (CLI.IsPostTypeLegalization) {
// If we are lowering a libcall after legalization, split the return type.
SmallVector<EVT, 4> OldRetTys;
SmallVector<uint64_t, 4> OldOffsets;
RetTys.swap(OldRetTys);
Offsets.swap(OldOffsets);
for (size_t i = 0, e = OldRetTys.size(); i != e; ++i) {
EVT RetVT = OldRetTys[i];
uint64_t Offset = OldOffsets[i];
MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), RetVT);
unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), RetVT);
unsigned RegisterVTByteSZ = RegisterVT.getSizeInBits() / 8;
RetTys.append(NumRegs, RegisterVT);
for (unsigned j = 0; j != NumRegs; ++j)
Offsets.push_back(Offset + j * RegisterVTByteSZ);
}
}
SmallVector<ISD::OutputArg, 4> Outs;
GetReturnInfo(CLI.CallConv, CLI.RetTy, getReturnAttrs(CLI), Outs, *this, DL);
bool CanLowerReturn =
this->CanLowerReturn(CLI.CallConv, CLI.DAG.getMachineFunction(),
CLI.IsVarArg, Outs, CLI.RetTy->getContext());
SDValue DemoteStackSlot;
int DemoteStackIdx = -100;
if (!CanLowerReturn) {
// FIXME: equivalent assert?
// assert(!CS.hasInAllocaArgument() &&
// "sret demotion is incompatible with inalloca");
uint64_t TySize = DL.getTypeAllocSize(CLI.RetTy);
unsigned Align = DL.getPrefTypeAlignment(CLI.RetTy);
MachineFunction &MF = CLI.DAG.getMachineFunction();
DemoteStackIdx = MF.getFrameInfo().CreateStackObject(TySize, Align, false);
Type *StackSlotPtrType = PointerType::get(CLI.RetTy,
DL.getAllocaAddrSpace());
DemoteStackSlot = CLI.DAG.getFrameIndex(DemoteStackIdx, getFrameIndexTy(DL));
ArgListEntry Entry;
Entry.Node = DemoteStackSlot;
Entry.Ty = StackSlotPtrType;
Entry.IsSExt = false;
Entry.IsZExt = false;
Entry.IsInReg = false;
Entry.IsSRet = true;
Entry.IsNest = false;
Entry.IsByVal = false;
Entry.IsReturned = false;
Entry.IsSwiftSelf = false;
Entry.IsSwiftError = false;
Entry.Alignment = Align;
CLI.getArgs().insert(CLI.getArgs().begin(), Entry);
CLI.NumFixedArgs += 1;
CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext());
// sret demotion isn't compatible with tail-calls, since the sret argument
// points into the callers stack frame.
CLI.IsTailCall = false;
} else {
bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters(
CLI.RetTy, CLI.CallConv, CLI.IsVarArg);
for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
ISD::ArgFlagsTy Flags;
if (NeedsRegBlock) {
Flags.setInConsecutiveRegs();
if (I == RetTys.size() - 1)
Flags.setInConsecutiveRegsLast();
}
EVT VT = RetTys[I];
MVT RegisterVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(),
CLI.CallConv, VT);
unsigned NumRegs = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
CLI.CallConv, VT);
for (unsigned i = 0; i != NumRegs; ++i) {
ISD::InputArg MyFlags;
MyFlags.Flags = Flags;
MyFlags.VT = RegisterVT;
MyFlags.ArgVT = VT;
MyFlags.Used = CLI.IsReturnValueUsed;
if (CLI.RetTy->isPointerTy()) {
MyFlags.Flags.setPointer();
MyFlags.Flags.setPointerAddrSpace(
cast<PointerType>(CLI.RetTy)->getAddressSpace());
}
if (CLI.RetSExt)
MyFlags.Flags.setSExt();
if (CLI.RetZExt)
MyFlags.Flags.setZExt();
if (CLI.IsInReg)
MyFlags.Flags.setInReg();
CLI.Ins.push_back(MyFlags);
}
}
}
// We push in swifterror return as the last element of CLI.Ins.
ArgListTy &Args = CLI.getArgs();
if (supportSwiftError()) {
for (unsigned i = 0, e = Args.size(); i != e; ++i) {
if (Args[i].IsSwiftError) {
ISD::InputArg MyFlags;
MyFlags.VT = getPointerTy(DL);
MyFlags.ArgVT = EVT(getPointerTy(DL));
MyFlags.Flags.setSwiftError();
CLI.Ins.push_back(MyFlags);
}
}
}
// Handle all of the outgoing arguments.
CLI.Outs.clear();
CLI.OutVals.clear();
for (unsigned i = 0, e = Args.size(); i != e; ++i) {
SmallVector<EVT, 4> ValueVTs;
ComputeValueVTs(*this, DL, Args[i].Ty, ValueVTs);
// FIXME: Split arguments if CLI.IsPostTypeLegalization
Type *FinalType = Args[i].Ty;
if (Args[i].IsByVal)
FinalType = cast<PointerType>(Args[i].Ty)->getElementType();
bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters(
FinalType, CLI.CallConv, CLI.IsVarArg);
for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues;
++Value) {
EVT VT = ValueVTs[Value];
Type *ArgTy = VT.getTypeForEVT(CLI.RetTy->getContext());
SDValue Op = SDValue(Args[i].Node.getNode(),
Args[i].Node.getResNo() + Value);
ISD::ArgFlagsTy Flags;
// Certain targets (such as MIPS), may have a different ABI alignment
// for a type depending on the context. Give the target a chance to
// specify the alignment it wants.
unsigned OriginalAlignment = getABIAlignmentForCallingConv(ArgTy, DL);
if (Args[i].Ty->isPointerTy()) {
Flags.setPointer();
Flags.setPointerAddrSpace(
cast<PointerType>(Args[i].Ty)->getAddressSpace());
}
if (Args[i].IsZExt)
Flags.setZExt();
if (Args[i].IsSExt)
Flags.setSExt();
if (Args[i].IsInReg) {
// If we are using vectorcall calling convention, a structure that is
// passed InReg - is surely an HVA
if (CLI.CallConv == CallingConv::X86_VectorCall &&
isa<StructType>(FinalType)) {
// The first value of a structure is marked
if (0 == Value)
Flags.setHvaStart();
Flags.setHva();
}
// Set InReg Flag
Flags.setInReg();
}
if (Args[i].IsSRet)
Flags.setSRet();
if (Args[i].IsSwiftSelf)
Flags.setSwiftSelf();
if (Args[i].IsSwiftError)
Flags.setSwiftError();
if (Args[i].IsByVal)
Flags.setByVal();
if (Args[i].IsInAlloca) {
Flags.setInAlloca();
// Set the byval flag for CCAssignFn callbacks that don't know about
// inalloca. This way we can know how many bytes we should've allocated
// and how many bytes a callee cleanup function will pop. If we port
// inalloca to more targets, we'll have to add custom inalloca handling
// in the various CC lowering callbacks.
Flags.setByVal();
}
if (Args[i].IsByVal || Args[i].IsInAlloca) {
PointerType *Ty = cast<PointerType>(Args[i].Ty);
Type *ElementTy = Ty->getElementType();
unsigned FrameSize = DL.getTypeAllocSize(
Args[i].ByValType ? Args[i].ByValType : ElementTy);
Flags.setByValSize(FrameSize);
// info is not there but there are cases it cannot get right.
unsigned FrameAlign;
if (Args[i].Alignment)
FrameAlign = Args[i].Alignment;
else
FrameAlign = getByValTypeAlignment(ElementTy, DL);
Flags.setByValAlign(FrameAlign);
}
if (Args[i].IsNest)
Flags.setNest();
if (NeedsRegBlock)
Flags.setInConsecutiveRegs();
Flags.setOrigAlign(OriginalAlignment);
MVT PartVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(),
CLI.CallConv, VT);
unsigned NumParts = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
CLI.CallConv, VT);
SmallVector<SDValue, 4> Parts(NumParts);
ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
if (Args[i].IsSExt)
ExtendKind = ISD::SIGN_EXTEND;
else if (Args[i].IsZExt)
ExtendKind = ISD::ZERO_EXTEND;
// Conservatively only handle 'returned' on non-vectors that can be lowered,
// for now.
if (Args[i].IsReturned && !Op.getValueType().isVector() &&
CanLowerReturn) {
assert((CLI.RetTy == Args[i].Ty ||
(CLI.RetTy->isPointerTy() && Args[i].Ty->isPointerTy() &&
CLI.RetTy->getPointerAddressSpace() ==
Args[i].Ty->getPointerAddressSpace())) &&
RetTys.size() == NumValues && "unexpected use of 'returned'");
// Before passing 'returned' to the target lowering code, ensure that
// either the register MVT and the actual EVT are the same size or that
// the return value and argument are extended in the same way; in these
// cases it's safe to pass the argument register value unchanged as the
// return register value (although it's at the target's option whether
// to do so)
// TODO: allow code generation to take advantage of partially preserved
// registers rather than clobbering the entire register when the
// parameter extension method is not compatible with the return
// extension method
if ((NumParts * PartVT.getSizeInBits() == VT.getSizeInBits()) ||
(ExtendKind != ISD::ANY_EXTEND && CLI.RetSExt == Args[i].IsSExt &&
CLI.RetZExt == Args[i].IsZExt))
Flags.setReturned();
}
getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts, PartVT,
CLI.CS.getInstruction(), CLI.CallConv, ExtendKind);
for (unsigned j = 0; j != NumParts; ++j) {
// if it isn't first piece, alignment must be 1
ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(), VT,
i < CLI.NumFixedArgs,
i, j*Parts[j].getValueType().getStoreSize());
if (NumParts > 1 && j == 0)
MyFlags.Flags.setSplit();
else if (j != 0) {
MyFlags.Flags.setOrigAlign(1);
if (j == NumParts - 1)
MyFlags.Flags.setSplitEnd();
}
CLI.Outs.push_back(MyFlags);
CLI.OutVals.push_back(Parts[j]);
}
if (NeedsRegBlock && Value == NumValues - 1)
CLI.Outs[CLI.Outs.size() - 1].Flags.setInConsecutiveRegsLast();
}
}
SmallVector<SDValue, 4> InVals;
CLI.Chain = LowerCall(CLI, InVals);
// Update CLI.InVals to use outside of this function.
CLI.InVals = InVals;
// Verify that the target's LowerCall behaved as expected.
assert(CLI.Chain.getNode() && CLI.Chain.getValueType() == MVT::Other &&
"LowerCall didn't return a valid chain!");
assert((!CLI.IsTailCall || InVals.empty()) &&
"LowerCall emitted a return value for a tail call!");
assert((CLI.IsTailCall || InVals.size() == CLI.Ins.size()) &&
"LowerCall didn't emit the correct number of values!");
// For a tail call, the return value is merely live-out and there aren't
// any nodes in the DAG representing it. Return a special value to
// indicate that a tail call has been emitted and no more Instructions
// should be processed in the current block.
if (CLI.IsTailCall) {
CLI.DAG.setRoot(CLI.Chain);
return std::make_pair(SDValue(), SDValue());
}
#ifndef NDEBUG
for (unsigned i = 0, e = CLI.Ins.size(); i != e; ++i) {
assert(InVals[i].getNode() && "LowerCall emitted a null value!");
assert(EVT(CLI.Ins[i].VT) == InVals[i].getValueType() &&
"LowerCall emitted a value with the wrong type!");
}
#endif
SmallVector<SDValue, 4> ReturnValues;
if (!CanLowerReturn) {
// The instruction result is the result of loading from the
// hidden sret parameter.
SmallVector<EVT, 1> PVTs;
Type *PtrRetTy = OrigRetTy->getPointerTo(DL.getAllocaAddrSpace());
ComputeValueVTs(*this, DL, PtrRetTy, PVTs);
assert(PVTs.size() == 1 && "Pointers should fit in one register");
EVT PtrVT = PVTs[0];
unsigned NumValues = RetTys.size();
ReturnValues.resize(NumValues);
SmallVector<SDValue, 4> Chains(NumValues);
// An aggregate return value cannot wrap around the address space, so
// offsets to its parts don't wrap either.
SDNodeFlags Flags;
Flags.setNoUnsignedWrap(true);
for (unsigned i = 0; i < NumValues; ++i) {
SDValue Add = CLI.DAG.getNode(ISD::ADD, CLI.DL, PtrVT, DemoteStackSlot,
CLI.DAG.getConstant(Offsets[i], CLI.DL,
PtrVT), Flags);
SDValue L = CLI.DAG.getLoad(
RetTys[i], CLI.DL, CLI.Chain, Add,
MachinePointerInfo::getFixedStack(CLI.DAG.getMachineFunction(),
DemoteStackIdx, Offsets[i]),
/* Alignment = */ 1);
ReturnValues[i] = L;
Chains[i] = L.getValue(1);
}
CLI.Chain = CLI.DAG.getNode(ISD::TokenFactor, CLI.DL, MVT::Other, Chains);
} else {
// Collect the legal value parts into potentially illegal values
// that correspond to the original function's return values.
Optional<ISD::NodeType> AssertOp;
if (CLI.RetSExt)
AssertOp = ISD::AssertSext;
else if (CLI.RetZExt)
AssertOp = ISD::AssertZext;
unsigned CurReg = 0;
for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
EVT VT = RetTys[I];
MVT RegisterVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(),
CLI.CallConv, VT);
unsigned NumRegs = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
CLI.CallConv, VT);
ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg],
NumRegs, RegisterVT, VT, nullptr,
CLI.CallConv, AssertOp));
CurReg += NumRegs;
}
// For a function returning void, there is no return value. We can't create
// such a node, so we just return a null return value in that case. In
// that case, nothing will actually look at the value.
if (ReturnValues.empty())
return std::make_pair(SDValue(), CLI.Chain);
}
SDValue Res = CLI.DAG.getNode(ISD::MERGE_VALUES, CLI.DL,
CLI.DAG.getVTList(RetTys), ReturnValues);
return std::make_pair(Res, CLI.Chain);
}
void TargetLowering::LowerOperationWrapper(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
if (SDValue Res = LowerOperation(SDValue(N, 0), DAG))
Results.push_back(Res);
}
SDValue TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("LowerOperation not implemented for this target!");
}
void
SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
SDValue Op = getNonRegisterValue(V);
assert((Op.getOpcode() != ISD::CopyFromReg ||
cast<RegisterSDNode>(Op.getOperand(1))->getReg() != Reg) &&
"Copy from a reg to the same reg!");
assert(!TargetRegisterInfo::isPhysicalRegister(Reg) && "Is a physreg");
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// If this is an InlineAsm we have to match the registers required, not the
// notional registers required by the type.
RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), Reg, V->getType(),
None); // This is not an ABI copy.
SDValue Chain = DAG.getEntryNode();
ISD::NodeType ExtendType = (FuncInfo.PreferredExtendType.find(V) ==
FuncInfo.PreferredExtendType.end())
? ISD::ANY_EXTEND
: FuncInfo.PreferredExtendType[V];
RFV.getCopyToRegs(Op, DAG, getCurSDLoc(), Chain, nullptr, V, ExtendType);
PendingExports.push_back(Chain);
}
#include "llvm/CodeGen/SelectionDAGISel.h"
/// isOnlyUsedInEntryBlock - If the specified argument is only used in the
/// entry block, return true. This includes arguments used by switches, since
/// the switch may expand into multiple basic blocks.
static bool isOnlyUsedInEntryBlock(const Argument *A, bool FastISel) {
// With FastISel active, we may be splitting blocks, so force creation
// of virtual registers for all non-dead arguments.
if (FastISel)
return A->use_empty();
const BasicBlock &Entry = A->getParent()->front();
for (const User *U : A->users())
if (cast<Instruction>(U)->getParent() != &Entry || isa<SwitchInst>(U))
return false; // Use not in entry block.
return true;
}
using ArgCopyElisionMapTy =
DenseMap<const Argument *,
std::pair<const AllocaInst *, const StoreInst *>>;
/// Scan the entry block of the function in FuncInfo for arguments that look
/// like copies into a local alloca. Record any copied arguments in
/// ArgCopyElisionCandidates.
static void
findArgumentCopyElisionCandidates(const DataLayout &DL,
FunctionLoweringInfo *FuncInfo,
ArgCopyElisionMapTy &ArgCopyElisionCandidates) {
// Record the state of every static alloca used in the entry block. Argument
// allocas are all used in the entry block, so we need approximately as many
// entries as we have arguments.
enum StaticAllocaInfo { Unknown, Clobbered, Elidable };
SmallDenseMap<const AllocaInst *, StaticAllocaInfo, 8> StaticAllocas;
unsigned NumArgs = FuncInfo->Fn->arg_size();
StaticAllocas.reserve(NumArgs * 2);
auto GetInfoIfStaticAlloca = [&](const Value *V) -> StaticAllocaInfo * {
if (!V)
return nullptr;
V = V->stripPointerCasts();
const auto *AI = dyn_cast<AllocaInst>(V);
if (!AI || !AI->isStaticAlloca() || !FuncInfo->StaticAllocaMap.count(AI))
return nullptr;
auto Iter = StaticAllocas.insert({AI, Unknown});
return &Iter.first->second;
};
// Look for stores of arguments to static allocas. Look through bitcasts and
// GEPs to handle type coercions, as long as the alloca is fully initialized
// by the store. Any non-store use of an alloca escapes it and any subsequent
// unanalyzed store might write it.
// FIXME: Handle structs initialized with multiple stores.
for (const Instruction &I : FuncInfo->Fn->getEntryBlock()) {
// Look for stores, and handle non-store uses conservatively.
const auto *SI = dyn_cast<StoreInst>(&I);
if (!SI) {
// We will look through cast uses, so ignore them completely.
if (I.isCast())
continue;
// Ignore debug info intrinsics, they don't escape or store to allocas.
if (isa<DbgInfoIntrinsic>(I))
continue;
// This is an unknown instruction. Assume it escapes or writes to all
// static alloca operands.
for (const Use &U : I.operands()) {
if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(U))
*Info = StaticAllocaInfo::Clobbered;
}
continue;
}
// If the stored value is a static alloca, mark it as escaped.
if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(SI->getValueOperand()))
*Info = StaticAllocaInfo::Clobbered;
// Check if the destination is a static alloca.
const Value *Dst = SI->getPointerOperand()->stripPointerCasts();
StaticAllocaInfo *Info = GetInfoIfStaticAlloca(Dst);
if (!Info)
continue;
const AllocaInst *AI = cast<AllocaInst>(Dst);
// Skip allocas that have been initialized or clobbered.
if (*Info != StaticAllocaInfo::Unknown)
continue;
// Check if the stored value is an argument, and that this store fully
// initializes the alloca. Don't elide copies from the same argument twice.
const Value *Val = SI->getValueOperand()->stripPointerCasts();
const auto *Arg = dyn_cast<Argument>(Val);
if (!Arg || Arg->hasInAllocaAttr() || Arg->hasByValAttr() ||
Arg->getType()->isEmptyTy() ||
DL.getTypeStoreSize(Arg->getType()) !=
DL.getTypeAllocSize(AI->getAllocatedType()) ||
ArgCopyElisionCandidates.count(Arg)) {
*Info = StaticAllocaInfo::Clobbered;
continue;
}
LLVM_DEBUG(dbgs() << "Found argument copy elision candidate: " << *AI
<< '\n');
// Mark this alloca and store for argument copy elision.
*Info = StaticAllocaInfo::Elidable;
ArgCopyElisionCandidates.insert({Arg, {AI, SI}});
// Stop scanning if we've seen all arguments. This will happen early in -O0
// builds, which is useful, because -O0 builds have large entry blocks and
// many allocas.
if (ArgCopyElisionCandidates.size() == NumArgs)
break;
}
}
/// Try to elide argument copies from memory into a local alloca. Succeeds if
/// ArgVal is a load from a suitable fixed stack object.
static void tryToElideArgumentCopy(
FunctionLoweringInfo *FuncInfo, SmallVectorImpl<SDValue> &Chains,
DenseMap<int, int> &ArgCopyElisionFrameIndexMap,
SmallPtrSetImpl<const Instruction *> &ElidedArgCopyInstrs,
ArgCopyElisionMapTy &ArgCopyElisionCandidates, const Argument &Arg,
SDValue ArgVal, bool &ArgHasUses) {
// Check if this is a load from a fixed stack object.
auto *LNode = dyn_cast<LoadSDNode>(ArgVal);
if (!LNode)
return;
auto *FINode = dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode());
if (!FINode)
return;
// Check that the fixed stack object is the right size and alignment.
// Look at the alignment that the user wrote on the alloca instead of looking
// at the stack object.
auto ArgCopyIter = ArgCopyElisionCandidates.find(&Arg);
assert(ArgCopyIter != ArgCopyElisionCandidates.end());
const AllocaInst *AI = ArgCopyIter->second.first;
int FixedIndex = FINode->getIndex();
int &AllocaIndex = FuncInfo->StaticAllocaMap[AI];
int OldIndex = AllocaIndex;
MachineFrameInfo &MFI = FuncInfo->MF->getFrameInfo();
if (MFI.getObjectSize(FixedIndex) != MFI.getObjectSize(OldIndex)) {
LLVM_DEBUG(
dbgs() << " argument copy elision failed due to bad fixed stack "
"object size\n");
return;
}
unsigned RequiredAlignment = AI->getAlignment();
if (!RequiredAlignment) {
RequiredAlignment = FuncInfo->MF->getDataLayout().getABITypeAlignment(
AI->getAllocatedType());
}
if (MFI.getObjectAlignment(FixedIndex) < RequiredAlignment) {
LLVM_DEBUG(dbgs() << " argument copy elision failed: alignment of alloca "
"greater than stack argument alignment ("
<< RequiredAlignment << " vs "
<< MFI.getObjectAlignment(FixedIndex) << ")\n");
return;
}
// Perform the elision. Delete the old stack object and replace its only use
// in the variable info map. Mark the stack object as mutable.
LLVM_DEBUG({
dbgs() << "Eliding argument copy from " << Arg << " to " << *AI << '\n'
<< " Replacing frame index " << OldIndex << " with " << FixedIndex
<< '\n';
});
MFI.RemoveStackObject(OldIndex);
MFI.setIsImmutableObjectIndex(FixedIndex, false);
AllocaIndex = FixedIndex;
ArgCopyElisionFrameIndexMap.insert({OldIndex, FixedIndex});
Chains.push_back(ArgVal.getValue(1));
// Avoid emitting code for the store implementing the copy.
const StoreInst *SI = ArgCopyIter->second.second;
ElidedArgCopyInstrs.insert(SI);
// Check for uses of the argument again so that we can avoid exporting ArgVal
// if it is't used by anything other than the store.
for (const Value *U : Arg.users()) {
if (U != SI) {
ArgHasUses = true;
break;
}
}
}
void SelectionDAGISel::LowerArguments(const Function &F) {
SelectionDAG &DAG = SDB->DAG;
SDLoc dl = SDB->getCurSDLoc();
const DataLayout &DL = DAG.getDataLayout();
SmallVector<ISD::InputArg, 16> Ins;
if (!FuncInfo->CanLowerReturn) {
// Put in an sret pointer parameter before all the other parameters.
SmallVector<EVT, 1> ValueVTs;
ComputeValueVTs(*TLI, DAG.getDataLayout(),
F.getReturnType()->getPointerTo(
DAG.getDataLayout().getAllocaAddrSpace()),
ValueVTs);
// NOTE: Assuming that a pointer will never break down to more than one VT
// or one register.
ISD::ArgFlagsTy Flags;
Flags.setSRet();
MVT RegisterVT = TLI->getRegisterType(*DAG.getContext(), ValueVTs[0]);
ISD::InputArg RetArg(Flags, RegisterVT, ValueVTs[0], true,
ISD::InputArg::NoArgIndex, 0);
Ins.push_back(RetArg);
}
// Look for stores of arguments to static allocas. Mark such arguments with a
// flag to ask the target to give us the memory location of that argument if
// available.
ArgCopyElisionMapTy ArgCopyElisionCandidates;
findArgumentCopyElisionCandidates(DL, FuncInfo, ArgCopyElisionCandidates);
// Set up the incoming argument description vector.
for (const Argument &Arg : F.args()) {
unsigned ArgNo = Arg.getArgNo();
SmallVector<EVT, 4> ValueVTs;
ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);
bool isArgValueUsed = !Arg.use_empty();
unsigned PartBase = 0;
Type *FinalType = Arg.getType();
if (Arg.hasAttribute(Attribute::ByVal))
FinalType = Arg.getParamByValType();
bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters(
FinalType, F.getCallingConv(), F.isVarArg());
for (unsigned Value = 0, NumValues = ValueVTs.size();
Value != NumValues; ++Value) {
EVT VT = ValueVTs[Value];
Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
ISD::ArgFlagsTy Flags;
// Certain targets (such as MIPS), may have a different ABI alignment
// for a type depending on the context. Give the target a chance to
// specify the alignment it wants.
unsigned OriginalAlignment =
TLI->getABIAlignmentForCallingConv(ArgTy, DL);
if (Arg.getType()->isPointerTy()) {
Flags.setPointer();
Flags.setPointerAddrSpace(
cast<PointerType>(Arg.getType())->getAddressSpace());
}
if (Arg.hasAttribute(Attribute::ZExt))
Flags.setZExt();
if (Arg.hasAttribute(Attribute::SExt))
Flags.setSExt();
if (Arg.hasAttribute(Attribute::InReg)) {
// If we are using vectorcall calling convention, a structure that is
// passed InReg - is surely an HVA
if (F.getCallingConv() == CallingConv::X86_VectorCall &&
isa<StructType>(Arg.getType())) {
// The first value of a structure is marked
if (0 == Value)
Flags.setHvaStart();
Flags.setHva();
}
// Set InReg Flag
Flags.setInReg();
}
if (Arg.hasAttribute(Attribute::StructRet))
Flags.setSRet();
if (Arg.hasAttribute(Attribute::SwiftSelf))
Flags.setSwiftSelf();
if (Arg.hasAttribute(Attribute::SwiftError))
Flags.setSwiftError();
if (Arg.hasAttribute(Attribute::ByVal))
Flags.setByVal();
if (Arg.hasAttribute(Attribute::InAlloca)) {
Flags.setInAlloca();
// Set the byval flag for CCAssignFn callbacks that don't know about
// inalloca. This way we can know how many bytes we should've allocated
// and how many bytes a callee cleanup function will pop. If we port
// inalloca to more targets, we'll have to add custom inalloca handling
// in the various CC lowering callbacks.
Flags.setByVal();
}
if (F.getCallingConv() == CallingConv::X86_INTR) {
// IA Interrupt passes frame (1st parameter) by value in the stack.
if (ArgNo == 0)
Flags.setByVal();
}
if (Flags.isByVal() || Flags.isInAlloca()) {
Type *ElementTy = Arg.getParamByValType();
// For ByVal, size and alignment should be passed from FE. BE will
// guess if this info is not there but there are cases it cannot get
// right.
unsigned FrameSize = DL.getTypeAllocSize(Arg.getParamByValType());
Flags.setByValSize(FrameSize);
unsigned FrameAlign;
if (Arg.getParamAlignment())
FrameAlign = Arg.getParamAlignment();
else
FrameAlign = TLI->getByValTypeAlignment(ElementTy, DL);
Flags.setByValAlign(FrameAlign);
}
if (Arg.hasAttribute(Attribute::Nest))
Flags.setNest();
if (NeedsRegBlock)
Flags.setInConsecutiveRegs();
Flags.setOrigAlign(OriginalAlignment);
if (ArgCopyElisionCandidates.count(&Arg))
Flags.setCopyElisionCandidate();
MVT RegisterVT = TLI->getRegisterTypeForCallingConv(
*CurDAG->getContext(), F.getCallingConv(), VT);
unsigned NumRegs = TLI->getNumRegistersForCallingConv(
*CurDAG->getContext(), F.getCallingConv(), VT);
for (unsigned i = 0; i != NumRegs; ++i) {
ISD::InputArg MyFlags(Flags, RegisterVT, VT, isArgValueUsed,
ArgNo, PartBase+i*RegisterVT.getStoreSize());
if (NumRegs > 1 && i == 0)
MyFlags.Flags.setSplit();
// if it isn't first piece, alignment must be 1
else if (i > 0) {
MyFlags.Flags.setOrigAlign(1);
if (i == NumRegs - 1)
MyFlags.Flags.setSplitEnd();
}
Ins.push_back(MyFlags);
}
if (NeedsRegBlock && Value == NumValues - 1)
Ins[Ins.size() - 1].Flags.setInConsecutiveRegsLast();
PartBase += VT.getStoreSize();
}
}
// Call the target to set up the argument values.
SmallVector<SDValue, 8> InVals;
SDValue NewRoot = TLI->LowerFormalArguments(
DAG.getRoot(), F.getCallingConv(), F.isVarArg(), Ins, dl, DAG, InVals);
// Verify that the target's LowerFormalArguments behaved as expected.
assert(NewRoot.getNode() && NewRoot.getValueType() == MVT::Other &&
"LowerFormalArguments didn't return a valid chain!");
assert(InVals.size() == Ins.size() &&
"LowerFormalArguments didn't emit the correct number of values!");
LLVM_DEBUG({
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
assert(InVals[i].getNode() &&
"LowerFormalArguments emitted a null value!");
assert(EVT(Ins[i].VT) == InVals[i].getValueType() &&
"LowerFormalArguments emitted a value with the wrong type!");
}
});
// Update the DAG with the new chain value resulting from argument lowering.
DAG.setRoot(NewRoot);
// Set up the argument values.
unsigned i = 0;
if (!FuncInfo->CanLowerReturn) {
// Create a virtual register for the sret pointer, and put in a copy
// from the sret argument into it.
SmallVector<EVT, 1> ValueVTs;
ComputeValueVTs(*TLI, DAG.getDataLayout(),
F.getReturnType()->getPointerTo(
DAG.getDataLayout().getAllocaAddrSpace()),
ValueVTs);
MVT VT = ValueVTs[0].getSimpleVT();
MVT RegVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
Optional<ISD::NodeType> AssertOp = None;
SDValue ArgValue = getCopyFromParts(DAG, dl, &InVals[0], 1, RegVT, VT,
nullptr, F.getCallingConv(), AssertOp);
MachineFunction& MF = SDB->DAG.getMachineFunction();
MachineRegisterInfo& RegInfo = MF.getRegInfo();
unsigned SRetReg = RegInfo.createVirtualRegister(TLI->getRegClassFor(RegVT));
FuncInfo->DemoteRegister = SRetReg;
NewRoot =
SDB->DAG.getCopyToReg(NewRoot, SDB->getCurSDLoc(), SRetReg, ArgValue);
DAG.setRoot(NewRoot);
// i indexes lowered arguments. Bump it past the hidden sret argument.
++i;
}
SmallVector<SDValue, 4> Chains;
DenseMap<int, int> ArgCopyElisionFrameIndexMap;
for (const Argument &Arg : F.args()) {
SmallVector<SDValue, 4> ArgValues;
SmallVector<EVT, 4> ValueVTs;
ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);
unsigned NumValues = ValueVTs.size();
if (NumValues == 0)
continue;
bool ArgHasUses = !Arg.use_empty();
// Elide the copying store if the target loaded this argument from a
// suitable fixed stack object.
if (Ins[i].Flags.isCopyElisionCandidate()) {
tryToElideArgumentCopy(FuncInfo, Chains, ArgCopyElisionFrameIndexMap,
ElidedArgCopyInstrs, ArgCopyElisionCandidates, Arg,
InVals[i], ArgHasUses);
}
// If this argument is unused then remember its value. It is used to generate
// debugging information.
bool isSwiftErrorArg =
TLI->supportSwiftError() &&
Arg.hasAttribute(Attribute::SwiftError);
if (!ArgHasUses && !isSwiftErrorArg) {
SDB->setUnusedArgValue(&Arg, InVals[i]);
// Also remember any frame index for use in FastISel.
if (FrameIndexSDNode *FI =
dyn_cast<FrameIndexSDNode>(InVals[i].getNode()))
FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
}
for (unsigned Val = 0; Val != NumValues; ++Val) {
EVT VT = ValueVTs[Val];
MVT PartVT = TLI->getRegisterTypeForCallingConv(*CurDAG->getContext(),
F.getCallingConv(), VT);
unsigned NumParts = TLI->getNumRegistersForCallingConv(
*CurDAG->getContext(), F.getCallingConv(), VT);
// Even an apparant 'unused' swifterror argument needs to be returned. So
// we do generate a copy for it that can be used on return from the
// function.
if (ArgHasUses || isSwiftErrorArg) {
Optional<ISD::NodeType> AssertOp;
if (Arg.hasAttribute(Attribute::SExt))
AssertOp = ISD::AssertSext;
else if (Arg.hasAttribute(Attribute::ZExt))
AssertOp = ISD::AssertZext;
ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts,
PartVT, VT, nullptr,
F.getCallingConv(), AssertOp));
}
i += NumParts;
}
// We don't need to do anything else for unused arguments.
if (ArgValues.empty())
continue;
// Note down frame index.
if (FrameIndexSDNode *FI =
dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
SDValue Res = DAG.getMergeValues(makeArrayRef(ArgValues.data(), NumValues),
SDB->getCurSDLoc());
SDB->setValue(&Arg, Res);
if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::BUILD_PAIR) {
// We want to associate the argument with the frame index, among
// involved operands, that correspond to the lowest address. The
// getCopyFromParts function, called earlier, is swapping the order of
// the operands to BUILD_PAIR depending on endianness. The result of
// that swapping is that the least significant bits of the argument will
// be in the first operand of the BUILD_PAIR node, and the most
// significant bits will be in the second operand.
unsigned LowAddressOp = DAG.getDataLayout().isBigEndian() ? 1 : 0;
if (LoadSDNode *LNode =
dyn_cast<LoadSDNode>(Res.getOperand(LowAddressOp).getNode()))
if (FrameIndexSDNode *FI =
dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()))
FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
}
// Update the SwiftErrorVRegDefMap.
if (Res.getOpcode() == ISD::CopyFromReg && isSwiftErrorArg) {
unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
if (TargetRegisterInfo::isVirtualRegister(Reg))
SwiftError->setCurrentVReg(FuncInfo->MBB, SwiftError->getFunctionArg(),
Reg);
}
// If this argument is live outside of the entry block, insert a copy from
// wherever we got it to the vreg that other BB's will reference it as.
if (Res.getOpcode() == ISD::CopyFromReg) {
// If we can, though, try to skip creating an unnecessary vreg.
// FIXME: This isn't very clean... it would be nice to make this more
// general.
unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
if (TargetRegisterInfo::isVirtualRegister(Reg)) {
FuncInfo->ValueMap[&Arg] = Reg;
continue;
}
}
if (!isOnlyUsedInEntryBlock(&Arg, TM.Options.EnableFastISel)) {
FuncInfo->InitializeRegForValue(&Arg);
SDB->CopyToExportRegsIfNeeded(&Arg);
}
}
if (!Chains.empty()) {
Chains.push_back(NewRoot);
NewRoot = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
}
DAG.setRoot(NewRoot);
assert(i == InVals.size() && "Argument register count mismatch!");
// If any argument copy elisions occurred and we have debug info, update the
// stale frame indices used in the dbg.declare variable info table.
MachineFunction::VariableDbgInfoMapTy &DbgDeclareInfo = MF->getVariableDbgInfo();
if (!DbgDeclareInfo.empty() && !ArgCopyElisionFrameIndexMap.empty()) {
for (MachineFunction::VariableDbgInfo &VI : DbgDeclareInfo) {
auto I = ArgCopyElisionFrameIndexMap.find(VI.Slot);
if (I != ArgCopyElisionFrameIndexMap.end())
VI.Slot = I->second;
}
}
// Finally, if the target has anything special to do, allow it to do so.
EmitFunctionEntryCode();
}
/// Handle PHI nodes in successor blocks. Emit code into the SelectionDAG to
/// ensure constants are generated when needed. Remember the virtual registers
/// that need to be added to the Machine PHI nodes as input. We cannot just
/// directly add them, because expansion might result in multiple MBB's for one
/// BB. As such, the start of the BB might correspond to a different MBB than
/// the end.
void
SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
const Instruction *TI = LLVMBB->getTerminator();
SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;
// Check PHI nodes in successors that expect a value to be available from this
// block.
for (unsigned succ = 0, e = TI->getNumSuccessors(); succ != e; ++succ) {
const BasicBlock *SuccBB = TI->getSuccessor(succ);
if (!isa<PHINode>(SuccBB->begin())) continue;
MachineBasicBlock *SuccMBB = FuncInfo.MBBMap[SuccBB];
// If this terminator has multiple identical successors (common for
// switches), only handle each succ once.
if (!SuccsHandled.insert(SuccMBB).second)
continue;
MachineBasicBlock::iterator MBBI = SuccMBB->begin();
// At this point we know that there is a 1-1 correspondence between LLVM PHI
// nodes and Machine PHI nodes, but the incoming operands have not been
// emitted yet.
for (const PHINode &PN : SuccBB->phis()) {
// Ignore dead phi's.
if (PN.use_empty())
continue;
// Skip empty types
if (PN.getType()->isEmptyTy())
continue;
unsigned Reg;
const Value *PHIOp = PN.getIncomingValueForBlock(LLVMBB);
if (const Constant *C = dyn_cast<Constant>(PHIOp)) {
unsigned &RegOut = ConstantsOut[C];
if (RegOut == 0) {
RegOut = FuncInfo.CreateRegs(C);
CopyValueToVirtualRegister(C, RegOut);
}
Reg = RegOut;
} else {
DenseMap<const Value *, unsigned>::iterator I =
FuncInfo.ValueMap.find(PHIOp);
if (I != FuncInfo.ValueMap.end())
Reg = I->second;
else {
assert(isa<AllocaInst>(PHIOp) &&
FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(PHIOp)) &&
"Didn't codegen value into a register!??");
Reg = FuncInfo.CreateRegs(PHIOp);
CopyValueToVirtualRegister(PHIOp, Reg);
}
}
// Remember that this register needs to added to the machine PHI node as
// the input for this MBB.
SmallVector<EVT, 4> ValueVTs;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
ComputeValueVTs(TLI, DAG.getDataLayout(), PN.getType(), ValueVTs);
for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) {
EVT VT = ValueVTs[vti];
unsigned NumRegisters = TLI.getNumRegisters(*DAG.getContext(), VT);
for (unsigned i = 0, e = NumRegisters; i != e; ++i)
FuncInfo.PHINodesToUpdate.push_back(
std::make_pair(&*MBBI++, Reg + i));
Reg += NumRegisters;
}
}
}
ConstantsOut.clear();
}
/// Add a successor MBB to ParentMBB< creating a new MachineBB for BB if SuccMBB
/// is 0.
MachineBasicBlock *
SelectionDAGBuilder::StackProtectorDescriptor::
AddSuccessorMBB(const BasicBlock *BB,
MachineBasicBlock *ParentMBB,
bool IsLikely,
MachineBasicBlock *SuccMBB) {
// If SuccBB has not been created yet, create it.
if (!SuccMBB) {
MachineFunction *MF = ParentMBB->getParent();
MachineFunction::iterator BBI(ParentMBB);
SuccMBB = MF->CreateMachineBasicBlock(BB);
MF->insert(++BBI, SuccMBB);
}
// Add it as a successor of ParentMBB.
ParentMBB->addSuccessor(
SuccMBB, BranchProbabilityInfo::getBranchProbStackProtector(IsLikely));
return SuccMBB;
}
MachineBasicBlock *SelectionDAGBuilder::NextBlock(MachineBasicBlock *MBB) {
MachineFunction::iterator I(MBB);
if (++I == FuncInfo.MF->end())
return nullptr;
return &*I;
}
/// During lowering new call nodes can be created (such as memset, etc.).
/// Those will become new roots of the current DAG, but complications arise
/// when they are tail calls. In such cases, the call lowering will update
/// the root, but the builder still needs to know that a tail call has been
/// lowered in order to avoid generating an additional return.
void SelectionDAGBuilder::updateDAGForMaybeTailCall(SDValue MaybeTC) {
// If the node is null, we do have a tail call.
if (MaybeTC.getNode() != nullptr)
DAG.setRoot(MaybeTC);
else
HasTailCall = true;
}
void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
MachineBasicBlock *SwitchMBB,
MachineBasicBlock *DefaultMBB) {
MachineFunction *CurMF = FuncInfo.MF;
MachineBasicBlock *NextMBB = nullptr;
MachineFunction::iterator BBI(W.MBB);
if (++BBI != FuncInfo.MF->end())
NextMBB = &*BBI;
unsigned Size = W.LastCluster - W.FirstCluster + 1;
BranchProbabilityInfo *BPI = FuncInfo.BPI;
if (Size == 2 && W.MBB == SwitchMBB) {
// If any two of the cases has the same destination, and if one value
// is the same as the other, but has one bit unset that the other has set,
// use bit manipulation to do two compares at once. For example:
// "if (X == 6 || X == 4)" -> "if ((X|2) == 6)"
// TODO: This could be extended to merge any 2 cases in switches with 3
// cases.
// TODO: Handle cases where W.CaseBB != SwitchBB.
CaseCluster &Small = *W.FirstCluster;
CaseCluster &Big = *W.LastCluster;
if (Small.Low == Small.High && Big.Low == Big.High &&
Small.MBB == Big.MBB) {
const APInt &SmallValue = Small.Low->getValue();
const APInt &BigValue = Big.Low->getValue();
// Check that there is only one bit different.
APInt CommonBit = BigValue ^ SmallValue;
if (CommonBit.isPowerOf2()) {
SDValue CondLHS = getValue(Cond);
EVT VT = CondLHS.getValueType();
SDLoc DL = getCurSDLoc();
SDValue Or = DAG.getNode(ISD::OR, DL, VT, CondLHS,
DAG.getConstant(CommonBit, DL, VT));
SDValue Cond = DAG.getSetCC(
DL, MVT::i1, Or, DAG.getConstant(BigValue | SmallValue, DL, VT),
ISD::SETEQ);
// Update successor info.
// Both Small and Big will jump to Small.BB, so we sum up the
// probabilities.
addSuccessorWithProb(SwitchMBB, Small.MBB, Small.Prob + Big.Prob);
if (BPI)
addSuccessorWithProb(
SwitchMBB, DefaultMBB,
// The default destination is the first successor in IR.
BPI->getEdgeProbability(SwitchMBB->getBasicBlock(), (unsigned)0));
else
addSuccessorWithProb(SwitchMBB, DefaultMBB);
// Insert the true branch.
SDValue BrCond =
DAG.getNode(ISD::BRCOND, DL, MVT::Other, getControlRoot(), Cond,
DAG.getBasicBlock(Small.MBB));
// Insert the false branch.
BrCond = DAG.getNode(ISD::BR, DL, MVT::Other, BrCond,
DAG.getBasicBlock(DefaultMBB));
DAG.setRoot(BrCond);
return;
}
}
}
if (TM.getOptLevel() != CodeGenOpt::None) {
// Here, we order cases by probability so the most likely case will be
// checked first. However, two clusters can have the same probability in
// which case their relative ordering is non-deterministic. So we use Low
// as a tie-breaker as clusters are guaranteed to never overlap.
llvm::sort(W.FirstCluster, W.LastCluster + 1,
[](const CaseCluster &a, const CaseCluster &b) {
return a.Prob != b.Prob ?
a.Prob > b.Prob :
a.Low->getValue().slt(b.Low->getValue());
});
// Rearrange the case blocks so that the last one falls through if possible
// without changing the order of probabilities.
for (CaseClusterIt I = W.LastCluster; I > W.FirstCluster; ) {
--I;
if (I->Prob > W.LastCluster->Prob)
break;
if (I->Kind == CC_Range && I->MBB == NextMBB) {
std::swap(*I, *W.LastCluster);
break;
}
}
}
// Compute total probability.
BranchProbability DefaultProb = W.DefaultProb;
BranchProbability UnhandledProbs = DefaultProb;
for (CaseClusterIt I = W.FirstCluster; I <= W.LastCluster; ++I)
UnhandledProbs += I->Prob;
MachineBasicBlock *CurMBB = W.MBB;
for (CaseClusterIt I = W.FirstCluster, E = W.LastCluster; I <= E; ++I) {
bool FallthroughUnreachable = false;
MachineBasicBlock *Fallthrough;
if (I == W.LastCluster) {
// For the last cluster, fall through to the default destination.
Fallthrough = DefaultMBB;
FallthroughUnreachable = isa<UnreachableInst>(
DefaultMBB->getBasicBlock()->getFirstNonPHIOrDbg());
} else {
Fallthrough = CurMF->CreateMachineBasicBlock(CurMBB->getBasicBlock());
CurMF->insert(BBI, Fallthrough);
// Put Cond in a virtual register to make it available from the new blocks.
ExportFromCurrentBlock(Cond);
}
UnhandledProbs -= I->Prob;
switch (I->Kind) {
case CC_JumpTable: {
// FIXME: Optimize away range check based on pivot comparisons.
JumpTableHeader *JTH = &SL->JTCases[I->JTCasesIndex].first;
SwitchCG::JumpTable *JT = &SL->JTCases[I->JTCasesIndex].second;
// The jump block hasn't been inserted yet; insert it here.
MachineBasicBlock *JumpMBB = JT->MBB;
CurMF->insert(BBI, JumpMBB);
auto JumpProb = I->Prob;
auto FallthroughProb = UnhandledProbs;
// If the default statement is a target of the jump table, we evenly
// distribute the default probability to successors of CurMBB. Also
// update the probability on the edge from JumpMBB to Fallthrough.
for (MachineBasicBlock::succ_iterator SI = JumpMBB->succ_begin(),
SE = JumpMBB->succ_end();
SI != SE; ++SI) {
if (*SI == DefaultMBB) {
JumpProb += DefaultProb / 2;
FallthroughProb -= DefaultProb / 2;
JumpMBB->setSuccProbability(SI, DefaultProb / 2);
JumpMBB->normalizeSuccProbs();
break;
}
}
if (FallthroughUnreachable) {
// Skip the range check if the fallthrough block is unreachable.
JTH->OmitRangeCheck = true;
}
if (!JTH->OmitRangeCheck)
addSuccessorWithProb(CurMBB, Fallthrough, FallthroughProb);
addSuccessorWithProb(CurMBB, JumpMBB, JumpProb);
CurMBB->normalizeSuccProbs();
// The jump table header will be inserted in our current block, do the
// range check, and fall through to our fallthrough block.
JTH->HeaderBB = CurMBB;
JT->Default = Fallthrough; // FIXME: Move Default to JumpTableHeader.
// If we're in the right place, emit the jump table header right now.
if (CurMBB == SwitchMBB) {
visitJumpTableHeader(*JT, *JTH, SwitchMBB);
JTH->Emitted = true;
}
break;
}
case CC_BitTests: {
// FIXME: If Fallthrough is unreachable, skip the range check.
// FIXME: Optimize away range check based on pivot comparisons.
BitTestBlock *BTB = &SL->BitTestCases[I->BTCasesIndex];
// The bit test blocks haven't been inserted yet; insert them here.
for (BitTestCase &BTC : BTB->Cases)
CurMF->insert(BBI, BTC.ThisBB);
// Fill in fields of the BitTestBlock.
BTB->Parent = CurMBB;
BTB->Default = Fallthrough;
BTB->DefaultProb = UnhandledProbs;
// If the cases in bit test don't form a contiguous range, we evenly
// distribute the probability on the edge to Fallthrough to two
// successors of CurMBB.
if (!BTB->ContiguousRange) {
BTB->Prob += DefaultProb / 2;
BTB->DefaultProb -= DefaultProb / 2;
}
// If we're in the right place, emit the bit test header right now.
if (CurMBB == SwitchMBB) {
visitBitTestHeader(*BTB, SwitchMBB);
BTB->Emitted = true;
}
break;
}
case CC_Range: {
const Value *RHS, *LHS, *MHS;
ISD::CondCode CC;
if (I->Low == I->High) {
// Check Cond == I->Low.
CC = ISD::SETEQ;
LHS = Cond;
RHS=I->Low;
MHS = nullptr;
} else {
// Check I->Low <= Cond <= I->High.
CC = ISD::SETLE;
LHS = I->Low;
MHS = Cond;
RHS = I->High;
}
// If Fallthrough is unreachable, fold away the comparison.
if (FallthroughUnreachable)
CC = ISD::SETTRUE;
// The false probability is the sum of all unhandled cases.
CaseBlock CB(CC, LHS, RHS, MHS, I->MBB, Fallthrough, CurMBB,
getCurSDLoc(), I->Prob, UnhandledProbs);
if (CurMBB == SwitchMBB)
visitSwitchCase(CB, SwitchMBB);
else
SL->SwitchCases.push_back(CB);
break;
}
}
CurMBB = Fallthrough;
}
}
unsigned SelectionDAGBuilder::caseClusterRank(const CaseCluster &CC,
CaseClusterIt First,
CaseClusterIt Last) {
return std::count_if(First, Last + 1, [&](const CaseCluster &X) {
if (X.Prob != CC.Prob)
return X.Prob > CC.Prob;
// Ties are broken by comparing the case value.
return X.Low->getValue().slt(CC.Low->getValue());
});
}
void SelectionDAGBuilder::splitWorkItem(SwitchWorkList &WorkList,
const SwitchWorkListItem &W,
Value *Cond,
MachineBasicBlock *SwitchMBB) {
assert(W.FirstCluster->Low->getValue().slt(W.LastCluster->Low->getValue()) &&
"Clusters not sorted?");
assert(W.LastCluster - W.FirstCluster + 1 >= 2 && "Too small to split!");
// Balance the tree based on branch probabilities to create a near-optimal (in
// terms of search time given key frequency) binary search tree. See e.g. Kurt
// Mehlhorn "Nearly Optimal Binary Search Trees" (1975).
CaseClusterIt LastLeft = W.FirstCluster;
CaseClusterIt FirstRight = W.LastCluster;
auto LeftProb = LastLeft->Prob + W.DefaultProb / 2;
auto RightProb = FirstRight->Prob + W.DefaultProb / 2;
// Move LastLeft and FirstRight towards each other from opposite directions to
// find a partitioning of the clusters which balances the probability on both
// sides. If LeftProb and RightProb are equal, alternate which side is
// taken to ensure 0-probability nodes are distributed evenly.
unsigned I = 0;
while (LastLeft + 1 < FirstRight) {
if (LeftProb < RightProb || (LeftProb == RightProb && (I & 1)))
LeftProb += (++LastLeft)->Prob;
else
RightProb += (--FirstRight)->Prob;
I++;
}
while (true) {
// Our binary search tree differs from a typical BST in that ours can have up
// to three values in each leaf. The pivot selection above doesn't take that
// into account, which means the tree might require more nodes and be less
// efficient. We compensate for this here.
unsigned NumLeft = LastLeft - W.FirstCluster + 1;
unsigned NumRight = W.LastCluster - FirstRight + 1;
if (std::min(NumLeft, NumRight) < 3 && std::max(NumLeft, NumRight) > 3) {
// If one side has less than 3 clusters, and the other has more than 3,
// consider taking a cluster from the other side.
if (NumLeft < NumRight) {
// Consider moving the first cluster on the right to the left side.
CaseCluster &CC = *FirstRight;
unsigned RightSideRank = caseClusterRank(CC, FirstRight, W.LastCluster);
unsigned LeftSideRank = caseClusterRank(CC, W.FirstCluster, LastLeft);
if (LeftSideRank <= RightSideRank) {
// Moving the cluster to the left does not demote it.
++LastLeft;
++FirstRight;
continue;
}
} else {
assert(NumRight < NumLeft);
// Consider moving the last element on the left to the right side.
CaseCluster &CC = *LastLeft;
unsigned LeftSideRank = caseClusterRank(CC, W.FirstCluster, LastLeft);
unsigned RightSideRank = caseClusterRank(CC, FirstRight, W.LastCluster);
if (RightSideRank <= LeftSideRank) {
// Moving the cluster to the right does not demot it.
--LastLeft;
--FirstRight;
continue;
}
}
}
break;
}
assert(LastLeft + 1 == FirstRight);
assert(LastLeft >= W.FirstCluster);
assert(FirstRight <= W.LastCluster);
// Use the first element on the right as pivot since we will make less-than
// comparisons against it.
CaseClusterIt PivotCluster = FirstRight;
assert(PivotCluster > W.FirstCluster);
assert(PivotCluster <= W.LastCluster);
CaseClusterIt FirstLeft = W.FirstCluster;
CaseClusterIt LastRight = W.LastCluster;
const ConstantInt *Pivot = PivotCluster->Low;
// New blocks will be inserted immediately after the current one.
MachineFunction::iterator BBI(W.MBB);
++BBI;
// We will branch to the LHS if Value < Pivot. If LHS is a single cluster,
// we can branch to its destination directly if it's squeezed exactly in
// between the known lower bound and Pivot - 1.
MachineBasicBlock *LeftMBB;
if (FirstLeft == LastLeft && FirstLeft->Kind == CC_Range &&
FirstLeft->Low == W.GE &&
(FirstLeft->High->getValue() + 1LL) == Pivot->getValue()) {
LeftMBB = FirstLeft->MBB;
} else {
LeftMBB = FuncInfo.MF->CreateMachineBasicBlock(W.MBB->getBasicBlock());
FuncInfo.MF->insert(BBI, LeftMBB);
WorkList.push_back(
{LeftMBB, FirstLeft, LastLeft, W.GE, Pivot, W.DefaultProb / 2});
// Put Cond in a virtual register to make it available from the new blocks.
ExportFromCurrentBlock(Cond);
}
// Similarly, we will branch to the RHS if Value >= Pivot. If RHS is a
// single cluster, RHS.Low == Pivot, and we can branch to its destination
// directly if RHS.High equals the current upper bound.
MachineBasicBlock *RightMBB;
if (FirstRight == LastRight && FirstRight->Kind == CC_Range &&
W.LT && (FirstRight->High->getValue() + 1ULL) == W.LT->getValue()) {
RightMBB = FirstRight->MBB;
} else {
RightMBB = FuncInfo.MF->CreateMachineBasicBlock(W.MBB->getBasicBlock());
FuncInfo.MF->insert(BBI, RightMBB);
WorkList.push_back(
{RightMBB, FirstRight, LastRight, Pivot, W.LT, W.DefaultProb / 2});
// Put Cond in a virtual register to make it available from the new blocks.
ExportFromCurrentBlock(Cond);
}
// Create the CaseBlock record that will be used to lower the branch.
CaseBlock CB(ISD::SETLT, Cond, Pivot, nullptr, LeftMBB, RightMBB, W.MBB,
getCurSDLoc(), LeftProb, RightProb);
if (W.MBB == SwitchMBB)
visitSwitchCase(CB, SwitchMBB);
else
SL->SwitchCases.push_back(CB);
}
// Scale CaseProb after peeling a case with the probablity of PeeledCaseProb
// from the swith statement.
static BranchProbability scaleCaseProbality(BranchProbability CaseProb,
BranchProbability PeeledCaseProb) {
if (PeeledCaseProb == BranchProbability::getOne())
return BranchProbability::getZero();
BranchProbability SwitchProb = PeeledCaseProb.getCompl();
uint32_t Numerator = CaseProb.getNumerator();
uint32_t Denominator = SwitchProb.scale(CaseProb.getDenominator());
return BranchProbability(Numerator, std::max(Numerator, Denominator));
}
// Try to peel the top probability case if it exceeds the threshold.
// Return current MachineBasicBlock for the switch statement if the peeling
// does not occur.
// If the peeling is performed, return the newly created MachineBasicBlock
// for the peeled switch statement. Also update Clusters to remove the peeled
// case. PeeledCaseProb is the BranchProbability for the peeled case.
MachineBasicBlock *SelectionDAGBuilder::peelDominantCaseCluster(
const SwitchInst &SI, CaseClusterVector &Clusters,
BranchProbability &PeeledCaseProb) {
MachineBasicBlock *SwitchMBB = FuncInfo.MBB;
// Don't perform if there is only one cluster or optimizing for size.
if (SwitchPeelThreshold > 100 || !FuncInfo.BPI || Clusters.size() < 2 ||
TM.getOptLevel() == CodeGenOpt::None ||
SwitchMBB->getParent()->getFunction().hasMinSize())
return SwitchMBB;
BranchProbability TopCaseProb = BranchProbability(SwitchPeelThreshold, 100);
unsigned PeeledCaseIndex = 0;
bool SwitchPeeled = false;
for (unsigned Index = 0; Index < Clusters.size(); ++Index) {
CaseCluster &CC = Clusters[Index];
if (CC.Prob < TopCaseProb)
continue;
TopCaseProb = CC.Prob;
PeeledCaseIndex = Index;
SwitchPeeled = true;
}
if (!SwitchPeeled)
return SwitchMBB;
LLVM_DEBUG(dbgs() << "Peeled one top case in switch stmt, prob: "
<< TopCaseProb << "\n");
// Record the MBB for the peeled switch statement.
MachineFunction::iterator BBI(SwitchMBB);
++BBI;
MachineBasicBlock *PeeledSwitchMBB =
FuncInfo.MF->CreateMachineBasicBlock(SwitchMBB->getBasicBlock());
FuncInfo.MF->insert(BBI, PeeledSwitchMBB);
ExportFromCurrentBlock(SI.getCondition());
auto PeeledCaseIt = Clusters.begin() + PeeledCaseIndex;
SwitchWorkListItem W = {SwitchMBB, PeeledCaseIt, PeeledCaseIt,
nullptr, nullptr, TopCaseProb.getCompl()};
lowerWorkItem(W, SI.getCondition(), SwitchMBB, PeeledSwitchMBB);
Clusters.erase(PeeledCaseIt);
for (CaseCluster &CC : Clusters) {
LLVM_DEBUG(
dbgs() << "Scale the probablity for one cluster, before scaling: "
<< CC.Prob << "\n");
CC.Prob = scaleCaseProbality(CC.Prob, TopCaseProb);
LLVM_DEBUG(dbgs() << "After scaling: " << CC.Prob << "\n");
}
PeeledCaseProb = TopCaseProb;
return PeeledSwitchMBB;
}
void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
// Extract cases from the switch.
BranchProbabilityInfo *BPI = FuncInfo.BPI;
CaseClusterVector Clusters;
Clusters.reserve(SI.getNumCases());
for (auto I : SI.cases()) {
MachineBasicBlock *Succ = FuncInfo.MBBMap[I.getCaseSuccessor()];
const ConstantInt *CaseVal = I.getCaseValue();
BranchProbability Prob =
BPI ? BPI->getEdgeProbability(SI.getParent(), I.getSuccessorIndex())
: BranchProbability(1, SI.getNumCases() + 1);
Clusters.push_back(CaseCluster::range(CaseVal, CaseVal, Succ, Prob));
}
MachineBasicBlock *DefaultMBB = FuncInfo.MBBMap[SI.getDefaultDest()];
// Cluster adjacent cases with the same destination. We do this at all
// optimization levels because it's cheap to do and will make codegen faster
// if there are many clusters.
sortAndRangeify(Clusters);
// The branch probablity of the peeled case.
BranchProbability PeeledCaseProb = BranchProbability::getZero();
MachineBasicBlock *PeeledSwitchMBB =
peelDominantCaseCluster(SI, Clusters, PeeledCaseProb);
// If there is only the default destination, jump there directly.
MachineBasicBlock *SwitchMBB = FuncInfo.MBB;
if (Clusters.empty()) {
assert(PeeledSwitchMBB == SwitchMBB);
SwitchMBB->addSuccessor(DefaultMBB);
if (DefaultMBB != NextBlock(SwitchMBB)) {
DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other,
getControlRoot(), DAG.getBasicBlock(DefaultMBB)));
}
return;
}
SL->findJumpTables(Clusters, &SI, DefaultMBB);
SL->findBitTestClusters(Clusters, &SI);
LLVM_DEBUG({
dbgs() << "Case clusters: ";
for (const CaseCluster &C : Clusters) {
if (C.Kind == CC_JumpTable)
dbgs() << "JT:";
if (C.Kind == CC_BitTests)
dbgs() << "BT:";
C.Low->getValue().print(dbgs(), true);
if (C.Low != C.High) {
dbgs() << '-';
C.High->getValue().print(dbgs(), true);
}
dbgs() << ' ';
}
dbgs() << '\n';
});
assert(!Clusters.empty());
SwitchWorkList WorkList;
CaseClusterIt First = Clusters.begin();
CaseClusterIt Last = Clusters.end() - 1;
auto DefaultProb = getEdgeProbability(PeeledSwitchMBB, DefaultMBB);
// Scale the branchprobability for DefaultMBB if the peel occurs and
// DefaultMBB is not replaced.
if (PeeledCaseProb != BranchProbability::getZero() &&
DefaultMBB == FuncInfo.MBBMap[SI.getDefaultDest()])
DefaultProb = scaleCaseProbality(DefaultProb, PeeledCaseProb);
WorkList.push_back(
{PeeledSwitchMBB, First, Last, nullptr, nullptr, DefaultProb});
while (!WorkList.empty()) {
SwitchWorkListItem W = WorkList.back();
WorkList.pop_back();
unsigned NumClusters = W.LastCluster - W.FirstCluster + 1;
if (NumClusters > 3 && TM.getOptLevel() != CodeGenOpt::None &&
!DefaultMBB->getParent()->getFunction().hasMinSize()) {
// For optimized builds, lower large range as a balanced binary tree.
splitWorkItem(WorkList, W, SI.getCondition(), SwitchMBB);
continue;
}
lowerWorkItem(W, SI.getCondition(), SwitchMBB, DefaultMBB);
}
}
Index: vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/TargetLowering.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/TargetLowering.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/TargetLowering.cpp (revision 351303)
@@ -1,6284 +1,6288 @@
//===-- TargetLowering.cpp - Implement the TargetLowering class -----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This implements the TargetLowering class.
//
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
#include <cctype>
using namespace llvm;
/// NOTE: The TargetMachine owns TLOF.
TargetLowering::TargetLowering(const TargetMachine &tm)
: TargetLoweringBase(tm) {}
const char *TargetLowering::getTargetNodeName(unsigned Opcode) const {
return nullptr;
}
bool TargetLowering::isPositionIndependent() const {
return getTargetMachine().isPositionIndependent();
}
/// Check whether a given call node is in tail position within its function. If
/// so, it sets Chain to the input chain of the tail call.
bool TargetLowering::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
SDValue &Chain) const {
const Function &F = DAG.getMachineFunction().getFunction();
// Conservatively require the attributes of the call to match those of
// the return. Ignore NoAlias and NonNull because they don't affect the
// call sequence.
AttributeList CallerAttrs = F.getAttributes();
if (AttrBuilder(CallerAttrs, AttributeList::ReturnIndex)
.removeAttribute(Attribute::NoAlias)
.removeAttribute(Attribute::NonNull)
.hasAttributes())
return false;
// It's not safe to eliminate the sign / zero extension of the return value.
if (CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt) ||
CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
return false;
// Check if the only use is a function return node.
return isUsedByReturnOnly(Node, Chain);
}
bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI,
const uint32_t *CallerPreservedMask,
const SmallVectorImpl<CCValAssign> &ArgLocs,
const SmallVectorImpl<SDValue> &OutVals) const {
for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
const CCValAssign &ArgLoc = ArgLocs[I];
if (!ArgLoc.isRegLoc())
continue;
unsigned Reg = ArgLoc.getLocReg();
// Only look at callee saved registers.
if (MachineOperand::clobbersPhysReg(CallerPreservedMask, Reg))
continue;
// Check that we pass the value used for the caller.
// (We look for a CopyFromReg reading a virtual register that is used
// for the function live-in value of register Reg)
SDValue Value = OutVals[I];
if (Value->getOpcode() != ISD::CopyFromReg)
return false;
unsigned ArgReg = cast<RegisterSDNode>(Value->getOperand(1))->getReg();
if (MRI.getLiveInPhysReg(ArgReg) != Reg)
return false;
}
return true;
}
/// Set CallLoweringInfo attribute flags based on a call instruction
/// and called function attributes.
void TargetLoweringBase::ArgListEntry::setAttributes(const CallBase *Call,
unsigned ArgIdx) {
IsSExt = Call->paramHasAttr(ArgIdx, Attribute::SExt);
IsZExt = Call->paramHasAttr(ArgIdx, Attribute::ZExt);
IsInReg = Call->paramHasAttr(ArgIdx, Attribute::InReg);
IsSRet = Call->paramHasAttr(ArgIdx, Attribute::StructRet);
IsNest = Call->paramHasAttr(ArgIdx, Attribute::Nest);
IsByVal = Call->paramHasAttr(ArgIdx, Attribute::ByVal);
IsInAlloca = Call->paramHasAttr(ArgIdx, Attribute::InAlloca);
IsReturned = Call->paramHasAttr(ArgIdx, Attribute::Returned);
IsSwiftSelf = Call->paramHasAttr(ArgIdx, Attribute::SwiftSelf);
IsSwiftError = Call->paramHasAttr(ArgIdx, Attribute::SwiftError);
Alignment = Call->getParamAlignment(ArgIdx);
ByValType = nullptr;
if (Call->paramHasAttr(ArgIdx, Attribute::ByVal))
ByValType = Call->getParamByValType(ArgIdx);
}
/// Generate a libcall taking the given operands as arguments and returning a
/// result of type RetVT.
std::pair<SDValue, SDValue>
TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
ArrayRef<SDValue> Ops, bool isSigned,
const SDLoc &dl, bool doesNotReturn,
bool isReturnValueUsed,
bool isPostTypeLegalization) const {
TargetLowering::ArgListTy Args;
Args.reserve(Ops.size());
TargetLowering::ArgListEntry Entry;
for (SDValue Op : Ops) {
Entry.Node = Op;
Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
Entry.IsSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
Entry.IsZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
Args.push_back(Entry);
}
if (LC == RTLIB::UNKNOWN_LIBCALL)
report_fatal_error("Unsupported library call operation!");
SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
getPointerTy(DAG.getDataLayout()));
Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
TargetLowering::CallLoweringInfo CLI(DAG);
bool signExtend = shouldSignExtendTypeInLibCall(RetVT, isSigned);
CLI.setDebugLoc(dl)
.setChain(DAG.getEntryNode())
.setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
.setNoReturn(doesNotReturn)
.setDiscardResult(!isReturnValueUsed)
.setIsPostTypeLegalization(isPostTypeLegalization)
.setSExtResult(signExtend)
.setZExtResult(!signExtend);
return LowerCallTo(CLI);
}
bool
TargetLowering::findOptimalMemOpLowering(std::vector<EVT> &MemOps,
unsigned Limit, uint64_t Size,
unsigned DstAlign, unsigned SrcAlign,
bool IsMemset,
bool ZeroMemset,
bool MemcpyStrSrc,
bool AllowOverlap,
unsigned DstAS, unsigned SrcAS,
const AttributeList &FuncAttributes) const {
// If 'SrcAlign' is zero, that means the memory operation does not need to
// load the value, i.e. memset or memcpy from constant string. Otherwise,
// it's the inferred alignment of the source. 'DstAlign', on the other hand,
// is the specified alignment of the memory operation. If it is zero, that
// means it's possible to change the alignment of the destination.
// 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does
// not need to be loaded.
if (!(SrcAlign == 0 || SrcAlign >= DstAlign))
return false;
EVT VT = getOptimalMemOpType(Size, DstAlign, SrcAlign,
IsMemset, ZeroMemset, MemcpyStrSrc,
FuncAttributes);
if (VT == MVT::Other) {
// Use the largest integer type whose alignment constraints are satisfied.
// We only need to check DstAlign here as SrcAlign is always greater or
// equal to DstAlign (or zero).
VT = MVT::i64;
while (DstAlign && DstAlign < VT.getSizeInBits() / 8 &&
!allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign))
VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
assert(VT.isInteger());
// Find the largest legal integer type.
MVT LVT = MVT::i64;
while (!isTypeLegal(LVT))
LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1);
assert(LVT.isInteger());
// If the type we've chosen is larger than the largest legal integer type
// then use that instead.
if (VT.bitsGT(LVT))
VT = LVT;
}
unsigned NumMemOps = 0;
while (Size != 0) {
unsigned VTSize = VT.getSizeInBits() / 8;
while (VTSize > Size) {
// For now, only use non-vector load / store's for the left-over pieces.
EVT NewVT = VT;
unsigned NewVTSize;
bool Found = false;
if (VT.isVector() || VT.isFloatingPoint()) {
NewVT = (VT.getSizeInBits() > 64) ? MVT::i64 : MVT::i32;
if (isOperationLegalOrCustom(ISD::STORE, NewVT) &&
isSafeMemOpType(NewVT.getSimpleVT()))
Found = true;
else if (NewVT == MVT::i64 &&
isOperationLegalOrCustom(ISD::STORE, MVT::f64) &&
isSafeMemOpType(MVT::f64)) {
// i64 is usually not legal on 32-bit targets, but f64 may be.
NewVT = MVT::f64;
Found = true;
}
}
if (!Found) {
do {
NewVT = (MVT::SimpleValueType)(NewVT.getSimpleVT().SimpleTy - 1);
if (NewVT == MVT::i8)
break;
} while (!isSafeMemOpType(NewVT.getSimpleVT()));
}
NewVTSize = NewVT.getSizeInBits() / 8;
// If the new VT cannot cover all of the remaining bits, then consider
// issuing a (or a pair of) unaligned and overlapping load / store.
bool Fast;
if (NumMemOps && AllowOverlap && NewVTSize < Size &&
allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign,
MachineMemOperand::MONone, &Fast) &&
Fast)
VTSize = Size;
else {
VT = NewVT;
VTSize = NewVTSize;
}
}
if (++NumMemOps > Limit)
return false;
MemOps.push_back(VT);
Size -= VTSize;
}
return true;
}
/// Soften the operands of a comparison. This code is shared among BR_CC,
/// SELECT_CC, and SETCC handlers.
void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
SDValue &NewLHS, SDValue &NewRHS,
ISD::CondCode &CCCode,
const SDLoc &dl) const {
assert((VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128 || VT == MVT::ppcf128)
&& "Unsupported setcc type!");
// Expand into one or more soft-fp libcall(s).
RTLIB::Libcall LC1 = RTLIB::UNKNOWN_LIBCALL, LC2 = RTLIB::UNKNOWN_LIBCALL;
bool ShouldInvertCC = false;
switch (CCCode) {
case ISD::SETEQ:
case ISD::SETOEQ:
LC1 = (VT == MVT::f32) ? RTLIB::OEQ_F32 :
(VT == MVT::f64) ? RTLIB::OEQ_F64 :
(VT == MVT::f128) ? RTLIB::OEQ_F128 : RTLIB::OEQ_PPCF128;
break;
case ISD::SETNE:
case ISD::SETUNE:
LC1 = (VT == MVT::f32) ? RTLIB::UNE_F32 :
(VT == MVT::f64) ? RTLIB::UNE_F64 :
(VT == MVT::f128) ? RTLIB::UNE_F128 : RTLIB::UNE_PPCF128;
break;
case ISD::SETGE:
case ISD::SETOGE:
LC1 = (VT == MVT::f32) ? RTLIB::OGE_F32 :
(VT == MVT::f64) ? RTLIB::OGE_F64 :
(VT == MVT::f128) ? RTLIB::OGE_F128 : RTLIB::OGE_PPCF128;
break;
case ISD::SETLT:
case ISD::SETOLT:
LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
(VT == MVT::f64) ? RTLIB::OLT_F64 :
(VT == MVT::f128) ? RTLIB::OLT_F128 : RTLIB::OLT_PPCF128;
break;
case ISD::SETLE:
case ISD::SETOLE:
LC1 = (VT == MVT::f32) ? RTLIB::OLE_F32 :
(VT == MVT::f64) ? RTLIB::OLE_F64 :
(VT == MVT::f128) ? RTLIB::OLE_F128 : RTLIB::OLE_PPCF128;
break;
case ISD::SETGT:
case ISD::SETOGT:
LC1 = (VT == MVT::f32) ? RTLIB::OGT_F32 :
(VT == MVT::f64) ? RTLIB::OGT_F64 :
(VT == MVT::f128) ? RTLIB::OGT_F128 : RTLIB::OGT_PPCF128;
break;
case ISD::SETUO:
LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 :
(VT == MVT::f64) ? RTLIB::UO_F64 :
(VT == MVT::f128) ? RTLIB::UO_F128 : RTLIB::UO_PPCF128;
break;
case ISD::SETO:
LC1 = (VT == MVT::f32) ? RTLIB::O_F32 :
(VT == MVT::f64) ? RTLIB::O_F64 :
(VT == MVT::f128) ? RTLIB::O_F128 : RTLIB::O_PPCF128;
break;
case ISD::SETONE:
// SETONE = SETOLT | SETOGT
LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
(VT == MVT::f64) ? RTLIB::OLT_F64 :
(VT == MVT::f128) ? RTLIB::OLT_F128 : RTLIB::OLT_PPCF128;
LC2 = (VT == MVT::f32) ? RTLIB::OGT_F32 :
(VT == MVT::f64) ? RTLIB::OGT_F64 :
(VT == MVT::f128) ? RTLIB::OGT_F128 : RTLIB::OGT_PPCF128;
break;
case ISD::SETUEQ:
LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 :
(VT == MVT::f64) ? RTLIB::UO_F64 :
(VT == MVT::f128) ? RTLIB::UO_F128 : RTLIB::UO_PPCF128;
LC2 = (VT == MVT::f32) ? RTLIB::OEQ_F32 :
(VT == MVT::f64) ? RTLIB::OEQ_F64 :
(VT == MVT::f128) ? RTLIB::OEQ_F128 : RTLIB::OEQ_PPCF128;
break;
default:
// Invert CC for unordered comparisons
ShouldInvertCC = true;
switch (CCCode) {
case ISD::SETULT:
LC1 = (VT == MVT::f32) ? RTLIB::OGE_F32 :
(VT == MVT::f64) ? RTLIB::OGE_F64 :
(VT == MVT::f128) ? RTLIB::OGE_F128 : RTLIB::OGE_PPCF128;
break;
case ISD::SETULE:
LC1 = (VT == MVT::f32) ? RTLIB::OGT_F32 :
(VT == MVT::f64) ? RTLIB::OGT_F64 :
(VT == MVT::f128) ? RTLIB::OGT_F128 : RTLIB::OGT_PPCF128;
break;
case ISD::SETUGT:
LC1 = (VT == MVT::f32) ? RTLIB::OLE_F32 :
(VT == MVT::f64) ? RTLIB::OLE_F64 :
(VT == MVT::f128) ? RTLIB::OLE_F128 : RTLIB::OLE_PPCF128;
break;
case ISD::SETUGE:
LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
(VT == MVT::f64) ? RTLIB::OLT_F64 :
(VT == MVT::f128) ? RTLIB::OLT_F128 : RTLIB::OLT_PPCF128;
break;
default: llvm_unreachable("Do not know how to soften this setcc!");
}
}
// Use the target specific return value for comparions lib calls.
EVT RetVT = getCmpLibcallReturnType();
SDValue Ops[2] = {NewLHS, NewRHS};
NewLHS = makeLibCall(DAG, LC1, RetVT, Ops, false /*sign irrelevant*/,
dl).first;
NewRHS = DAG.getConstant(0, dl, RetVT);
CCCode = getCmpLibcallCC(LC1);
if (ShouldInvertCC)
CCCode = getSetCCInverse(CCCode, /*isInteger=*/true);
if (LC2 != RTLIB::UNKNOWN_LIBCALL) {
SDValue Tmp = DAG.getNode(
ISD::SETCC, dl,
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), RetVT),
NewLHS, NewRHS, DAG.getCondCode(CCCode));
NewLHS = makeLibCall(DAG, LC2, RetVT, Ops, false/*sign irrelevant*/,
dl).first;
NewLHS = DAG.getNode(
ISD::SETCC, dl,
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), RetVT),
NewLHS, NewRHS, DAG.getCondCode(getCmpLibcallCC(LC2)));
NewLHS = DAG.getNode(ISD::OR, dl, Tmp.getValueType(), Tmp, NewLHS);
NewRHS = SDValue();
}
}
/// Return the entry encoding for a jump table in the current function. The
/// returned value is a member of the MachineJumpTableInfo::JTEntryKind enum.
unsigned TargetLowering::getJumpTableEncoding() const {
// In non-pic modes, just use the address of a block.
if (!isPositionIndependent())
return MachineJumpTableInfo::EK_BlockAddress;
// In PIC mode, if the target supports a GPRel32 directive, use it.
if (getTargetMachine().getMCAsmInfo()->getGPRel32Directive() != nullptr)
return MachineJumpTableInfo::EK_GPRel32BlockAddress;
// Otherwise, use a label difference.
return MachineJumpTableInfo::EK_LabelDifference32;
}
SDValue TargetLowering::getPICJumpTableRelocBase(SDValue Table,
SelectionDAG &DAG) const {
// If our PIC model is GP relative, use the global offset table as the base.
unsigned JTEncoding = getJumpTableEncoding();
if ((JTEncoding == MachineJumpTableInfo::EK_GPRel64BlockAddress) ||
(JTEncoding == MachineJumpTableInfo::EK_GPRel32BlockAddress))
return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy(DAG.getDataLayout()));
return Table;
}
/// This returns the relocation base for the given PIC jumptable, the same as
/// getPICJumpTableRelocBase, but as an MCExpr.
const MCExpr *
TargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
unsigned JTI,MCContext &Ctx) const{
// The normal PIC reloc base is the label at the start of the jump table.
return MCSymbolRefExpr::create(MF->getJTISymbol(JTI, Ctx), Ctx);
}
bool
TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
const TargetMachine &TM = getTargetMachine();
const GlobalValue *GV = GA->getGlobal();
// If the address is not even local to this DSO we will have to load it from
// a got and then add the offset.
if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
return false;
// If the code is position independent we will have to add a base register.
if (isPositionIndependent())
return false;
// Otherwise we can do it.
return true;
}
//===----------------------------------------------------------------------===//
// Optimization Methods
//===----------------------------------------------------------------------===//
/// If the specified instruction has a constant integer operand and there are
/// bits set in that constant that are not demanded, then clear those bits and
/// return true.
bool TargetLowering::ShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
TargetLoweringOpt &TLO) const {
SDLoc DL(Op);
unsigned Opcode = Op.getOpcode();
// Do target-specific constant optimization.
if (targetShrinkDemandedConstant(Op, Demanded, TLO))
return TLO.New.getNode();
// FIXME: ISD::SELECT, ISD::SELECT_CC
switch (Opcode) {
default:
break;
case ISD::XOR:
case ISD::AND:
case ISD::OR: {
auto *Op1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
if (!Op1C)
return false;
// If this is a 'not' op, don't touch it because that's a canonical form.
const APInt &C = Op1C->getAPIntValue();
if (Opcode == ISD::XOR && Demanded.isSubsetOf(C))
return false;
if (!C.isSubsetOf(Demanded)) {
EVT VT = Op.getValueType();
SDValue NewC = TLO.DAG.getConstant(Demanded & C, DL, VT);
SDValue NewOp = TLO.DAG.getNode(Opcode, DL, VT, Op.getOperand(0), NewC);
return TLO.CombineTo(Op, NewOp);
}
break;
}
}
return false;
}
/// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free.
/// This uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be
/// generalized for targets with other types of implicit widening casts.
bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth,
const APInt &Demanded,
TargetLoweringOpt &TLO) const {
assert(Op.getNumOperands() == 2 &&
"ShrinkDemandedOp only supports binary operators!");
assert(Op.getNode()->getNumValues() == 1 &&
"ShrinkDemandedOp only supports nodes with one result!");
SelectionDAG &DAG = TLO.DAG;
SDLoc dl(Op);
// Early return, as this function cannot handle vector types.
if (Op.getValueType().isVector())
return false;
// Don't do this if the node has another user, which may require the
// full value.
if (!Op.getNode()->hasOneUse())
return false;
// Search for the smallest integer type with free casts to and from
// Op's type. For expedience, just check power-of-2 integer types.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned DemandedSize = Demanded.getActiveBits();
unsigned SmallVTBits = DemandedSize;
if (!isPowerOf2_32(SmallVTBits))
SmallVTBits = NextPowerOf2(SmallVTBits);
for (; SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), SmallVTBits);
if (TLI.isTruncateFree(Op.getValueType(), SmallVT) &&
TLI.isZExtFree(SmallVT, Op.getValueType())) {
// We found a type with free casts.
SDValue X = DAG.getNode(
Op.getOpcode(), dl, SmallVT,
DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(0)),
DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(1)));
assert(DemandedSize <= SmallVTBits && "Narrowed below demanded bits?");
SDValue Z = DAG.getNode(ISD::ANY_EXTEND, dl, Op.getValueType(), X);
return TLO.CombineTo(Op, Z);
}
}
return false;
}
bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
!DCI.isBeforeLegalizeOps());
KnownBits Known;
bool Simplified = SimplifyDemandedBits(Op, DemandedBits, Known, TLO);
if (Simplified) {
DCI.AddToWorklist(Op.getNode());
DCI.CommitTargetLoweringOpt(TLO);
}
return Simplified;
}
bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
KnownBits &Known,
TargetLoweringOpt &TLO,
unsigned Depth,
bool AssumeSingleUse) const {
EVT VT = Op.getValueType();
APInt DemandedElts = VT.isVector()
? APInt::getAllOnesValue(VT.getVectorNumElements())
: APInt(1, 1);
return SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO, Depth,
AssumeSingleUse);
}
/// Look at Op. At this point, we know that only the OriginalDemandedBits of the
/// result of Op are ever used downstream. If we can use this information to
/// simplify Op, create a new simplified DAG node and return true, returning the
/// original and new nodes in Old and New. Otherwise, analyze the expression and
/// return a mask of Known bits for the expression (used to simplify the
/// caller). The Known bits may only be accurate for those bits in the
/// OriginalDemandedBits and OriginalDemandedElts.
bool TargetLowering::SimplifyDemandedBits(
SDValue Op, const APInt &OriginalDemandedBits,
const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
unsigned Depth, bool AssumeSingleUse) const {
unsigned BitWidth = OriginalDemandedBits.getBitWidth();
assert(Op.getScalarValueSizeInBits() == BitWidth &&
"Mask size mismatches value type size!");
unsigned NumElts = OriginalDemandedElts.getBitWidth();
assert((!Op.getValueType().isVector() ||
NumElts == Op.getValueType().getVectorNumElements()) &&
"Unexpected vector size");
APInt DemandedBits = OriginalDemandedBits;
APInt DemandedElts = OriginalDemandedElts;
SDLoc dl(Op);
auto &DL = TLO.DAG.getDataLayout();
// Don't know anything.
Known = KnownBits(BitWidth);
// Undef operand.
if (Op.isUndef())
return false;
if (Op.getOpcode() == ISD::Constant) {
// We know all of the bits for a constant!
Known.One = cast<ConstantSDNode>(Op)->getAPIntValue();
Known.Zero = ~Known.One;
return false;
}
// Other users may use these bits.
EVT VT = Op.getValueType();
if (!Op.getNode()->hasOneUse() && !AssumeSingleUse) {
if (Depth != 0) {
// If not at the root, Just compute the Known bits to
// simplify things downstream.
Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
return false;
}
// If this is the root being simplified, allow it to have multiple uses,
// just set the DemandedBits/Elts to all bits.
DemandedBits = APInt::getAllOnesValue(BitWidth);
DemandedElts = APInt::getAllOnesValue(NumElts);
} else if (OriginalDemandedBits == 0 || OriginalDemandedElts == 0) {
// Not demanding any bits/elts from Op.
return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
} else if (Depth == 6) { // Limit search depth.
return false;
}
KnownBits Known2, KnownOut;
switch (Op.getOpcode()) {
case ISD::SCALAR_TO_VECTOR: {
if (!DemandedElts[0])
return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
KnownBits SrcKnown;
SDValue Src = Op.getOperand(0);
unsigned SrcBitWidth = Src.getScalarValueSizeInBits();
APInt SrcDemandedBits = DemandedBits.zextOrSelf(SrcBitWidth);
if (SimplifyDemandedBits(Src, SrcDemandedBits, SrcKnown, TLO, Depth + 1))
return true;
Known = SrcKnown.zextOrTrunc(BitWidth, false);
break;
}
case ISD::BUILD_VECTOR:
// Collect the known bits that are shared by every demanded element.
// TODO: Call SimplifyDemandedBits for non-constant demanded elements.
Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
return false; // Don't fall through, will infinitely loop.
case ISD::LOAD: {
LoadSDNode *LD = cast<LoadSDNode>(Op);
if (getTargetConstantFromLoad(LD)) {
Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
return false; // Don't fall through, will infinitely loop.
}
break;
}
case ISD::INSERT_VECTOR_ELT: {
SDValue Vec = Op.getOperand(0);
SDValue Scl = Op.getOperand(1);
auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
EVT VecVT = Vec.getValueType();
// If index isn't constant, assume we need all vector elements AND the
// inserted element.
APInt DemandedVecElts(DemandedElts);
if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
unsigned Idx = CIdx->getZExtValue();
DemandedVecElts.clearBit(Idx);
// Inserted element is not required.
if (!DemandedElts[Idx])
return TLO.CombineTo(Op, Vec);
}
KnownBits KnownScl;
unsigned NumSclBits = Scl.getScalarValueSizeInBits();
APInt DemandedSclBits = DemandedBits.zextOrTrunc(NumSclBits);
if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
return true;
Known = KnownScl.zextOrTrunc(BitWidth, false);
KnownBits KnownVec;
if (SimplifyDemandedBits(Vec, DemandedBits, DemandedVecElts, KnownVec, TLO,
Depth + 1))
return true;
if (!!DemandedVecElts) {
Known.One &= KnownVec.One;
Known.Zero &= KnownVec.Zero;
}
return false;
}
case ISD::INSERT_SUBVECTOR: {
SDValue Base = Op.getOperand(0);
SDValue Sub = Op.getOperand(1);
EVT SubVT = Sub.getValueType();
unsigned NumSubElts = SubVT.getVectorNumElements();
// If index isn't constant, assume we need the original demanded base
// elements and ALL the inserted subvector elements.
APInt BaseElts = DemandedElts;
APInt SubElts = APInt::getAllOnesValue(NumSubElts);
if (isa<ConstantSDNode>(Op.getOperand(2))) {
const APInt &Idx = Op.getConstantOperandAPInt(2);
if (Idx.ule(NumElts - NumSubElts)) {
unsigned SubIdx = Idx.getZExtValue();
SubElts = DemandedElts.extractBits(NumSubElts, SubIdx);
BaseElts.insertBits(APInt::getNullValue(NumSubElts), SubIdx);
}
}
KnownBits KnownSub, KnownBase;
if (SimplifyDemandedBits(Sub, DemandedBits, SubElts, KnownSub, TLO,
Depth + 1))
return true;
if (SimplifyDemandedBits(Base, DemandedBits, BaseElts, KnownBase, TLO,
Depth + 1))
return true;
Known.Zero.setAllBits();
Known.One.setAllBits();
if (!!SubElts) {
Known.One &= KnownSub.One;
Known.Zero &= KnownSub.Zero;
}
if (!!BaseElts) {
Known.One &= KnownBase.One;
Known.Zero &= KnownBase.Zero;
}
break;
}
case ISD::CONCAT_VECTORS: {
Known.Zero.setAllBits();
Known.One.setAllBits();
EVT SubVT = Op.getOperand(0).getValueType();
unsigned NumSubVecs = Op.getNumOperands();
unsigned NumSubElts = SubVT.getVectorNumElements();
for (unsigned i = 0; i != NumSubVecs; ++i) {
APInt DemandedSubElts =
DemandedElts.extractBits(NumSubElts, i * NumSubElts);
if (SimplifyDemandedBits(Op.getOperand(i), DemandedBits, DemandedSubElts,
Known2, TLO, Depth + 1))
return true;
// Known bits are shared by every demanded subvector element.
if (!!DemandedSubElts) {
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
}
}
break;
}
case ISD::VECTOR_SHUFFLE: {
ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(Op)->getMask();
// Collect demanded elements from shuffle operands..
APInt DemandedLHS(NumElts, 0);
APInt DemandedRHS(NumElts, 0);
for (unsigned i = 0; i != NumElts; ++i) {
if (!DemandedElts[i])
continue;
int M = ShuffleMask[i];
if (M < 0) {
// For UNDEF elements, we don't know anything about the common state of
// the shuffle result.
DemandedLHS.clearAllBits();
DemandedRHS.clearAllBits();
break;
}
assert(0 <= M && M < (int)(2 * NumElts) && "Shuffle index out of range");
if (M < (int)NumElts)
DemandedLHS.setBit(M);
else
DemandedRHS.setBit(M - NumElts);
}
if (!!DemandedLHS || !!DemandedRHS) {
Known.Zero.setAllBits();
Known.One.setAllBits();
if (!!DemandedLHS) {
if (SimplifyDemandedBits(Op.getOperand(0), DemandedBits, DemandedLHS,
Known2, TLO, Depth + 1))
return true;
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
}
if (!!DemandedRHS) {
if (SimplifyDemandedBits(Op.getOperand(1), DemandedBits, DemandedRHS,
Known2, TLO, Depth + 1))
return true;
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
}
}
break;
}
case ISD::AND: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
// If the RHS is a constant, check to see if the LHS would be zero without
// using the bits from the RHS. Below, we use knowledge about the RHS to
// simplify the LHS, here we're using information from the LHS to simplify
// the RHS.
if (ConstantSDNode *RHSC = isConstOrConstSplat(Op1)) {
// Do not increment Depth here; that can cause an infinite loop.
KnownBits LHSKnown = TLO.DAG.computeKnownBits(Op0, DemandedElts, Depth);
// If the LHS already has zeros where RHSC does, this 'and' is dead.
if ((LHSKnown.Zero & DemandedBits) ==
(~RHSC->getAPIntValue() & DemandedBits))
return TLO.CombineTo(Op, Op0);
// If any of the set bits in the RHS are known zero on the LHS, shrink
// the constant.
if (ShrinkDemandedConstant(Op, ~LHSKnown.Zero & DemandedBits, TLO))
return true;
// Bitwise-not (xor X, -1) is a special case: we don't usually shrink its
// constant, but if this 'and' is only clearing bits that were just set by
// the xor, then this 'and' can be eliminated by shrinking the mask of
// the xor. For example, for a 32-bit X:
// and (xor (srl X, 31), -1), 1 --> xor (srl X, 31), 1
if (isBitwiseNot(Op0) && Op0.hasOneUse() &&
LHSKnown.One == ~RHSC->getAPIntValue()) {
SDValue Xor = TLO.DAG.getNode(ISD::XOR, dl, VT, Op0.getOperand(0), Op1);
return TLO.CombineTo(Op, Xor);
}
}
if (SimplifyDemandedBits(Op1, DemandedBits, DemandedElts, Known, TLO,
Depth + 1))
return true;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
if (SimplifyDemandedBits(Op0, ~Known.Zero & DemandedBits, DemandedElts,
Known2, TLO, Depth + 1))
return true;
assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
// If all of the demanded bits are known one on one side, return the other.
// These bits cannot contribute to the result of the 'and'.
if (DemandedBits.isSubsetOf(Known2.Zero | Known.One))
return TLO.CombineTo(Op, Op0);
if (DemandedBits.isSubsetOf(Known.Zero | Known2.One))
return TLO.CombineTo(Op, Op1);
// If all of the demanded bits in the inputs are known zeros, return zero.
if (DemandedBits.isSubsetOf(Known.Zero | Known2.Zero))
return TLO.CombineTo(Op, TLO.DAG.getConstant(0, dl, VT));
// If the RHS is a constant, see if we can simplify it.
if (ShrinkDemandedConstant(Op, ~Known2.Zero & DemandedBits, TLO))
return true;
// If the operation can be done in a smaller type, do so.
if (ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
return true;
// Output known-1 bits are only known if set in both the LHS & RHS.
Known.One &= Known2.One;
// Output known-0 are known to be clear if zero in either the LHS | RHS.
Known.Zero |= Known2.Zero;
break;
}
case ISD::OR: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
if (SimplifyDemandedBits(Op1, DemandedBits, DemandedElts, Known, TLO,
Depth + 1))
return true;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
if (SimplifyDemandedBits(Op0, ~Known.One & DemandedBits, DemandedElts,
Known2, TLO, Depth + 1))
return true;
assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
// If all of the demanded bits are known zero on one side, return the other.
// These bits cannot contribute to the result of the 'or'.
if (DemandedBits.isSubsetOf(Known2.One | Known.Zero))
return TLO.CombineTo(Op, Op0);
if (DemandedBits.isSubsetOf(Known.One | Known2.Zero))
return TLO.CombineTo(Op, Op1);
// If the RHS is a constant, see if we can simplify it.
if (ShrinkDemandedConstant(Op, DemandedBits, TLO))
return true;
// If the operation can be done in a smaller type, do so.
if (ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
return true;
// Output known-0 bits are only known if clear in both the LHS & RHS.
Known.Zero &= Known2.Zero;
// Output known-1 are known to be set if set in either the LHS | RHS.
Known.One |= Known2.One;
break;
}
case ISD::XOR: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
if (SimplifyDemandedBits(Op1, DemandedBits, DemandedElts, Known, TLO,
Depth + 1))
return true;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
if (SimplifyDemandedBits(Op0, DemandedBits, DemandedElts, Known2, TLO,
Depth + 1))
return true;
assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
// If all of the demanded bits are known zero on one side, return the other.
// These bits cannot contribute to the result of the 'xor'.
if (DemandedBits.isSubsetOf(Known.Zero))
return TLO.CombineTo(Op, Op0);
if (DemandedBits.isSubsetOf(Known2.Zero))
return TLO.CombineTo(Op, Op1);
// If the operation can be done in a smaller type, do so.
if (ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
return true;
// If all of the unknown bits are known to be zero on one side or the other
// (but not both) turn this into an *inclusive* or.
// e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
if (DemandedBits.isSubsetOf(Known.Zero | Known2.Zero))
return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::OR, dl, VT, Op0, Op1));
// Output known-0 bits are known if clear or set in both the LHS & RHS.
KnownOut.Zero = (Known.Zero & Known2.Zero) | (Known.One & Known2.One);
// Output known-1 are known to be set if set in only one of the LHS, RHS.
KnownOut.One = (Known.Zero & Known2.One) | (Known.One & Known2.Zero);
if (ConstantSDNode *C = isConstOrConstSplat(Op1)) {
// If one side is a constant, and all of the known set bits on the other
// side are also set in the constant, turn this into an AND, as we know
// the bits will be cleared.
// e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
// NB: it is okay if more bits are known than are requested
if (C->getAPIntValue() == Known2.One) {
SDValue ANDC =
TLO.DAG.getConstant(~C->getAPIntValue() & DemandedBits, dl, VT);
return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, dl, VT, Op0, ANDC));
}
// If the RHS is a constant, see if we can change it. Don't alter a -1
// constant because that's a 'not' op, and that is better for combining
// and codegen.
if (!C->isAllOnesValue()) {
if (DemandedBits.isSubsetOf(C->getAPIntValue())) {
// We're flipping all demanded bits. Flip the undemanded bits too.
SDValue New = TLO.DAG.getNOT(dl, Op0, VT);
return TLO.CombineTo(Op, New);
}
// If we can't turn this into a 'not', try to shrink the constant.
if (ShrinkDemandedConstant(Op, DemandedBits, TLO))
return true;
}
}
Known = std::move(KnownOut);
break;
}
case ISD::SELECT:
if (SimplifyDemandedBits(Op.getOperand(2), DemandedBits, Known, TLO,
Depth + 1))
return true;
if (SimplifyDemandedBits(Op.getOperand(1), DemandedBits, Known2, TLO,
Depth + 1))
return true;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
// If the operands are constants, see if we can simplify them.
if (ShrinkDemandedConstant(Op, DemandedBits, TLO))
return true;
// Only known if known in both the LHS and RHS.
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
break;
case ISD::SELECT_CC:
if (SimplifyDemandedBits(Op.getOperand(3), DemandedBits, Known, TLO,
Depth + 1))
return true;
if (SimplifyDemandedBits(Op.getOperand(2), DemandedBits, Known2, TLO,
Depth + 1))
return true;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
// If the operands are constants, see if we can simplify them.
if (ShrinkDemandedConstant(Op, DemandedBits, TLO))
return true;
// Only known if known in both the LHS and RHS.
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
break;
case ISD::SETCC: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
// If (1) we only need the sign-bit, (2) the setcc operands are the same
// width as the setcc result, and (3) the result of a setcc conforms to 0 or
// -1, we may be able to bypass the setcc.
if (DemandedBits.isSignMask() &&
Op0.getScalarValueSizeInBits() == BitWidth &&
getBooleanContents(VT) ==
BooleanContent::ZeroOrNegativeOneBooleanContent) {
// If we're testing X < 0, then this compare isn't needed - just use X!
// FIXME: We're limiting to integer types here, but this should also work
// if we don't care about FP signed-zero. The use of SETLT with FP means
// that we don't care about NaNs.
if (CC == ISD::SETLT && Op1.getValueType().isInteger() &&
(isNullConstant(Op1) || ISD::isBuildVectorAllZeros(Op1.getNode())))
return TLO.CombineTo(Op, Op0);
// TODO: Should we check for other forms of sign-bit comparisons?
// Examples: X <= -1, X >= 0
}
if (getBooleanContents(Op0.getValueType()) ==
TargetLowering::ZeroOrOneBooleanContent &&
BitWidth > 1)
Known.Zero.setBitsFrom(1);
break;
}
case ISD::SHL: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
if (ConstantSDNode *SA = isConstOrConstSplat(Op1, DemandedElts)) {
// If the shift count is an invalid immediate, don't do anything.
if (SA->getAPIntValue().uge(BitWidth))
break;
unsigned ShAmt = SA->getZExtValue();
if (ShAmt == 0)
return TLO.CombineTo(Op, Op0);
// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
// single shift. We can do this if the bottom bits (which are shifted
// out) are never demanded.
// TODO - support non-uniform vector amounts.
if (Op0.getOpcode() == ISD::SRL) {
if ((DemandedBits & APInt::getLowBitsSet(BitWidth, ShAmt)) == 0) {
if (ConstantSDNode *SA2 =
isConstOrConstSplat(Op0.getOperand(1), DemandedElts)) {
if (SA2->getAPIntValue().ult(BitWidth)) {
unsigned C1 = SA2->getZExtValue();
unsigned Opc = ISD::SHL;
int Diff = ShAmt - C1;
if (Diff < 0) {
Diff = -Diff;
Opc = ISD::SRL;
}
SDValue NewSA = TLO.DAG.getConstant(Diff, dl, Op1.getValueType());
return TLO.CombineTo(
Op, TLO.DAG.getNode(Opc, dl, VT, Op0.getOperand(0), NewSA));
}
}
}
}
if (SimplifyDemandedBits(Op0, DemandedBits.lshr(ShAmt), DemandedElts,
Known, TLO, Depth + 1))
return true;
// Try shrinking the operation as long as the shift amount will still be
// in range.
if ((ShAmt < DemandedBits.getActiveBits()) &&
ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
return true;
// Convert (shl (anyext x, c)) to (anyext (shl x, c)) if the high bits
// are not demanded. This will likely allow the anyext to be folded away.
if (Op0.getOpcode() == ISD::ANY_EXTEND) {
SDValue InnerOp = Op0.getOperand(0);
EVT InnerVT = InnerOp.getValueType();
unsigned InnerBits = InnerVT.getScalarSizeInBits();
if (ShAmt < InnerBits && DemandedBits.getActiveBits() <= InnerBits &&
isTypeDesirableForOp(ISD::SHL, InnerVT)) {
EVT ShTy = getShiftAmountTy(InnerVT, DL);
if (!APInt(BitWidth, ShAmt).isIntN(ShTy.getSizeInBits()))
ShTy = InnerVT;
SDValue NarrowShl =
TLO.DAG.getNode(ISD::SHL, dl, InnerVT, InnerOp,
TLO.DAG.getConstant(ShAmt, dl, ShTy));
return TLO.CombineTo(
Op, TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT, NarrowShl));
}
// Repeat the SHL optimization above in cases where an extension
// intervenes: (shl (anyext (shr x, c1)), c2) to
// (shl (anyext x), c2-c1). This requires that the bottom c1 bits
// aren't demanded (as above) and that the shifted upper c1 bits of
// x aren't demanded.
if (Op0.hasOneUse() && InnerOp.getOpcode() == ISD::SRL &&
InnerOp.hasOneUse()) {
if (ConstantSDNode *SA2 =
isConstOrConstSplat(InnerOp.getOperand(1))) {
unsigned InnerShAmt = SA2->getLimitedValue(InnerBits);
if (InnerShAmt < ShAmt && InnerShAmt < InnerBits &&
DemandedBits.getActiveBits() <=
(InnerBits - InnerShAmt + ShAmt) &&
DemandedBits.countTrailingZeros() >= ShAmt) {
SDValue NewSA = TLO.DAG.getConstant(ShAmt - InnerShAmt, dl,
Op1.getValueType());
SDValue NewExt = TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT,
InnerOp.getOperand(0));
return TLO.CombineTo(
Op, TLO.DAG.getNode(ISD::SHL, dl, VT, NewExt, NewSA));
}
}
}
}
Known.Zero <<= ShAmt;
Known.One <<= ShAmt;
// low bits known zero.
Known.Zero.setLowBits(ShAmt);
}
break;
}
case ISD::SRL: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
if (ConstantSDNode *SA = isConstOrConstSplat(Op1, DemandedElts)) {
// If the shift count is an invalid immediate, don't do anything.
if (SA->getAPIntValue().uge(BitWidth))
break;
unsigned ShAmt = SA->getZExtValue();
if (ShAmt == 0)
return TLO.CombineTo(Op, Op0);
EVT ShiftVT = Op1.getValueType();
APInt InDemandedMask = (DemandedBits << ShAmt);
// If the shift is exact, then it does demand the low bits (and knows that
// they are zero).
if (Op->getFlags().hasExact())
InDemandedMask.setLowBits(ShAmt);
// If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
// single shift. We can do this if the top bits (which are shifted out)
// are never demanded.
// TODO - support non-uniform vector amounts.
if (Op0.getOpcode() == ISD::SHL) {
if (ConstantSDNode *SA2 =
isConstOrConstSplat(Op0.getOperand(1), DemandedElts)) {
if ((DemandedBits & APInt::getHighBitsSet(BitWidth, ShAmt)) == 0) {
if (SA2->getAPIntValue().ult(BitWidth)) {
unsigned C1 = SA2->getZExtValue();
unsigned Opc = ISD::SRL;
int Diff = ShAmt - C1;
if (Diff < 0) {
Diff = -Diff;
Opc = ISD::SHL;
}
SDValue NewSA = TLO.DAG.getConstant(Diff, dl, ShiftVT);
return TLO.CombineTo(
Op, TLO.DAG.getNode(Opc, dl, VT, Op0.getOperand(0), NewSA));
}
}
}
}
// Compute the new bits that are at the top now.
if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO,
Depth + 1))
return true;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
Known.Zero.lshrInPlace(ShAmt);
Known.One.lshrInPlace(ShAmt);
Known.Zero.setHighBits(ShAmt); // High bits known zero.
}
break;
}
case ISD::SRA: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
// If this is an arithmetic shift right and only the low-bit is set, we can
// always convert this into a logical shr, even if the shift amount is
// variable. The low bit of the shift cannot be an input sign bit unless
// the shift amount is >= the size of the datatype, which is undefined.
if (DemandedBits.isOneValue())
return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1));
if (ConstantSDNode *SA = isConstOrConstSplat(Op1, DemandedElts)) {
// If the shift count is an invalid immediate, don't do anything.
if (SA->getAPIntValue().uge(BitWidth))
break;
unsigned ShAmt = SA->getZExtValue();
if (ShAmt == 0)
return TLO.CombineTo(Op, Op0);
APInt InDemandedMask = (DemandedBits << ShAmt);
// If the shift is exact, then it does demand the low bits (and knows that
// they are zero).
if (Op->getFlags().hasExact())
InDemandedMask.setLowBits(ShAmt);
// If any of the demanded bits are produced by the sign extension, we also
// demand the input sign bit.
if (DemandedBits.countLeadingZeros() < ShAmt)
InDemandedMask.setSignBit();
if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO,
Depth + 1))
return true;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
Known.Zero.lshrInPlace(ShAmt);
Known.One.lshrInPlace(ShAmt);
// If the input sign bit is known to be zero, or if none of the top bits
// are demanded, turn this into an unsigned shift right.
if (Known.Zero[BitWidth - ShAmt - 1] ||
DemandedBits.countLeadingZeros() >= ShAmt) {
SDNodeFlags Flags;
Flags.setExact(Op->getFlags().hasExact());
return TLO.CombineTo(
Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1, Flags));
}
int Log2 = DemandedBits.exactLogBase2();
if (Log2 >= 0) {
// The bit must come from the sign.
SDValue NewSA =
TLO.DAG.getConstant(BitWidth - 1 - Log2, dl, Op1.getValueType());
return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, NewSA));
}
if (Known.One[BitWidth - ShAmt - 1])
// New bits are known one.
Known.One.setHighBits(ShAmt);
}
break;
}
case ISD::FSHL:
case ISD::FSHR: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDValue Op2 = Op.getOperand(2);
bool IsFSHL = (Op.getOpcode() == ISD::FSHL);
if (ConstantSDNode *SA = isConstOrConstSplat(Op2, DemandedElts)) {
unsigned Amt = SA->getAPIntValue().urem(BitWidth);
// For fshl, 0-shift returns the 1st arg.
// For fshr, 0-shift returns the 2nd arg.
if (Amt == 0) {
if (SimplifyDemandedBits(IsFSHL ? Op0 : Op1, DemandedBits, DemandedElts,
Known, TLO, Depth + 1))
return true;
break;
}
// fshl: (Op0 << Amt) | (Op1 >> (BW - Amt))
// fshr: (Op0 << (BW - Amt)) | (Op1 >> Amt)
APInt Demanded0 = DemandedBits.lshr(IsFSHL ? Amt : (BitWidth - Amt));
APInt Demanded1 = DemandedBits << (IsFSHL ? (BitWidth - Amt) : Amt);
if (SimplifyDemandedBits(Op0, Demanded0, DemandedElts, Known2, TLO,
Depth + 1))
return true;
if (SimplifyDemandedBits(Op1, Demanded1, DemandedElts, Known, TLO,
Depth + 1))
return true;
Known2.One <<= (IsFSHL ? Amt : (BitWidth - Amt));
Known2.Zero <<= (IsFSHL ? Amt : (BitWidth - Amt));
Known.One.lshrInPlace(IsFSHL ? (BitWidth - Amt) : Amt);
Known.Zero.lshrInPlace(IsFSHL ? (BitWidth - Amt) : Amt);
Known.One |= Known2.One;
Known.Zero |= Known2.Zero;
}
break;
}
case ISD::BITREVERSE: {
SDValue Src = Op.getOperand(0);
APInt DemandedSrcBits = DemandedBits.reverseBits();
if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, Known2, TLO,
Depth + 1))
return true;
Known.One = Known2.One.reverseBits();
Known.Zero = Known2.Zero.reverseBits();
break;
}
case ISD::SIGN_EXTEND_INREG: {
SDValue Op0 = Op.getOperand(0);
EVT ExVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
unsigned ExVTBits = ExVT.getScalarSizeInBits();
// If we only care about the highest bit, don't bother shifting right.
if (DemandedBits.isSignMask()) {
unsigned NumSignBits = TLO.DAG.ComputeNumSignBits(Op0);
bool AlreadySignExtended = NumSignBits >= BitWidth - ExVTBits + 1;
// However if the input is already sign extended we expect the sign
// extension to be dropped altogether later and do not simplify.
if (!AlreadySignExtended) {
// Compute the correct shift amount type, which must be getShiftAmountTy
// for scalar types after legalization.
EVT ShiftAmtTy = VT;
if (TLO.LegalTypes() && !ShiftAmtTy.isVector())
ShiftAmtTy = getShiftAmountTy(ShiftAmtTy, DL);
SDValue ShiftAmt =
TLO.DAG.getConstant(BitWidth - ExVTBits, dl, ShiftAmtTy);
return TLO.CombineTo(Op,
TLO.DAG.getNode(ISD::SHL, dl, VT, Op0, ShiftAmt));
}
}
// If none of the extended bits are demanded, eliminate the sextinreg.
if (DemandedBits.getActiveBits() <= ExVTBits)
return TLO.CombineTo(Op, Op0);
APInt InputDemandedBits = DemandedBits.getLoBits(ExVTBits);
// Since the sign extended bits are demanded, we know that the sign
// bit is demanded.
InputDemandedBits.setBit(ExVTBits - 1);
if (SimplifyDemandedBits(Op0, InputDemandedBits, Known, TLO, Depth + 1))
return true;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
// If the sign bit of the input is known set or clear, then we know the
// top bits of the result.
// If the input sign bit is known zero, convert this into a zero extension.
if (Known.Zero[ExVTBits - 1])
return TLO.CombineTo(
Op, TLO.DAG.getZeroExtendInReg(Op0, dl, ExVT.getScalarType()));
APInt Mask = APInt::getLowBitsSet(BitWidth, ExVTBits);
if (Known.One[ExVTBits - 1]) { // Input sign bit known set
Known.One.setBitsFrom(ExVTBits);
Known.Zero &= Mask;
} else { // Input sign bit unknown
Known.Zero &= Mask;
Known.One &= Mask;
}
break;
}
case ISD::BUILD_PAIR: {
EVT HalfVT = Op.getOperand(0).getValueType();
unsigned HalfBitWidth = HalfVT.getScalarSizeInBits();
APInt MaskLo = DemandedBits.getLoBits(HalfBitWidth).trunc(HalfBitWidth);
APInt MaskHi = DemandedBits.getHiBits(HalfBitWidth).trunc(HalfBitWidth);
KnownBits KnownLo, KnownHi;
if (SimplifyDemandedBits(Op.getOperand(0), MaskLo, KnownLo, TLO, Depth + 1))
return true;
if (SimplifyDemandedBits(Op.getOperand(1), MaskHi, KnownHi, TLO, Depth + 1))
return true;
Known.Zero = KnownLo.Zero.zext(BitWidth) |
KnownHi.Zero.zext(BitWidth).shl(HalfBitWidth);
Known.One = KnownLo.One.zext(BitWidth) |
KnownHi.One.zext(BitWidth).shl(HalfBitWidth);
break;
}
case ISD::ZERO_EXTEND:
case ISD::ZERO_EXTEND_VECTOR_INREG: {
SDValue Src = Op.getOperand(0);
EVT SrcVT = Src.getValueType();
unsigned InBits = SrcVT.getScalarSizeInBits();
unsigned InElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
bool IsVecInReg = Op.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG;
// If none of the top bits are demanded, convert this into an any_extend.
if (DemandedBits.getActiveBits() <= InBits) {
// If we only need the non-extended bits of the bottom element
// then we can just bitcast to the result.
if (IsVecInReg && DemandedElts == 1 &&
VT.getSizeInBits() == SrcVT.getSizeInBits() &&
TLO.DAG.getDataLayout().isLittleEndian())
return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src));
unsigned Opc =
IsVecInReg ? ISD::ANY_EXTEND_VECTOR_INREG : ISD::ANY_EXTEND;
if (!TLO.LegalOperations() || isOperationLegal(Opc, VT))
return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, Src));
}
APInt InDemandedBits = DemandedBits.trunc(InBits);
APInt InDemandedElts = DemandedElts.zextOrSelf(InElts);
if (SimplifyDemandedBits(Src, InDemandedBits, InDemandedElts, Known, TLO,
Depth + 1))
return true;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
assert(Known.getBitWidth() == InBits && "Src width has changed?");
Known = Known.zext(BitWidth, true /* ExtendedBitsAreKnownZero */);
break;
}
case ISD::SIGN_EXTEND:
case ISD::SIGN_EXTEND_VECTOR_INREG: {
SDValue Src = Op.getOperand(0);
EVT SrcVT = Src.getValueType();
unsigned InBits = SrcVT.getScalarSizeInBits();
unsigned InElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
bool IsVecInReg = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG;
// If none of the top bits are demanded, convert this into an any_extend.
if (DemandedBits.getActiveBits() <= InBits) {
// If we only need the non-extended bits of the bottom element
// then we can just bitcast to the result.
if (IsVecInReg && DemandedElts == 1 &&
VT.getSizeInBits() == SrcVT.getSizeInBits() &&
TLO.DAG.getDataLayout().isLittleEndian())
return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src));
unsigned Opc =
IsVecInReg ? ISD::ANY_EXTEND_VECTOR_INREG : ISD::ANY_EXTEND;
if (!TLO.LegalOperations() || isOperationLegal(Opc, VT))
return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, Src));
}
APInt InDemandedBits = DemandedBits.trunc(InBits);
APInt InDemandedElts = DemandedElts.zextOrSelf(InElts);
// Since some of the sign extended bits are demanded, we know that the sign
// bit is demanded.
InDemandedBits.setBit(InBits - 1);
if (SimplifyDemandedBits(Src, InDemandedBits, InDemandedElts, Known, TLO,
Depth + 1))
return true;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
assert(Known.getBitWidth() == InBits && "Src width has changed?");
// If the sign bit is known one, the top bits match.
Known = Known.sext(BitWidth);
// If the sign bit is known zero, convert this to a zero extend.
if (Known.isNonNegative()) {
unsigned Opc =
IsVecInReg ? ISD::ZERO_EXTEND_VECTOR_INREG : ISD::ZERO_EXTEND;
if (!TLO.LegalOperations() || isOperationLegal(Opc, VT))
return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, Src));
}
break;
}
case ISD::ANY_EXTEND:
case ISD::ANY_EXTEND_VECTOR_INREG: {
SDValue Src = Op.getOperand(0);
EVT SrcVT = Src.getValueType();
unsigned InBits = SrcVT.getScalarSizeInBits();
unsigned InElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
bool IsVecInReg = Op.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG;
// If we only need the bottom element then we can just bitcast.
// TODO: Handle ANY_EXTEND?
if (IsVecInReg && DemandedElts == 1 &&
VT.getSizeInBits() == SrcVT.getSizeInBits() &&
TLO.DAG.getDataLayout().isLittleEndian())
return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src));
APInt InDemandedBits = DemandedBits.trunc(InBits);
APInt InDemandedElts = DemandedElts.zextOrSelf(InElts);
if (SimplifyDemandedBits(Src, InDemandedBits, InDemandedElts, Known, TLO,
Depth + 1))
return true;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
assert(Known.getBitWidth() == InBits && "Src width has changed?");
Known = Known.zext(BitWidth, false /* => any extend */);
break;
}
case ISD::TRUNCATE: {
SDValue Src = Op.getOperand(0);
// Simplify the input, using demanded bit information, and compute the known
// zero/one bits live out.
unsigned OperandBitWidth = Src.getScalarValueSizeInBits();
APInt TruncMask = DemandedBits.zext(OperandBitWidth);
if (SimplifyDemandedBits(Src, TruncMask, Known, TLO, Depth + 1))
return true;
Known = Known.trunc(BitWidth);
// If the input is only used by this truncate, see if we can shrink it based
// on the known demanded bits.
if (Src.getNode()->hasOneUse()) {
switch (Src.getOpcode()) {
default:
break;
case ISD::SRL:
// Shrink SRL by a constant if none of the high bits shifted in are
// demanded.
if (TLO.LegalTypes() && !isTypeDesirableForOp(ISD::SRL, VT))
// Do not turn (vt1 truncate (vt2 srl)) into (vt1 srl) if vt1 is
// undesirable.
break;
auto *ShAmt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
if (!ShAmt || ShAmt->getAPIntValue().uge(BitWidth))
break;
SDValue Shift = Src.getOperand(1);
uint64_t ShVal = ShAmt->getZExtValue();
if (TLO.LegalTypes())
Shift = TLO.DAG.getConstant(ShVal, dl, getShiftAmountTy(VT, DL));
APInt HighBits =
APInt::getHighBitsSet(OperandBitWidth, OperandBitWidth - BitWidth);
HighBits.lshrInPlace(ShVal);
HighBits = HighBits.trunc(BitWidth);
if (!(HighBits & DemandedBits)) {
// None of the shifted in bits are needed. Add a truncate of the
// shift input, then shift it.
SDValue NewTrunc =
TLO.DAG.getNode(ISD::TRUNCATE, dl, VT, Src.getOperand(0));
return TLO.CombineTo(
Op, TLO.DAG.getNode(ISD::SRL, dl, VT, NewTrunc, Shift));
}
break;
}
}
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
break;
}
case ISD::AssertZext: {
// AssertZext demands all of the high bits, plus any of the low bits
// demanded by its users.
EVT ZVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
APInt InMask = APInt::getLowBitsSet(BitWidth, ZVT.getSizeInBits());
if (SimplifyDemandedBits(Op.getOperand(0), ~InMask | DemandedBits, Known,
TLO, Depth + 1))
return true;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
Known.Zero |= ~InMask;
break;
}
case ISD::EXTRACT_VECTOR_ELT: {
SDValue Src = Op.getOperand(0);
SDValue Idx = Op.getOperand(1);
unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
unsigned EltBitWidth = Src.getScalarValueSizeInBits();
// Demand the bits from every vector element without a constant index.
APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts);
if (auto *CIdx = dyn_cast<ConstantSDNode>(Idx))
if (CIdx->getAPIntValue().ult(NumSrcElts))
DemandedSrcElts = APInt::getOneBitSet(NumSrcElts, CIdx->getZExtValue());
// If BitWidth > EltBitWidth the value is anyext:ed. So we do not know
// anything about the extended bits.
APInt DemandedSrcBits = DemandedBits;
if (BitWidth > EltBitWidth)
DemandedSrcBits = DemandedSrcBits.trunc(EltBitWidth);
if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts, Known2, TLO,
Depth + 1))
return true;
Known = Known2;
if (BitWidth > EltBitWidth)
Known = Known.zext(BitWidth, false /* => any extend */);
break;
}
case ISD::BITCAST: {
SDValue Src = Op.getOperand(0);
EVT SrcVT = Src.getValueType();
unsigned NumSrcEltBits = SrcVT.getScalarSizeInBits();
// If this is an FP->Int bitcast and if the sign bit is the only
// thing demanded, turn this into a FGETSIGN.
if (!TLO.LegalOperations() && !VT.isVector() && !SrcVT.isVector() &&
DemandedBits == APInt::getSignMask(Op.getValueSizeInBits()) &&
SrcVT.isFloatingPoint()) {
bool OpVTLegal = isOperationLegalOrCustom(ISD::FGETSIGN, VT);
bool i32Legal = isOperationLegalOrCustom(ISD::FGETSIGN, MVT::i32);
if ((OpVTLegal || i32Legal) && VT.isSimple() && SrcVT != MVT::f16 &&
SrcVT != MVT::f128) {
// Cannot eliminate/lower SHL for f128 yet.
EVT Ty = OpVTLegal ? VT : MVT::i32;
// Make a FGETSIGN + SHL to move the sign bit into the appropriate
// place. We expect the SHL to be eliminated by other optimizations.
SDValue Sign = TLO.DAG.getNode(ISD::FGETSIGN, dl, Ty, Src);
unsigned OpVTSizeInBits = Op.getValueSizeInBits();
if (!OpVTLegal && OpVTSizeInBits > 32)
Sign = TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Sign);
unsigned ShVal = Op.getValueSizeInBits() - 1;
SDValue ShAmt = TLO.DAG.getConstant(ShVal, dl, VT);
return TLO.CombineTo(Op,
TLO.DAG.getNode(ISD::SHL, dl, VT, Sign, ShAmt));
}
}
// Bitcast from a vector using SimplifyDemanded Bits/VectorElts.
// Demand the elt/bit if any of the original elts/bits are demanded.
// TODO - bigendian once we have test coverage.
// TODO - bool vectors once SimplifyDemandedVectorElts has SETCC support.
if (SrcVT.isVector() && NumSrcEltBits > 1 &&
(BitWidth % NumSrcEltBits) == 0 &&
TLO.DAG.getDataLayout().isLittleEndian()) {
unsigned Scale = BitWidth / NumSrcEltBits;
unsigned NumSrcElts = SrcVT.getVectorNumElements();
APInt DemandedSrcBits = APInt::getNullValue(NumSrcEltBits);
APInt DemandedSrcElts = APInt::getNullValue(NumSrcElts);
for (unsigned i = 0; i != Scale; ++i) {
unsigned Offset = i * NumSrcEltBits;
APInt Sub = DemandedBits.extractBits(NumSrcEltBits, Offset);
if (!Sub.isNullValue()) {
DemandedSrcBits |= Sub;
for (unsigned j = 0; j != NumElts; ++j)
if (DemandedElts[j])
DemandedSrcElts.setBit((j * Scale) + i);
}
}
APInt KnownSrcUndef, KnownSrcZero;
if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownSrcUndef,
KnownSrcZero, TLO, Depth + 1))
return true;
KnownBits KnownSrcBits;
if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts,
KnownSrcBits, TLO, Depth + 1))
return true;
} else if ((NumSrcEltBits % BitWidth) == 0 &&
TLO.DAG.getDataLayout().isLittleEndian()) {
unsigned Scale = NumSrcEltBits / BitWidth;
unsigned NumSrcElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
APInt DemandedSrcBits = APInt::getNullValue(NumSrcEltBits);
APInt DemandedSrcElts = APInt::getNullValue(NumSrcElts);
for (unsigned i = 0; i != NumElts; ++i)
if (DemandedElts[i]) {
unsigned Offset = (i % Scale) * BitWidth;
DemandedSrcBits.insertBits(DemandedBits, Offset);
DemandedSrcElts.setBit(i / Scale);
}
if (SrcVT.isVector()) {
APInt KnownSrcUndef, KnownSrcZero;
if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownSrcUndef,
KnownSrcZero, TLO, Depth + 1))
return true;
}
KnownBits KnownSrcBits;
if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts,
KnownSrcBits, TLO, Depth + 1))
return true;
}
// If this is a bitcast, let computeKnownBits handle it. Only do this on a
// recursive call where Known may be useful to the caller.
if (Depth > 0) {
Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
return false;
}
break;
}
case ISD::ADD:
case ISD::MUL:
case ISD::SUB: {
// Add, Sub, and Mul don't demand any bits in positions beyond that
// of the highest bit demanded of them.
SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
unsigned DemandedBitsLZ = DemandedBits.countLeadingZeros();
APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
if (SimplifyDemandedBits(Op0, LoMask, DemandedElts, Known2, TLO,
Depth + 1) ||
SimplifyDemandedBits(Op1, LoMask, DemandedElts, Known2, TLO,
Depth + 1) ||
// See if the operation should be performed at a smaller bit width.
ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO)) {
SDNodeFlags Flags = Op.getNode()->getFlags();
if (Flags.hasNoSignedWrap() || Flags.hasNoUnsignedWrap()) {
// Disable the nsw and nuw flags. We can no longer guarantee that we
// won't wrap after simplification.
Flags.setNoSignedWrap(false);
Flags.setNoUnsignedWrap(false);
SDValue NewOp =
TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1, Flags);
return TLO.CombineTo(Op, NewOp);
}
return true;
}
// If we have a constant operand, we may be able to turn it into -1 if we
// do not demand the high bits. This can make the constant smaller to
// encode, allow more general folding, or match specialized instruction
// patterns (eg, 'blsr' on x86). Don't bother changing 1 to -1 because that
// is probably not useful (and could be detrimental).
ConstantSDNode *C = isConstOrConstSplat(Op1);
APInt HighMask = APInt::getHighBitsSet(BitWidth, DemandedBitsLZ);
if (C && !C->isAllOnesValue() && !C->isOne() &&
(C->getAPIntValue() | HighMask).isAllOnesValue()) {
SDValue Neg1 = TLO.DAG.getAllOnesConstant(dl, VT);
// We can't guarantee that the new math op doesn't wrap, so explicitly
// clear those flags to prevent folding with a potential existing node
// that has those flags set.
SDNodeFlags Flags;
Flags.setNoSignedWrap(false);
Flags.setNoUnsignedWrap(false);
SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Neg1, Flags);
return TLO.CombineTo(Op, NewOp);
}
LLVM_FALLTHROUGH;
}
default:
if (Op.getOpcode() >= ISD::BUILTIN_OP_END) {
if (SimplifyDemandedBitsForTargetNode(Op, DemandedBits, DemandedElts,
Known, TLO, Depth))
return true;
break;
}
// Just use computeKnownBits to compute output bits.
Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
break;
}
// If we know the value of all of the demanded bits, return this as a
// constant.
if (DemandedBits.isSubsetOf(Known.Zero | Known.One)) {
// Avoid folding to a constant if any OpaqueConstant is involved.
const SDNode *N = Op.getNode();
for (SDNodeIterator I = SDNodeIterator::begin(N),
E = SDNodeIterator::end(N);
I != E; ++I) {
SDNode *Op = *I;
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
if (C->isOpaque())
return false;
}
// TODO: Handle float bits as well.
if (VT.isInteger())
return TLO.CombineTo(Op, TLO.DAG.getConstant(Known.One, dl, VT));
}
return false;
}
bool TargetLowering::SimplifyDemandedVectorElts(SDValue Op,
const APInt &DemandedElts,
APInt &KnownUndef,
APInt &KnownZero,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
!DCI.isBeforeLegalizeOps());
bool Simplified =
SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, TLO);
if (Simplified) {
DCI.AddToWorklist(Op.getNode());
DCI.CommitTargetLoweringOpt(TLO);
}
return Simplified;
}
/// Given a vector binary operation and known undefined elements for each input
/// operand, compute whether each element of the output is undefined.
static APInt getKnownUndefForVectorBinop(SDValue BO, SelectionDAG &DAG,
const APInt &UndefOp0,
const APInt &UndefOp1) {
EVT VT = BO.getValueType();
assert(DAG.getTargetLoweringInfo().isBinOp(BO.getOpcode()) && VT.isVector() &&
"Vector binop only");
EVT EltVT = VT.getVectorElementType();
unsigned NumElts = VT.getVectorNumElements();
assert(UndefOp0.getBitWidth() == NumElts &&
UndefOp1.getBitWidth() == NumElts && "Bad type for undef analysis");
auto getUndefOrConstantElt = [&](SDValue V, unsigned Index,
const APInt &UndefVals) {
if (UndefVals[Index])
return DAG.getUNDEF(EltVT);
if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
// Try hard to make sure that the getNode() call is not creating temporary
// nodes. Ignore opaque integers because they do not constant fold.
SDValue Elt = BV->getOperand(Index);
auto *C = dyn_cast<ConstantSDNode>(Elt);
if (isa<ConstantFPSDNode>(Elt) || Elt.isUndef() || (C && !C->isOpaque()))
return Elt;
}
return SDValue();
};
APInt KnownUndef = APInt::getNullValue(NumElts);
for (unsigned i = 0; i != NumElts; ++i) {
// If both inputs for this element are either constant or undef and match
// the element type, compute the constant/undef result for this element of
// the vector.
// TODO: Ideally we would use FoldConstantArithmetic() here, but that does
// not handle FP constants. The code within getNode() should be refactored
// to avoid the danger of creating a bogus temporary node here.
SDValue C0 = getUndefOrConstantElt(BO.getOperand(0), i, UndefOp0);
SDValue C1 = getUndefOrConstantElt(BO.getOperand(1), i, UndefOp1);
if (C0 && C1 && C0.getValueType() == EltVT && C1.getValueType() == EltVT)
if (DAG.getNode(BO.getOpcode(), SDLoc(BO), EltVT, C0, C1).isUndef())
KnownUndef.setBit(i);
}
return KnownUndef;
}
bool TargetLowering::SimplifyDemandedVectorElts(
SDValue Op, const APInt &OriginalDemandedElts, APInt &KnownUndef,
APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth,
bool AssumeSingleUse) const {
EVT VT = Op.getValueType();
APInt DemandedElts = OriginalDemandedElts;
unsigned NumElts = DemandedElts.getBitWidth();
assert(VT.isVector() && "Expected vector op");
assert(VT.getVectorNumElements() == NumElts &&
"Mask size mismatches value type element count!");
KnownUndef = KnownZero = APInt::getNullValue(NumElts);
// Undef operand.
if (Op.isUndef()) {
KnownUndef.setAllBits();
return false;
}
// If Op has other users, assume that all elements are needed.
if (!Op.getNode()->hasOneUse() && !AssumeSingleUse)
DemandedElts.setAllBits();
// Not demanding any elements from Op.
if (DemandedElts == 0) {
KnownUndef.setAllBits();
return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
}
// Limit search depth.
if (Depth >= 6)
return false;
SDLoc DL(Op);
unsigned EltSizeInBits = VT.getScalarSizeInBits();
switch (Op.getOpcode()) {
case ISD::SCALAR_TO_VECTOR: {
if (!DemandedElts[0]) {
KnownUndef.setAllBits();
return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
}
KnownUndef.setHighBits(NumElts - 1);
break;
}
case ISD::BITCAST: {
SDValue Src = Op.getOperand(0);
EVT SrcVT = Src.getValueType();
// We only handle vectors here.
// TODO - investigate calling SimplifyDemandedBits/ComputeKnownBits?
if (!SrcVT.isVector())
break;
// Fast handling of 'identity' bitcasts.
unsigned NumSrcElts = SrcVT.getVectorNumElements();
if (NumSrcElts == NumElts)
return SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef,
KnownZero, TLO, Depth + 1);
APInt SrcZero, SrcUndef;
APInt SrcDemandedElts = APInt::getNullValue(NumSrcElts);
// Bitcast from 'large element' src vector to 'small element' vector, we
// must demand a source element if any DemandedElt maps to it.
if ((NumElts % NumSrcElts) == 0) {
unsigned Scale = NumElts / NumSrcElts;
for (unsigned i = 0; i != NumElts; ++i)
if (DemandedElts[i])
SrcDemandedElts.setBit(i / Scale);
if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero,
TLO, Depth + 1))
return true;
// Try calling SimplifyDemandedBits, converting demanded elts to the bits
// of the large element.
// TODO - bigendian once we have test coverage.
if (TLO.DAG.getDataLayout().isLittleEndian()) {
unsigned SrcEltSizeInBits = SrcVT.getScalarSizeInBits();
APInt SrcDemandedBits = APInt::getNullValue(SrcEltSizeInBits);
for (unsigned i = 0; i != NumElts; ++i)
if (DemandedElts[i]) {
unsigned Ofs = (i % Scale) * EltSizeInBits;
SrcDemandedBits.setBits(Ofs, Ofs + EltSizeInBits);
}
KnownBits Known;
if (SimplifyDemandedBits(Src, SrcDemandedBits, Known, TLO, Depth + 1))
return true;
}
// If the src element is zero/undef then all the output elements will be -
// only demanded elements are guaranteed to be correct.
for (unsigned i = 0; i != NumSrcElts; ++i) {
if (SrcDemandedElts[i]) {
if (SrcZero[i])
KnownZero.setBits(i * Scale, (i + 1) * Scale);
if (SrcUndef[i])
KnownUndef.setBits(i * Scale, (i + 1) * Scale);
}
}
}
// Bitcast from 'small element' src vector to 'large element' vector, we
// demand all smaller source elements covered by the larger demanded element
// of this vector.
if ((NumSrcElts % NumElts) == 0) {
unsigned Scale = NumSrcElts / NumElts;
for (unsigned i = 0; i != NumElts; ++i)
if (DemandedElts[i])
SrcDemandedElts.setBits(i * Scale, (i + 1) * Scale);
if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero,
TLO, Depth + 1))
return true;
// If all the src elements covering an output element are zero/undef, then
// the output element will be as well, assuming it was demanded.
for (unsigned i = 0; i != NumElts; ++i) {
if (DemandedElts[i]) {
if (SrcZero.extractBits(Scale, i * Scale).isAllOnesValue())
KnownZero.setBit(i);
if (SrcUndef.extractBits(Scale, i * Scale).isAllOnesValue())
KnownUndef.setBit(i);
}
}
}
break;
}
case ISD::BUILD_VECTOR: {
// Check all elements and simplify any unused elements with UNDEF.
if (!DemandedElts.isAllOnesValue()) {
// Don't simplify BROADCASTS.
if (llvm::any_of(Op->op_values(),
[&](SDValue Elt) { return Op.getOperand(0) != Elt; })) {
SmallVector<SDValue, 32> Ops(Op->op_begin(), Op->op_end());
bool Updated = false;
for (unsigned i = 0; i != NumElts; ++i) {
if (!DemandedElts[i] && !Ops[i].isUndef()) {
Ops[i] = TLO.DAG.getUNDEF(Ops[0].getValueType());
KnownUndef.setBit(i);
Updated = true;
}
}
if (Updated)
return TLO.CombineTo(Op, TLO.DAG.getBuildVector(VT, DL, Ops));
}
}
for (unsigned i = 0; i != NumElts; ++i) {
SDValue SrcOp = Op.getOperand(i);
if (SrcOp.isUndef()) {
KnownUndef.setBit(i);
} else if (EltSizeInBits == SrcOp.getScalarValueSizeInBits() &&
(isNullConstant(SrcOp) || isNullFPConstant(SrcOp))) {
KnownZero.setBit(i);
}
}
break;
}
case ISD::CONCAT_VECTORS: {
EVT SubVT = Op.getOperand(0).getValueType();
unsigned NumSubVecs = Op.getNumOperands();
unsigned NumSubElts = SubVT.getVectorNumElements();
for (unsigned i = 0; i != NumSubVecs; ++i) {
SDValue SubOp = Op.getOperand(i);
APInt SubElts = DemandedElts.extractBits(NumSubElts, i * NumSubElts);
APInt SubUndef, SubZero;
if (SimplifyDemandedVectorElts(SubOp, SubElts, SubUndef, SubZero, TLO,
Depth + 1))
return true;
KnownUndef.insertBits(SubUndef, i * NumSubElts);
KnownZero.insertBits(SubZero, i * NumSubElts);
}
break;
}
case ISD::INSERT_SUBVECTOR: {
if (!isa<ConstantSDNode>(Op.getOperand(2)))
break;
SDValue Base = Op.getOperand(0);
SDValue Sub = Op.getOperand(1);
EVT SubVT = Sub.getValueType();
unsigned NumSubElts = SubVT.getVectorNumElements();
const APInt &Idx = Op.getConstantOperandAPInt(2);
if (Idx.ugt(NumElts - NumSubElts))
break;
unsigned SubIdx = Idx.getZExtValue();
APInt SubElts = DemandedElts.extractBits(NumSubElts, SubIdx);
APInt SubUndef, SubZero;
if (SimplifyDemandedVectorElts(Sub, SubElts, SubUndef, SubZero, TLO,
Depth + 1))
return true;
APInt BaseElts = DemandedElts;
BaseElts.insertBits(APInt::getNullValue(NumSubElts), SubIdx);
if (SimplifyDemandedVectorElts(Base, BaseElts, KnownUndef, KnownZero, TLO,
Depth + 1))
return true;
KnownUndef.insertBits(SubUndef, SubIdx);
KnownZero.insertBits(SubZero, SubIdx);
break;
}
case ISD::EXTRACT_SUBVECTOR: {
SDValue Src = Op.getOperand(0);
ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
// Offset the demanded elts by the subvector index.
uint64_t Idx = SubIdx->getZExtValue();
APInt SrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
APInt SrcUndef, SrcZero;
if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
Depth + 1))
return true;
KnownUndef = SrcUndef.extractBits(NumElts, Idx);
KnownZero = SrcZero.extractBits(NumElts, Idx);
}
break;
}
case ISD::INSERT_VECTOR_ELT: {
SDValue Vec = Op.getOperand(0);
SDValue Scl = Op.getOperand(1);
auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
// For a legal, constant insertion index, if we don't need this insertion
// then strip it, else remove it from the demanded elts.
if (CIdx && CIdx->getAPIntValue().ult(NumElts)) {
unsigned Idx = CIdx->getZExtValue();
if (!DemandedElts[Idx])
return TLO.CombineTo(Op, Vec);
APInt DemandedVecElts(DemandedElts);
DemandedVecElts.clearBit(Idx);
if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
KnownZero, TLO, Depth + 1))
return true;
KnownUndef.clearBit(Idx);
if (Scl.isUndef())
KnownUndef.setBit(Idx);
KnownZero.clearBit(Idx);
if (isNullConstant(Scl) || isNullFPConstant(Scl))
KnownZero.setBit(Idx);
break;
}
APInt VecUndef, VecZero;
if (SimplifyDemandedVectorElts(Vec, DemandedElts, VecUndef, VecZero, TLO,
Depth + 1))
return true;
// Without knowing the insertion index we can't set KnownUndef/KnownZero.
break;
}
case ISD::VSELECT: {
// Try to transform the select condition based on the current demanded
// elements.
// TODO: If a condition element is undef, we can choose from one arm of the
// select (and if one arm is undef, then we can propagate that to the
// result).
// TODO - add support for constant vselect masks (see IR version of this).
APInt UnusedUndef, UnusedZero;
if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, UnusedUndef,
UnusedZero, TLO, Depth + 1))
return true;
// See if we can simplify either vselect operand.
APInt DemandedLHS(DemandedElts);
APInt DemandedRHS(DemandedElts);
APInt UndefLHS, ZeroLHS;
APInt UndefRHS, ZeroRHS;
if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedLHS, UndefLHS,
ZeroLHS, TLO, Depth + 1))
return true;
if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedRHS, UndefRHS,
ZeroRHS, TLO, Depth + 1))
return true;
KnownUndef = UndefLHS & UndefRHS;
KnownZero = ZeroLHS & ZeroRHS;
break;
}
case ISD::VECTOR_SHUFFLE: {
ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(Op)->getMask();
// Collect demanded elements from shuffle operands..
APInt DemandedLHS(NumElts, 0);
APInt DemandedRHS(NumElts, 0);
for (unsigned i = 0; i != NumElts; ++i) {
int M = ShuffleMask[i];
if (M < 0 || !DemandedElts[i])
continue;
assert(0 <= M && M < (int)(2 * NumElts) && "Shuffle index out of range");
if (M < (int)NumElts)
DemandedLHS.setBit(M);
else
DemandedRHS.setBit(M - NumElts);
}
// See if we can simplify either shuffle operand.
APInt UndefLHS, ZeroLHS;
APInt UndefRHS, ZeroRHS;
if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, UndefLHS,
ZeroLHS, TLO, Depth + 1))
return true;
if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, UndefRHS,
ZeroRHS, TLO, Depth + 1))
return true;
// Simplify mask using undef elements from LHS/RHS.
bool Updated = false;
bool IdentityLHS = true, IdentityRHS = true;
SmallVector<int, 32> NewMask(ShuffleMask.begin(), ShuffleMask.end());
for (unsigned i = 0; i != NumElts; ++i) {
int &M = NewMask[i];
if (M < 0)
continue;
if (!DemandedElts[i] || (M < (int)NumElts && UndefLHS[M]) ||
(M >= (int)NumElts && UndefRHS[M - NumElts])) {
Updated = true;
M = -1;
}
IdentityLHS &= (M < 0) || (M == (int)i);
IdentityRHS &= (M < 0) || ((M - NumElts) == i);
}
// Update legal shuffle masks based on demanded elements if it won't reduce
// to Identity which can cause premature removal of the shuffle mask.
if (Updated && !IdentityLHS && !IdentityRHS && !TLO.LegalOps &&
isShuffleMaskLegal(NewMask, VT))
return TLO.CombineTo(Op,
TLO.DAG.getVectorShuffle(VT, DL, Op.getOperand(0),
Op.getOperand(1), NewMask));
// Propagate undef/zero elements from LHS/RHS.
for (unsigned i = 0; i != NumElts; ++i) {
int M = ShuffleMask[i];
if (M < 0) {
KnownUndef.setBit(i);
} else if (M < (int)NumElts) {
if (UndefLHS[M])
KnownUndef.setBit(i);
if (ZeroLHS[M])
KnownZero.setBit(i);
} else {
if (UndefRHS[M - NumElts])
KnownUndef.setBit(i);
if (ZeroRHS[M - NumElts])
KnownZero.setBit(i);
}
}
break;
}
case ISD::ANY_EXTEND_VECTOR_INREG:
case ISD::SIGN_EXTEND_VECTOR_INREG:
case ISD::ZERO_EXTEND_VECTOR_INREG: {
APInt SrcUndef, SrcZero;
SDValue Src = Op.getOperand(0);
unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts);
if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, SrcUndef, SrcZero, TLO,
Depth + 1))
return true;
KnownZero = SrcZero.zextOrTrunc(NumElts);
KnownUndef = SrcUndef.zextOrTrunc(NumElts);
if (Op.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG &&
Op.getValueSizeInBits() == Src.getValueSizeInBits() &&
DemandedSrcElts == 1 && TLO.DAG.getDataLayout().isLittleEndian()) {
// aext - if we just need the bottom element then we can bitcast.
return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src));
}
if (Op.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) {
// zext(undef) upper bits are guaranteed to be zero.
if (DemandedElts.isSubsetOf(KnownUndef))
return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
KnownUndef.clearAllBits();
}
break;
}
// TODO: There are more binop opcodes that could be handled here - MUL, MIN,
// MAX, saturated math, etc.
case ISD::OR:
case ISD::XOR:
case ISD::ADD:
case ISD::SUB:
case ISD::FADD:
case ISD::FSUB:
case ISD::FMUL:
case ISD::FDIV:
case ISD::FREM: {
APInt UndefRHS, ZeroRHS;
if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, UndefRHS,
ZeroRHS, TLO, Depth + 1))
return true;
APInt UndefLHS, ZeroLHS;
if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, UndefLHS,
ZeroLHS, TLO, Depth + 1))
return true;
KnownZero = ZeroLHS & ZeroRHS;
KnownUndef = getKnownUndefForVectorBinop(Op, TLO.DAG, UndefLHS, UndefRHS);
break;
}
case ISD::SHL:
case ISD::SRL:
case ISD::SRA:
case ISD::ROTL:
case ISD::ROTR: {
APInt UndefRHS, ZeroRHS;
if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, UndefRHS,
ZeroRHS, TLO, Depth + 1))
return true;
APInt UndefLHS, ZeroLHS;
if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, UndefLHS,
ZeroLHS, TLO, Depth + 1))
return true;
KnownZero = ZeroLHS;
KnownUndef = UndefLHS & UndefRHS; // TODO: use getKnownUndefForVectorBinop?
break;
}
case ISD::MUL:
case ISD::AND: {
APInt SrcUndef, SrcZero;
if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, SrcUndef,
SrcZero, TLO, Depth + 1))
return true;
if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, KnownUndef,
KnownZero, TLO, Depth + 1))
return true;
// If either side has a zero element, then the result element is zero, even
// if the other is an UNDEF.
// TODO: Extend getKnownUndefForVectorBinop to also deal with known zeros
// and then handle 'and' nodes with the rest of the binop opcodes.
KnownZero |= SrcZero;
KnownUndef &= SrcUndef;
KnownUndef &= ~KnownZero;
break;
}
case ISD::TRUNCATE:
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, KnownUndef,
KnownZero, TLO, Depth + 1))
return true;
if (Op.getOpcode() == ISD::ZERO_EXTEND) {
// zext(undef) upper bits are guaranteed to be zero.
if (DemandedElts.isSubsetOf(KnownUndef))
return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
KnownUndef.clearAllBits();
}
break;
default: {
if (Op.getOpcode() >= ISD::BUILTIN_OP_END) {
if (SimplifyDemandedVectorEltsForTargetNode(Op, DemandedElts, KnownUndef,
KnownZero, TLO, Depth))
return true;
} else {
KnownBits Known;
APInt DemandedBits = APInt::getAllOnesValue(EltSizeInBits);
if (SimplifyDemandedBits(Op, DemandedBits, OriginalDemandedElts, Known,
TLO, Depth, AssumeSingleUse))
return true;
}
break;
}
}
assert((KnownUndef & KnownZero) == 0 && "Elements flagged as undef AND zero");
// Constant fold all undef cases.
// TODO: Handle zero cases as well.
if (DemandedElts.isSubsetOf(KnownUndef))
return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
return false;
}
/// Determine which of the bits specified in Mask are known to be either zero or
/// one and return them in the Known.
void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
KnownBits &Known,
const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth) const {
assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
Op.getOpcode() == ISD::INTRINSIC_VOID) &&
"Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!");
Known.resetAll();
}
void TargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
KnownBits &Known,
const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth) const {
assert(isa<FrameIndexSDNode>(Op) && "expected FrameIndex");
if (unsigned Align = DAG.InferPtrAlignment(Op)) {
// The low bits are known zero if the pointer is aligned.
Known.Zero.setLowBits(Log2_32(Align));
}
}
/// This method can be implemented by targets that want to expose additional
/// information about sign bits to the DAG Combiner.
unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
const APInt &,
const SelectionDAG &,
unsigned Depth) const {
assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
Op.getOpcode() == ISD::INTRINSIC_VOID) &&
"Should use ComputeNumSignBits if you don't know whether Op"
" is a target node!");
return 1;
}
bool TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
TargetLoweringOpt &TLO, unsigned Depth) const {
assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
Op.getOpcode() == ISD::INTRINSIC_VOID) &&
"Should use SimplifyDemandedVectorElts if you don't know whether Op"
" is a target node!");
return false;
}
bool TargetLowering::SimplifyDemandedBitsForTargetNode(
SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const {
assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
Op.getOpcode() == ISD::INTRINSIC_VOID) &&
"Should use SimplifyDemandedBits if you don't know whether Op"
" is a target node!");
computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);
return false;
}
const Constant *TargetLowering::getTargetConstantFromLoad(LoadSDNode*) const {
return nullptr;
}
bool TargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
const SelectionDAG &DAG,
bool SNaN,
unsigned Depth) const {
assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
Op.getOpcode() == ISD::INTRINSIC_VOID) &&
"Should use isKnownNeverNaN if you don't know whether Op"
" is a target node!");
return false;
}
// FIXME: Ideally, this would use ISD::isConstantSplatVector(), but that must
// work with truncating build vectors and vectors with elements of less than
// 8 bits.
bool TargetLowering::isConstTrueVal(const SDNode *N) const {
if (!N)
return false;
APInt CVal;
if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
CVal = CN->getAPIntValue();
} else if (auto *BV = dyn_cast<BuildVectorSDNode>(N)) {
auto *CN = BV->getConstantSplatNode();
if (!CN)
return false;
// If this is a truncating build vector, truncate the splat value.
// Otherwise, we may fail to match the expected values below.
unsigned BVEltWidth = BV->getValueType(0).getScalarSizeInBits();
CVal = CN->getAPIntValue();
if (BVEltWidth < CVal.getBitWidth())
CVal = CVal.trunc(BVEltWidth);
} else {
return false;
}
switch (getBooleanContents(N->getValueType(0))) {
case UndefinedBooleanContent:
return CVal[0];
case ZeroOrOneBooleanContent:
return CVal.isOneValue();
case ZeroOrNegativeOneBooleanContent:
return CVal.isAllOnesValue();
}
llvm_unreachable("Invalid boolean contents");
}
bool TargetLowering::isConstFalseVal(const SDNode *N) const {
if (!N)
return false;
const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
if (!CN) {
const BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N);
if (!BV)
return false;
// Only interested in constant splats, we don't care about undef
// elements in identifying boolean constants and getConstantSplatNode
// returns NULL if all ops are undef;
CN = BV->getConstantSplatNode();
if (!CN)
return false;
}
if (getBooleanContents(N->getValueType(0)) == UndefinedBooleanContent)
return !CN->getAPIntValue()[0];
return CN->isNullValue();
}
bool TargetLowering::isExtendedTrueVal(const ConstantSDNode *N, EVT VT,
bool SExt) const {
if (VT == MVT::i1)
return N->isOne();
TargetLowering::BooleanContent Cnt = getBooleanContents(VT);
switch (Cnt) {
case TargetLowering::ZeroOrOneBooleanContent:
// An extended value of 1 is always true, unless its original type is i1,
// in which case it will be sign extended to -1.
return (N->isOne() && !SExt) || (SExt && (N->getValueType(0) != MVT::i1));
case TargetLowering::UndefinedBooleanContent:
case TargetLowering::ZeroOrNegativeOneBooleanContent:
return N->isAllOnesValue() && SExt;
}
llvm_unreachable("Unexpected enumeration.");
}
/// This helper function of SimplifySetCC tries to optimize the comparison when
/// either operand of the SetCC node is a bitwise-and instruction.
SDValue TargetLowering::foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1,
ISD::CondCode Cond, const SDLoc &DL,
DAGCombinerInfo &DCI) const {
// Match these patterns in any of their permutations:
// (X & Y) == Y
// (X & Y) != Y
if (N1.getOpcode() == ISD::AND && N0.getOpcode() != ISD::AND)
std::swap(N0, N1);
EVT OpVT = N0.getValueType();
if (N0.getOpcode() != ISD::AND || !OpVT.isInteger() ||
(Cond != ISD::SETEQ && Cond != ISD::SETNE))
return SDValue();
SDValue X, Y;
if (N0.getOperand(0) == N1) {
X = N0.getOperand(1);
Y = N0.getOperand(0);
} else if (N0.getOperand(1) == N1) {
X = N0.getOperand(0);
Y = N0.getOperand(1);
} else {
return SDValue();
}
SelectionDAG &DAG = DCI.DAG;
SDValue Zero = DAG.getConstant(0, DL, OpVT);
if (DAG.isKnownToBeAPowerOfTwo(Y)) {
// Simplify X & Y == Y to X & Y != 0 if Y has exactly one bit set.
// Note that where Y is variable and is known to have at most one bit set
// (for example, if it is Z & 1) we cannot do this; the expressions are not
// equivalent when Y == 0.
Cond = ISD::getSetCCInverse(Cond, /*isInteger=*/true);
if (DCI.isBeforeLegalizeOps() ||
isCondCodeLegal(Cond, N0.getSimpleValueType()))
return DAG.getSetCC(DL, VT, N0, Zero, Cond);
} else if (N0.hasOneUse() && hasAndNotCompare(Y)) {
// If the target supports an 'and-not' or 'and-complement' logic operation,
// try to use that to make a comparison operation more efficient.
// But don't do this transform if the mask is a single bit because there are
// more efficient ways to deal with that case (for example, 'bt' on x86 or
// 'rlwinm' on PPC).
// Bail out if the compare operand that we want to turn into a zero is
// already a zero (otherwise, infinite loop).
auto *YConst = dyn_cast<ConstantSDNode>(Y);
if (YConst && YConst->isNullValue())
return SDValue();
// Transform this into: ~X & Y == 0.
SDValue NotX = DAG.getNOT(SDLoc(X), X, OpVT);
SDValue NewAnd = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, NotX, Y);
return DAG.getSetCC(DL, VT, NewAnd, Zero, Cond);
}
return SDValue();
}
/// There are multiple IR patterns that could be checking whether certain
/// truncation of a signed number would be lossy or not. The pattern which is
/// best at IR level, may not lower optimally. Thus, we want to unfold it.
/// We are looking for the following pattern: (KeptBits is a constant)
/// (add %x, (1 << (KeptBits-1))) srccond (1 << KeptBits)
/// KeptBits won't be bitwidth(x), that will be constant-folded to true/false.
/// KeptBits also can't be 1, that would have been folded to %x dstcond 0
/// We will unfold it into the natural trunc+sext pattern:
/// ((%x << C) a>> C) dstcond %x
/// Where C = bitwidth(x) - KeptBits and C u< bitwidth(x)
SDValue TargetLowering::optimizeSetCCOfSignedTruncationCheck(
EVT SCCVT, SDValue N0, SDValue N1, ISD::CondCode Cond, DAGCombinerInfo &DCI,
const SDLoc &DL) const {
// We must be comparing with a constant.
ConstantSDNode *C1;
if (!(C1 = dyn_cast<ConstantSDNode>(N1)))
return SDValue();
// N0 should be: add %x, (1 << (KeptBits-1))
if (N0->getOpcode() != ISD::ADD)
return SDValue();
// And we must be 'add'ing a constant.
ConstantSDNode *C01;
if (!(C01 = dyn_cast<ConstantSDNode>(N0->getOperand(1))))
return SDValue();
SDValue X = N0->getOperand(0);
EVT XVT = X.getValueType();
// Validate constants ...
APInt I1 = C1->getAPIntValue();
ISD::CondCode NewCond;
if (Cond == ISD::CondCode::SETULT) {
NewCond = ISD::CondCode::SETEQ;
} else if (Cond == ISD::CondCode::SETULE) {
NewCond = ISD::CondCode::SETEQ;
// But need to 'canonicalize' the constant.
I1 += 1;
} else if (Cond == ISD::CondCode::SETUGT) {
NewCond = ISD::CondCode::SETNE;
// But need to 'canonicalize' the constant.
I1 += 1;
} else if (Cond == ISD::CondCode::SETUGE) {
NewCond = ISD::CondCode::SETNE;
} else
return SDValue();
APInt I01 = C01->getAPIntValue();
auto checkConstants = [&I1, &I01]() -> bool {
// Both of them must be power-of-two, and the constant from setcc is bigger.
return I1.ugt(I01) && I1.isPowerOf2() && I01.isPowerOf2();
};
if (checkConstants()) {
// Great, e.g. got icmp ult i16 (add i16 %x, 128), 256
} else {
// What if we invert constants? (and the target predicate)
I1.negate();
I01.negate();
NewCond = getSetCCInverse(NewCond, /*isInteger=*/true);
if (!checkConstants())
return SDValue();
// Great, e.g. got icmp uge i16 (add i16 %x, -128), -256
}
// They are power-of-two, so which bit is set?
const unsigned KeptBits = I1.logBase2();
const unsigned KeptBitsMinusOne = I01.logBase2();
// Magic!
if (KeptBits != (KeptBitsMinusOne + 1))
return SDValue();
assert(KeptBits > 0 && KeptBits < XVT.getSizeInBits() && "unreachable");
// We don't want to do this in every single case.
SelectionDAG &DAG = DCI.DAG;
if (!DAG.getTargetLoweringInfo().shouldTransformSignedTruncationCheck(
XVT, KeptBits))
return SDValue();
const unsigned MaskedBits = XVT.getSizeInBits() - KeptBits;
assert(MaskedBits > 0 && MaskedBits < XVT.getSizeInBits() && "unreachable");
// Unfold into: ((%x << C) a>> C) cond %x
// Where 'cond' will be either 'eq' or 'ne'.
SDValue ShiftAmt = DAG.getConstant(MaskedBits, DL, XVT);
SDValue T0 = DAG.getNode(ISD::SHL, DL, XVT, X, ShiftAmt);
SDValue T1 = DAG.getNode(ISD::SRA, DL, XVT, T0, ShiftAmt);
SDValue T2 = DAG.getSetCC(DL, SCCVT, T1, X, NewCond);
return T2;
}
/// Try to fold an equality comparison with a {add/sub/xor} binary operation as
/// the 1st operand (N0). Callers are expected to swap the N0/N1 parameters to
/// handle the commuted versions of these patterns.
SDValue TargetLowering::foldSetCCWithBinOp(EVT VT, SDValue N0, SDValue N1,
ISD::CondCode Cond, const SDLoc &DL,
DAGCombinerInfo &DCI) const {
unsigned BOpcode = N0.getOpcode();
assert((BOpcode == ISD::ADD || BOpcode == ISD::SUB || BOpcode == ISD::XOR) &&
"Unexpected binop");
assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) && "Unexpected condcode");
// (X + Y) == X --> Y == 0
// (X - Y) == X --> Y == 0
// (X ^ Y) == X --> Y == 0
SelectionDAG &DAG = DCI.DAG;
EVT OpVT = N0.getValueType();
SDValue X = N0.getOperand(0);
SDValue Y = N0.getOperand(1);
if (X == N1)
return DAG.getSetCC(DL, VT, Y, DAG.getConstant(0, DL, OpVT), Cond);
if (Y != N1)
return SDValue();
// (X + Y) == Y --> X == 0
// (X ^ Y) == Y --> X == 0
if (BOpcode == ISD::ADD || BOpcode == ISD::XOR)
return DAG.getSetCC(DL, VT, X, DAG.getConstant(0, DL, OpVT), Cond);
// The shift would not be valid if the operands are boolean (i1).
if (!N0.hasOneUse() || OpVT.getScalarSizeInBits() == 1)
return SDValue();
// (X - Y) == Y --> X == Y << 1
EVT ShiftVT = getShiftAmountTy(OpVT, DAG.getDataLayout(),
!DCI.isBeforeLegalize());
SDValue One = DAG.getConstant(1, DL, ShiftVT);
SDValue YShl1 = DAG.getNode(ISD::SHL, DL, N1.getValueType(), Y, One);
if (!DCI.isCalledByLegalizer())
DCI.AddToWorklist(YShl1.getNode());
return DAG.getSetCC(DL, VT, X, YShl1, Cond);
}
/// Try to simplify a setcc built with the specified operands and cc. If it is
/// unable to simplify it, return a null SDValue.
SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
ISD::CondCode Cond, bool foldBooleans,
DAGCombinerInfo &DCI,
const SDLoc &dl) const {
SelectionDAG &DAG = DCI.DAG;
EVT OpVT = N0.getValueType();
// Constant fold or commute setcc.
if (SDValue Fold = DAG.FoldSetCC(VT, N0, N1, Cond, dl))
return Fold;
// Ensure that the constant occurs on the RHS and fold constant comparisons.
// TODO: Handle non-splat vector constants. All undef causes trouble.
ISD::CondCode SwappedCC = ISD::getSetCCSwappedOperands(Cond);
if (isConstOrConstSplat(N0) &&
(DCI.isBeforeLegalizeOps() ||
isCondCodeLegal(SwappedCC, N0.getSimpleValueType())))
return DAG.getSetCC(dl, VT, N1, N0, SwappedCC);
// If we have a subtract with the same 2 non-constant operands as this setcc
// -- but in reverse order -- then try to commute the operands of this setcc
// to match. A matching pair of setcc (cmp) and sub may be combined into 1
// instruction on some targets.
if (!isConstOrConstSplat(N0) && !isConstOrConstSplat(N1) &&
(DCI.isBeforeLegalizeOps() ||
isCondCodeLegal(SwappedCC, N0.getSimpleValueType())) &&
DAG.getNodeIfExists(ISD::SUB, DAG.getVTList(OpVT), { N1, N0 } ) &&
!DAG.getNodeIfExists(ISD::SUB, DAG.getVTList(OpVT), { N0, N1 } ))
return DAG.getSetCC(dl, VT, N1, N0, SwappedCC);
if (auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
const APInt &C1 = N1C->getAPIntValue();
// If the LHS is '(srl (ctlz x), 5)', the RHS is 0/1, and this is an
// equality comparison, then we're just comparing whether X itself is
// zero.
if (N0.getOpcode() == ISD::SRL && (C1.isNullValue() || C1.isOneValue()) &&
N0.getOperand(0).getOpcode() == ISD::CTLZ &&
N0.getOperand(1).getOpcode() == ISD::Constant) {
const APInt &ShAmt = N0.getConstantOperandAPInt(1);
if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
ShAmt == Log2_32(N0.getValueSizeInBits())) {
if ((C1 == 0) == (Cond == ISD::SETEQ)) {
// (srl (ctlz x), 5) == 0 -> X != 0
// (srl (ctlz x), 5) != 1 -> X != 0
Cond = ISD::SETNE;
} else {
// (srl (ctlz x), 5) != 0 -> X == 0
// (srl (ctlz x), 5) == 1 -> X == 0
Cond = ISD::SETEQ;
}
SDValue Zero = DAG.getConstant(0, dl, N0.getValueType());
return DAG.getSetCC(dl, VT, N0.getOperand(0).getOperand(0),
Zero, Cond);
}
}
SDValue CTPOP = N0;
// Look through truncs that don't change the value of a ctpop.
if (N0.hasOneUse() && N0.getOpcode() == ISD::TRUNCATE)
CTPOP = N0.getOperand(0);
if (CTPOP.hasOneUse() && CTPOP.getOpcode() == ISD::CTPOP &&
(N0 == CTPOP ||
N0.getValueSizeInBits() > Log2_32_Ceil(CTPOP.getValueSizeInBits()))) {
EVT CTVT = CTPOP.getValueType();
SDValue CTOp = CTPOP.getOperand(0);
// (ctpop x) u< 2 -> (x & x-1) == 0
// (ctpop x) u> 1 -> (x & x-1) != 0
if ((Cond == ISD::SETULT && C1 == 2) || (Cond == ISD::SETUGT && C1 == 1)){
SDValue Sub = DAG.getNode(ISD::SUB, dl, CTVT, CTOp,
DAG.getConstant(1, dl, CTVT));
SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Sub);
ISD::CondCode CC = Cond == ISD::SETULT ? ISD::SETEQ : ISD::SETNE;
return DAG.getSetCC(dl, VT, And, DAG.getConstant(0, dl, CTVT), CC);
}
// If ctpop is not supported, expand a power-of-2 comparison based on it.
if (C1 == 1 && !isOperationLegalOrCustom(ISD::CTPOP, CTVT) &&
(Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
// (ctpop x) == 1 --> (x != 0) && ((x & x-1) == 0)
// (ctpop x) != 1 --> (x == 0) || ((x & x-1) != 0)
SDValue Zero = DAG.getConstant(0, dl, CTVT);
SDValue NegOne = DAG.getAllOnesConstant(dl, CTVT);
ISD::CondCode InvCond = ISD::getSetCCInverse(Cond, true);
SDValue Add = DAG.getNode(ISD::ADD, dl, CTVT, CTOp, NegOne);
SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Add);
SDValue LHS = DAG.getSetCC(dl, VT, CTOp, Zero, InvCond);
SDValue RHS = DAG.getSetCC(dl, VT, And, Zero, Cond);
unsigned LogicOpcode = Cond == ISD::SETEQ ? ISD::AND : ISD::OR;
return DAG.getNode(LogicOpcode, dl, VT, LHS, RHS);
}
}
// (zext x) == C --> x == (trunc C)
// (sext x) == C --> x == (trunc C)
if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
DCI.isBeforeLegalize() && N0->hasOneUse()) {
unsigned MinBits = N0.getValueSizeInBits();
SDValue PreExt;
bool Signed = false;
if (N0->getOpcode() == ISD::ZERO_EXTEND) {
// ZExt
MinBits = N0->getOperand(0).getValueSizeInBits();
PreExt = N0->getOperand(0);
} else if (N0->getOpcode() == ISD::AND) {
// DAGCombine turns costly ZExts into ANDs
if (auto *C = dyn_cast<ConstantSDNode>(N0->getOperand(1)))
if ((C->getAPIntValue()+1).isPowerOf2()) {
MinBits = C->getAPIntValue().countTrailingOnes();
PreExt = N0->getOperand(0);
}
} else if (N0->getOpcode() == ISD::SIGN_EXTEND) {
// SExt
MinBits = N0->getOperand(0).getValueSizeInBits();
PreExt = N0->getOperand(0);
Signed = true;
} else if (auto *LN0 = dyn_cast<LoadSDNode>(N0)) {
// ZEXTLOAD / SEXTLOAD
if (LN0->getExtensionType() == ISD::ZEXTLOAD) {
MinBits = LN0->getMemoryVT().getSizeInBits();
PreExt = N0;
} else if (LN0->getExtensionType() == ISD::SEXTLOAD) {
Signed = true;
MinBits = LN0->getMemoryVT().getSizeInBits();
PreExt = N0;
}
}
// Figure out how many bits we need to preserve this constant.
unsigned ReqdBits = Signed ?
C1.getBitWidth() - C1.getNumSignBits() + 1 :
C1.getActiveBits();
// Make sure we're not losing bits from the constant.
if (MinBits > 0 &&
MinBits < C1.getBitWidth() &&
MinBits >= ReqdBits) {
EVT MinVT = EVT::getIntegerVT(*DAG.getContext(), MinBits);
if (isTypeDesirableForOp(ISD::SETCC, MinVT)) {
// Will get folded away.
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MinVT, PreExt);
if (MinBits == 1 && C1 == 1)
// Invert the condition.
return DAG.getSetCC(dl, VT, Trunc, DAG.getConstant(0, dl, MVT::i1),
Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
SDValue C = DAG.getConstant(C1.trunc(MinBits), dl, MinVT);
return DAG.getSetCC(dl, VT, Trunc, C, Cond);
}
// If truncating the setcc operands is not desirable, we can still
// simplify the expression in some cases:
// setcc ([sz]ext (setcc x, y, cc)), 0, setne) -> setcc (x, y, cc)
// setcc ([sz]ext (setcc x, y, cc)), 0, seteq) -> setcc (x, y, inv(cc))
// setcc (zext (setcc x, y, cc)), 1, setne) -> setcc (x, y, inv(cc))
// setcc (zext (setcc x, y, cc)), 1, seteq) -> setcc (x, y, cc)
// setcc (sext (setcc x, y, cc)), -1, setne) -> setcc (x, y, inv(cc))
// setcc (sext (setcc x, y, cc)), -1, seteq) -> setcc (x, y, cc)
SDValue TopSetCC = N0->getOperand(0);
unsigned N0Opc = N0->getOpcode();
bool SExt = (N0Opc == ISD::SIGN_EXTEND);
if (TopSetCC.getValueType() == MVT::i1 && VT == MVT::i1 &&
TopSetCC.getOpcode() == ISD::SETCC &&
(N0Opc == ISD::ZERO_EXTEND || N0Opc == ISD::SIGN_EXTEND) &&
(isConstFalseVal(N1C) ||
isExtendedTrueVal(N1C, N0->getValueType(0), SExt))) {
bool Inverse = (N1C->isNullValue() && Cond == ISD::SETEQ) ||
(!N1C->isNullValue() && Cond == ISD::SETNE);
if (!Inverse)
return TopSetCC;
ISD::CondCode InvCond = ISD::getSetCCInverse(
cast<CondCodeSDNode>(TopSetCC.getOperand(2))->get(),
TopSetCC.getOperand(0).getValueType().isInteger());
return DAG.getSetCC(dl, VT, TopSetCC.getOperand(0),
TopSetCC.getOperand(1),
InvCond);
}
}
}
// If the LHS is '(and load, const)', the RHS is 0, the test is for
// equality or unsigned, and all 1 bits of the const are in the same
// partial word, see if we can shorten the load.
if (DCI.isBeforeLegalize() &&
!ISD::isSignedIntSetCC(Cond) &&
N0.getOpcode() == ISD::AND && C1 == 0 &&
N0.getNode()->hasOneUse() &&
isa<LoadSDNode>(N0.getOperand(0)) &&
N0.getOperand(0).getNode()->hasOneUse() &&
isa<ConstantSDNode>(N0.getOperand(1))) {
LoadSDNode *Lod = cast<LoadSDNode>(N0.getOperand(0));
APInt bestMask;
unsigned bestWidth = 0, bestOffset = 0;
if (!Lod->isVolatile() && Lod->isUnindexed()) {
unsigned origWidth = N0.getValueSizeInBits();
unsigned maskWidth = origWidth;
// We can narrow (e.g.) 16-bit extending loads on 32-bit target to
// 8 bits, but have to be careful...
if (Lod->getExtensionType() != ISD::NON_EXTLOAD)
origWidth = Lod->getMemoryVT().getSizeInBits();
const APInt &Mask = N0.getConstantOperandAPInt(1);
for (unsigned width = origWidth / 2; width>=8; width /= 2) {
APInt newMask = APInt::getLowBitsSet(maskWidth, width);
for (unsigned offset=0; offset<origWidth/width; offset++) {
if (Mask.isSubsetOf(newMask)) {
if (DAG.getDataLayout().isLittleEndian())
bestOffset = (uint64_t)offset * (width/8);
else
bestOffset = (origWidth/width - offset - 1) * (width/8);
bestMask = Mask.lshr(offset * (width/8) * 8);
bestWidth = width;
break;
}
newMask <<= width;
}
}
}
if (bestWidth) {
EVT newVT = EVT::getIntegerVT(*DAG.getContext(), bestWidth);
if (newVT.isRound() &&
shouldReduceLoadWidth(Lod, ISD::NON_EXTLOAD, newVT)) {
EVT PtrType = Lod->getOperand(1).getValueType();
SDValue Ptr = Lod->getBasePtr();
if (bestOffset != 0)
Ptr = DAG.getNode(ISD::ADD, dl, PtrType, Lod->getBasePtr(),
DAG.getConstant(bestOffset, dl, PtrType));
unsigned NewAlign = MinAlign(Lod->getAlignment(), bestOffset);
SDValue NewLoad = DAG.getLoad(
newVT, dl, Lod->getChain(), Ptr,
Lod->getPointerInfo().getWithOffset(bestOffset), NewAlign);
return DAG.getSetCC(dl, VT,
DAG.getNode(ISD::AND, dl, newVT, NewLoad,
DAG.getConstant(bestMask.trunc(bestWidth),
dl, newVT)),
DAG.getConstant(0LL, dl, newVT), Cond);
}
}
}
// If the LHS is a ZERO_EXTEND, perform the comparison on the input.
if (N0.getOpcode() == ISD::ZERO_EXTEND) {
unsigned InSize = N0.getOperand(0).getValueSizeInBits();
// If the comparison constant has bits in the upper part, the
// zero-extended value could never match.
if (C1.intersects(APInt::getHighBitsSet(C1.getBitWidth(),
C1.getBitWidth() - InSize))) {
switch (Cond) {
case ISD::SETUGT:
case ISD::SETUGE:
case ISD::SETEQ:
return DAG.getConstant(0, dl, VT);
case ISD::SETULT:
case ISD::SETULE:
case ISD::SETNE:
return DAG.getConstant(1, dl, VT);
case ISD::SETGT:
case ISD::SETGE:
// True if the sign bit of C1 is set.
return DAG.getConstant(C1.isNegative(), dl, VT);
case ISD::SETLT:
case ISD::SETLE:
// True if the sign bit of C1 isn't set.
return DAG.getConstant(C1.isNonNegative(), dl, VT);
default:
break;
}
}
// Otherwise, we can perform the comparison with the low bits.
switch (Cond) {
case ISD::SETEQ:
case ISD::SETNE:
case ISD::SETUGT:
case ISD::SETUGE:
case ISD::SETULT:
case ISD::SETULE: {
EVT newVT = N0.getOperand(0).getValueType();
if (DCI.isBeforeLegalizeOps() ||
(isOperationLegal(ISD::SETCC, newVT) &&
isCondCodeLegal(Cond, newVT.getSimpleVT()))) {
EVT NewSetCCVT =
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), newVT);
SDValue NewConst = DAG.getConstant(C1.trunc(InSize), dl, newVT);
SDValue NewSetCC = DAG.getSetCC(dl, NewSetCCVT, N0.getOperand(0),
NewConst, Cond);
return DAG.getBoolExtOrTrunc(NewSetCC, dl, VT, N0.getValueType());
}
break;
}
default:
break; // todo, be more careful with signed comparisons
}
} else if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
(Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
EVT ExtSrcTy = cast<VTSDNode>(N0.getOperand(1))->getVT();
unsigned ExtSrcTyBits = ExtSrcTy.getSizeInBits();
EVT ExtDstTy = N0.getValueType();
unsigned ExtDstTyBits = ExtDstTy.getSizeInBits();
// If the constant doesn't fit into the number of bits for the source of
// the sign extension, it is impossible for both sides to be equal.
if (C1.getMinSignedBits() > ExtSrcTyBits)
return DAG.getConstant(Cond == ISD::SETNE, dl, VT);
SDValue ZextOp;
EVT Op0Ty = N0.getOperand(0).getValueType();
if (Op0Ty == ExtSrcTy) {
ZextOp = N0.getOperand(0);
} else {
APInt Imm = APInt::getLowBitsSet(ExtDstTyBits, ExtSrcTyBits);
ZextOp = DAG.getNode(ISD::AND, dl, Op0Ty, N0.getOperand(0),
DAG.getConstant(Imm, dl, Op0Ty));
}
if (!DCI.isCalledByLegalizer())
DCI.AddToWorklist(ZextOp.getNode());
// Otherwise, make this a use of a zext.
return DAG.getSetCC(dl, VT, ZextOp,
DAG.getConstant(C1 & APInt::getLowBitsSet(
ExtDstTyBits,
ExtSrcTyBits),
dl, ExtDstTy),
Cond);
} else if ((N1C->isNullValue() || N1C->isOne()) &&
(Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
// SETCC (SETCC), [0|1], [EQ|NE] -> SETCC
if (N0.getOpcode() == ISD::SETCC &&
isTypeLegal(VT) && VT.bitsLE(N0.getValueType())) {
bool TrueWhenTrue = (Cond == ISD::SETEQ) ^ (!N1C->isOne());
if (TrueWhenTrue)
return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
// Invert the condition.
ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
CC = ISD::getSetCCInverse(CC,
N0.getOperand(0).getValueType().isInteger());
if (DCI.isBeforeLegalizeOps() ||
isCondCodeLegal(CC, N0.getOperand(0).getSimpleValueType()))
return DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
}
if ((N0.getOpcode() == ISD::XOR ||
(N0.getOpcode() == ISD::AND &&
N0.getOperand(0).getOpcode() == ISD::XOR &&
N0.getOperand(1) == N0.getOperand(0).getOperand(1))) &&
isa<ConstantSDNode>(N0.getOperand(1)) &&
cast<ConstantSDNode>(N0.getOperand(1))->isOne()) {
// If this is (X^1) == 0/1, swap the RHS and eliminate the xor. We
// can only do this if the top bits are known zero.
unsigned BitWidth = N0.getValueSizeInBits();
if (DAG.MaskedValueIsZero(N0,
APInt::getHighBitsSet(BitWidth,
BitWidth-1))) {
// Okay, get the un-inverted input value.
SDValue Val;
if (N0.getOpcode() == ISD::XOR) {
Val = N0.getOperand(0);
} else {
assert(N0.getOpcode() == ISD::AND &&
N0.getOperand(0).getOpcode() == ISD::XOR);
// ((X^1)&1)^1 -> X & 1
Val = DAG.getNode(ISD::AND, dl, N0.getValueType(),
N0.getOperand(0).getOperand(0),
N0.getOperand(1));
}
return DAG.getSetCC(dl, VT, Val, N1,
Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
}
} else if (N1C->isOne() &&
(VT == MVT::i1 ||
getBooleanContents(N0->getValueType(0)) ==
ZeroOrOneBooleanContent)) {
SDValue Op0 = N0;
if (Op0.getOpcode() == ISD::TRUNCATE)
Op0 = Op0.getOperand(0);
if ((Op0.getOpcode() == ISD::XOR) &&
Op0.getOperand(0).getOpcode() == ISD::SETCC &&
Op0.getOperand(1).getOpcode() == ISD::SETCC) {
// (xor (setcc), (setcc)) == / != 1 -> (setcc) != / == (setcc)
Cond = (Cond == ISD::SETEQ) ? ISD::SETNE : ISD::SETEQ;
return DAG.getSetCC(dl, VT, Op0.getOperand(0), Op0.getOperand(1),
Cond);
}
if (Op0.getOpcode() == ISD::AND &&
isa<ConstantSDNode>(Op0.getOperand(1)) &&
cast<ConstantSDNode>(Op0.getOperand(1))->isOne()) {
// If this is (X&1) == / != 1, normalize it to (X&1) != / == 0.
if (Op0.getValueType().bitsGT(VT))
Op0 = DAG.getNode(ISD::AND, dl, VT,
DAG.getNode(ISD::TRUNCATE, dl, VT, Op0.getOperand(0)),
DAG.getConstant(1, dl, VT));
else if (Op0.getValueType().bitsLT(VT))
Op0 = DAG.getNode(ISD::AND, dl, VT,
DAG.getNode(ISD::ANY_EXTEND, dl, VT, Op0.getOperand(0)),
DAG.getConstant(1, dl, VT));
return DAG.getSetCC(dl, VT, Op0,
DAG.getConstant(0, dl, Op0.getValueType()),
Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
}
if (Op0.getOpcode() == ISD::AssertZext &&
cast<VTSDNode>(Op0.getOperand(1))->getVT() == MVT::i1)
return DAG.getSetCC(dl, VT, Op0,
DAG.getConstant(0, dl, Op0.getValueType()),
Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
}
}
// Given:
// icmp eq/ne (urem %x, %y), 0
// Iff %x has 0 or 1 bits set, and %y has at least 2 bits set, omit 'urem':
// icmp eq/ne %x, 0
if (N0.getOpcode() == ISD::UREM && N1C->isNullValue() &&
(Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
KnownBits XKnown = DAG.computeKnownBits(N0.getOperand(0));
KnownBits YKnown = DAG.computeKnownBits(N0.getOperand(1));
if (XKnown.countMaxPopulation() == 1 && YKnown.countMinPopulation() >= 2)
return DAG.getSetCC(dl, VT, N0.getOperand(0), N1, Cond);
}
if (SDValue V =
optimizeSetCCOfSignedTruncationCheck(VT, N0, N1, Cond, DCI, dl))
return V;
}
// These simplifications apply to splat vectors as well.
// TODO: Handle more splat vector cases.
if (auto *N1C = isConstOrConstSplat(N1)) {
const APInt &C1 = N1C->getAPIntValue();
APInt MinVal, MaxVal;
unsigned OperandBitSize = N1C->getValueType(0).getScalarSizeInBits();
if (ISD::isSignedIntSetCC(Cond)) {
MinVal = APInt::getSignedMinValue(OperandBitSize);
MaxVal = APInt::getSignedMaxValue(OperandBitSize);
} else {
MinVal = APInt::getMinValue(OperandBitSize);
MaxVal = APInt::getMaxValue(OperandBitSize);
}
// Canonicalize GE/LE comparisons to use GT/LT comparisons.
if (Cond == ISD::SETGE || Cond == ISD::SETUGE) {
// X >= MIN --> true
if (C1 == MinVal)
return DAG.getBoolConstant(true, dl, VT, OpVT);
if (!VT.isVector()) { // TODO: Support this for vectors.
// X >= C0 --> X > (C0 - 1)
APInt C = C1 - 1;
ISD::CondCode NewCC = (Cond == ISD::SETGE) ? ISD::SETGT : ISD::SETUGT;
if ((DCI.isBeforeLegalizeOps() ||
isCondCodeLegal(NewCC, VT.getSimpleVT())) &&
(!N1C->isOpaque() || (C.getBitWidth() <= 64 &&
isLegalICmpImmediate(C.getSExtValue())))) {
return DAG.getSetCC(dl, VT, N0,
DAG.getConstant(C, dl, N1.getValueType()),
NewCC);
}
}
}
if (Cond == ISD::SETLE || Cond == ISD::SETULE) {
// X <= MAX --> true
if (C1 == MaxVal)
return DAG.getBoolConstant(true, dl, VT, OpVT);
// X <= C0 --> X < (C0 + 1)
if (!VT.isVector()) { // TODO: Support this for vectors.
APInt C = C1 + 1;
ISD::CondCode NewCC = (Cond == ISD::SETLE) ? ISD::SETLT : ISD::SETULT;
if ((DCI.isBeforeLegalizeOps() ||
isCondCodeLegal(NewCC, VT.getSimpleVT())) &&
(!N1C->isOpaque() || (C.getBitWidth() <= 64 &&
isLegalICmpImmediate(C.getSExtValue())))) {
return DAG.getSetCC(dl, VT, N0,
DAG.getConstant(C, dl, N1.getValueType()),
NewCC);
}
}
}
if (Cond == ISD::SETLT || Cond == ISD::SETULT) {
if (C1 == MinVal)
return DAG.getBoolConstant(false, dl, VT, OpVT); // X < MIN --> false
// TODO: Support this for vectors after legalize ops.
if (!VT.isVector() || DCI.isBeforeLegalizeOps()) {
// Canonicalize setlt X, Max --> setne X, Max
if (C1 == MaxVal)
return DAG.getSetCC(dl, VT, N0, N1, ISD::SETNE);
// If we have setult X, 1, turn it into seteq X, 0
if (C1 == MinVal+1)
return DAG.getSetCC(dl, VT, N0,
DAG.getConstant(MinVal, dl, N0.getValueType()),
ISD::SETEQ);
}
}
if (Cond == ISD::SETGT || Cond == ISD::SETUGT) {
if (C1 == MaxVal)
return DAG.getBoolConstant(false, dl, VT, OpVT); // X > MAX --> false
// TODO: Support this for vectors after legalize ops.
if (!VT.isVector() || DCI.isBeforeLegalizeOps()) {
// Canonicalize setgt X, Min --> setne X, Min
if (C1 == MinVal)
return DAG.getSetCC(dl, VT, N0, N1, ISD::SETNE);
// If we have setugt X, Max-1, turn it into seteq X, Max
if (C1 == MaxVal-1)
return DAG.getSetCC(dl, VT, N0,
DAG.getConstant(MaxVal, dl, N0.getValueType()),
ISD::SETEQ);
}
}
// If we have "setcc X, C0", check to see if we can shrink the immediate
// by changing cc.
// TODO: Support this for vectors after legalize ops.
if (!VT.isVector() || DCI.isBeforeLegalizeOps()) {
// SETUGT X, SINTMAX -> SETLT X, 0
if (Cond == ISD::SETUGT &&
C1 == APInt::getSignedMaxValue(OperandBitSize))
return DAG.getSetCC(dl, VT, N0,
DAG.getConstant(0, dl, N1.getValueType()),
ISD::SETLT);
// SETULT X, SINTMIN -> SETGT X, -1
if (Cond == ISD::SETULT &&
C1 == APInt::getSignedMinValue(OperandBitSize)) {
SDValue ConstMinusOne =
DAG.getConstant(APInt::getAllOnesValue(OperandBitSize), dl,
N1.getValueType());
return DAG.getSetCC(dl, VT, N0, ConstMinusOne, ISD::SETGT);
}
}
}
// Back to non-vector simplifications.
// TODO: Can we do these for vector splats?
if (auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
const APInt &C1 = N1C->getAPIntValue();
// Fold bit comparisons when we can.
if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
(VT == N0.getValueType() ||
(isTypeLegal(VT) && VT.bitsLE(N0.getValueType()))) &&
N0.getOpcode() == ISD::AND) {
auto &DL = DAG.getDataLayout();
if (auto *AndRHS = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
EVT ShiftTy = getShiftAmountTy(N0.getValueType(), DL,
!DCI.isBeforeLegalize());
if (Cond == ISD::SETNE && C1 == 0) {// (X & 8) != 0 --> (X & 8) >> 3
// Perform the xform if the AND RHS is a single bit.
if (AndRHS->getAPIntValue().isPowerOf2()) {
return DAG.getNode(ISD::TRUNCATE, dl, VT,
DAG.getNode(ISD::SRL, dl, N0.getValueType(), N0,
DAG.getConstant(AndRHS->getAPIntValue().logBase2(), dl,
ShiftTy)));
}
} else if (Cond == ISD::SETEQ && C1 == AndRHS->getAPIntValue()) {
// (X & 8) == 8 --> (X & 8) >> 3
// Perform the xform if C1 is a single bit.
if (C1.isPowerOf2()) {
return DAG.getNode(ISD::TRUNCATE, dl, VT,
DAG.getNode(ISD::SRL, dl, N0.getValueType(), N0,
DAG.getConstant(C1.logBase2(), dl,
ShiftTy)));
}
}
}
}
if (C1.getMinSignedBits() <= 64 &&
!isLegalICmpImmediate(C1.getSExtValue())) {
// (X & -256) == 256 -> (X >> 8) == 1
if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
if (auto *AndRHS = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
const APInt &AndRHSC = AndRHS->getAPIntValue();
if ((-AndRHSC).isPowerOf2() && (AndRHSC & C1) == C1) {
unsigned ShiftBits = AndRHSC.countTrailingZeros();
auto &DL = DAG.getDataLayout();
EVT ShiftTy = getShiftAmountTy(N0.getValueType(), DL,
!DCI.isBeforeLegalize());
EVT CmpTy = N0.getValueType();
SDValue Shift = DAG.getNode(ISD::SRL, dl, CmpTy, N0.getOperand(0),
DAG.getConstant(ShiftBits, dl,
ShiftTy));
SDValue CmpRHS = DAG.getConstant(C1.lshr(ShiftBits), dl, CmpTy);
return DAG.getSetCC(dl, VT, Shift, CmpRHS, Cond);
}
}
} else if (Cond == ISD::SETULT || Cond == ISD::SETUGE ||
Cond == ISD::SETULE || Cond == ISD::SETUGT) {
bool AdjOne = (Cond == ISD::SETULE || Cond == ISD::SETUGT);
// X < 0x100000000 -> (X >> 32) < 1
// X >= 0x100000000 -> (X >> 32) >= 1
// X <= 0x0ffffffff -> (X >> 32) < 1
// X > 0x0ffffffff -> (X >> 32) >= 1
unsigned ShiftBits;
APInt NewC = C1;
ISD::CondCode NewCond = Cond;
if (AdjOne) {
ShiftBits = C1.countTrailingOnes();
NewC = NewC + 1;
NewCond = (Cond == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
} else {
ShiftBits = C1.countTrailingZeros();
}
NewC.lshrInPlace(ShiftBits);
if (ShiftBits && NewC.getMinSignedBits() <= 64 &&
isLegalICmpImmediate(NewC.getSExtValue())) {
auto &DL = DAG.getDataLayout();
EVT ShiftTy = getShiftAmountTy(N0.getValueType(), DL,
!DCI.isBeforeLegalize());
EVT CmpTy = N0.getValueType();
SDValue Shift = DAG.getNode(ISD::SRL, dl, CmpTy, N0,
DAG.getConstant(ShiftBits, dl, ShiftTy));
SDValue CmpRHS = DAG.getConstant(NewC, dl, CmpTy);
return DAG.getSetCC(dl, VT, Shift, CmpRHS, NewCond);
}
}
}
}
if (!isa<ConstantFPSDNode>(N0) && isa<ConstantFPSDNode>(N1)) {
auto *CFP = cast<ConstantFPSDNode>(N1);
assert(!CFP->getValueAPF().isNaN() && "Unexpected NaN value");
// Otherwise, we know the RHS is not a NaN. Simplify the node to drop the
// constant if knowing that the operand is non-nan is enough. We prefer to
// have SETO(x,x) instead of SETO(x, 0.0) because this avoids having to
// materialize 0.0.
if (Cond == ISD::SETO || Cond == ISD::SETUO)
return DAG.getSetCC(dl, VT, N0, N0, Cond);
// setcc (fneg x), C -> setcc swap(pred) x, -C
if (N0.getOpcode() == ISD::FNEG) {
ISD::CondCode SwapCond = ISD::getSetCCSwappedOperands(Cond);
if (DCI.isBeforeLegalizeOps() ||
isCondCodeLegal(SwapCond, N0.getSimpleValueType())) {
SDValue NegN1 = DAG.getNode(ISD::FNEG, dl, N0.getValueType(), N1);
return DAG.getSetCC(dl, VT, N0.getOperand(0), NegN1, SwapCond);
}
}
// If the condition is not legal, see if we can find an equivalent one
// which is legal.
if (!isCondCodeLegal(Cond, N0.getSimpleValueType())) {
// If the comparison was an awkward floating-point == or != and one of
// the comparison operands is infinity or negative infinity, convert the
// condition to a less-awkward <= or >=.
if (CFP->getValueAPF().isInfinity()) {
if (CFP->getValueAPF().isNegative()) {
if (Cond == ISD::SETOEQ &&
isCondCodeLegal(ISD::SETOLE, N0.getSimpleValueType()))
return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOLE);
if (Cond == ISD::SETUEQ &&
isCondCodeLegal(ISD::SETOLE, N0.getSimpleValueType()))
return DAG.getSetCC(dl, VT, N0, N1, ISD::SETULE);
if (Cond == ISD::SETUNE &&
isCondCodeLegal(ISD::SETUGT, N0.getSimpleValueType()))
return DAG.getSetCC(dl, VT, N0, N1, ISD::SETUGT);
if (Cond == ISD::SETONE &&
isCondCodeLegal(ISD::SETUGT, N0.getSimpleValueType()))
return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOGT);
} else {
if (Cond == ISD::SETOEQ &&
isCondCodeLegal(ISD::SETOGE, N0.getSimpleValueType()))
return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOGE);
if (Cond == ISD::SETUEQ &&
isCondCodeLegal(ISD::SETOGE, N0.getSimpleValueType()))
return DAG.getSetCC(dl, VT, N0, N1, ISD::SETUGE);
if (Cond == ISD::SETUNE &&
isCondCodeLegal(ISD::SETULT, N0.getSimpleValueType()))
return DAG.getSetCC(dl, VT, N0, N1, ISD::SETULT);
if (Cond == ISD::SETONE &&
isCondCodeLegal(ISD::SETULT, N0.getSimpleValueType()))
return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOLT);
}
}
}
}
if (N0 == N1) {
// The sext(setcc()) => setcc() optimization relies on the appropriate
// constant being emitted.
assert(!N0.getValueType().isInteger() &&
"Integer types should be handled by FoldSetCC");
bool EqTrue = ISD::isTrueWhenEqual(Cond);
unsigned UOF = ISD::getUnorderedFlavor(Cond);
if (UOF == 2) // FP operators that are undefined on NaNs.
return DAG.getBoolConstant(EqTrue, dl, VT, OpVT);
if (UOF == unsigned(EqTrue))
return DAG.getBoolConstant(EqTrue, dl, VT, OpVT);
// Otherwise, we can't fold it. However, we can simplify it to SETUO/SETO
// if it is not already.
ISD::CondCode NewCond = UOF == 0 ? ISD::SETO : ISD::SETUO;
if (NewCond != Cond &&
(DCI.isBeforeLegalizeOps() ||
isCondCodeLegal(NewCond, N0.getSimpleValueType())))
return DAG.getSetCC(dl, VT, N0, N1, NewCond);
}
if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
N0.getValueType().isInteger()) {
if (N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::SUB ||
N0.getOpcode() == ISD::XOR) {
// Simplify (X+Y) == (X+Z) --> Y == Z
if (N0.getOpcode() == N1.getOpcode()) {
if (N0.getOperand(0) == N1.getOperand(0))
return DAG.getSetCC(dl, VT, N0.getOperand(1), N1.getOperand(1), Cond);
if (N0.getOperand(1) == N1.getOperand(1))
return DAG.getSetCC(dl, VT, N0.getOperand(0), N1.getOperand(0), Cond);
if (isCommutativeBinOp(N0.getOpcode())) {
// If X op Y == Y op X, try other combinations.
if (N0.getOperand(0) == N1.getOperand(1))
return DAG.getSetCC(dl, VT, N0.getOperand(1), N1.getOperand(0),
Cond);
if (N0.getOperand(1) == N1.getOperand(0))
return DAG.getSetCC(dl, VT, N0.getOperand(0), N1.getOperand(1),
Cond);
}
}
// If RHS is a legal immediate value for a compare instruction, we need
// to be careful about increasing register pressure needlessly.
bool LegalRHSImm = false;
if (auto *RHSC = dyn_cast<ConstantSDNode>(N1)) {
if (auto *LHSR = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
// Turn (X+C1) == C2 --> X == C2-C1
if (N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse()) {
return DAG.getSetCC(dl, VT, N0.getOperand(0),
DAG.getConstant(RHSC->getAPIntValue()-
LHSR->getAPIntValue(),
dl, N0.getValueType()), Cond);
}
// Turn (X^C1) == C2 into X == C1^C2 iff X&~C1 = 0.
if (N0.getOpcode() == ISD::XOR)
// If we know that all of the inverted bits are zero, don't bother
// performing the inversion.
if (DAG.MaskedValueIsZero(N0.getOperand(0), ~LHSR->getAPIntValue()))
return
DAG.getSetCC(dl, VT, N0.getOperand(0),
DAG.getConstant(LHSR->getAPIntValue() ^
RHSC->getAPIntValue(),
dl, N0.getValueType()),
Cond);
}
// Turn (C1-X) == C2 --> X == C1-C2
if (auto *SUBC = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
if (N0.getOpcode() == ISD::SUB && N0.getNode()->hasOneUse()) {
return
DAG.getSetCC(dl, VT, N0.getOperand(1),
DAG.getConstant(SUBC->getAPIntValue() -
RHSC->getAPIntValue(),
dl, N0.getValueType()),
Cond);
}
}
// Could RHSC fold directly into a compare?
if (RHSC->getValueType(0).getSizeInBits() <= 64)
LegalRHSImm = isLegalICmpImmediate(RHSC->getSExtValue());
}
// (X+Y) == X --> Y == 0 and similar folds.
// Don't do this if X is an immediate that can fold into a cmp
// instruction and X+Y has other uses. It could be an induction variable
// chain, and the transform would increase register pressure.
if (!LegalRHSImm || N0.hasOneUse())
if (SDValue V = foldSetCCWithBinOp(VT, N0, N1, Cond, dl, DCI))
return V;
}
if (N1.getOpcode() == ISD::ADD || N1.getOpcode() == ISD::SUB ||
N1.getOpcode() == ISD::XOR)
if (SDValue V = foldSetCCWithBinOp(VT, N1, N0, Cond, dl, DCI))
return V;
if (SDValue V = foldSetCCWithAnd(VT, N0, N1, Cond, dl, DCI))
return V;
}
// Fold remainder of division by a constant.
if (N0.getOpcode() == ISD::UREM && N0.hasOneUse() &&
(Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
// When division is cheap or optimizing for minimum size,
// fall through to DIVREM creation by skipping this fold.
if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttribute(Attribute::MinSize))
if (SDValue Folded = buildUREMEqFold(VT, N0, N1, Cond, DCI, dl))
return Folded;
}
// Fold away ALL boolean setcc's.
if (N0.getValueType().getScalarType() == MVT::i1 && foldBooleans) {
SDValue Temp;
switch (Cond) {
default: llvm_unreachable("Unknown integer setcc!");
case ISD::SETEQ: // X == Y -> ~(X^Y)
Temp = DAG.getNode(ISD::XOR, dl, OpVT, N0, N1);
N0 = DAG.getNOT(dl, Temp, OpVT);
if (!DCI.isCalledByLegalizer())
DCI.AddToWorklist(Temp.getNode());
break;
case ISD::SETNE: // X != Y --> (X^Y)
N0 = DAG.getNode(ISD::XOR, dl, OpVT, N0, N1);
break;
case ISD::SETGT: // X >s Y --> X == 0 & Y == 1 --> ~X & Y
case ISD::SETULT: // X <u Y --> X == 0 & Y == 1 --> ~X & Y
Temp = DAG.getNOT(dl, N0, OpVT);
N0 = DAG.getNode(ISD::AND, dl, OpVT, N1, Temp);
if (!DCI.isCalledByLegalizer())
DCI.AddToWorklist(Temp.getNode());
break;
case ISD::SETLT: // X <s Y --> X == 1 & Y == 0 --> ~Y & X
case ISD::SETUGT: // X >u Y --> X == 1 & Y == 0 --> ~Y & X
Temp = DAG.getNOT(dl, N1, OpVT);
N0 = DAG.getNode(ISD::AND, dl, OpVT, N0, Temp);
if (!DCI.isCalledByLegalizer())
DCI.AddToWorklist(Temp.getNode());
break;
case ISD::SETULE: // X <=u Y --> X == 0 | Y == 1 --> ~X | Y
case ISD::SETGE: // X >=s Y --> X == 0 | Y == 1 --> ~X | Y
Temp = DAG.getNOT(dl, N0, OpVT);
N0 = DAG.getNode(ISD::OR, dl, OpVT, N1, Temp);
if (!DCI.isCalledByLegalizer())
DCI.AddToWorklist(Temp.getNode());
break;
case ISD::SETUGE: // X >=u Y --> X == 1 | Y == 0 --> ~Y | X
case ISD::SETLE: // X <=s Y --> X == 1 | Y == 0 --> ~Y | X
Temp = DAG.getNOT(dl, N1, OpVT);
N0 = DAG.getNode(ISD::OR, dl, OpVT, N0, Temp);
break;
}
if (VT.getScalarType() != MVT::i1) {
if (!DCI.isCalledByLegalizer())
DCI.AddToWorklist(N0.getNode());
// FIXME: If running after legalize, we probably can't do this.
ISD::NodeType ExtendCode = getExtendForContent(getBooleanContents(OpVT));
N0 = DAG.getNode(ExtendCode, dl, VT, N0);
}
return N0;
}
// Could not fold it.
return SDValue();
}
/// Returns true (and the GlobalValue and the offset) if the node is a
/// GlobalAddress + offset.
bool TargetLowering::isGAPlusOffset(SDNode *WN, const GlobalValue *&GA,
int64_t &Offset) const {
SDNode *N = unwrapAddress(SDValue(WN, 0)).getNode();
if (auto *GASD = dyn_cast<GlobalAddressSDNode>(N)) {
GA = GASD->getGlobal();
Offset += GASD->getOffset();
return true;
}
if (N->getOpcode() == ISD::ADD) {
SDValue N1 = N->getOperand(0);
SDValue N2 = N->getOperand(1);
if (isGAPlusOffset(N1.getNode(), GA, Offset)) {
if (auto *V = dyn_cast<ConstantSDNode>(N2)) {
Offset += V->getSExtValue();
return true;
}
} else if (isGAPlusOffset(N2.getNode(), GA, Offset)) {
if (auto *V = dyn_cast<ConstantSDNode>(N1)) {
Offset += V->getSExtValue();
return true;
}
}
}
return false;
}
SDValue TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
// Default implementation: no optimization.
return SDValue();
}
//===----------------------------------------------------------------------===//
// Inline Assembler Implementation Methods
//===----------------------------------------------------------------------===//
TargetLowering::ConstraintType
TargetLowering::getConstraintType(StringRef Constraint) const {
unsigned S = Constraint.size();
if (S == 1) {
switch (Constraint[0]) {
default: break;
- case 'r': return C_RegisterClass;
+ case 'r':
+ return C_RegisterClass;
case 'm': // memory
case 'o': // offsetable
case 'V': // not offsetable
return C_Memory;
- case 'i': // Simple Integer or Relocatable Constant
case 'n': // Simple Integer
case 'E': // Floating Point Constant
case 'F': // Floating Point Constant
+ return C_Immediate;
+ case 'i': // Simple Integer or Relocatable Constant
case 's': // Relocatable Constant
case 'p': // Address.
case 'X': // Allow ANY value.
case 'I': // Target registers.
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case '<':
case '>':
return C_Other;
}
}
if (S > 1 && Constraint[0] == '{' && Constraint[S - 1] == '}') {
if (S == 8 && Constraint.substr(1, 6) == "memory") // "{memory}"
return C_Memory;
return C_Register;
}
return C_Unknown;
}
/// Try to replace an X constraint, which matches anything, with another that
/// has more specific requirements based on the type of the corresponding
/// operand.
const char *TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
if (ConstraintVT.isInteger())
return "r";
if (ConstraintVT.isFloatingPoint())
return "f"; // works for many targets
return nullptr;
}
SDValue TargetLowering::LowerAsmOutputForConstraint(
SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo,
SelectionDAG &DAG) const {
return SDValue();
}
/// Lower the specified operand into the Ops vector.
/// If it is invalid, don't add anything to Ops.
void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
std::string &Constraint,
std::vector<SDValue> &Ops,
SelectionDAG &DAG) const {
if (Constraint.length() > 1) return;
char ConstraintLetter = Constraint[0];
switch (ConstraintLetter) {
default: break;
case 'X': // Allows any operand; labels (basic block) use this.
if (Op.getOpcode() == ISD::BasicBlock ||
Op.getOpcode() == ISD::TargetBlockAddress) {
Ops.push_back(Op);
return;
}
LLVM_FALLTHROUGH;
case 'i': // Simple Integer or Relocatable Constant
case 'n': // Simple Integer
case 's': { // Relocatable Constant
GlobalAddressSDNode *GA;
ConstantSDNode *C;
BlockAddressSDNode *BA;
uint64_t Offset = 0;
// Match (GA) or (C) or (GA+C) or (GA-C) or ((GA+C)+C) or (((GA+C)+C)+C),
// etc., since getelementpointer is variadic. We can't use
// SelectionDAG::FoldSymbolOffset because it expects the GA to be accessible
// while in this case the GA may be furthest from the root node which is
// likely an ISD::ADD.
while (1) {
if ((GA = dyn_cast<GlobalAddressSDNode>(Op)) && ConstraintLetter != 'n') {
Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
GA->getValueType(0),
Offset + GA->getOffset()));
return;
} else if ((C = dyn_cast<ConstantSDNode>(Op)) &&
ConstraintLetter != 's') {
// gcc prints these as sign extended. Sign extend value to 64 bits
// now; without this it would get ZExt'd later in
// ScheduleDAGSDNodes::EmitNode, which is very generic.
bool IsBool = C->getConstantIntValue()->getBitWidth() == 1;
BooleanContent BCont = getBooleanContents(MVT::i64);
ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
: ISD::SIGN_EXTEND;
int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? C->getZExtValue()
: C->getSExtValue();
Ops.push_back(DAG.getTargetConstant(Offset + ExtVal,
SDLoc(C), MVT::i64));
return;
} else if ((BA = dyn_cast<BlockAddressSDNode>(Op)) &&
ConstraintLetter != 'n') {
Ops.push_back(DAG.getTargetBlockAddress(
BA->getBlockAddress(), BA->getValueType(0),
Offset + BA->getOffset(), BA->getTargetFlags()));
return;
} else {
const unsigned OpCode = Op.getOpcode();
if (OpCode == ISD::ADD || OpCode == ISD::SUB) {
if ((C = dyn_cast<ConstantSDNode>(Op.getOperand(0))))
Op = Op.getOperand(1);
// Subtraction is not commutative.
else if (OpCode == ISD::ADD &&
(C = dyn_cast<ConstantSDNode>(Op.getOperand(1))))
Op = Op.getOperand(0);
else
return;
Offset += (OpCode == ISD::ADD ? 1 : -1) * C->getSExtValue();
continue;
}
}
return;
}
break;
}
}
}
std::pair<unsigned, const TargetRegisterClass *>
TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI,
StringRef Constraint,
MVT VT) const {
if (Constraint.empty() || Constraint[0] != '{')
return std::make_pair(0u, static_cast<TargetRegisterClass *>(nullptr));
assert(*(Constraint.end() - 1) == '}' && "Not a brace enclosed constraint?");
// Remove the braces from around the name.
StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
std::pair<unsigned, const TargetRegisterClass *> R =
std::make_pair(0u, static_cast<const TargetRegisterClass *>(nullptr));
// Figure out which register class contains this reg.
for (const TargetRegisterClass *RC : RI->regclasses()) {
// If none of the value types for this register class are valid, we
// can't use it. For example, 64-bit reg classes on 32-bit targets.
if (!isLegalRC(*RI, *RC))
continue;
for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
I != E; ++I) {
if (RegName.equals_lower(RI->getRegAsmName(*I))) {
std::pair<unsigned, const TargetRegisterClass *> S =
std::make_pair(*I, RC);
// If this register class has the requested value type, return it,
// otherwise keep searching and return the first class found
// if no other is found which explicitly has the requested type.
if (RI->isTypeLegalForClass(*RC, VT))
return S;
if (!R.second)
R = S;
}
}
}
return R;
}
//===----------------------------------------------------------------------===//
// Constraint Selection.
/// Return true of this is an input operand that is a matching constraint like
/// "4".
bool TargetLowering::AsmOperandInfo::isMatchingInputConstraint() const {
assert(!ConstraintCode.empty() && "No known constraint!");
return isdigit(static_cast<unsigned char>(ConstraintCode[0]));
}
/// If this is an input matching constraint, this method returns the output
/// operand it matches.
unsigned TargetLowering::AsmOperandInfo::getMatchedOperand() const {
assert(!ConstraintCode.empty() && "No known constraint!");
return atoi(ConstraintCode.c_str());
}
/// Split up the constraint string from the inline assembly value into the
/// specific constraints and their prefixes, and also tie in the associated
/// operand values.
/// If this returns an empty vector, and if the constraint string itself
/// isn't empty, there was an error parsing.
TargetLowering::AsmOperandInfoVector
TargetLowering::ParseConstraints(const DataLayout &DL,
const TargetRegisterInfo *TRI,
ImmutableCallSite CS) const {
/// Information about all of the constraints.
AsmOperandInfoVector ConstraintOperands;
const InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());
unsigned maCount = 0; // Largest number of multiple alternative constraints.
// Do a prepass over the constraints, canonicalizing them, and building up the
// ConstraintOperands list.
unsigned ArgNo = 0; // ArgNo - The argument of the CallInst.
unsigned ResNo = 0; // ResNo - The result number of the next output.
for (InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
ConstraintOperands.emplace_back(std::move(CI));
AsmOperandInfo &OpInfo = ConstraintOperands.back();
// Update multiple alternative constraint count.
if (OpInfo.multipleAlternatives.size() > maCount)
maCount = OpInfo.multipleAlternatives.size();
OpInfo.ConstraintVT = MVT::Other;
// Compute the value type for each operand.
switch (OpInfo.Type) {
case InlineAsm::isOutput:
// Indirect outputs just consume an argument.
if (OpInfo.isIndirect) {
OpInfo.CallOperandVal = const_cast<Value *>(CS.getArgument(ArgNo++));
break;
}
// The return value of the call is this value. As such, there is no
// corresponding argument.
assert(!CS.getType()->isVoidTy() &&
"Bad inline asm!");
if (StructType *STy = dyn_cast<StructType>(CS.getType())) {
OpInfo.ConstraintVT =
getSimpleValueType(DL, STy->getElementType(ResNo));
} else {
assert(ResNo == 0 && "Asm only has one result!");
OpInfo.ConstraintVT = getSimpleValueType(DL, CS.getType());
}
++ResNo;
break;
case InlineAsm::isInput:
OpInfo.CallOperandVal = const_cast<Value *>(CS.getArgument(ArgNo++));
break;
case InlineAsm::isClobber:
// Nothing to do.
break;
}
if (OpInfo.CallOperandVal) {
llvm::Type *OpTy = OpInfo.CallOperandVal->getType();
if (OpInfo.isIndirect) {
llvm::PointerType *PtrTy = dyn_cast<PointerType>(OpTy);
if (!PtrTy)
report_fatal_error("Indirect operand for inline asm not a pointer!");
OpTy = PtrTy->getElementType();
}
// Look for vector wrapped in a struct. e.g. { <16 x i8> }.
if (StructType *STy = dyn_cast<StructType>(OpTy))
if (STy->getNumElements() == 1)
OpTy = STy->getElementType(0);
// If OpTy is not a single value, it may be a struct/union that we
// can tile with integers.
if (!OpTy->isSingleValueType() && OpTy->isSized()) {
unsigned BitSize = DL.getTypeSizeInBits(OpTy);
switch (BitSize) {
default: break;
case 1:
case 8:
case 16:
case 32:
case 64:
case 128:
OpInfo.ConstraintVT =
MVT::getVT(IntegerType::get(OpTy->getContext(), BitSize), true);
break;
}
} else if (PointerType *PT = dyn_cast<PointerType>(OpTy)) {
unsigned PtrSize = DL.getPointerSizeInBits(PT->getAddressSpace());
OpInfo.ConstraintVT = MVT::getIntegerVT(PtrSize);
} else {
OpInfo.ConstraintVT = MVT::getVT(OpTy, true);
}
}
}
// If we have multiple alternative constraints, select the best alternative.
if (!ConstraintOperands.empty()) {
if (maCount) {
unsigned bestMAIndex = 0;
int bestWeight = -1;
// weight: -1 = invalid match, and 0 = so-so match to 5 = good match.
int weight = -1;
unsigned maIndex;
// Compute the sums of the weights for each alternative, keeping track
// of the best (highest weight) one so far.
for (maIndex = 0; maIndex < maCount; ++maIndex) {
int weightSum = 0;
for (unsigned cIndex = 0, eIndex = ConstraintOperands.size();
cIndex != eIndex; ++cIndex) {
AsmOperandInfo &OpInfo = ConstraintOperands[cIndex];
if (OpInfo.Type == InlineAsm::isClobber)
continue;
// If this is an output operand with a matching input operand,
// look up the matching input. If their types mismatch, e.g. one
// is an integer, the other is floating point, or their sizes are
// different, flag it as an maCantMatch.
if (OpInfo.hasMatchingInput()) {
AsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
if (OpInfo.ConstraintVT != Input.ConstraintVT) {
if ((OpInfo.ConstraintVT.isInteger() !=
Input.ConstraintVT.isInteger()) ||
(OpInfo.ConstraintVT.getSizeInBits() !=
Input.ConstraintVT.getSizeInBits())) {
weightSum = -1; // Can't match.
break;
}
}
}
weight = getMultipleConstraintMatchWeight(OpInfo, maIndex);
if (weight == -1) {
weightSum = -1;
break;
}
weightSum += weight;
}
// Update best.
if (weightSum > bestWeight) {
bestWeight = weightSum;
bestMAIndex = maIndex;
}
}
// Now select chosen alternative in each constraint.
for (unsigned cIndex = 0, eIndex = ConstraintOperands.size();
cIndex != eIndex; ++cIndex) {
AsmOperandInfo &cInfo = ConstraintOperands[cIndex];
if (cInfo.Type == InlineAsm::isClobber)
continue;
cInfo.selectAlternative(bestMAIndex);
}
}
}
// Check and hook up tied operands, choose constraint code to use.
for (unsigned cIndex = 0, eIndex = ConstraintOperands.size();
cIndex != eIndex; ++cIndex) {
AsmOperandInfo &OpInfo = ConstraintOperands[cIndex];
// If this is an output operand with a matching input operand, look up the
// matching input. If their types mismatch, e.g. one is an integer, the
// other is floating point, or their sizes are different, flag it as an
// error.
if (OpInfo.hasMatchingInput()) {
AsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
if (OpInfo.ConstraintVT != Input.ConstraintVT) {
std::pair<unsigned, const TargetRegisterClass *> MatchRC =
getRegForInlineAsmConstraint(TRI, OpInfo.ConstraintCode,
OpInfo.ConstraintVT);
std::pair<unsigned, const TargetRegisterClass *> InputRC =
getRegForInlineAsmConstraint(TRI, Input.ConstraintCode,
Input.ConstraintVT);
if ((OpInfo.ConstraintVT.isInteger() !=
Input.ConstraintVT.isInteger()) ||
(MatchRC.second != InputRC.second)) {
report_fatal_error("Unsupported asm: input constraint"
" with a matching output constraint of"
" incompatible type!");
}
}
}
}
return ConstraintOperands;
}
/// Return an integer indicating how general CT is.
static unsigned getConstraintGenerality(TargetLowering::ConstraintType CT) {
switch (CT) {
+ case TargetLowering::C_Immediate:
case TargetLowering::C_Other:
case TargetLowering::C_Unknown:
return 0;
case TargetLowering::C_Register:
return 1;
case TargetLowering::C_RegisterClass:
return 2;
case TargetLowering::C_Memory:
return 3;
}
llvm_unreachable("Invalid constraint type");
}
/// Examine constraint type and operand type and determine a weight value.
/// This object must already have been set up with the operand type
/// and the current alternative constraint selected.
TargetLowering::ConstraintWeight
TargetLowering::getMultipleConstraintMatchWeight(
AsmOperandInfo &info, int maIndex) const {
InlineAsm::ConstraintCodeVector *rCodes;
if (maIndex >= (int)info.multipleAlternatives.size())
rCodes = &info.Codes;
else
rCodes = &info.multipleAlternatives[maIndex].Codes;
ConstraintWeight BestWeight = CW_Invalid;
// Loop over the options, keeping track of the most general one.
for (unsigned i = 0, e = rCodes->size(); i != e; ++i) {
ConstraintWeight weight =
getSingleConstraintMatchWeight(info, (*rCodes)[i].c_str());
if (weight > BestWeight)
BestWeight = weight;
}
return BestWeight;
}
/// Examine constraint type and operand type and determine a weight value.
/// This object must already have been set up with the operand type
/// and the current alternative constraint selected.
TargetLowering::ConstraintWeight
TargetLowering::getSingleConstraintMatchWeight(
AsmOperandInfo &info, const char *constraint) const {
ConstraintWeight weight = CW_Invalid;
Value *CallOperandVal = info.CallOperandVal;
// If we don't have a value, we can't do a match,
// but allow it at the lowest weight.
if (!CallOperandVal)
return CW_Default;
// Look at the constraint type.
switch (*constraint) {
case 'i': // immediate integer.
case 'n': // immediate integer with a known value.
if (isa<ConstantInt>(CallOperandVal))
weight = CW_Constant;
break;
case 's': // non-explicit intregal immediate.
if (isa<GlobalValue>(CallOperandVal))
weight = CW_Constant;
break;
case 'E': // immediate float if host format.
case 'F': // immediate float.
if (isa<ConstantFP>(CallOperandVal))
weight = CW_Constant;
break;
case '<': // memory operand with autodecrement.
case '>': // memory operand with autoincrement.
case 'm': // memory operand.
case 'o': // offsettable memory operand
case 'V': // non-offsettable memory operand
weight = CW_Memory;
break;
case 'r': // general register.
case 'g': // general register, memory operand or immediate integer.
// note: Clang converts "g" to "imr".
if (CallOperandVal->getType()->isIntegerTy())
weight = CW_Register;
break;
case 'X': // any operand.
default:
weight = CW_Default;
break;
}
return weight;
}
/// If there are multiple different constraints that we could pick for this
/// operand (e.g. "imr") try to pick the 'best' one.
/// This is somewhat tricky: constraints fall into four classes:
/// Other -> immediates and magic values
/// Register -> one specific register
/// RegisterClass -> a group of regs
/// Memory -> memory
/// Ideally, we would pick the most specific constraint possible: if we have
/// something that fits into a register, we would pick it. The problem here
/// is that if we have something that could either be in a register or in
/// memory that use of the register could cause selection of *other*
/// operands to fail: they might only succeed if we pick memory. Because of
/// this the heuristic we use is:
///
/// 1) If there is an 'other' constraint, and if the operand is valid for
/// that constraint, use it. This makes us take advantage of 'i'
/// constraints when available.
/// 2) Otherwise, pick the most general constraint present. This prefers
/// 'm' over 'r', for example.
///
static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
const TargetLowering &TLI,
SDValue Op, SelectionDAG *DAG) {
assert(OpInfo.Codes.size() > 1 && "Doesn't have multiple constraint options");
unsigned BestIdx = 0;
TargetLowering::ConstraintType BestType = TargetLowering::C_Unknown;
int BestGenerality = -1;
// Loop over the options, keeping track of the most general one.
for (unsigned i = 0, e = OpInfo.Codes.size(); i != e; ++i) {
TargetLowering::ConstraintType CType =
TLI.getConstraintType(OpInfo.Codes[i]);
- // If this is an 'other' constraint, see if the operand is valid for it.
- // For example, on X86 we might have an 'rI' constraint. If the operand
- // is an integer in the range [0..31] we want to use I (saving a load
- // of a register), otherwise we must use 'r'.
- if (CType == TargetLowering::C_Other && Op.getNode()) {
+ // If this is an 'other' or 'immediate' constraint, see if the operand is
+ // valid for it. For example, on X86 we might have an 'rI' constraint. If
+ // the operand is an integer in the range [0..31] we want to use I (saving a
+ // load of a register), otherwise we must use 'r'.
+ if ((CType == TargetLowering::C_Other ||
+ CType == TargetLowering::C_Immediate) && Op.getNode()) {
assert(OpInfo.Codes[i].size() == 1 &&
"Unhandled multi-letter 'other' constraint");
std::vector<SDValue> ResultOps;
TLI.LowerAsmOperandForConstraint(Op, OpInfo.Codes[i],
ResultOps, *DAG);
if (!ResultOps.empty()) {
BestType = CType;
BestIdx = i;
break;
}
}
// Things with matching constraints can only be registers, per gcc
// documentation. This mainly affects "g" constraints.
if (CType == TargetLowering::C_Memory && OpInfo.hasMatchingInput())
continue;
// This constraint letter is more general than the previous one, use it.
int Generality = getConstraintGenerality(CType);
if (Generality > BestGenerality) {
BestType = CType;
BestIdx = i;
BestGenerality = Generality;
}
}
OpInfo.ConstraintCode = OpInfo.Codes[BestIdx];
OpInfo.ConstraintType = BestType;
}
/// Determines the constraint code and constraint type to use for the specific
/// AsmOperandInfo, setting OpInfo.ConstraintCode and OpInfo.ConstraintType.
void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo,
SDValue Op,
SelectionDAG *DAG) const {
assert(!OpInfo.Codes.empty() && "Must have at least one constraint");
// Single-letter constraints ('r') are very common.
if (OpInfo.Codes.size() == 1) {
OpInfo.ConstraintCode = OpInfo.Codes[0];
OpInfo.ConstraintType = getConstraintType(OpInfo.ConstraintCode);
} else {
ChooseConstraint(OpInfo, *this, Op, DAG);
}
// 'X' matches anything.
if (OpInfo.ConstraintCode == "X" && OpInfo.CallOperandVal) {
// Labels and constants are handled elsewhere ('X' is the only thing
// that matches labels). For Functions, the type here is the type of
// the result, which is not what we want to look at; leave them alone.
Value *v = OpInfo.CallOperandVal;
if (isa<BasicBlock>(v) || isa<ConstantInt>(v) || isa<Function>(v)) {
OpInfo.CallOperandVal = v;
return;
}
if (Op.getNode() && Op.getOpcode() == ISD::TargetBlockAddress)
return;
// Otherwise, try to resolve it to something we know about by looking at
// the actual operand type.
if (const char *Repl = LowerXConstraint(OpInfo.ConstraintVT)) {
OpInfo.ConstraintCode = Repl;
OpInfo.ConstraintType = getConstraintType(OpInfo.ConstraintCode);
}
}
}
/// Given an exact SDIV by a constant, create a multiplication
/// with the multiplicative inverse of the constant.
static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N,
const SDLoc &dl, SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) {
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
EVT VT = N->getValueType(0);
EVT SVT = VT.getScalarType();
EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
EVT ShSVT = ShVT.getScalarType();
bool UseSRA = false;
SmallVector<SDValue, 16> Shifts, Factors;
auto BuildSDIVPattern = [&](ConstantSDNode *C) {
if (C->isNullValue())
return false;
APInt Divisor = C->getAPIntValue();
unsigned Shift = Divisor.countTrailingZeros();
if (Shift) {
Divisor.ashrInPlace(Shift);
UseSRA = true;
}
// Calculate the multiplicative inverse, using Newton's method.
APInt t;
APInt Factor = Divisor;
while ((t = Divisor * Factor) != 1)
Factor *= APInt(Divisor.getBitWidth(), 2) - t;
Shifts.push_back(DAG.getConstant(Shift, dl, ShSVT));
Factors.push_back(DAG.getConstant(Factor, dl, SVT));
return true;
};
// Collect all magic values from the build vector.
if (!ISD::matchUnaryPredicate(Op1, BuildSDIVPattern))
return SDValue();
SDValue Shift, Factor;
if (VT.isVector()) {
Shift = DAG.getBuildVector(ShVT, dl, Shifts);
Factor = DAG.getBuildVector(VT, dl, Factors);
} else {
Shift = Shifts[0];
Factor = Factors[0];
}
SDValue Res = Op0;
// Shift the value upfront if it is even, so the LSB is one.
if (UseSRA) {
// TODO: For UDIV use SRL instead of SRA.
SDNodeFlags Flags;
Flags.setExact(true);
Res = DAG.getNode(ISD::SRA, dl, VT, Res, Shift, Flags);
Created.push_back(Res.getNode());
}
return DAG.getNode(ISD::MUL, dl, VT, Res, Factor);
}
SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const {
AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.isIntDivCheap(N->getValueType(0), Attr))
return SDValue(N, 0); // Lower SDIV as SDIV
return SDValue();
}
/// Given an ISD::SDIV node expressing a divide by constant,
/// return a DAG expression to select that will generate the same value by
/// multiplying by a magic number.
/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG,
bool IsAfterLegalization,
SmallVectorImpl<SDNode *> &Created) const {
SDLoc dl(N);
EVT VT = N->getValueType(0);
EVT SVT = VT.getScalarType();
EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
EVT ShSVT = ShVT.getScalarType();
unsigned EltBits = VT.getScalarSizeInBits();
// Check to see if we can do this.
// FIXME: We should be more aggressive here.
if (!isTypeLegal(VT))
return SDValue();
// If the sdiv has an 'exact' bit we can use a simpler lowering.
if (N->getFlags().hasExact())
return BuildExactSDIV(*this, N, dl, DAG, Created);
SmallVector<SDValue, 16> MagicFactors, Factors, Shifts, ShiftMasks;
auto BuildSDIVPattern = [&](ConstantSDNode *C) {
if (C->isNullValue())
return false;
const APInt &Divisor = C->getAPIntValue();
APInt::ms magics = Divisor.magic();
int NumeratorFactor = 0;
int ShiftMask = -1;
if (Divisor.isOneValue() || Divisor.isAllOnesValue()) {
// If d is +1/-1, we just multiply the numerator by +1/-1.
NumeratorFactor = Divisor.getSExtValue();
magics.m = 0;
magics.s = 0;
ShiftMask = 0;
} else if (Divisor.isStrictlyPositive() && magics.m.isNegative()) {
// If d > 0 and m < 0, add the numerator.
NumeratorFactor = 1;
} else if (Divisor.isNegative() && magics.m.isStrictlyPositive()) {
// If d < 0 and m > 0, subtract the numerator.
NumeratorFactor = -1;
}
MagicFactors.push_back(DAG.getConstant(magics.m, dl, SVT));
Factors.push_back(DAG.getConstant(NumeratorFactor, dl, SVT));
Shifts.push_back(DAG.getConstant(magics.s, dl, ShSVT));
ShiftMasks.push_back(DAG.getConstant(ShiftMask, dl, SVT));
return true;
};
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// Collect the shifts / magic values from each element.
if (!ISD::matchUnaryPredicate(N1, BuildSDIVPattern))
return SDValue();
SDValue MagicFactor, Factor, Shift, ShiftMask;
if (VT.isVector()) {
MagicFactor = DAG.getBuildVector(VT, dl, MagicFactors);
Factor = DAG.getBuildVector(VT, dl, Factors);
Shift = DAG.getBuildVector(ShVT, dl, Shifts);
ShiftMask = DAG.getBuildVector(VT, dl, ShiftMasks);
} else {
MagicFactor = MagicFactors[0];
Factor = Factors[0];
Shift = Shifts[0];
ShiftMask = ShiftMasks[0];
}
// Multiply the numerator (operand 0) by the magic value.
// FIXME: We should support doing a MUL in a wider type.
SDValue Q;
if (IsAfterLegalization ? isOperationLegal(ISD::MULHS, VT)
: isOperationLegalOrCustom(ISD::MULHS, VT))
Q = DAG.getNode(ISD::MULHS, dl, VT, N0, MagicFactor);
else if (IsAfterLegalization ? isOperationLegal(ISD::SMUL_LOHI, VT)
: isOperationLegalOrCustom(ISD::SMUL_LOHI, VT)) {
SDValue LoHi =
DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(VT, VT), N0, MagicFactor);
Q = SDValue(LoHi.getNode(), 1);
} else
return SDValue(); // No mulhs or equivalent.
Created.push_back(Q.getNode());
// (Optionally) Add/subtract the numerator using Factor.
Factor = DAG.getNode(ISD::MUL, dl, VT, N0, Factor);
Created.push_back(Factor.getNode());
Q = DAG.getNode(ISD::ADD, dl, VT, Q, Factor);
Created.push_back(Q.getNode());
// Shift right algebraic by shift value.
Q = DAG.getNode(ISD::SRA, dl, VT, Q, Shift);
Created.push_back(Q.getNode());
// Extract the sign bit, mask it and add it to the quotient.
SDValue SignShift = DAG.getConstant(EltBits - 1, dl, ShVT);
SDValue T = DAG.getNode(ISD::SRL, dl, VT, Q, SignShift);
Created.push_back(T.getNode());
T = DAG.getNode(ISD::AND, dl, VT, T, ShiftMask);
Created.push_back(T.getNode());
return DAG.getNode(ISD::ADD, dl, VT, Q, T);
}
/// Given an ISD::UDIV node expressing a divide by constant,
/// return a DAG expression to select that will generate the same value by
/// multiplying by a magic number.
/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
bool IsAfterLegalization,
SmallVectorImpl<SDNode *> &Created) const {
SDLoc dl(N);
EVT VT = N->getValueType(0);
EVT SVT = VT.getScalarType();
EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
EVT ShSVT = ShVT.getScalarType();
unsigned EltBits = VT.getScalarSizeInBits();
// Check to see if we can do this.
// FIXME: We should be more aggressive here.
if (!isTypeLegal(VT))
return SDValue();
bool UseNPQ = false;
SmallVector<SDValue, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
auto BuildUDIVPattern = [&](ConstantSDNode *C) {
if (C->isNullValue())
return false;
// FIXME: We should use a narrower constant when the upper
// bits are known to be zero.
APInt Divisor = C->getAPIntValue();
APInt::mu magics = Divisor.magicu();
unsigned PreShift = 0, PostShift = 0;
// If the divisor is even, we can avoid using the expensive fixup by
// shifting the divided value upfront.
if (magics.a != 0 && !Divisor[0]) {
PreShift = Divisor.countTrailingZeros();
// Get magic number for the shifted divisor.
magics = Divisor.lshr(PreShift).magicu(PreShift);
assert(magics.a == 0 && "Should use cheap fixup now");
}
APInt Magic = magics.m;
unsigned SelNPQ;
if (magics.a == 0 || Divisor.isOneValue()) {
assert(magics.s < Divisor.getBitWidth() &&
"We shouldn't generate an undefined shift!");
PostShift = magics.s;
SelNPQ = false;
} else {
PostShift = magics.s - 1;
SelNPQ = true;
}
PreShifts.push_back(DAG.getConstant(PreShift, dl, ShSVT));
MagicFactors.push_back(DAG.getConstant(Magic, dl, SVT));
NPQFactors.push_back(
DAG.getConstant(SelNPQ ? APInt::getOneBitSet(EltBits, EltBits - 1)
: APInt::getNullValue(EltBits),
dl, SVT));
PostShifts.push_back(DAG.getConstant(PostShift, dl, ShSVT));
UseNPQ |= SelNPQ;
return true;
};
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// Collect the shifts/magic values from each element.
if (!ISD::matchUnaryPredicate(N1, BuildUDIVPattern))
return SDValue();
SDValue PreShift, PostShift, MagicFactor, NPQFactor;
if (VT.isVector()) {
PreShift = DAG.getBuildVector(ShVT, dl, PreShifts);
MagicFactor = DAG.getBuildVector(VT, dl, MagicFactors);
NPQFactor = DAG.getBuildVector(VT, dl, NPQFactors);
PostShift = DAG.getBuildVector(ShVT, dl, PostShifts);
} else {
PreShift = PreShifts[0];
MagicFactor = MagicFactors[0];
PostShift = PostShifts[0];
}
SDValue Q = N0;
Q = DAG.getNode(ISD::SRL, dl, VT, Q, PreShift);
Created.push_back(Q.getNode());
// FIXME: We should support doing a MUL in a wider type.
auto GetMULHU = [&](SDValue X, SDValue Y) {
if (IsAfterLegalization ? isOperationLegal(ISD::MULHU, VT)
: isOperationLegalOrCustom(ISD::MULHU, VT))
return DAG.getNode(ISD::MULHU, dl, VT, X, Y);
if (IsAfterLegalization ? isOperationLegal(ISD::UMUL_LOHI, VT)
: isOperationLegalOrCustom(ISD::UMUL_LOHI, VT)) {
SDValue LoHi =
DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT), X, Y);
return SDValue(LoHi.getNode(), 1);
}
return SDValue(); // No mulhu or equivalent
};
// Multiply the numerator (operand 0) by the magic value.
Q = GetMULHU(Q, MagicFactor);
if (!Q)
return SDValue();
Created.push_back(Q.getNode());
if (UseNPQ) {
SDValue NPQ = DAG.getNode(ISD::SUB, dl, VT, N0, Q);
Created.push_back(NPQ.getNode());
// For vectors we might have a mix of non-NPQ/NPQ paths, so use
// MULHU to act as a SRL-by-1 for NPQ, else multiply by zero.
if (VT.isVector())
NPQ = GetMULHU(NPQ, NPQFactor);
else
NPQ = DAG.getNode(ISD::SRL, dl, VT, NPQ, DAG.getConstant(1, dl, ShVT));
Created.push_back(NPQ.getNode());
Q = DAG.getNode(ISD::ADD, dl, VT, NPQ, Q);
Created.push_back(Q.getNode());
}
Q = DAG.getNode(ISD::SRL, dl, VT, Q, PostShift);
Created.push_back(Q.getNode());
SDValue One = DAG.getConstant(1, dl, VT);
SDValue IsOne = DAG.getSetCC(dl, VT, N1, One, ISD::SETEQ);
return DAG.getSelect(dl, VT, IsOne, N0, Q);
}
/// Given an ISD::UREM used only by an ISD::SETEQ or ISD::SETNE
/// where the divisor is constant and the comparison target is zero,
/// return a DAG expression that will generate the same comparison result
/// using only multiplications, additions and shifts/rotations.
/// Ref: "Hacker's Delight" 10-17.
SDValue TargetLowering::buildUREMEqFold(EVT SETCCVT, SDValue REMNode,
SDValue CompTargetNode,
ISD::CondCode Cond,
DAGCombinerInfo &DCI,
const SDLoc &DL) const {
SmallVector<SDNode *, 2> Built;
if (SDValue Folded = prepareUREMEqFold(SETCCVT, REMNode, CompTargetNode, Cond,
DCI, DL, Built)) {
for (SDNode *N : Built)
DCI.AddToWorklist(N);
return Folded;
}
return SDValue();
}
SDValue
TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
SDValue CompTargetNode, ISD::CondCode Cond,
DAGCombinerInfo &DCI, const SDLoc &DL,
SmallVectorImpl<SDNode *> &Created) const {
// fold (seteq/ne (urem N, D), 0) -> (setule/ugt (rotr (mul N, P), K), Q)
// - D must be constant with D = D0 * 2^K where D0 is odd and D0 != 1
// - P is the multiplicative inverse of D0 modulo 2^W
// - Q = floor((2^W - 1) / D0)
// where W is the width of the common type of N and D.
assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
"Only applicable for (in)equality comparisons.");
EVT VT = REMNode.getValueType();
// If MUL is unavailable, we cannot proceed in any case.
if (!isOperationLegalOrCustom(ISD::MUL, VT))
return SDValue();
// TODO: Add non-uniform constant support.
ConstantSDNode *Divisor = isConstOrConstSplat(REMNode->getOperand(1));
ConstantSDNode *CompTarget = isConstOrConstSplat(CompTargetNode);
if (!Divisor || !CompTarget || Divisor->isNullValue() ||
!CompTarget->isNullValue())
return SDValue();
const APInt &D = Divisor->getAPIntValue();
// Decompose D into D0 * 2^K
unsigned K = D.countTrailingZeros();
bool DivisorIsEven = (K != 0);
APInt D0 = D.lshr(K);
// The fold is invalid when D0 == 1.
// This is reachable because visitSetCC happens before visitREM.
if (D0.isOneValue())
return SDValue();
// P = inv(D0, 2^W)
// 2^W requires W + 1 bits, so we have to extend and then truncate.
unsigned W = D.getBitWidth();
APInt P = D0.zext(W + 1)
.multiplicativeInverse(APInt::getSignedMinValue(W + 1))
.trunc(W);
assert(!P.isNullValue() && "No multiplicative inverse!"); // unreachable
assert((D0 * P).isOneValue() && "Multiplicative inverse sanity check.");
// Q = floor((2^W - 1) / D)
APInt Q = APInt::getAllOnesValue(W).udiv(D);
SelectionDAG &DAG = DCI.DAG;
SDValue PVal = DAG.getConstant(P, DL, VT);
SDValue QVal = DAG.getConstant(Q, DL, VT);
// (mul N, P)
SDValue Op1 = DAG.getNode(ISD::MUL, DL, VT, REMNode->getOperand(0), PVal);
Created.push_back(Op1.getNode());
// Rotate right only if D was even.
if (DivisorIsEven) {
// We need ROTR to do this.
if (!isOperationLegalOrCustom(ISD::ROTR, VT))
return SDValue();
SDValue ShAmt =
DAG.getConstant(K, DL, getShiftAmountTy(VT, DAG.getDataLayout()));
SDNodeFlags Flags;
Flags.setExact(true);
// UREM: (rotr (mul N, P), K)
Op1 = DAG.getNode(ISD::ROTR, DL, VT, Op1, ShAmt, Flags);
Created.push_back(Op1.getNode());
}
// UREM: (setule/setugt (rotr (mul N, P), K), Q)
return DAG.getSetCC(DL, SETCCVT, Op1, QVal,
((Cond == ISD::SETEQ) ? ISD::SETULE : ISD::SETUGT));
}
bool TargetLowering::
verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const {
if (!isa<ConstantSDNode>(Op.getOperand(0))) {
DAG.getContext()->emitError("argument to '__builtin_return_address' must "
"be a constant integer");
return true;
}
return false;
}
//===----------------------------------------------------------------------===//
// Legalization Utilities
//===----------------------------------------------------------------------===//
bool TargetLowering::expandMUL_LOHI(unsigned Opcode, EVT VT, SDLoc dl,
SDValue LHS, SDValue RHS,
SmallVectorImpl<SDValue> &Result,
EVT HiLoVT, SelectionDAG &DAG,
MulExpansionKind Kind, SDValue LL,
SDValue LH, SDValue RL, SDValue RH) const {
assert(Opcode == ISD::MUL || Opcode == ISD::UMUL_LOHI ||
Opcode == ISD::SMUL_LOHI);
bool HasMULHS = (Kind == MulExpansionKind::Always) ||
isOperationLegalOrCustom(ISD::MULHS, HiLoVT);
bool HasMULHU = (Kind == MulExpansionKind::Always) ||
isOperationLegalOrCustom(ISD::MULHU, HiLoVT);
bool HasSMUL_LOHI = (Kind == MulExpansionKind::Always) ||
isOperationLegalOrCustom(ISD::SMUL_LOHI, HiLoVT);
bool HasUMUL_LOHI = (Kind == MulExpansionKind::Always) ||
isOperationLegalOrCustom(ISD::UMUL_LOHI, HiLoVT);
if (!HasMULHU && !HasMULHS && !HasUMUL_LOHI && !HasSMUL_LOHI)
return false;
unsigned OuterBitSize = VT.getScalarSizeInBits();
unsigned InnerBitSize = HiLoVT.getScalarSizeInBits();
unsigned LHSSB = DAG.ComputeNumSignBits(LHS);
unsigned RHSSB = DAG.ComputeNumSignBits(RHS);
// LL, LH, RL, and RH must be either all NULL or all set to a value.
assert((LL.getNode() && LH.getNode() && RL.getNode() && RH.getNode()) ||
(!LL.getNode() && !LH.getNode() && !RL.getNode() && !RH.getNode()));
SDVTList VTs = DAG.getVTList(HiLoVT, HiLoVT);
auto MakeMUL_LOHI = [&](SDValue L, SDValue R, SDValue &Lo, SDValue &Hi,
bool Signed) -> bool {
if ((Signed && HasSMUL_LOHI) || (!Signed && HasUMUL_LOHI)) {
Lo = DAG.getNode(Signed ? ISD::SMUL_LOHI : ISD::UMUL_LOHI, dl, VTs, L, R);
Hi = SDValue(Lo.getNode(), 1);
return true;
}
if ((Signed && HasMULHS) || (!Signed && HasMULHU)) {
Lo = DAG.getNode(ISD::MUL, dl, HiLoVT, L, R);
Hi = DAG.getNode(Signed ? ISD::MULHS : ISD::MULHU, dl, HiLoVT, L, R);
return true;
}
return false;
};
SDValue Lo, Hi;
if (!LL.getNode() && !RL.getNode() &&
isOperationLegalOrCustom(ISD::TRUNCATE, HiLoVT)) {
LL = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, LHS);
RL = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, RHS);
}
if (!LL.getNode())
return false;
APInt HighMask = APInt::getHighBitsSet(OuterBitSize, InnerBitSize);
if (DAG.MaskedValueIsZero(LHS, HighMask) &&
DAG.MaskedValueIsZero(RHS, HighMask)) {
// The inputs are both zero-extended.
if (MakeMUL_LOHI(LL, RL, Lo, Hi, false)) {
Result.push_back(Lo);
Result.push_back(Hi);
if (Opcode != ISD::MUL) {
SDValue Zero = DAG.getConstant(0, dl, HiLoVT);
Result.push_back(Zero);
Result.push_back(Zero);
}
return true;
}
}
if (!VT.isVector() && Opcode == ISD::MUL && LHSSB > InnerBitSize &&
RHSSB > InnerBitSize) {
// The input values are both sign-extended.
// TODO non-MUL case?
if (MakeMUL_LOHI(LL, RL, Lo, Hi, true)) {
Result.push_back(Lo);
Result.push_back(Hi);
return true;
}
}
unsigned ShiftAmount = OuterBitSize - InnerBitSize;
EVT ShiftAmountTy = getShiftAmountTy(VT, DAG.getDataLayout());
if (APInt::getMaxValue(ShiftAmountTy.getSizeInBits()).ult(ShiftAmount)) {
// FIXME getShiftAmountTy does not always return a sensible result when VT
// is an illegal type, and so the type may be too small to fit the shift
// amount. Override it with i32. The shift will have to be legalized.
ShiftAmountTy = MVT::i32;
}
SDValue Shift = DAG.getConstant(ShiftAmount, dl, ShiftAmountTy);
if (!LH.getNode() && !RH.getNode() &&
isOperationLegalOrCustom(ISD::SRL, VT) &&
isOperationLegalOrCustom(ISD::TRUNCATE, HiLoVT)) {
LH = DAG.getNode(ISD::SRL, dl, VT, LHS, Shift);
LH = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, LH);
RH = DAG.getNode(ISD::SRL, dl, VT, RHS, Shift);
RH = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, RH);
}
if (!LH.getNode())
return false;
if (!MakeMUL_LOHI(LL, RL, Lo, Hi, false))
return false;
Result.push_back(Lo);
if (Opcode == ISD::MUL) {
RH = DAG.getNode(ISD::MUL, dl, HiLoVT, LL, RH);
LH = DAG.getNode(ISD::MUL, dl, HiLoVT, LH, RL);
Hi = DAG.getNode(ISD::ADD, dl, HiLoVT, Hi, RH);
Hi = DAG.getNode(ISD::ADD, dl, HiLoVT, Hi, LH);
Result.push_back(Hi);
return true;
}
// Compute the full width result.
auto Merge = [&](SDValue Lo, SDValue Hi) -> SDValue {
Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Lo);
Hi = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Hi);
Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift);
return DAG.getNode(ISD::OR, dl, VT, Lo, Hi);
};
SDValue Next = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Hi);
if (!MakeMUL_LOHI(LL, RH, Lo, Hi, false))
return false;
// This is effectively the add part of a multiply-add of half-sized operands,
// so it cannot overflow.
Next = DAG.getNode(ISD::ADD, dl, VT, Next, Merge(Lo, Hi));
if (!MakeMUL_LOHI(LH, RL, Lo, Hi, false))
return false;
SDValue Zero = DAG.getConstant(0, dl, HiLoVT);
EVT BoolType = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
bool UseGlue = (isOperationLegalOrCustom(ISD::ADDC, VT) &&
isOperationLegalOrCustom(ISD::ADDE, VT));
if (UseGlue)
Next = DAG.getNode(ISD::ADDC, dl, DAG.getVTList(VT, MVT::Glue), Next,
Merge(Lo, Hi));
else
Next = DAG.getNode(ISD::ADDCARRY, dl, DAG.getVTList(VT, BoolType), Next,
Merge(Lo, Hi), DAG.getConstant(0, dl, BoolType));
SDValue Carry = Next.getValue(1);
Result.push_back(DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, Next));
Next = DAG.getNode(ISD::SRL, dl, VT, Next, Shift);
if (!MakeMUL_LOHI(LH, RH, Lo, Hi, Opcode == ISD::SMUL_LOHI))
return false;
if (UseGlue)
Hi = DAG.getNode(ISD::ADDE, dl, DAG.getVTList(HiLoVT, MVT::Glue), Hi, Zero,
Carry);
else
Hi = DAG.getNode(ISD::ADDCARRY, dl, DAG.getVTList(HiLoVT, BoolType), Hi,
Zero, Carry);
Next = DAG.getNode(ISD::ADD, dl, VT, Next, Merge(Lo, Hi));
if (Opcode == ISD::SMUL_LOHI) {
SDValue NextSub = DAG.getNode(ISD::SUB, dl, VT, Next,
DAG.getNode(ISD::ZERO_EXTEND, dl, VT, RL));
Next = DAG.getSelectCC(dl, LH, Zero, NextSub, Next, ISD::SETLT);
NextSub = DAG.getNode(ISD::SUB, dl, VT, Next,
DAG.getNode(ISD::ZERO_EXTEND, dl, VT, LL));
Next = DAG.getSelectCC(dl, RH, Zero, NextSub, Next, ISD::SETLT);
}
Result.push_back(DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, Next));
Next = DAG.getNode(ISD::SRL, dl, VT, Next, Shift);
Result.push_back(DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, Next));
return true;
}
bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
SelectionDAG &DAG, MulExpansionKind Kind,
SDValue LL, SDValue LH, SDValue RL,
SDValue RH) const {
SmallVector<SDValue, 2> Result;
bool Ok = expandMUL_LOHI(N->getOpcode(), N->getValueType(0), N,
N->getOperand(0), N->getOperand(1), Result, HiLoVT,
DAG, Kind, LL, LH, RL, RH);
if (Ok) {
assert(Result.size() == 2);
Lo = Result[0];
Hi = Result[1];
}
return Ok;
}
bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result,
SelectionDAG &DAG) const {
EVT VT = Node->getValueType(0);
if (VT.isVector() && (!isOperationLegalOrCustom(ISD::SHL, VT) ||
!isOperationLegalOrCustom(ISD::SRL, VT) ||
!isOperationLegalOrCustom(ISD::SUB, VT) ||
!isOperationLegalOrCustomOrPromote(ISD::OR, VT)))
return false;
// fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
// fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
SDValue X = Node->getOperand(0);
SDValue Y = Node->getOperand(1);
SDValue Z = Node->getOperand(2);
unsigned EltSizeInBits = VT.getScalarSizeInBits();
bool IsFSHL = Node->getOpcode() == ISD::FSHL;
SDLoc DL(SDValue(Node, 0));
EVT ShVT = Z.getValueType();
SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT);
SDValue Zero = DAG.getConstant(0, DL, ShVT);
SDValue ShAmt;
if (isPowerOf2_32(EltSizeInBits)) {
SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, ShVT);
ShAmt = DAG.getNode(ISD::AND, DL, ShVT, Z, Mask);
} else {
ShAmt = DAG.getNode(ISD::UREM, DL, ShVT, Z, BitWidthC);
}
SDValue InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, ShAmt);
SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, X, IsFSHL ? ShAmt : InvShAmt);
SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Y, IsFSHL ? InvShAmt : ShAmt);
SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
// If (Z % BW == 0), then the opposite direction shift is shift-by-bitwidth,
// and that is undefined. We must compare and select to avoid UB.
EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ShVT);
// For fshl, 0-shift returns the 1st arg (X).
// For fshr, 0-shift returns the 2nd arg (Y).
SDValue IsZeroShift = DAG.getSetCC(DL, CCVT, ShAmt, Zero, ISD::SETEQ);
Result = DAG.getSelect(DL, VT, IsZeroShift, IsFSHL ? X : Y, Or);
return true;
}
// TODO: Merge with expandFunnelShift.
bool TargetLowering::expandROT(SDNode *Node, SDValue &Result,
SelectionDAG &DAG) const {
EVT VT = Node->getValueType(0);
unsigned EltSizeInBits = VT.getScalarSizeInBits();
bool IsLeft = Node->getOpcode() == ISD::ROTL;
SDValue Op0 = Node->getOperand(0);
SDValue Op1 = Node->getOperand(1);
SDLoc DL(SDValue(Node, 0));
EVT ShVT = Op1.getValueType();
SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT);
// If a rotate in the other direction is legal, use it.
unsigned RevRot = IsLeft ? ISD::ROTR : ISD::ROTL;
if (isOperationLegal(RevRot, VT)) {
SDValue Sub = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, Op1);
Result = DAG.getNode(RevRot, DL, VT, Op0, Sub);
return true;
}
if (VT.isVector() && (!isOperationLegalOrCustom(ISD::SHL, VT) ||
!isOperationLegalOrCustom(ISD::SRL, VT) ||
!isOperationLegalOrCustom(ISD::SUB, VT) ||
!isOperationLegalOrCustomOrPromote(ISD::OR, VT) ||
!isOperationLegalOrCustomOrPromote(ISD::AND, VT)))
return false;
// Otherwise,
// (rotl x, c) -> (or (shl x, (and c, w-1)), (srl x, (and w-c, w-1)))
// (rotr x, c) -> (or (srl x, (and c, w-1)), (shl x, (and w-c, w-1)))
//
assert(isPowerOf2_32(EltSizeInBits) && EltSizeInBits > 1 &&
"Expecting the type bitwidth to be a power of 2");
unsigned ShOpc = IsLeft ? ISD::SHL : ISD::SRL;
unsigned HsOpc = IsLeft ? ISD::SRL : ISD::SHL;
SDValue BitWidthMinusOneC = DAG.getConstant(EltSizeInBits - 1, DL, ShVT);
SDValue NegOp1 = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, Op1);
SDValue And0 = DAG.getNode(ISD::AND, DL, ShVT, Op1, BitWidthMinusOneC);
SDValue And1 = DAG.getNode(ISD::AND, DL, ShVT, NegOp1, BitWidthMinusOneC);
Result = DAG.getNode(ISD::OR, DL, VT, DAG.getNode(ShOpc, DL, VT, Op0, And0),
DAG.getNode(HsOpc, DL, VT, Op0, And1));
return true;
}
bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
SelectionDAG &DAG) const {
SDValue Src = Node->getOperand(0);
EVT SrcVT = Src.getValueType();
EVT DstVT = Node->getValueType(0);
SDLoc dl(SDValue(Node, 0));
// FIXME: Only f32 to i64 conversions are supported.
if (SrcVT != MVT::f32 || DstVT != MVT::i64)
return false;
// Expand f32 -> i64 conversion
// This algorithm comes from compiler-rt's implementation of fixsfdi:
// https://github.com/llvm/llvm-project/blob/master/compiler-rt/lib/builtins/fixsfdi.c
unsigned SrcEltBits = SrcVT.getScalarSizeInBits();
EVT IntVT = SrcVT.changeTypeToInteger();
EVT IntShVT = getShiftAmountTy(IntVT, DAG.getDataLayout());
SDValue ExponentMask = DAG.getConstant(0x7F800000, dl, IntVT);
SDValue ExponentLoBit = DAG.getConstant(23, dl, IntVT);
SDValue Bias = DAG.getConstant(127, dl, IntVT);
SDValue SignMask = DAG.getConstant(APInt::getSignMask(SrcEltBits), dl, IntVT);
SDValue SignLowBit = DAG.getConstant(SrcEltBits - 1, dl, IntVT);
SDValue MantissaMask = DAG.getConstant(0x007FFFFF, dl, IntVT);
SDValue Bits = DAG.getNode(ISD::BITCAST, dl, IntVT, Src);
SDValue ExponentBits = DAG.getNode(
ISD::SRL, dl, IntVT, DAG.getNode(ISD::AND, dl, IntVT, Bits, ExponentMask),
DAG.getZExtOrTrunc(ExponentLoBit, dl, IntShVT));
SDValue Exponent = DAG.getNode(ISD::SUB, dl, IntVT, ExponentBits, Bias);
SDValue Sign = DAG.getNode(ISD::SRA, dl, IntVT,
DAG.getNode(ISD::AND, dl, IntVT, Bits, SignMask),
DAG.getZExtOrTrunc(SignLowBit, dl, IntShVT));
Sign = DAG.getSExtOrTrunc(Sign, dl, DstVT);
SDValue R = DAG.getNode(ISD::OR, dl, IntVT,
DAG.getNode(ISD::AND, dl, IntVT, Bits, MantissaMask),
DAG.getConstant(0x00800000, dl, IntVT));
R = DAG.getZExtOrTrunc(R, dl, DstVT);
R = DAG.getSelectCC(
dl, Exponent, ExponentLoBit,
DAG.getNode(ISD::SHL, dl, DstVT, R,
DAG.getZExtOrTrunc(
DAG.getNode(ISD::SUB, dl, IntVT, Exponent, ExponentLoBit),
dl, IntShVT)),
DAG.getNode(ISD::SRL, dl, DstVT, R,
DAG.getZExtOrTrunc(
DAG.getNode(ISD::SUB, dl, IntVT, ExponentLoBit, Exponent),
dl, IntShVT)),
ISD::SETGT);
SDValue Ret = DAG.getNode(ISD::SUB, dl, DstVT,
DAG.getNode(ISD::XOR, dl, DstVT, R, Sign), Sign);
Result = DAG.getSelectCC(dl, Exponent, DAG.getConstant(0, dl, IntVT),
DAG.getConstant(0, dl, DstVT), Ret, ISD::SETLT);
return true;
}
bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
SelectionDAG &DAG) const {
SDLoc dl(SDValue(Node, 0));
SDValue Src = Node->getOperand(0);
EVT SrcVT = Src.getValueType();
EVT DstVT = Node->getValueType(0);
EVT SetCCVT =
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);
// Only expand vector types if we have the appropriate vector bit operations.
if (DstVT.isVector() && (!isOperationLegalOrCustom(ISD::FP_TO_SINT, DstVT) ||
!isOperationLegalOrCustomOrPromote(ISD::XOR, SrcVT)))
return false;
// If the maximum float value is smaller then the signed integer range,
// the destination signmask can't be represented by the float, so we can
// just use FP_TO_SINT directly.
const fltSemantics &APFSem = DAG.EVTToAPFloatSemantics(SrcVT);
APFloat APF(APFSem, APInt::getNullValue(SrcVT.getScalarSizeInBits()));
APInt SignMask = APInt::getSignMask(DstVT.getScalarSizeInBits());
if (APFloat::opOverflow &
APF.convertFromAPInt(SignMask, false, APFloat::rmNearestTiesToEven)) {
Result = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src);
return true;
}
SDValue Cst = DAG.getConstantFP(APF, dl, SrcVT);
SDValue Sel = DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT);
bool Strict = shouldUseStrictFP_TO_INT(SrcVT, DstVT, /*IsSigned*/ false);
if (Strict) {
// Expand based on maximum range of FP_TO_SINT, if the value exceeds the
// signmask then offset (the result of which should be fully representable).
// Sel = Src < 0x8000000000000000
// Val = select Sel, Src, Src - 0x8000000000000000
// Ofs = select Sel, 0, 0x8000000000000000
// Result = fp_to_sint(Val) ^ Ofs
// TODO: Should any fast-math-flags be set for the FSUB?
SDValue Val = DAG.getSelect(dl, SrcVT, Sel, Src,
DAG.getNode(ISD::FSUB, dl, SrcVT, Src, Cst));
SDValue Ofs = DAG.getSelect(dl, DstVT, Sel, DAG.getConstant(0, dl, DstVT),
DAG.getConstant(SignMask, dl, DstVT));
Result = DAG.getNode(ISD::XOR, dl, DstVT,
DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Val), Ofs);
} else {
// Expand based on maximum range of FP_TO_SINT:
// True = fp_to_sint(Src)
// False = 0x8000000000000000 + fp_to_sint(Src - 0x8000000000000000)
// Result = select (Src < 0x8000000000000000), True, False
SDValue True = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src);
// TODO: Should any fast-math-flags be set for the FSUB?
SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT,
DAG.getNode(ISD::FSUB, dl, SrcVT, Src, Cst));
False = DAG.getNode(ISD::XOR, dl, DstVT, False,
DAG.getConstant(SignMask, dl, DstVT));
Result = DAG.getSelect(dl, DstVT, Sel, True, False);
}
return true;
}
bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result,
SelectionDAG &DAG) const {
SDValue Src = Node->getOperand(0);
EVT SrcVT = Src.getValueType();
EVT DstVT = Node->getValueType(0);
if (SrcVT.getScalarType() != MVT::i64)
return false;
SDLoc dl(SDValue(Node, 0));
EVT ShiftVT = getShiftAmountTy(SrcVT, DAG.getDataLayout());
if (DstVT.getScalarType() == MVT::f32) {
// Only expand vector types if we have the appropriate vector bit
// operations.
if (SrcVT.isVector() &&
(!isOperationLegalOrCustom(ISD::SRL, SrcVT) ||
!isOperationLegalOrCustom(ISD::FADD, DstVT) ||
!isOperationLegalOrCustom(ISD::SINT_TO_FP, SrcVT) ||
!isOperationLegalOrCustomOrPromote(ISD::OR, SrcVT) ||
!isOperationLegalOrCustomOrPromote(ISD::AND, SrcVT)))
return false;
// For unsigned conversions, convert them to signed conversions using the
// algorithm from the x86_64 __floatundidf in compiler_rt.
SDValue Fast = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
SDValue ShiftConst = DAG.getConstant(1, dl, ShiftVT);
SDValue Shr = DAG.getNode(ISD::SRL, dl, SrcVT, Src, ShiftConst);
SDValue AndConst = DAG.getConstant(1, dl, SrcVT);
SDValue And = DAG.getNode(ISD::AND, dl, SrcVT, Src, AndConst);
SDValue Or = DAG.getNode(ISD::OR, dl, SrcVT, And, Shr);
SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Or);
SDValue Slow = DAG.getNode(ISD::FADD, dl, DstVT, SignCvt, SignCvt);
// TODO: This really should be implemented using a branch rather than a
// select. We happen to get lucky and machinesink does the right
// thing most of the time. This would be a good candidate for a
// pseudo-op, or, even better, for whole-function isel.
EVT SetCCVT =
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);
SDValue SignBitTest = DAG.getSetCC(
dl, SetCCVT, Src, DAG.getConstant(0, dl, SrcVT), ISD::SETLT);
Result = DAG.getSelect(dl, DstVT, SignBitTest, Slow, Fast);
return true;
}
if (DstVT.getScalarType() == MVT::f64) {
// Only expand vector types if we have the appropriate vector bit
// operations.
if (SrcVT.isVector() &&
(!isOperationLegalOrCustom(ISD::SRL, SrcVT) ||
!isOperationLegalOrCustom(ISD::FADD, DstVT) ||
!isOperationLegalOrCustom(ISD::FSUB, DstVT) ||
!isOperationLegalOrCustomOrPromote(ISD::OR, SrcVT) ||
!isOperationLegalOrCustomOrPromote(ISD::AND, SrcVT)))
return false;
// Implementation of unsigned i64 to f64 following the algorithm in
// __floatundidf in compiler_rt. This implementation has the advantage
// of performing rounding correctly, both in the default rounding mode
// and in all alternate rounding modes.
SDValue TwoP52 = DAG.getConstant(UINT64_C(0x4330000000000000), dl, SrcVT);
SDValue TwoP84PlusTwoP52 = DAG.getConstantFP(
BitsToDouble(UINT64_C(0x4530000000100000)), dl, DstVT);
SDValue TwoP84 = DAG.getConstant(UINT64_C(0x4530000000000000), dl, SrcVT);
SDValue LoMask = DAG.getConstant(UINT64_C(0x00000000FFFFFFFF), dl, SrcVT);
SDValue HiShift = DAG.getConstant(32, dl, ShiftVT);
SDValue Lo = DAG.getNode(ISD::AND, dl, SrcVT, Src, LoMask);
SDValue Hi = DAG.getNode(ISD::SRL, dl, SrcVT, Src, HiShift);
SDValue LoOr = DAG.getNode(ISD::OR, dl, SrcVT, Lo, TwoP52);
SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84);
SDValue LoFlt = DAG.getBitcast(DstVT, LoOr);
SDValue HiFlt = DAG.getBitcast(DstVT, HiOr);
SDValue HiSub = DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52);
Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub);
return true;
}
return false;
}
SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
SelectionDAG &DAG) const {
SDLoc dl(Node);
unsigned NewOp = Node->getOpcode() == ISD::FMINNUM ?
ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
EVT VT = Node->getValueType(0);
if (isOperationLegalOrCustom(NewOp, VT)) {
SDValue Quiet0 = Node->getOperand(0);
SDValue Quiet1 = Node->getOperand(1);
if (!Node->getFlags().hasNoNaNs()) {
// Insert canonicalizes if it's possible we need to quiet to get correct
// sNaN behavior.
if (!DAG.isKnownNeverSNaN(Quiet0)) {
Quiet0 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet0,
Node->getFlags());
}
if (!DAG.isKnownNeverSNaN(Quiet1)) {
Quiet1 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet1,
Node->getFlags());
}
}
return DAG.getNode(NewOp, dl, VT, Quiet0, Quiet1, Node->getFlags());
}
// If the target has FMINIMUM/FMAXIMUM but not FMINNUM/FMAXNUM use that
// instead if there are no NaNs.
if (Node->getFlags().hasNoNaNs()) {
unsigned IEEE2018Op =
Node->getOpcode() == ISD::FMINNUM ? ISD::FMINIMUM : ISD::FMAXIMUM;
if (isOperationLegalOrCustom(IEEE2018Op, VT)) {
return DAG.getNode(IEEE2018Op, dl, VT, Node->getOperand(0),
Node->getOperand(1), Node->getFlags());
}
}
return SDValue();
}
bool TargetLowering::expandCTPOP(SDNode *Node, SDValue &Result,
SelectionDAG &DAG) const {
SDLoc dl(Node);
EVT VT = Node->getValueType(0);
EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
SDValue Op = Node->getOperand(0);
unsigned Len = VT.getScalarSizeInBits();
assert(VT.isInteger() && "CTPOP not implemented for this type.");
// TODO: Add support for irregular type lengths.
if (!(Len <= 128 && Len % 8 == 0))
return false;
// Only expand vector types if we have the appropriate vector bit operations.
if (VT.isVector() && (!isOperationLegalOrCustom(ISD::ADD, VT) ||
!isOperationLegalOrCustom(ISD::SUB, VT) ||
!isOperationLegalOrCustom(ISD::SRL, VT) ||
(Len != 8 && !isOperationLegalOrCustom(ISD::MUL, VT)) ||
!isOperationLegalOrCustomOrPromote(ISD::AND, VT)))
return false;
// This is the "best" algorithm from
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
SDValue Mask55 =
DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), dl, VT);
SDValue Mask33 =
DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), dl, VT);
SDValue Mask0F =
DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), dl, VT);
SDValue Mask01 =
DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT);
// v = v - ((v >> 1) & 0x55555555...)
Op = DAG.getNode(ISD::SUB, dl, VT, Op,
DAG.getNode(ISD::AND, dl, VT,
DAG.getNode(ISD::SRL, dl, VT, Op,
DAG.getConstant(1, dl, ShVT)),
Mask55));
// v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
Op = DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::AND, dl, VT, Op, Mask33),
DAG.getNode(ISD::AND, dl, VT,
DAG.getNode(ISD::SRL, dl, VT, Op,
DAG.getConstant(2, dl, ShVT)),
Mask33));
// v = (v + (v >> 4)) & 0x0F0F0F0F...
Op = DAG.getNode(ISD::AND, dl, VT,
DAG.getNode(ISD::ADD, dl, VT, Op,
DAG.getNode(ISD::SRL, dl, VT, Op,
DAG.getConstant(4, dl, ShVT))),
Mask0F);
// v = (v * 0x01010101...) >> (Len - 8)
if (Len > 8)
Op =
DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
DAG.getConstant(Len - 8, dl, ShVT));
Result = Op;
return true;
}
bool TargetLowering::expandCTLZ(SDNode *Node, SDValue &Result,
SelectionDAG &DAG) const {
SDLoc dl(Node);
EVT VT = Node->getValueType(0);
EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
SDValue Op = Node->getOperand(0);
unsigned NumBitsPerElt = VT.getScalarSizeInBits();
// If the non-ZERO_UNDEF version is supported we can use that instead.
if (Node->getOpcode() == ISD::CTLZ_ZERO_UNDEF &&
isOperationLegalOrCustom(ISD::CTLZ, VT)) {
Result = DAG.getNode(ISD::CTLZ, dl, VT, Op);
return true;
}
// If the ZERO_UNDEF version is supported use that and handle the zero case.
if (isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) {
EVT SetCCVT =
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
SDValue CTLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Op);
SDValue Zero = DAG.getConstant(0, dl, VT);
SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
Result = DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
DAG.getConstant(NumBitsPerElt, dl, VT), CTLZ);
return true;
}
// Only expand vector types if we have the appropriate vector bit operations.
if (VT.isVector() && (!isPowerOf2_32(NumBitsPerElt) ||
!isOperationLegalOrCustom(ISD::CTPOP, VT) ||
!isOperationLegalOrCustom(ISD::SRL, VT) ||
!isOperationLegalOrCustomOrPromote(ISD::OR, VT)))
return false;
// for now, we do this:
// x = x | (x >> 1);
// x = x | (x >> 2);
// ...
// x = x | (x >>16);
// x = x | (x >>32); // for 64-bit input
// return popcount(~x);
//
// Ref: "Hacker's Delight" by Henry Warren
for (unsigned i = 0; (1U << i) <= (NumBitsPerElt / 2); ++i) {
SDValue Tmp = DAG.getConstant(1ULL << i, dl, ShVT);
Op = DAG.getNode(ISD::OR, dl, VT, Op,
DAG.getNode(ISD::SRL, dl, VT, Op, Tmp));
}
Op = DAG.getNOT(dl, Op, VT);
Result = DAG.getNode(ISD::CTPOP, dl, VT, Op);
return true;
}
bool TargetLowering::expandCTTZ(SDNode *Node, SDValue &Result,
SelectionDAG &DAG) const {
SDLoc dl(Node);
EVT VT = Node->getValueType(0);
SDValue Op = Node->getOperand(0);
unsigned NumBitsPerElt = VT.getScalarSizeInBits();
// If the non-ZERO_UNDEF version is supported we can use that instead.
if (Node->getOpcode() == ISD::CTTZ_ZERO_UNDEF &&
isOperationLegalOrCustom(ISD::CTTZ, VT)) {
Result = DAG.getNode(ISD::CTTZ, dl, VT, Op);
return true;
}
// If the ZERO_UNDEF version is supported use that and handle the zero case.
if (isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) {
EVT SetCCVT =
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
SDValue CTTZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, VT, Op);
SDValue Zero = DAG.getConstant(0, dl, VT);
SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
Result = DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
DAG.getConstant(NumBitsPerElt, dl, VT), CTTZ);
return true;
}
// Only expand vector types if we have the appropriate vector bit operations.
if (VT.isVector() && (!isPowerOf2_32(NumBitsPerElt) ||
(!isOperationLegalOrCustom(ISD::CTPOP, VT) &&
!isOperationLegalOrCustom(ISD::CTLZ, VT)) ||
!isOperationLegalOrCustom(ISD::SUB, VT) ||
!isOperationLegalOrCustomOrPromote(ISD::AND, VT) ||
!isOperationLegalOrCustomOrPromote(ISD::XOR, VT)))
return false;
// for now, we use: { return popcount(~x & (x - 1)); }
// unless the target has ctlz but not ctpop, in which case we use:
// { return 32 - nlz(~x & (x-1)); }
// Ref: "Hacker's Delight" by Henry Warren
SDValue Tmp = DAG.getNode(
ISD::AND, dl, VT, DAG.getNOT(dl, Op, VT),
DAG.getNode(ISD::SUB, dl, VT, Op, DAG.getConstant(1, dl, VT)));
// If ISD::CTLZ is legal and CTPOP isn't, then do that instead.
if (isOperationLegal(ISD::CTLZ, VT) && !isOperationLegal(ISD::CTPOP, VT)) {
Result =
DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(NumBitsPerElt, dl, VT),
DAG.getNode(ISD::CTLZ, dl, VT, Tmp));
return true;
}
Result = DAG.getNode(ISD::CTPOP, dl, VT, Tmp);
return true;
}
bool TargetLowering::expandABS(SDNode *N, SDValue &Result,
SelectionDAG &DAG) const {
SDLoc dl(N);
EVT VT = N->getValueType(0);
EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
SDValue Op = N->getOperand(0);
// Only expand vector types if we have the appropriate vector operations.
if (VT.isVector() && (!isOperationLegalOrCustom(ISD::SRA, VT) ||
!isOperationLegalOrCustom(ISD::ADD, VT) ||
!isOperationLegalOrCustomOrPromote(ISD::XOR, VT)))
return false;
SDValue Shift =
DAG.getNode(ISD::SRA, dl, VT, Op,
DAG.getConstant(VT.getScalarSizeInBits() - 1, dl, ShVT));
SDValue Add = DAG.getNode(ISD::ADD, dl, VT, Op, Shift);
Result = DAG.getNode(ISD::XOR, dl, VT, Add, Shift);
return true;
}
SDValue TargetLowering::scalarizeVectorLoad(LoadSDNode *LD,
SelectionDAG &DAG) const {
SDLoc SL(LD);
SDValue Chain = LD->getChain();
SDValue BasePTR = LD->getBasePtr();
EVT SrcVT = LD->getMemoryVT();
ISD::LoadExtType ExtType = LD->getExtensionType();
unsigned NumElem = SrcVT.getVectorNumElements();
EVT SrcEltVT = SrcVT.getScalarType();
EVT DstEltVT = LD->getValueType(0).getScalarType();
unsigned Stride = SrcEltVT.getSizeInBits() / 8;
assert(SrcEltVT.isByteSized());
SmallVector<SDValue, 8> Vals;
SmallVector<SDValue, 8> LoadChains;
for (unsigned Idx = 0; Idx < NumElem; ++Idx) {
SDValue ScalarLoad =
DAG.getExtLoad(ExtType, SL, DstEltVT, Chain, BasePTR,
LD->getPointerInfo().getWithOffset(Idx * Stride),
SrcEltVT, MinAlign(LD->getAlignment(), Idx * Stride),
LD->getMemOperand()->getFlags(), LD->getAAInfo());
BasePTR = DAG.getObjectPtrOffset(SL, BasePTR, Stride);
Vals.push_back(ScalarLoad.getValue(0));
LoadChains.push_back(ScalarLoad.getValue(1));
}
SDValue NewChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoadChains);
SDValue Value = DAG.getBuildVector(LD->getValueType(0), SL, Vals);
return DAG.getMergeValues({Value, NewChain}, SL);
}
SDValue TargetLowering::scalarizeVectorStore(StoreSDNode *ST,
SelectionDAG &DAG) const {
SDLoc SL(ST);
SDValue Chain = ST->getChain();
SDValue BasePtr = ST->getBasePtr();
SDValue Value = ST->getValue();
EVT StVT = ST->getMemoryVT();
// The type of the data we want to save
EVT RegVT = Value.getValueType();
EVT RegSclVT = RegVT.getScalarType();
// The type of data as saved in memory.
EVT MemSclVT = StVT.getScalarType();
EVT IdxVT = getVectorIdxTy(DAG.getDataLayout());
unsigned NumElem = StVT.getVectorNumElements();
// A vector must always be stored in memory as-is, i.e. without any padding
// between the elements, since various code depend on it, e.g. in the
// handling of a bitcast of a vector type to int, which may be done with a
// vector store followed by an integer load. A vector that does not have
// elements that are byte-sized must therefore be stored as an integer
// built out of the extracted vector elements.
if (!MemSclVT.isByteSized()) {
unsigned NumBits = StVT.getSizeInBits();
EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumBits);
SDValue CurrVal = DAG.getConstant(0, SL, IntVT);
for (unsigned Idx = 0; Idx < NumElem; ++Idx) {
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, RegSclVT, Value,
DAG.getConstant(Idx, SL, IdxVT));
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MemSclVT, Elt);
SDValue ExtElt = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Trunc);
unsigned ShiftIntoIdx =
(DAG.getDataLayout().isBigEndian() ? (NumElem - 1) - Idx : Idx);
SDValue ShiftAmount =
DAG.getConstant(ShiftIntoIdx * MemSclVT.getSizeInBits(), SL, IntVT);
SDValue ShiftedElt =
DAG.getNode(ISD::SHL, SL, IntVT, ExtElt, ShiftAmount);
CurrVal = DAG.getNode(ISD::OR, SL, IntVT, CurrVal, ShiftedElt);
}
return DAG.getStore(Chain, SL, CurrVal, BasePtr, ST->getPointerInfo(),
ST->getAlignment(), ST->getMemOperand()->getFlags(),
ST->getAAInfo());
}
// Store Stride in bytes
unsigned Stride = MemSclVT.getSizeInBits() / 8;
assert(Stride && "Zero stride!");
// Extract each of the elements from the original vector and save them into
// memory individually.
SmallVector<SDValue, 8> Stores;
for (unsigned Idx = 0; Idx < NumElem; ++Idx) {
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, RegSclVT, Value,
DAG.getConstant(Idx, SL, IdxVT));
SDValue Ptr = DAG.getObjectPtrOffset(SL, BasePtr, Idx * Stride);
// This scalar TruncStore may be illegal, but we legalize it later.
SDValue Store = DAG.getTruncStore(
Chain, SL, Elt, Ptr, ST->getPointerInfo().getWithOffset(Idx * Stride),
MemSclVT, MinAlign(ST->getAlignment(), Idx * Stride),
ST->getMemOperand()->getFlags(), ST->getAAInfo());
Stores.push_back(Store);
}
return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Stores);
}
std::pair<SDValue, SDValue>
TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
assert(LD->getAddressingMode() == ISD::UNINDEXED &&
"unaligned indexed loads not implemented!");
SDValue Chain = LD->getChain();
SDValue Ptr = LD->getBasePtr();
EVT VT = LD->getValueType(0);
EVT LoadedVT = LD->getMemoryVT();
SDLoc dl(LD);
auto &MF = DAG.getMachineFunction();
if (VT.isFloatingPoint() || VT.isVector()) {
EVT intVT = EVT::getIntegerVT(*DAG.getContext(), LoadedVT.getSizeInBits());
if (isTypeLegal(intVT) && isTypeLegal(LoadedVT)) {
if (!isOperationLegalOrCustom(ISD::LOAD, intVT) &&
LoadedVT.isVector()) {
// Scalarize the load and let the individual components be handled.
SDValue Scalarized = scalarizeVectorLoad(LD, DAG);
if (Scalarized->getOpcode() == ISD::MERGE_VALUES)
return std::make_pair(Scalarized.getOperand(0), Scalarized.getOperand(1));
return std::make_pair(Scalarized.getValue(0), Scalarized.getValue(1));
}
// Expand to a (misaligned) integer load of the same size,
// then bitconvert to floating point or vector.
SDValue newLoad = DAG.getLoad(intVT, dl, Chain, Ptr,
LD->getMemOperand());
SDValue Result = DAG.getNode(ISD::BITCAST, dl, LoadedVT, newLoad);
if (LoadedVT != VT)
Result = DAG.getNode(VT.isFloatingPoint() ? ISD::FP_EXTEND :
ISD::ANY_EXTEND, dl, VT, Result);
return std::make_pair(Result, newLoad.getValue(1));
}
// Copy the value to a (aligned) stack slot using (unaligned) integer
// loads and stores, then do a (aligned) load from the stack slot.
MVT RegVT = getRegisterType(*DAG.getContext(), intVT);
unsigned LoadedBytes = LoadedVT.getStoreSize();
unsigned RegBytes = RegVT.getSizeInBits() / 8;
unsigned NumRegs = (LoadedBytes + RegBytes - 1) / RegBytes;
// Make sure the stack slot is also aligned for the register type.
SDValue StackBase = DAG.CreateStackTemporary(LoadedVT, RegVT);
auto FrameIndex = cast<FrameIndexSDNode>(StackBase.getNode())->getIndex();
SmallVector<SDValue, 8> Stores;
SDValue StackPtr = StackBase;
unsigned Offset = 0;
EVT PtrVT = Ptr.getValueType();
EVT StackPtrVT = StackPtr.getValueType();
SDValue PtrIncrement = DAG.getConstant(RegBytes, dl, PtrVT);
SDValue StackPtrIncrement = DAG.getConstant(RegBytes, dl, StackPtrVT);
// Do all but one copies using the full register width.
for (unsigned i = 1; i < NumRegs; i++) {
// Load one integer register's worth from the original location.
SDValue Load = DAG.getLoad(
RegVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(Offset),
MinAlign(LD->getAlignment(), Offset), LD->getMemOperand()->getFlags(),
LD->getAAInfo());
// Follow the load with a store to the stack slot. Remember the store.
Stores.push_back(DAG.getStore(
Load.getValue(1), dl, Load, StackPtr,
MachinePointerInfo::getFixedStack(MF, FrameIndex, Offset)));
// Increment the pointers.
Offset += RegBytes;
Ptr = DAG.getObjectPtrOffset(dl, Ptr, PtrIncrement);
StackPtr = DAG.getObjectPtrOffset(dl, StackPtr, StackPtrIncrement);
}
// The last copy may be partial. Do an extending load.
EVT MemVT = EVT::getIntegerVT(*DAG.getContext(),
8 * (LoadedBytes - Offset));
SDValue Load =
DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Chain, Ptr,
LD->getPointerInfo().getWithOffset(Offset), MemVT,
MinAlign(LD->getAlignment(), Offset),
LD->getMemOperand()->getFlags(), LD->getAAInfo());
// Follow the load with a store to the stack slot. Remember the store.
// On big-endian machines this requires a truncating store to ensure
// that the bits end up in the right place.
Stores.push_back(DAG.getTruncStore(
Load.getValue(1), dl, Load, StackPtr,
MachinePointerInfo::getFixedStack(MF, FrameIndex, Offset), MemVT));
// The order of the stores doesn't matter - say it with a TokenFactor.
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
// Finally, perform the original load only redirected to the stack slot.
Load = DAG.getExtLoad(LD->getExtensionType(), dl, VT, TF, StackBase,
MachinePointerInfo::getFixedStack(MF, FrameIndex, 0),
LoadedVT);
// Callers expect a MERGE_VALUES node.
return std::make_pair(Load, TF);
}
assert(LoadedVT.isInteger() && !LoadedVT.isVector() &&
"Unaligned load of unsupported type.");
// Compute the new VT that is half the size of the old one. This is an
// integer MVT.
unsigned NumBits = LoadedVT.getSizeInBits();
EVT NewLoadedVT;
NewLoadedVT = EVT::getIntegerVT(*DAG.getContext(), NumBits/2);
NumBits >>= 1;
unsigned Alignment = LD->getAlignment();
unsigned IncrementSize = NumBits / 8;
ISD::LoadExtType HiExtType = LD->getExtensionType();
// If the original load is NON_EXTLOAD, the hi part load must be ZEXTLOAD.
if (HiExtType == ISD::NON_EXTLOAD)
HiExtType = ISD::ZEXTLOAD;
// Load the value in two parts
SDValue Lo, Hi;
if (DAG.getDataLayout().isLittleEndian()) {
Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getPointerInfo(),
NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
LD->getAAInfo());
Ptr = DAG.getObjectPtrOffset(dl, Ptr, IncrementSize);
Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr,
LD->getPointerInfo().getWithOffset(IncrementSize),
NewLoadedVT, MinAlign(Alignment, IncrementSize),
LD->getMemOperand()->getFlags(), LD->getAAInfo());
} else {
Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo(),
NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
LD->getAAInfo());
Ptr = DAG.getObjectPtrOffset(dl, Ptr, IncrementSize);
Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr,
LD->getPointerInfo().getWithOffset(IncrementSize),
NewLoadedVT, MinAlign(Alignment, IncrementSize),
LD->getMemOperand()->getFlags(), LD->getAAInfo());
}
// aggregate the two parts
SDValue ShiftAmount =
DAG.getConstant(NumBits, dl, getShiftAmountTy(Hi.getValueType(),
DAG.getDataLayout()));
SDValue Result = DAG.getNode(ISD::SHL, dl, VT, Hi, ShiftAmount);
Result = DAG.getNode(ISD::OR, dl, VT, Result, Lo);
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
Hi.getValue(1));
return std::make_pair(Result, TF);
}
SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
SelectionDAG &DAG) const {
assert(ST->getAddressingMode() == ISD::UNINDEXED &&
"unaligned indexed stores not implemented!");
SDValue Chain = ST->getChain();
SDValue Ptr = ST->getBasePtr();
SDValue Val = ST->getValue();
EVT VT = Val.getValueType();
int Alignment = ST->getAlignment();
auto &MF = DAG.getMachineFunction();
EVT StoreMemVT = ST->getMemoryVT();
SDLoc dl(ST);
if (StoreMemVT.isFloatingPoint() || StoreMemVT.isVector()) {
EVT intVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
if (isTypeLegal(intVT)) {
if (!isOperationLegalOrCustom(ISD::STORE, intVT) &&
StoreMemVT.isVector()) {
// Scalarize the store and let the individual components be handled.
SDValue Result = scalarizeVectorStore(ST, DAG);
return Result;
}
// Expand to a bitconvert of the value to the integer type of the
// same size, then a (misaligned) int store.
// FIXME: Does not handle truncating floating point stores!
SDValue Result = DAG.getNode(ISD::BITCAST, dl, intVT, Val);
Result = DAG.getStore(Chain, dl, Result, Ptr, ST->getPointerInfo(),
Alignment, ST->getMemOperand()->getFlags());
return Result;
}
// Do a (aligned) store to a stack slot, then copy from the stack slot
// to the final destination using (unaligned) integer loads and stores.
MVT RegVT = getRegisterType(
*DAG.getContext(),
EVT::getIntegerVT(*DAG.getContext(), StoreMemVT.getSizeInBits()));
EVT PtrVT = Ptr.getValueType();
unsigned StoredBytes = StoreMemVT.getStoreSize();
unsigned RegBytes = RegVT.getSizeInBits() / 8;
unsigned NumRegs = (StoredBytes + RegBytes - 1) / RegBytes;
// Make sure the stack slot is also aligned for the register type.
SDValue StackPtr = DAG.CreateStackTemporary(StoreMemVT, RegVT);
auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
// Perform the original store, only redirected to the stack slot.
SDValue Store = DAG.getTruncStore(
Chain, dl, Val, StackPtr,
MachinePointerInfo::getFixedStack(MF, FrameIndex, 0), StoreMemVT);
EVT StackPtrVT = StackPtr.getValueType();
SDValue PtrIncrement = DAG.getConstant(RegBytes, dl, PtrVT);
SDValue StackPtrIncrement = DAG.getConstant(RegBytes, dl, StackPtrVT);
SmallVector<SDValue, 8> Stores;
unsigned Offset = 0;
// Do all but one copies using the full register width.
for (unsigned i = 1; i < NumRegs; i++) {
// Load one integer register's worth from the stack slot.
SDValue Load = DAG.getLoad(
RegVT, dl, Store, StackPtr,
MachinePointerInfo::getFixedStack(MF, FrameIndex, Offset));
// Store it to the final location. Remember the store.
Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, Ptr,
ST->getPointerInfo().getWithOffset(Offset),
MinAlign(ST->getAlignment(), Offset),
ST->getMemOperand()->getFlags()));
// Increment the pointers.
Offset += RegBytes;
StackPtr = DAG.getObjectPtrOffset(dl, StackPtr, StackPtrIncrement);
Ptr = DAG.getObjectPtrOffset(dl, Ptr, PtrIncrement);
}
// The last store may be partial. Do a truncating store. On big-endian
// machines this requires an extending load from the stack slot to ensure
// that the bits are in the right place.
EVT LoadMemVT =
EVT::getIntegerVT(*DAG.getContext(), 8 * (StoredBytes - Offset));
// Load from the stack slot.
SDValue Load = DAG.getExtLoad(
ISD::EXTLOAD, dl, RegVT, Store, StackPtr,
MachinePointerInfo::getFixedStack(MF, FrameIndex, Offset), LoadMemVT);
Stores.push_back(
DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr,
ST->getPointerInfo().getWithOffset(Offset), LoadMemVT,
MinAlign(ST->getAlignment(), Offset),
ST->getMemOperand()->getFlags(), ST->getAAInfo()));
// The order of the stores doesn't matter - say it with a TokenFactor.
SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
return Result;
}
assert(StoreMemVT.isInteger() && !StoreMemVT.isVector() &&
"Unaligned store of unknown type.");
// Get the half-size VT
EVT NewStoredVT = StoreMemVT.getHalfSizedIntegerVT(*DAG.getContext());
int NumBits = NewStoredVT.getSizeInBits();
int IncrementSize = NumBits / 8;
// Divide the stored value in two parts.
SDValue ShiftAmount = DAG.getConstant(
NumBits, dl, getShiftAmountTy(Val.getValueType(), DAG.getDataLayout()));
SDValue Lo = Val;
SDValue Hi = DAG.getNode(ISD::SRL, dl, VT, Val, ShiftAmount);
// Store the two parts
SDValue Store1, Store2;
Store1 = DAG.getTruncStore(Chain, dl,
DAG.getDataLayout().isLittleEndian() ? Lo : Hi,
Ptr, ST->getPointerInfo(), NewStoredVT, Alignment,
ST->getMemOperand()->getFlags());
Ptr = DAG.getObjectPtrOffset(dl, Ptr, IncrementSize);
Alignment = MinAlign(Alignment, IncrementSize);
Store2 = DAG.getTruncStore(
Chain, dl, DAG.getDataLayout().isLittleEndian() ? Hi : Lo, Ptr,
ST->getPointerInfo().getWithOffset(IncrementSize), NewStoredVT, Alignment,
ST->getMemOperand()->getFlags(), ST->getAAInfo());
SDValue Result =
DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
return Result;
}
SDValue
TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask,
const SDLoc &DL, EVT DataVT,
SelectionDAG &DAG,
bool IsCompressedMemory) const {
SDValue Increment;
EVT AddrVT = Addr.getValueType();
EVT MaskVT = Mask.getValueType();
assert(DataVT.getVectorNumElements() == MaskVT.getVectorNumElements() &&
"Incompatible types of Data and Mask");
if (IsCompressedMemory) {
// Incrementing the pointer according to number of '1's in the mask.
EVT MaskIntVT = EVT::getIntegerVT(*DAG.getContext(), MaskVT.getSizeInBits());
SDValue MaskInIntReg = DAG.getBitcast(MaskIntVT, Mask);
if (MaskIntVT.getSizeInBits() < 32) {
MaskInIntReg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskInIntReg);
MaskIntVT = MVT::i32;
}
// Count '1's with POPCNT.
Increment = DAG.getNode(ISD::CTPOP, DL, MaskIntVT, MaskInIntReg);
Increment = DAG.getZExtOrTrunc(Increment, DL, AddrVT);
// Scale is an element size in bytes.
SDValue Scale = DAG.getConstant(DataVT.getScalarSizeInBits() / 8, DL,
AddrVT);
Increment = DAG.getNode(ISD::MUL, DL, AddrVT, Increment, Scale);
} else
Increment = DAG.getConstant(DataVT.getStoreSize(), DL, AddrVT);
return DAG.getNode(ISD::ADD, DL, AddrVT, Addr, Increment);
}
static SDValue clampDynamicVectorIndex(SelectionDAG &DAG,
SDValue Idx,
EVT VecVT,
const SDLoc &dl) {
if (isa<ConstantSDNode>(Idx))
return Idx;
EVT IdxVT = Idx.getValueType();
unsigned NElts = VecVT.getVectorNumElements();
if (isPowerOf2_32(NElts)) {
APInt Imm = APInt::getLowBitsSet(IdxVT.getSizeInBits(),
Log2_32(NElts));
return DAG.getNode(ISD::AND, dl, IdxVT, Idx,
DAG.getConstant(Imm, dl, IdxVT));
}
return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx,
DAG.getConstant(NElts - 1, dl, IdxVT));
}
SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG,
SDValue VecPtr, EVT VecVT,
SDValue Index) const {
SDLoc dl(Index);
// Make sure the index type is big enough to compute in.
Index = DAG.getZExtOrTrunc(Index, dl, VecPtr.getValueType());
EVT EltVT = VecVT.getVectorElementType();
// Calculate the element offset and add it to the pointer.
unsigned EltSize = EltVT.getSizeInBits() / 8; // FIXME: should be ABI size.
assert(EltSize * 8 == EltVT.getSizeInBits() &&
"Converting bits to bytes lost precision");
Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl);
EVT IdxVT = Index.getValueType();
Index = DAG.getNode(ISD::MUL, dl, IdxVT, Index,
DAG.getConstant(EltSize, dl, IdxVT));
return DAG.getNode(ISD::ADD, dl, IdxVT, VecPtr, Index);
}
//===----------------------------------------------------------------------===//
// Implementation of Emulated TLS Model
//===----------------------------------------------------------------------===//
SDValue TargetLowering::LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA,
SelectionDAG &DAG) const {
// Access to address of TLS varialbe xyz is lowered to a function call:
// __emutls_get_address( address of global variable named "__emutls_v.xyz" )
EVT PtrVT = getPointerTy(DAG.getDataLayout());
PointerType *VoidPtrType = Type::getInt8PtrTy(*DAG.getContext());
SDLoc dl(GA);
ArgListTy Args;
ArgListEntry Entry;
std::string NameString = ("__emutls_v." + GA->getGlobal()->getName()).str();
Module *VariableModule = const_cast<Module*>(GA->getGlobal()->getParent());
StringRef EmuTlsVarName(NameString);
GlobalVariable *EmuTlsVar = VariableModule->getNamedGlobal(EmuTlsVarName);
assert(EmuTlsVar && "Cannot find EmuTlsVar ");
Entry.Node = DAG.getGlobalAddress(EmuTlsVar, dl, PtrVT);
Entry.Ty = VoidPtrType;
Args.push_back(Entry);
SDValue EmuTlsGetAddr = DAG.getExternalSymbol("__emutls_get_address", PtrVT);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl).setChain(DAG.getEntryNode());
CLI.setLibCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args));
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
// At last for X86 targets, maybe good for other targets too?
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
MFI.setAdjustsStack(true); // Is this only for X86 target?
MFI.setHasCalls(true);
assert((GA->getOffset() == 0) &&
"Emulated TLS must have zero offset in GlobalAddressSDNode");
return CallResult.first;
}
SDValue TargetLowering::lowerCmpEqZeroToCtlzSrl(SDValue Op,
SelectionDAG &DAG) const {
assert((Op->getOpcode() == ISD::SETCC) && "Input has to be a SETCC node.");
if (!isCtlzFast())
return SDValue();
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
SDLoc dl(Op);
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
if (C->isNullValue() && CC == ISD::SETEQ) {
EVT VT = Op.getOperand(0).getValueType();
SDValue Zext = Op.getOperand(0);
if (VT.bitsLT(MVT::i32)) {
VT = MVT::i32;
Zext = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Op.getOperand(0));
}
unsigned Log2b = Log2_32(VT.getSizeInBits());
SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Zext);
SDValue Scc = DAG.getNode(ISD::SRL, dl, VT, Clz,
DAG.getConstant(Log2b, dl, MVT::i32));
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Scc);
}
}
return SDValue();
}
SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
unsigned Opcode = Node->getOpcode();
SDValue LHS = Node->getOperand(0);
SDValue RHS = Node->getOperand(1);
EVT VT = LHS.getValueType();
SDLoc dl(Node);
assert(VT == RHS.getValueType() && "Expected operands to be the same type");
assert(VT.isInteger() && "Expected operands to be integers");
// usub.sat(a, b) -> umax(a, b) - b
if (Opcode == ISD::USUBSAT && isOperationLegalOrCustom(ISD::UMAX, VT)) {
SDValue Max = DAG.getNode(ISD::UMAX, dl, VT, LHS, RHS);
return DAG.getNode(ISD::SUB, dl, VT, Max, RHS);
}
if (Opcode == ISD::UADDSAT && isOperationLegalOrCustom(ISD::UMIN, VT)) {
SDValue InvRHS = DAG.getNOT(dl, RHS, VT);
SDValue Min = DAG.getNode(ISD::UMIN, dl, VT, LHS, InvRHS);
return DAG.getNode(ISD::ADD, dl, VT, Min, RHS);
}
unsigned OverflowOp;
switch (Opcode) {
case ISD::SADDSAT:
OverflowOp = ISD::SADDO;
break;
case ISD::UADDSAT:
OverflowOp = ISD::UADDO;
break;
case ISD::SSUBSAT:
OverflowOp = ISD::SSUBO;
break;
case ISD::USUBSAT:
OverflowOp = ISD::USUBO;
break;
default:
llvm_unreachable("Expected method to receive signed or unsigned saturation "
"addition or subtraction node.");
}
unsigned BitWidth = LHS.getScalarValueSizeInBits();
EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
SDValue Result = DAG.getNode(OverflowOp, dl, DAG.getVTList(VT, BoolVT),
LHS, RHS);
SDValue SumDiff = Result.getValue(0);
SDValue Overflow = Result.getValue(1);
SDValue Zero = DAG.getConstant(0, dl, VT);
SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
if (Opcode == ISD::UADDSAT) {
if (getBooleanContents(VT) == ZeroOrNegativeOneBooleanContent) {
// (LHS + RHS) | OverflowMask
SDValue OverflowMask = DAG.getSExtOrTrunc(Overflow, dl, VT);
return DAG.getNode(ISD::OR, dl, VT, SumDiff, OverflowMask);
}
// Overflow ? 0xffff.... : (LHS + RHS)
return DAG.getSelect(dl, VT, Overflow, AllOnes, SumDiff);
} else if (Opcode == ISD::USUBSAT) {
if (getBooleanContents(VT) == ZeroOrNegativeOneBooleanContent) {
// (LHS - RHS) & ~OverflowMask
SDValue OverflowMask = DAG.getSExtOrTrunc(Overflow, dl, VT);
SDValue Not = DAG.getNOT(dl, OverflowMask, VT);
return DAG.getNode(ISD::AND, dl, VT, SumDiff, Not);
}
// Overflow ? 0 : (LHS - RHS)
return DAG.getSelect(dl, VT, Overflow, Zero, SumDiff);
} else {
// SatMax -> Overflow && SumDiff < 0
// SatMin -> Overflow && SumDiff >= 0
APInt MinVal = APInt::getSignedMinValue(BitWidth);
APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
SDValue SumNeg = DAG.getSetCC(dl, BoolVT, SumDiff, Zero, ISD::SETLT);
Result = DAG.getSelect(dl, VT, SumNeg, SatMax, SatMin);
return DAG.getSelect(dl, VT, Overflow, Result, SumDiff);
}
}
SDValue
TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
assert((Node->getOpcode() == ISD::SMULFIX ||
Node->getOpcode() == ISD::UMULFIX ||
Node->getOpcode() == ISD::SMULFIXSAT) &&
"Expected a fixed point multiplication opcode");
SDLoc dl(Node);
SDValue LHS = Node->getOperand(0);
SDValue RHS = Node->getOperand(1);
EVT VT = LHS.getValueType();
unsigned Scale = Node->getConstantOperandVal(2);
bool Saturating = Node->getOpcode() == ISD::SMULFIXSAT;
EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
unsigned VTSize = VT.getScalarSizeInBits();
if (!Scale) {
// [us]mul.fix(a, b, 0) -> mul(a, b)
if (!Saturating && isOperationLegalOrCustom(ISD::MUL, VT)) {
return DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
} else if (Saturating && isOperationLegalOrCustom(ISD::SMULO, VT)) {
SDValue Result =
DAG.getNode(ISD::SMULO, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
SDValue Product = Result.getValue(0);
SDValue Overflow = Result.getValue(1);
SDValue Zero = DAG.getConstant(0, dl, VT);
APInt MinVal = APInt::getSignedMinValue(VTSize);
APInt MaxVal = APInt::getSignedMaxValue(VTSize);
SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Product, Zero, ISD::SETLT);
Result = DAG.getSelect(dl, VT, ProdNeg, SatMax, SatMin);
return DAG.getSelect(dl, VT, Overflow, Result, Product);
}
}
bool Signed =
Node->getOpcode() == ISD::SMULFIX || Node->getOpcode() == ISD::SMULFIXSAT;
assert(((Signed && Scale < VTSize) || (!Signed && Scale <= VTSize)) &&
"Expected scale to be less than the number of bits if signed or at "
"most the number of bits if unsigned.");
assert(LHS.getValueType() == RHS.getValueType() &&
"Expected both operands to be the same type");
// Get the upper and lower bits of the result.
SDValue Lo, Hi;
unsigned LoHiOp = Signed ? ISD::SMUL_LOHI : ISD::UMUL_LOHI;
unsigned HiOp = Signed ? ISD::MULHS : ISD::MULHU;
if (isOperationLegalOrCustom(LoHiOp, VT)) {
SDValue Result = DAG.getNode(LoHiOp, dl, DAG.getVTList(VT, VT), LHS, RHS);
Lo = Result.getValue(0);
Hi = Result.getValue(1);
} else if (isOperationLegalOrCustom(HiOp, VT)) {
Lo = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
Hi = DAG.getNode(HiOp, dl, VT, LHS, RHS);
} else if (VT.isVector()) {
return SDValue();
} else {
report_fatal_error("Unable to expand fixed point multiplication.");
}
if (Scale == VTSize)
// Result is just the top half since we'd be shifting by the width of the
// operand.
return Hi;
// The result will need to be shifted right by the scale since both operands
// are scaled. The result is given to us in 2 halves, so we only want part of
// both in the result.
EVT ShiftTy = getShiftAmountTy(VT, DAG.getDataLayout());
SDValue Result = DAG.getNode(ISD::FSHR, dl, VT, Hi, Lo,
DAG.getConstant(Scale, dl, ShiftTy));
if (!Saturating)
return Result;
unsigned OverflowBits = VTSize - Scale + 1; // +1 for the sign
SDValue HiMask =
DAG.getConstant(APInt::getHighBitsSet(VTSize, OverflowBits), dl, VT);
SDValue LoMask = DAG.getConstant(
APInt::getLowBitsSet(VTSize, VTSize - OverflowBits), dl, VT);
APInt MaxVal = APInt::getSignedMaxValue(VTSize);
APInt MinVal = APInt::getSignedMinValue(VTSize);
Result = DAG.getSelectCC(dl, Hi, LoMask,
DAG.getConstant(MaxVal, dl, VT), Result,
ISD::SETGT);
return DAG.getSelectCC(dl, Hi, HiMask,
DAG.getConstant(MinVal, dl, VT), Result,
ISD::SETLT);
}
void TargetLowering::expandUADDSUBO(
SDNode *Node, SDValue &Result, SDValue &Overflow, SelectionDAG &DAG) const {
SDLoc dl(Node);
SDValue LHS = Node->getOperand(0);
SDValue RHS = Node->getOperand(1);
bool IsAdd = Node->getOpcode() == ISD::UADDO;
// If ADD/SUBCARRY is legal, use that instead.
unsigned OpcCarry = IsAdd ? ISD::ADDCARRY : ISD::SUBCARRY;
if (isOperationLegalOrCustom(OpcCarry, Node->getValueType(0))) {
SDValue CarryIn = DAG.getConstant(0, dl, Node->getValueType(1));
SDValue NodeCarry = DAG.getNode(OpcCarry, dl, Node->getVTList(),
{ LHS, RHS, CarryIn });
Result = SDValue(NodeCarry.getNode(), 0);
Overflow = SDValue(NodeCarry.getNode(), 1);
return;
}
Result = DAG.getNode(IsAdd ? ISD::ADD : ISD::SUB, dl,
LHS.getValueType(), LHS, RHS);
EVT ResultType = Node->getValueType(1);
EVT SetCCType = getSetCCResultType(
DAG.getDataLayout(), *DAG.getContext(), Node->getValueType(0));
ISD::CondCode CC = IsAdd ? ISD::SETULT : ISD::SETUGT;
SDValue SetCC = DAG.getSetCC(dl, SetCCType, Result, LHS, CC);
Overflow = DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType);
}
void TargetLowering::expandSADDSUBO(
SDNode *Node, SDValue &Result, SDValue &Overflow, SelectionDAG &DAG) const {
SDLoc dl(Node);
SDValue LHS = Node->getOperand(0);
SDValue RHS = Node->getOperand(1);
bool IsAdd = Node->getOpcode() == ISD::SADDO;
Result = DAG.getNode(IsAdd ? ISD::ADD : ISD::SUB, dl,
LHS.getValueType(), LHS, RHS);
EVT ResultType = Node->getValueType(1);
EVT OType = getSetCCResultType(
DAG.getDataLayout(), *DAG.getContext(), Node->getValueType(0));
// If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
unsigned OpcSat = IsAdd ? ISD::SADDSAT : ISD::SSUBSAT;
if (isOperationLegalOrCustom(OpcSat, LHS.getValueType())) {
SDValue Sat = DAG.getNode(OpcSat, dl, LHS.getValueType(), LHS, RHS);
SDValue SetCC = DAG.getSetCC(dl, OType, Result, Sat, ISD::SETNE);
Overflow = DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType);
return;
}
SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType());
// LHSSign -> LHS >= 0
// RHSSign -> RHS >= 0
// SumSign -> Result >= 0
//
// Add:
// Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
// Sub:
// Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
IsAdd ? ISD::SETEQ : ISD::SETNE);
SDValue SumSign = DAG.getSetCC(dl, OType, Result, Zero, ISD::SETGE);
SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);
SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
Overflow = DAG.getBoolExtOrTrunc(Cmp, dl, ResultType, ResultType);
}
bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result,
SDValue &Overflow, SelectionDAG &DAG) const {
SDLoc dl(Node);
EVT VT = Node->getValueType(0);
EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
SDValue LHS = Node->getOperand(0);
SDValue RHS = Node->getOperand(1);
bool isSigned = Node->getOpcode() == ISD::SMULO;
// For power-of-two multiplications we can use a simpler shift expansion.
if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
const APInt &C = RHSC->getAPIntValue();
// mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
if (C.isPowerOf2()) {
// smulo(x, signed_min) is same as umulo(x, signed_min).
bool UseArithShift = isSigned && !C.isMinSignedValue();
EVT ShiftAmtTy = getShiftAmountTy(VT, DAG.getDataLayout());
SDValue ShiftAmt = DAG.getConstant(C.logBase2(), dl, ShiftAmtTy);
Result = DAG.getNode(ISD::SHL, dl, VT, LHS, ShiftAmt);
Overflow = DAG.getSetCC(dl, SetCCVT,
DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
dl, VT, Result, ShiftAmt),
LHS, ISD::SETNE);
return true;
}
}
EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2);
if (VT.isVector())
WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT,
VT.getVectorNumElements());
SDValue BottomHalf;
SDValue TopHalf;
static const unsigned Ops[2][3] =
{ { ISD::MULHU, ISD::UMUL_LOHI, ISD::ZERO_EXTEND },
{ ISD::MULHS, ISD::SMUL_LOHI, ISD::SIGN_EXTEND }};
if (isOperationLegalOrCustom(Ops[isSigned][0], VT)) {
BottomHalf = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
TopHalf = DAG.getNode(Ops[isSigned][0], dl, VT, LHS, RHS);
} else if (isOperationLegalOrCustom(Ops[isSigned][1], VT)) {
BottomHalf = DAG.getNode(Ops[isSigned][1], dl, DAG.getVTList(VT, VT), LHS,
RHS);
TopHalf = BottomHalf.getValue(1);
} else if (isTypeLegal(WideVT)) {
LHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, LHS);
RHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, RHS);
SDValue Mul = DAG.getNode(ISD::MUL, dl, WideVT, LHS, RHS);
BottomHalf = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits(), dl,
getShiftAmountTy(WideVT, DAG.getDataLayout()));
TopHalf = DAG.getNode(ISD::TRUNCATE, dl, VT,
DAG.getNode(ISD::SRL, dl, WideVT, Mul, ShiftAmt));
} else {
if (VT.isVector())
return false;
// We can fall back to a libcall with an illegal type for the MUL if we
// have a libcall big enough.
// Also, we can fall back to a division in some cases, but that's a big
// performance hit in the general case.
RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
if (WideVT == MVT::i16)
LC = RTLIB::MUL_I16;
else if (WideVT == MVT::i32)
LC = RTLIB::MUL_I32;
else if (WideVT == MVT::i64)
LC = RTLIB::MUL_I64;
else if (WideVT == MVT::i128)
LC = RTLIB::MUL_I128;
assert(LC != RTLIB::UNKNOWN_LIBCALL && "Cannot expand this operation!");
SDValue HiLHS;
SDValue HiRHS;
if (isSigned) {
// The high part is obtained by SRA'ing all but one of the bits of low
// part.
unsigned LoSize = VT.getSizeInBits();
HiLHS =
DAG.getNode(ISD::SRA, dl, VT, LHS,
DAG.getConstant(LoSize - 1, dl,
getPointerTy(DAG.getDataLayout())));
HiRHS =
DAG.getNode(ISD::SRA, dl, VT, RHS,
DAG.getConstant(LoSize - 1, dl,
getPointerTy(DAG.getDataLayout())));
} else {
HiLHS = DAG.getConstant(0, dl, VT);
HiRHS = DAG.getConstant(0, dl, VT);
}
// Here we're passing the 2 arguments explicitly as 4 arguments that are
// pre-lowered to the correct types. This all depends upon WideVT not
// being a legal type for the architecture and thus has to be split to
// two arguments.
SDValue Ret;
if (shouldSplitFunctionArgumentsAsLittleEndian(DAG.getDataLayout())) {
// Halves of WideVT are packed into registers in different order
// depending on platform endianness. This is usually handled by
// the C calling convention, but we can't defer to it in
// the legalizer.
SDValue Args[] = { LHS, HiLHS, RHS, HiRHS };
Ret = makeLibCall(DAG, LC, WideVT, Args, isSigned, dl,
/* doesNotReturn */ false, /* isReturnValueUsed */ true,
/* isPostTypeLegalization */ true).first;
} else {
SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
Ret = makeLibCall(DAG, LC, WideVT, Args, isSigned, dl,
/* doesNotReturn */ false, /* isReturnValueUsed */ true,
/* isPostTypeLegalization */ true).first;
}
assert(Ret.getOpcode() == ISD::MERGE_VALUES &&
"Ret value is a collection of constituent nodes holding result.");
if (DAG.getDataLayout().isLittleEndian()) {
// Same as above.
BottomHalf = Ret.getOperand(0);
TopHalf = Ret.getOperand(1);
} else {
BottomHalf = Ret.getOperand(1);
TopHalf = Ret.getOperand(0);
}
}
Result = BottomHalf;
if (isSigned) {
SDValue ShiftAmt = DAG.getConstant(
VT.getScalarSizeInBits() - 1, dl,
getShiftAmountTy(BottomHalf.getValueType(), DAG.getDataLayout()));
SDValue Sign = DAG.getNode(ISD::SRA, dl, VT, BottomHalf, ShiftAmt);
Overflow = DAG.getSetCC(dl, SetCCVT, TopHalf, Sign, ISD::SETNE);
} else {
Overflow = DAG.getSetCC(dl, SetCCVT, TopHalf,
DAG.getConstant(0, dl, VT), ISD::SETNE);
}
// Truncate the result if SetCC returns a larger type than needed.
EVT RType = Node->getValueType(1);
if (RType.getSizeInBits() < Overflow.getValueSizeInBits())
Overflow = DAG.getNode(ISD::TRUNCATE, dl, RType, Overflow);
assert(RType.getSizeInBits() == Overflow.getValueSizeInBits() &&
"Unexpected result type for S/UMULO legalization");
return true;
}
SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const {
SDLoc dl(Node);
bool NoNaN = Node->getFlags().hasNoNaNs();
unsigned BaseOpcode = 0;
switch (Node->getOpcode()) {
default: llvm_unreachable("Expected VECREDUCE opcode");
case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
case ISD::VECREDUCE_ADD: BaseOpcode = ISD::ADD; break;
case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
case ISD::VECREDUCE_SMAX: BaseOpcode = ISD::SMAX; break;
case ISD::VECREDUCE_SMIN: BaseOpcode = ISD::SMIN; break;
case ISD::VECREDUCE_UMAX: BaseOpcode = ISD::UMAX; break;
case ISD::VECREDUCE_UMIN: BaseOpcode = ISD::UMIN; break;
case ISD::VECREDUCE_FMAX:
BaseOpcode = NoNaN ? ISD::FMAXNUM : ISD::FMAXIMUM;
break;
case ISD::VECREDUCE_FMIN:
BaseOpcode = NoNaN ? ISD::FMINNUM : ISD::FMINIMUM;
break;
}
SDValue Op = Node->getOperand(0);
EVT VT = Op.getValueType();
// Try to use a shuffle reduction for power of two vectors.
if (VT.isPow2VectorType()) {
while (VT.getVectorNumElements() > 1) {
EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
if (!isOperationLegalOrCustom(BaseOpcode, HalfVT))
break;
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(Op, dl);
Op = DAG.getNode(BaseOpcode, dl, HalfVT, Lo, Hi);
VT = HalfVT;
}
}
EVT EltVT = VT.getVectorElementType();
unsigned NumElts = VT.getVectorNumElements();
SmallVector<SDValue, 8> Ops;
DAG.ExtractVectorElements(Op, Ops, 0, NumElts);
SDValue Res = Ops[0];
for (unsigned i = 1; i < NumElts; i++)
Res = DAG.getNode(BaseOpcode, dl, EltVT, Res, Ops[i], Node->getFlags());
// Result type may be wider than element type.
if (EltVT != Node->getValueType(0))
Res = DAG.getNode(ISD::ANY_EXTEND, dl, Node->getValueType(0), Res);
return Res;
}
Index: vendor/llvm/dist-release_90/lib/MC/MCContext.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/MC/MCContext.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/MC/MCContext.cpp (revision 351303)
@@ -1,701 +1,707 @@
//===- lib/MC/MCContext.cpp - Machine Code Context ------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "llvm/MC/MCContext.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/BinaryFormat/COFF.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCCodeView.h"
#include "llvm/MC/MCDwarf.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFragment.h"
#include "llvm/MC/MCLabel.h"
#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCSectionCOFF.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCSectionMachO.h"
#include "llvm/MC/MCSectionWasm.h"
#include "llvm/MC/MCSectionXCOFF.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCSymbolCOFF.h"
#include "llvm/MC/MCSymbolELF.h"
#include "llvm/MC/MCSymbolMachO.h"
#include "llvm/MC/MCSymbolWasm.h"
#include "llvm/MC/MCSymbolXCOFF.h"
#include "llvm/MC/SectionKind.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Signals.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/raw_ostream.h"
#include <cassert>
#include <cstdlib>
#include <tuple>
#include <utility>
using namespace llvm;
static cl::opt<char*>
AsSecureLogFileName("as-secure-log-file-name",
cl::desc("As secure log file name (initialized from "
"AS_SECURE_LOG_FILE env variable)"),
cl::init(getenv("AS_SECURE_LOG_FILE")), cl::Hidden);
MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri,
const MCObjectFileInfo *mofi, const SourceMgr *mgr,
bool DoAutoReset)
: SrcMgr(mgr), InlineSrcMgr(nullptr), MAI(mai), MRI(mri), MOFI(mofi),
Symbols(Allocator), UsedNames(Allocator),
+ InlineAsmUsedLabelNames(Allocator),
CurrentDwarfLoc(0, 0, 0, DWARF2_FLAG_IS_STMT, 0, 0),
AutoReset(DoAutoReset) {
SecureLogFile = AsSecureLogFileName;
if (SrcMgr && SrcMgr->getNumBuffers())
MainFileName =
SrcMgr->getMemoryBuffer(SrcMgr->getMainFileID())->getBufferIdentifier();
}
MCContext::~MCContext() {
if (AutoReset)
reset();
// NOTE: The symbols are all allocated out of a bump pointer allocator,
// we don't need to free them here.
}
//===----------------------------------------------------------------------===//
// Module Lifetime Management
//===----------------------------------------------------------------------===//
void MCContext::reset() {
// Call the destructors so the fragments are freed
COFFAllocator.DestroyAll();
ELFAllocator.DestroyAll();
MachOAllocator.DestroyAll();
XCOFFAllocator.DestroyAll();
MCSubtargetAllocator.DestroyAll();
+ InlineAsmUsedLabelNames.clear();
UsedNames.clear();
Symbols.clear();
Allocator.Reset();
Instances.clear();
CompilationDir.clear();
MainFileName.clear();
MCDwarfLineTablesCUMap.clear();
SectionsForRanges.clear();
MCGenDwarfLabelEntries.clear();
DwarfDebugFlags = StringRef();
DwarfCompileUnitID = 0;
CurrentDwarfLoc = MCDwarfLoc(0, 0, 0, DWARF2_FLAG_IS_STMT, 0, 0);
CVContext.reset();
MachOUniquingMap.clear();
ELFUniquingMap.clear();
COFFUniquingMap.clear();
WasmUniquingMap.clear();
XCOFFUniquingMap.clear();
NextID.clear();
AllowTemporaryLabels = true;
DwarfLocSeen = false;
GenDwarfForAssembly = false;
GenDwarfFileNumber = 0;
HadError = false;
}
//===----------------------------------------------------------------------===//
// Symbol Manipulation
//===----------------------------------------------------------------------===//
MCSymbol *MCContext::getOrCreateSymbol(const Twine &Name) {
SmallString<128> NameSV;
StringRef NameRef = Name.toStringRef(NameSV);
assert(!NameRef.empty() && "Normal symbols cannot be unnamed!");
MCSymbol *&Sym = Symbols[NameRef];
if (!Sym)
Sym = createSymbol(NameRef, false, false);
return Sym;
}
MCSymbol *MCContext::getOrCreateFrameAllocSymbol(StringRef FuncName,
unsigned Idx) {
return getOrCreateSymbol(Twine(MAI->getPrivateGlobalPrefix()) + FuncName +
"$frame_escape_" + Twine(Idx));
}
MCSymbol *MCContext::getOrCreateParentFrameOffsetSymbol(StringRef FuncName) {
return getOrCreateSymbol(Twine(MAI->getPrivateGlobalPrefix()) + FuncName +
"$parent_frame_offset");
}
MCSymbol *MCContext::getOrCreateLSDASymbol(StringRef FuncName) {
return getOrCreateSymbol(Twine(MAI->getPrivateGlobalPrefix()) + "__ehtable$" +
FuncName);
}
MCSymbol *MCContext::createSymbolImpl(const StringMapEntry<bool> *Name,
bool IsTemporary) {
if (MOFI) {
switch (MOFI->getObjectFileType()) {
case MCObjectFileInfo::IsCOFF:
return new (Name, *this) MCSymbolCOFF(Name, IsTemporary);
case MCObjectFileInfo::IsELF:
return new (Name, *this) MCSymbolELF(Name, IsTemporary);
case MCObjectFileInfo::IsMachO:
return new (Name, *this) MCSymbolMachO(Name, IsTemporary);
case MCObjectFileInfo::IsWasm:
return new (Name, *this) MCSymbolWasm(Name, IsTemporary);
case MCObjectFileInfo::IsXCOFF:
return new (Name, *this) MCSymbolXCOFF(Name, IsTemporary);
}
}
return new (Name, *this) MCSymbol(MCSymbol::SymbolKindUnset, Name,
IsTemporary);
}
MCSymbol *MCContext::createSymbol(StringRef Name, bool AlwaysAddSuffix,
bool CanBeUnnamed) {
if (CanBeUnnamed && !UseNamesOnTempLabels)
return createSymbolImpl(nullptr, true);
// Determine whether this is a user written assembler temporary or normal
// label, if used.
bool IsTemporary = CanBeUnnamed;
if (AllowTemporaryLabels && !IsTemporary)
IsTemporary = Name.startswith(MAI->getPrivateGlobalPrefix());
SmallString<128> NewName = Name;
bool AddSuffix = AlwaysAddSuffix;
unsigned &NextUniqueID = NextID[Name];
while (true) {
if (AddSuffix) {
NewName.resize(Name.size());
raw_svector_ostream(NewName) << NextUniqueID++;
}
auto NameEntry = UsedNames.insert(std::make_pair(NewName, true));
if (NameEntry.second || !NameEntry.first->second) {
// Ok, we found a name.
// Mark it as used for a non-section symbol.
NameEntry.first->second = true;
// Have the MCSymbol object itself refer to the copy of the string that is
// embedded in the UsedNames entry.
return createSymbolImpl(&*NameEntry.first, IsTemporary);
}
assert(IsTemporary && "Cannot rename non-temporary symbols");
AddSuffix = true;
}
llvm_unreachable("Infinite loop");
}
MCSymbol *MCContext::createTempSymbol(const Twine &Name, bool AlwaysAddSuffix,
bool CanBeUnnamed) {
SmallString<128> NameSV;
raw_svector_ostream(NameSV) << MAI->getPrivateGlobalPrefix() << Name;
return createSymbol(NameSV, AlwaysAddSuffix, CanBeUnnamed);
}
MCSymbol *MCContext::createLinkerPrivateTempSymbol() {
SmallString<128> NameSV;
raw_svector_ostream(NameSV) << MAI->getLinkerPrivateGlobalPrefix() << "tmp";
return createSymbol(NameSV, true, false);
}
MCSymbol *MCContext::createTempSymbol(bool CanBeUnnamed) {
return createTempSymbol("tmp", true, CanBeUnnamed);
}
unsigned MCContext::NextInstance(unsigned LocalLabelVal) {
MCLabel *&Label = Instances[LocalLabelVal];
if (!Label)
Label = new (*this) MCLabel(0);
return Label->incInstance();
}
unsigned MCContext::GetInstance(unsigned LocalLabelVal) {
MCLabel *&Label = Instances[LocalLabelVal];
if (!Label)
Label = new (*this) MCLabel(0);
return Label->getInstance();
}
MCSymbol *MCContext::getOrCreateDirectionalLocalSymbol(unsigned LocalLabelVal,
unsigned Instance) {
MCSymbol *&Sym = LocalSymbols[std::make_pair(LocalLabelVal, Instance)];
if (!Sym)
Sym = createTempSymbol(false);
return Sym;
}
MCSymbol *MCContext::createDirectionalLocalSymbol(unsigned LocalLabelVal) {
unsigned Instance = NextInstance(LocalLabelVal);
return getOrCreateDirectionalLocalSymbol(LocalLabelVal, Instance);
}
MCSymbol *MCContext::getDirectionalLocalSymbol(unsigned LocalLabelVal,
bool Before) {
unsigned Instance = GetInstance(LocalLabelVal);
if (!Before)
++Instance;
return getOrCreateDirectionalLocalSymbol(LocalLabelVal, Instance);
}
MCSymbol *MCContext::lookupSymbol(const Twine &Name) const {
SmallString<128> NameSV;
StringRef NameRef = Name.toStringRef(NameSV);
return Symbols.lookup(NameRef);
}
void MCContext::setSymbolValue(MCStreamer &Streamer,
StringRef Sym,
uint64_t Val) {
auto Symbol = getOrCreateSymbol(Sym);
Streamer.EmitAssignment(Symbol, MCConstantExpr::create(Val, *this));
+}
+
+void MCContext::registerInlineAsmLabel(MCSymbol *Sym) {
+ InlineAsmUsedLabelNames[Sym->getName()] = Sym;
}
//===----------------------------------------------------------------------===//
// Section Management
//===----------------------------------------------------------------------===//
MCSectionMachO *MCContext::getMachOSection(StringRef Segment, StringRef Section,
unsigned TypeAndAttributes,
unsigned Reserved2, SectionKind Kind,
const char *BeginSymName) {
// We unique sections by their segment/section pair. The returned section
// may not have the same flags as the requested section, if so this should be
// diagnosed by the client as an error.
// Form the name to look up.
SmallString<64> Name;
Name += Segment;
Name.push_back(',');
Name += Section;
// Do the lookup, if we have a hit, return it.
MCSectionMachO *&Entry = MachOUniquingMap[Name];
if (Entry)
return Entry;
MCSymbol *Begin = nullptr;
if (BeginSymName)
Begin = createTempSymbol(BeginSymName, false);
// Otherwise, return a new section.
return Entry = new (MachOAllocator.Allocate()) MCSectionMachO(
Segment, Section, TypeAndAttributes, Reserved2, Kind, Begin);
}
void MCContext::renameELFSection(MCSectionELF *Section, StringRef Name) {
StringRef GroupName;
if (const MCSymbol *Group = Section->getGroup())
GroupName = Group->getName();
unsigned UniqueID = Section->getUniqueID();
ELFUniquingMap.erase(
ELFSectionKey{Section->getSectionName(), GroupName, UniqueID});
auto I = ELFUniquingMap.insert(std::make_pair(
ELFSectionKey{Name, GroupName, UniqueID},
Section))
.first;
StringRef CachedName = I->first.SectionName;
const_cast<MCSectionELF *>(Section)->setSectionName(CachedName);
}
MCSectionELF *MCContext::createELFSectionImpl(StringRef Section, unsigned Type,
unsigned Flags, SectionKind K,
unsigned EntrySize,
const MCSymbolELF *Group,
unsigned UniqueID,
const MCSymbolELF *Associated) {
MCSymbolELF *R;
MCSymbol *&Sym = Symbols[Section];
// A section symbol can not redefine regular symbols. There may be multiple
// sections with the same name, in which case the first such section wins.
if (Sym && Sym->isDefined() &&
(!Sym->isInSection() || Sym->getSection().getBeginSymbol() != Sym))
reportError(SMLoc(), "invalid symbol redefinition");
if (Sym && Sym->isUndefined()) {
R = cast<MCSymbolELF>(Sym);
} else {
auto NameIter = UsedNames.insert(std::make_pair(Section, false)).first;
R = new (&*NameIter, *this) MCSymbolELF(&*NameIter, /*isTemporary*/ false);
if (!Sym)
Sym = R;
}
R->setBinding(ELF::STB_LOCAL);
R->setType(ELF::STT_SECTION);
auto *Ret = new (ELFAllocator.Allocate()) MCSectionELF(
Section, Type, Flags, K, EntrySize, Group, UniqueID, R, Associated);
auto *F = new MCDataFragment();
Ret->getFragmentList().insert(Ret->begin(), F);
F->setParent(Ret);
R->setFragment(F);
return Ret;
}
MCSectionELF *MCContext::createELFRelSection(const Twine &Name, unsigned Type,
unsigned Flags, unsigned EntrySize,
const MCSymbolELF *Group,
const MCSectionELF *RelInfoSection) {
StringMap<bool>::iterator I;
bool Inserted;
std::tie(I, Inserted) =
RelSecNames.insert(std::make_pair(Name.str(), true));
return createELFSectionImpl(
I->getKey(), Type, Flags, SectionKind::getReadOnly(), EntrySize, Group,
true, cast<MCSymbolELF>(RelInfoSection->getBeginSymbol()));
}
MCSectionELF *MCContext::getELFNamedSection(const Twine &Prefix,
const Twine &Suffix, unsigned Type,
unsigned Flags,
unsigned EntrySize) {
return getELFSection(Prefix + "." + Suffix, Type, Flags, EntrySize, Suffix);
}
MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type,
unsigned Flags, unsigned EntrySize,
const Twine &Group, unsigned UniqueID,
const MCSymbolELF *Associated) {
MCSymbolELF *GroupSym = nullptr;
if (!Group.isTriviallyEmpty() && !Group.str().empty())
GroupSym = cast<MCSymbolELF>(getOrCreateSymbol(Group));
return getELFSection(Section, Type, Flags, EntrySize, GroupSym, UniqueID,
Associated);
}
MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type,
unsigned Flags, unsigned EntrySize,
const MCSymbolELF *GroupSym,
unsigned UniqueID,
const MCSymbolELF *Associated) {
StringRef Group = "";
if (GroupSym)
Group = GroupSym->getName();
// Do the lookup, if we have a hit, return it.
auto IterBool = ELFUniquingMap.insert(
std::make_pair(ELFSectionKey{Section.str(), Group, UniqueID}, nullptr));
auto &Entry = *IterBool.first;
if (!IterBool.second)
return Entry.second;
StringRef CachedName = Entry.first.SectionName;
SectionKind Kind;
if (Flags & ELF::SHF_ARM_PURECODE)
Kind = SectionKind::getExecuteOnly();
else if (Flags & ELF::SHF_EXECINSTR)
Kind = SectionKind::getText();
else
Kind = SectionKind::getReadOnly();
MCSectionELF *Result = createELFSectionImpl(
CachedName, Type, Flags, Kind, EntrySize, GroupSym, UniqueID, Associated);
Entry.second = Result;
return Result;
}
MCSectionELF *MCContext::createELFGroupSection(const MCSymbolELF *Group) {
return createELFSectionImpl(".group", ELF::SHT_GROUP, 0,
SectionKind::getReadOnly(), 4, Group, ~0,
nullptr);
}
MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
unsigned Characteristics,
SectionKind Kind,
StringRef COMDATSymName, int Selection,
unsigned UniqueID,
const char *BeginSymName) {
MCSymbol *COMDATSymbol = nullptr;
if (!COMDATSymName.empty()) {
COMDATSymbol = getOrCreateSymbol(COMDATSymName);
COMDATSymName = COMDATSymbol->getName();
}
// Do the lookup, if we have a hit, return it.
COFFSectionKey T{Section, COMDATSymName, Selection, UniqueID};
auto IterBool = COFFUniquingMap.insert(std::make_pair(T, nullptr));
auto Iter = IterBool.first;
if (!IterBool.second)
return Iter->second;
MCSymbol *Begin = nullptr;
if (BeginSymName)
Begin = createTempSymbol(BeginSymName, false);
StringRef CachedName = Iter->first.SectionName;
MCSectionCOFF *Result = new (COFFAllocator.Allocate()) MCSectionCOFF(
CachedName, Characteristics, COMDATSymbol, Selection, Kind, Begin);
Iter->second = Result;
return Result;
}
MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
unsigned Characteristics,
SectionKind Kind,
const char *BeginSymName) {
return getCOFFSection(Section, Characteristics, Kind, "", 0, GenericSectionID,
BeginSymName);
}
MCSectionCOFF *MCContext::getAssociativeCOFFSection(MCSectionCOFF *Sec,
const MCSymbol *KeySym,
unsigned UniqueID) {
// Return the normal section if we don't have to be associative or unique.
if (!KeySym && UniqueID == GenericSectionID)
return Sec;
// If we have a key symbol, make an associative section with the same name and
// kind as the normal section.
unsigned Characteristics = Sec->getCharacteristics();
if (KeySym) {
Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
return getCOFFSection(Sec->getSectionName(), Characteristics,
Sec->getKind(), KeySym->getName(),
COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE, UniqueID);
}
return getCOFFSection(Sec->getSectionName(), Characteristics, Sec->getKind(),
"", 0, UniqueID);
}
MCSectionWasm *MCContext::getWasmSection(const Twine &Section, SectionKind K,
const Twine &Group, unsigned UniqueID,
const char *BeginSymName) {
MCSymbolWasm *GroupSym = nullptr;
if (!Group.isTriviallyEmpty() && !Group.str().empty()) {
GroupSym = cast<MCSymbolWasm>(getOrCreateSymbol(Group));
GroupSym->setComdat(true);
}
return getWasmSection(Section, K, GroupSym, UniqueID, BeginSymName);
}
MCSectionWasm *MCContext::getWasmSection(const Twine &Section, SectionKind Kind,
const MCSymbolWasm *GroupSym,
unsigned UniqueID,
const char *BeginSymName) {
StringRef Group = "";
if (GroupSym)
Group = GroupSym->getName();
// Do the lookup, if we have a hit, return it.
auto IterBool = WasmUniquingMap.insert(
std::make_pair(WasmSectionKey{Section.str(), Group, UniqueID}, nullptr));
auto &Entry = *IterBool.first;
if (!IterBool.second)
return Entry.second;
StringRef CachedName = Entry.first.SectionName;
MCSymbol *Begin = createSymbol(CachedName, false, false);
cast<MCSymbolWasm>(Begin)->setType(wasm::WASM_SYMBOL_TYPE_SECTION);
MCSectionWasm *Result = new (WasmAllocator.Allocate())
MCSectionWasm(CachedName, Kind, GroupSym, UniqueID, Begin);
Entry.second = Result;
auto *F = new MCDataFragment();
Result->getFragmentList().insert(Result->begin(), F);
F->setParent(Result);
Begin->setFragment(F);
return Result;
}
MCSectionXCOFF *MCContext::getXCOFFSection(StringRef Section,
XCOFF::StorageMappingClass SMC,
SectionKind Kind,
const char *BeginSymName) {
// Do the lookup. If we have a hit, return it.
auto IterBool = XCOFFUniquingMap.insert(
std::make_pair(XCOFFSectionKey{Section.str(), SMC}, nullptr));
auto &Entry = *IterBool.first;
if (!IterBool.second)
return Entry.second;
// Otherwise, return a new section.
StringRef CachedName = Entry.first.SectionName;
MCSymbol *Begin = nullptr;
if (BeginSymName)
Begin = createTempSymbol(BeginSymName, false);
MCSectionXCOFF *Result = new (XCOFFAllocator.Allocate())
MCSectionXCOFF(CachedName, SMC, Kind, Begin);
Entry.second = Result;
auto *F = new MCDataFragment();
Result->getFragmentList().insert(Result->begin(), F);
F->setParent(Result);
if (Begin)
Begin->setFragment(F);
return Result;
}
MCSubtargetInfo &MCContext::getSubtargetCopy(const MCSubtargetInfo &STI) {
return *new (MCSubtargetAllocator.Allocate()) MCSubtargetInfo(STI);
}
void MCContext::addDebugPrefixMapEntry(const std::string &From,
const std::string &To) {
DebugPrefixMap.insert(std::make_pair(From, To));
}
void MCContext::RemapDebugPaths() {
const auto &DebugPrefixMap = this->DebugPrefixMap;
const auto RemapDebugPath = [&DebugPrefixMap](std::string &Path) {
for (const auto &Entry : DebugPrefixMap)
if (StringRef(Path).startswith(Entry.first)) {
std::string RemappedPath =
(Twine(Entry.second) + Path.substr(Entry.first.size())).str();
Path.swap(RemappedPath);
}
};
// Remap compilation directory.
std::string CompDir = CompilationDir.str();
RemapDebugPath(CompDir);
CompilationDir = CompDir;
// Remap MCDwarfDirs in all compilation units.
for (auto &CUIDTablePair : MCDwarfLineTablesCUMap)
for (auto &Dir : CUIDTablePair.second.getMCDwarfDirs())
RemapDebugPath(Dir);
}
//===----------------------------------------------------------------------===//
// Dwarf Management
//===----------------------------------------------------------------------===//
void MCContext::setGenDwarfRootFile(StringRef InputFileName, StringRef Buffer) {
// MCDwarf needs the root file as well as the compilation directory.
// If we find a '.file 0' directive that will supersede these values.
Optional<MD5::MD5Result> Cksum;
if (getDwarfVersion() >= 5) {
MD5 Hash;
MD5::MD5Result Sum;
Hash.update(Buffer);
Hash.final(Sum);
Cksum = Sum;
}
// Canonicalize the root filename. It cannot be empty, and should not
// repeat the compilation dir.
// The MCContext ctor initializes MainFileName to the name associated with
// the SrcMgr's main file ID, which might be the same as InputFileName (and
// possibly include directory components).
// Or, MainFileName might have been overridden by a -main-file-name option,
// which is supposed to be just a base filename with no directory component.
// So, if the InputFileName and MainFileName are not equal, assume
// MainFileName is a substitute basename and replace the last component.
SmallString<1024> FileNameBuf = InputFileName;
if (FileNameBuf.empty() || FileNameBuf == "-")
FileNameBuf = "<stdin>";
if (!getMainFileName().empty() && FileNameBuf != getMainFileName()) {
llvm::sys::path::remove_filename(FileNameBuf);
llvm::sys::path::append(FileNameBuf, getMainFileName());
}
StringRef FileName = FileNameBuf;
if (FileName.consume_front(getCompilationDir()))
if (llvm::sys::path::is_separator(FileName.front()))
FileName = FileName.drop_front();
assert(!FileName.empty());
setMCLineTableRootFile(
/*CUID=*/0, getCompilationDir(), FileName, Cksum, None);
}
/// getDwarfFile - takes a file name and number to place in the dwarf file and
/// directory tables. If the file number has already been allocated it is an
/// error and zero is returned and the client reports the error, else the
/// allocated file number is returned. The file numbers may be in any order.
Expected<unsigned> MCContext::getDwarfFile(StringRef Directory,
StringRef FileName,
unsigned FileNumber,
Optional<MD5::MD5Result> Checksum,
Optional<StringRef> Source,
unsigned CUID) {
MCDwarfLineTable &Table = MCDwarfLineTablesCUMap[CUID];
return Table.tryGetFile(Directory, FileName, Checksum, Source, DwarfVersion,
FileNumber);
}
/// isValidDwarfFileNumber - takes a dwarf file number and returns true if it
/// currently is assigned and false otherwise.
bool MCContext::isValidDwarfFileNumber(unsigned FileNumber, unsigned CUID) {
const MCDwarfLineTable &LineTable = getMCDwarfLineTable(CUID);
if (FileNumber == 0)
return getDwarfVersion() >= 5;
if (FileNumber >= LineTable.getMCDwarfFiles().size())
return false;
return !LineTable.getMCDwarfFiles()[FileNumber].Name.empty();
}
/// Remove empty sections from SectionsForRanges, to avoid generating
/// useless debug info for them.
void MCContext::finalizeDwarfSections(MCStreamer &MCOS) {
SectionsForRanges.remove_if(
[&](MCSection *Sec) { return !MCOS.mayHaveInstructions(*Sec); });
}
CodeViewContext &MCContext::getCVContext() {
if (!CVContext.get())
CVContext.reset(new CodeViewContext);
return *CVContext.get();
}
//===----------------------------------------------------------------------===//
// Error Reporting
//===----------------------------------------------------------------------===//
void MCContext::reportError(SMLoc Loc, const Twine &Msg) {
HadError = true;
// If we have a source manager use it. Otherwise, try using the inline source
// manager.
// If that fails, use the generic report_fatal_error().
if (SrcMgr)
SrcMgr->PrintMessage(Loc, SourceMgr::DK_Error, Msg);
else if (InlineSrcMgr)
InlineSrcMgr->PrintMessage(Loc, SourceMgr::DK_Error, Msg);
else
report_fatal_error(Msg, false);
}
void MCContext::reportFatalError(SMLoc Loc, const Twine &Msg) {
reportError(Loc, Msg);
// If we reached here, we are failing ungracefully. Run the interrupt handlers
// to make sure any special cleanups get done, in particular that we remove
// files registered with RemoveFileOnSignal.
sys::RunInterruptHandlers();
exit(1);
}
Index: vendor/llvm/dist-release_90/lib/MC/MCParser/AsmParser.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/MC/MCParser/AsmParser.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/MC/MCParser/AsmParser.cpp (revision 351303)
@@ -1,5960 +1,5962 @@
//===- AsmParser.cpp - Parser for Assembly Files --------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This class implements the parser for assembly files.
//
//===----------------------------------------------------------------------===//
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/BinaryFormat/Dwarf.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCCodeView.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDirectives.h"
#include "llvm/MC/MCDwarf.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInstPrinter.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCParser/AsmCond.h"
#include "llvm/MC/MCParser/AsmLexer.h"
#include "llvm/MC/MCParser/MCAsmLexer.h"
#include "llvm/MC/MCParser/MCAsmParser.h"
#include "llvm/MC/MCParser/MCAsmParserExtension.h"
#include "llvm/MC/MCParser/MCAsmParserUtils.h"
#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
#include "llvm/MC/MCParser/MCTargetAsmParser.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSection.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCTargetOptions.h"
#include "llvm/MC/MCValue.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MD5.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/SMLoc.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cassert>
#include <cctype>
#include <climits>
#include <cstddef>
#include <cstdint>
#include <deque>
#include <memory>
#include <sstream>
#include <string>
#include <tuple>
#include <utility>
#include <vector>
using namespace llvm;
MCAsmParserSemaCallback::~MCAsmParserSemaCallback() = default;
static cl::opt<unsigned> AsmMacroMaxNestingDepth(
"asm-macro-max-nesting-depth", cl::init(20), cl::Hidden,
cl::desc("The maximum nesting depth allowed for assembly macros."));
namespace {
/// Helper types for tracking macro definitions.
typedef std::vector<AsmToken> MCAsmMacroArgument;
typedef std::vector<MCAsmMacroArgument> MCAsmMacroArguments;
/// Helper class for storing information about an active macro
/// instantiation.
struct MacroInstantiation {
/// The location of the instantiation.
SMLoc InstantiationLoc;
/// The buffer where parsing should resume upon instantiation completion.
int ExitBuffer;
/// The location where parsing should resume upon instantiation completion.
SMLoc ExitLoc;
/// The depth of TheCondStack at the start of the instantiation.
size_t CondStackDepth;
public:
MacroInstantiation(SMLoc IL, int EB, SMLoc EL, size_t CondStackDepth);
};
struct ParseStatementInfo {
/// The parsed operands from the last parsed statement.
SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> ParsedOperands;
/// The opcode from the last parsed instruction.
unsigned Opcode = ~0U;
/// Was there an error parsing the inline assembly?
bool ParseError = false;
SmallVectorImpl<AsmRewrite> *AsmRewrites = nullptr;
ParseStatementInfo() = delete;
ParseStatementInfo(SmallVectorImpl<AsmRewrite> *rewrites)
: AsmRewrites(rewrites) {}
};
/// The concrete assembly parser instance.
class AsmParser : public MCAsmParser {
private:
AsmLexer Lexer;
MCContext &Ctx;
MCStreamer &Out;
const MCAsmInfo &MAI;
SourceMgr &SrcMgr;
SourceMgr::DiagHandlerTy SavedDiagHandler;
void *SavedDiagContext;
std::unique_ptr<MCAsmParserExtension> PlatformParser;
/// This is the current buffer index we're lexing from as managed by the
/// SourceMgr object.
unsigned CurBuffer;
AsmCond TheCondState;
std::vector<AsmCond> TheCondStack;
/// maps directive names to handler methods in parser
/// extensions. Extensions register themselves in this map by calling
/// addDirectiveHandler.
StringMap<ExtensionDirectiveHandler> ExtensionDirectiveMap;
/// Stack of active macro instantiations.
std::vector<MacroInstantiation*> ActiveMacros;
/// List of bodies of anonymous macros.
std::deque<MCAsmMacro> MacroLikeBodies;
/// Boolean tracking whether macro substitution is enabled.
unsigned MacrosEnabledFlag : 1;
/// Keeps track of how many .macro's have been instantiated.
unsigned NumOfMacroInstantiations;
/// The values from the last parsed cpp hash file line comment if any.
struct CppHashInfoTy {
StringRef Filename;
int64_t LineNumber;
SMLoc Loc;
unsigned Buf;
CppHashInfoTy() : Filename(), LineNumber(0), Loc(), Buf(0) {}
};
CppHashInfoTy CppHashInfo;
/// The filename from the first cpp hash file line comment, if any.
StringRef FirstCppHashFilename;
/// List of forward directional labels for diagnosis at the end.
SmallVector<std::tuple<SMLoc, CppHashInfoTy, MCSymbol *>, 4> DirLabels;
/// AssemblerDialect. ~OU means unset value and use value provided by MAI.
unsigned AssemblerDialect = ~0U;
/// is Darwin compatibility enabled?
bool IsDarwin = false;
/// Are we parsing ms-style inline assembly?
bool ParsingInlineAsm = false;
/// Did we already inform the user about inconsistent MD5 usage?
bool ReportedInconsistentMD5 = false;
// Is alt macro mode enabled.
bool AltMacroMode = false;
public:
AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
const MCAsmInfo &MAI, unsigned CB);
AsmParser(const AsmParser &) = delete;
AsmParser &operator=(const AsmParser &) = delete;
~AsmParser() override;
bool Run(bool NoInitialTextSection, bool NoFinalize = false) override;
void addDirectiveHandler(StringRef Directive,
ExtensionDirectiveHandler Handler) override {
ExtensionDirectiveMap[Directive] = Handler;
}
void addAliasForDirective(StringRef Directive, StringRef Alias) override {
DirectiveKindMap[Directive] = DirectiveKindMap[Alias];
}
/// @name MCAsmParser Interface
/// {
SourceMgr &getSourceManager() override { return SrcMgr; }
MCAsmLexer &getLexer() override { return Lexer; }
MCContext &getContext() override { return Ctx; }
MCStreamer &getStreamer() override { return Out; }
CodeViewContext &getCVContext() { return Ctx.getCVContext(); }
unsigned getAssemblerDialect() override {
if (AssemblerDialect == ~0U)
return MAI.getAssemblerDialect();
else
return AssemblerDialect;
}
void setAssemblerDialect(unsigned i) override {
AssemblerDialect = i;
}
void Note(SMLoc L, const Twine &Msg, SMRange Range = None) override;
bool Warning(SMLoc L, const Twine &Msg, SMRange Range = None) override;
bool printError(SMLoc L, const Twine &Msg, SMRange Range = None) override;
const AsmToken &Lex() override;
void setParsingInlineAsm(bool V) override {
ParsingInlineAsm = V;
// When parsing MS inline asm, we must lex 0b1101 and 0ABCH as binary and
// hex integer literals.
Lexer.setLexMasmIntegers(V);
}
bool isParsingInlineAsm() override { return ParsingInlineAsm; }
bool parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
unsigned &NumOutputs, unsigned &NumInputs,
SmallVectorImpl<std::pair<void *,bool>> &OpDecls,
SmallVectorImpl<std::string> &Constraints,
SmallVectorImpl<std::string> &Clobbers,
const MCInstrInfo *MII, const MCInstPrinter *IP,
MCAsmParserSemaCallback &SI) override;
bool parseExpression(const MCExpr *&Res);
bool parseExpression(const MCExpr *&Res, SMLoc &EndLoc) override;
bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) override;
bool parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) override;
bool parseParenExprOfDepth(unsigned ParenDepth, const MCExpr *&Res,
SMLoc &EndLoc) override;
bool parseAbsoluteExpression(int64_t &Res) override;
/// Parse a floating point expression using the float \p Semantics
/// and set \p Res to the value.
bool parseRealValue(const fltSemantics &Semantics, APInt &Res);
/// Parse an identifier or string (as a quoted identifier)
/// and set \p Res to the identifier contents.
bool parseIdentifier(StringRef &Res) override;
void eatToEndOfStatement() override;
bool checkForValidSection() override;
/// }
private:
bool parseStatement(ParseStatementInfo &Info,
MCAsmParserSemaCallback *SI);
bool parseCurlyBlockScope(SmallVectorImpl<AsmRewrite>& AsmStrRewrites);
bool parseCppHashLineFilenameComment(SMLoc L);
void checkForBadMacro(SMLoc DirectiveLoc, StringRef Name, StringRef Body,
ArrayRef<MCAsmMacroParameter> Parameters);
bool expandMacro(raw_svector_ostream &OS, StringRef Body,
ArrayRef<MCAsmMacroParameter> Parameters,
ArrayRef<MCAsmMacroArgument> A, bool EnableAtPseudoVariable,
SMLoc L);
/// Are macros enabled in the parser?
bool areMacrosEnabled() {return MacrosEnabledFlag;}
/// Control a flag in the parser that enables or disables macros.
void setMacrosEnabled(bool Flag) {MacrosEnabledFlag = Flag;}
/// Are we inside a macro instantiation?
bool isInsideMacroInstantiation() {return !ActiveMacros.empty();}
/// Handle entry to macro instantiation.
///
/// \param M The macro.
/// \param NameLoc Instantiation location.
bool handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc);
/// Handle exit from macro instantiation.
void handleMacroExit();
/// Extract AsmTokens for a macro argument.
bool parseMacroArgument(MCAsmMacroArgument &MA, bool Vararg);
/// Parse all macro arguments for a given macro.
bool parseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A);
void printMacroInstantiations();
void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg,
SMRange Range = None) const {
ArrayRef<SMRange> Ranges(Range);
SrcMgr.PrintMessage(Loc, Kind, Msg, Ranges);
}
static void DiagHandler(const SMDiagnostic &Diag, void *Context);
/// Should we emit DWARF describing this assembler source? (Returns false if
/// the source has .file directives, which means we don't want to generate
/// info describing the assembler source itself.)
bool enabledGenDwarfForAssembly();
/// Enter the specified file. This returns true on failure.
bool enterIncludeFile(const std::string &Filename);
/// Process the specified file for the .incbin directive.
/// This returns true on failure.
bool processIncbinFile(const std::string &Filename, int64_t Skip = 0,
const MCExpr *Count = nullptr, SMLoc Loc = SMLoc());
/// Reset the current lexer position to that given by \p Loc. The
/// current token is not set; clients should ensure Lex() is called
/// subsequently.
///
/// \param InBuffer If not 0, should be the known buffer id that contains the
/// location.
void jumpToLoc(SMLoc Loc, unsigned InBuffer = 0);
/// Parse up to the end of statement and a return the contents from the
/// current token until the end of the statement; the current token on exit
/// will be either the EndOfStatement or EOF.
StringRef parseStringToEndOfStatement() override;
/// Parse until the end of a statement or a comma is encountered,
/// return the contents from the current token up to the end or comma.
StringRef parseStringToComma();
bool parseAssignment(StringRef Name, bool allow_redef,
bool NoDeadStrip = false);
unsigned getBinOpPrecedence(AsmToken::TokenKind K,
MCBinaryExpr::Opcode &Kind);
bool parseBinOpRHS(unsigned Precedence, const MCExpr *&Res, SMLoc &EndLoc);
bool parseParenExpr(const MCExpr *&Res, SMLoc &EndLoc);
bool parseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc);
bool parseRegisterOrRegisterNumber(int64_t &Register, SMLoc DirectiveLoc);
bool parseCVFunctionId(int64_t &FunctionId, StringRef DirectiveName);
bool parseCVFileId(int64_t &FileId, StringRef DirectiveName);
// Generic (target and platform independent) directive parsing.
enum DirectiveKind {
DK_NO_DIRECTIVE, // Placeholder
DK_SET,
DK_EQU,
DK_EQUIV,
DK_ASCII,
DK_ASCIZ,
DK_STRING,
DK_BYTE,
DK_SHORT,
DK_RELOC,
DK_VALUE,
DK_2BYTE,
DK_LONG,
DK_INT,
DK_4BYTE,
DK_QUAD,
DK_8BYTE,
DK_OCTA,
DK_DC,
DK_DC_A,
DK_DC_B,
DK_DC_D,
DK_DC_L,
DK_DC_S,
DK_DC_W,
DK_DC_X,
DK_DCB,
DK_DCB_B,
DK_DCB_D,
DK_DCB_L,
DK_DCB_S,
DK_DCB_W,
DK_DCB_X,
DK_DS,
DK_DS_B,
DK_DS_D,
DK_DS_L,
DK_DS_P,
DK_DS_S,
DK_DS_W,
DK_DS_X,
DK_SINGLE,
DK_FLOAT,
DK_DOUBLE,
DK_ALIGN,
DK_ALIGN32,
DK_BALIGN,
DK_BALIGNW,
DK_BALIGNL,
DK_P2ALIGN,
DK_P2ALIGNW,
DK_P2ALIGNL,
DK_ORG,
DK_FILL,
DK_ENDR,
DK_BUNDLE_ALIGN_MODE,
DK_BUNDLE_LOCK,
DK_BUNDLE_UNLOCK,
DK_ZERO,
DK_EXTERN,
DK_GLOBL,
DK_GLOBAL,
DK_LAZY_REFERENCE,
DK_NO_DEAD_STRIP,
DK_SYMBOL_RESOLVER,
DK_PRIVATE_EXTERN,
DK_REFERENCE,
DK_WEAK_DEFINITION,
DK_WEAK_REFERENCE,
DK_WEAK_DEF_CAN_BE_HIDDEN,
DK_COLD,
DK_COMM,
DK_COMMON,
DK_LCOMM,
DK_ABORT,
DK_INCLUDE,
DK_INCBIN,
DK_CODE16,
DK_CODE16GCC,
DK_REPT,
DK_IRP,
DK_IRPC,
DK_IF,
DK_IFEQ,
DK_IFGE,
DK_IFGT,
DK_IFLE,
DK_IFLT,
DK_IFNE,
DK_IFB,
DK_IFNB,
DK_IFC,
DK_IFEQS,
DK_IFNC,
DK_IFNES,
DK_IFDEF,
DK_IFNDEF,
DK_IFNOTDEF,
DK_ELSEIF,
DK_ELSE,
DK_ENDIF,
DK_SPACE,
DK_SKIP,
DK_FILE,
DK_LINE,
DK_LOC,
DK_STABS,
DK_CV_FILE,
DK_CV_FUNC_ID,
DK_CV_INLINE_SITE_ID,
DK_CV_LOC,
DK_CV_LINETABLE,
DK_CV_INLINE_LINETABLE,
DK_CV_DEF_RANGE,
DK_CV_STRINGTABLE,
DK_CV_STRING,
DK_CV_FILECHECKSUMS,
DK_CV_FILECHECKSUM_OFFSET,
DK_CV_FPO_DATA,
DK_CFI_SECTIONS,
DK_CFI_STARTPROC,
DK_CFI_ENDPROC,
DK_CFI_DEF_CFA,
DK_CFI_DEF_CFA_OFFSET,
DK_CFI_ADJUST_CFA_OFFSET,
DK_CFI_DEF_CFA_REGISTER,
DK_CFI_OFFSET,
DK_CFI_REL_OFFSET,
DK_CFI_PERSONALITY,
DK_CFI_LSDA,
DK_CFI_REMEMBER_STATE,
DK_CFI_RESTORE_STATE,
DK_CFI_SAME_VALUE,
DK_CFI_RESTORE,
DK_CFI_ESCAPE,
DK_CFI_RETURN_COLUMN,
DK_CFI_SIGNAL_FRAME,
DK_CFI_UNDEFINED,
DK_CFI_REGISTER,
DK_CFI_WINDOW_SAVE,
DK_CFI_B_KEY_FRAME,
DK_MACROS_ON,
DK_MACROS_OFF,
DK_ALTMACRO,
DK_NOALTMACRO,
DK_MACRO,
DK_EXITM,
DK_ENDM,
DK_ENDMACRO,
DK_PURGEM,
DK_SLEB128,
DK_ULEB128,
DK_ERR,
DK_ERROR,
DK_WARNING,
DK_PRINT,
DK_ADDRSIG,
DK_ADDRSIG_SYM,
DK_END
};
/// Maps directive name --> DirectiveKind enum, for
/// directives parsed by this class.
StringMap<DirectiveKind> DirectiveKindMap;
// ".ascii", ".asciz", ".string"
bool parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated);
bool parseDirectiveReloc(SMLoc DirectiveLoc); // ".reloc"
bool parseDirectiveValue(StringRef IDVal,
unsigned Size); // ".byte", ".long", ...
bool parseDirectiveOctaValue(StringRef IDVal); // ".octa", ...
bool parseDirectiveRealValue(StringRef IDVal,
const fltSemantics &); // ".single", ...
bool parseDirectiveFill(); // ".fill"
bool parseDirectiveZero(); // ".zero"
// ".set", ".equ", ".equiv"
bool parseDirectiveSet(StringRef IDVal, bool allow_redef);
bool parseDirectiveOrg(); // ".org"
// ".align{,32}", ".p2align{,w,l}"
bool parseDirectiveAlign(bool IsPow2, unsigned ValueSize);
// ".file", ".line", ".loc", ".stabs"
bool parseDirectiveFile(SMLoc DirectiveLoc);
bool parseDirectiveLine();
bool parseDirectiveLoc();
bool parseDirectiveStabs();
// ".cv_file", ".cv_func_id", ".cv_inline_site_id", ".cv_loc", ".cv_linetable",
// ".cv_inline_linetable", ".cv_def_range", ".cv_string"
bool parseDirectiveCVFile();
bool parseDirectiveCVFuncId();
bool parseDirectiveCVInlineSiteId();
bool parseDirectiveCVLoc();
bool parseDirectiveCVLinetable();
bool parseDirectiveCVInlineLinetable();
bool parseDirectiveCVDefRange();
bool parseDirectiveCVString();
bool parseDirectiveCVStringTable();
bool parseDirectiveCVFileChecksums();
bool parseDirectiveCVFileChecksumOffset();
bool parseDirectiveCVFPOData();
// .cfi directives
bool parseDirectiveCFIRegister(SMLoc DirectiveLoc);
bool parseDirectiveCFIWindowSave();
bool parseDirectiveCFISections();
bool parseDirectiveCFIStartProc();
bool parseDirectiveCFIEndProc();
bool parseDirectiveCFIDefCfaOffset();
bool parseDirectiveCFIDefCfa(SMLoc DirectiveLoc);
bool parseDirectiveCFIAdjustCfaOffset();
bool parseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc);
bool parseDirectiveCFIOffset(SMLoc DirectiveLoc);
bool parseDirectiveCFIRelOffset(SMLoc DirectiveLoc);
bool parseDirectiveCFIPersonalityOrLsda(bool IsPersonality);
bool parseDirectiveCFIRememberState();
bool parseDirectiveCFIRestoreState();
bool parseDirectiveCFISameValue(SMLoc DirectiveLoc);
bool parseDirectiveCFIRestore(SMLoc DirectiveLoc);
bool parseDirectiveCFIEscape();
bool parseDirectiveCFIReturnColumn(SMLoc DirectiveLoc);
bool parseDirectiveCFISignalFrame();
bool parseDirectiveCFIUndefined(SMLoc DirectiveLoc);
// macro directives
bool parseDirectivePurgeMacro(SMLoc DirectiveLoc);
bool parseDirectiveExitMacro(StringRef Directive);
bool parseDirectiveEndMacro(StringRef Directive);
bool parseDirectiveMacro(SMLoc DirectiveLoc);
bool parseDirectiveMacrosOnOff(StringRef Directive);
// alternate macro mode directives
bool parseDirectiveAltmacro(StringRef Directive);
// ".bundle_align_mode"
bool parseDirectiveBundleAlignMode();
// ".bundle_lock"
bool parseDirectiveBundleLock();
// ".bundle_unlock"
bool parseDirectiveBundleUnlock();
// ".space", ".skip"
bool parseDirectiveSpace(StringRef IDVal);
// ".dcb"
bool parseDirectiveDCB(StringRef IDVal, unsigned Size);
bool parseDirectiveRealDCB(StringRef IDVal, const fltSemantics &);
// ".ds"
bool parseDirectiveDS(StringRef IDVal, unsigned Size);
// .sleb128 (Signed=true) and .uleb128 (Signed=false)
bool parseDirectiveLEB128(bool Signed);
/// Parse a directive like ".globl" which
/// accepts a single symbol (which should be a label or an external).
bool parseDirectiveSymbolAttribute(MCSymbolAttr Attr);
bool parseDirectiveComm(bool IsLocal); // ".comm" and ".lcomm"
bool parseDirectiveAbort(); // ".abort"
bool parseDirectiveInclude(); // ".include"
bool parseDirectiveIncbin(); // ".incbin"
// ".if", ".ifeq", ".ifge", ".ifgt" , ".ifle", ".iflt" or ".ifne"
bool parseDirectiveIf(SMLoc DirectiveLoc, DirectiveKind DirKind);
// ".ifb" or ".ifnb", depending on ExpectBlank.
bool parseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank);
// ".ifc" or ".ifnc", depending on ExpectEqual.
bool parseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual);
// ".ifeqs" or ".ifnes", depending on ExpectEqual.
bool parseDirectiveIfeqs(SMLoc DirectiveLoc, bool ExpectEqual);
// ".ifdef" or ".ifndef", depending on expect_defined
bool parseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined);
bool parseDirectiveElseIf(SMLoc DirectiveLoc); // ".elseif"
bool parseDirectiveElse(SMLoc DirectiveLoc); // ".else"
bool parseDirectiveEndIf(SMLoc DirectiveLoc); // .endif
bool parseEscapedString(std::string &Data) override;
const MCExpr *applyModifierToExpr(const MCExpr *E,
MCSymbolRefExpr::VariantKind Variant);
// Macro-like directives
MCAsmMacro *parseMacroLikeBody(SMLoc DirectiveLoc);
void instantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
raw_svector_ostream &OS);
bool parseDirectiveRept(SMLoc DirectiveLoc, StringRef Directive);
bool parseDirectiveIrp(SMLoc DirectiveLoc); // ".irp"
bool parseDirectiveIrpc(SMLoc DirectiveLoc); // ".irpc"
bool parseDirectiveEndr(SMLoc DirectiveLoc); // ".endr"
// "_emit" or "__emit"
bool parseDirectiveMSEmit(SMLoc DirectiveLoc, ParseStatementInfo &Info,
size_t Len);
// "align"
bool parseDirectiveMSAlign(SMLoc DirectiveLoc, ParseStatementInfo &Info);
// "end"
bool parseDirectiveEnd(SMLoc DirectiveLoc);
// ".err" or ".error"
bool parseDirectiveError(SMLoc DirectiveLoc, bool WithMessage);
// ".warning"
bool parseDirectiveWarning(SMLoc DirectiveLoc);
// .print <double-quotes-string>
bool parseDirectivePrint(SMLoc DirectiveLoc);
// Directives to support address-significance tables.
bool parseDirectiveAddrsig();
bool parseDirectiveAddrsigSym();
void initializeDirectiveKindMap();
};
} // end anonymous namespace
namespace llvm {
extern MCAsmParserExtension *createDarwinAsmParser();
extern MCAsmParserExtension *createELFAsmParser();
extern MCAsmParserExtension *createCOFFAsmParser();
extern MCAsmParserExtension *createWasmAsmParser();
} // end namespace llvm
enum { DEFAULT_ADDRSPACE = 0 };
AsmParser::AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
const MCAsmInfo &MAI, unsigned CB = 0)
: Lexer(MAI), Ctx(Ctx), Out(Out), MAI(MAI), SrcMgr(SM),
CurBuffer(CB ? CB : SM.getMainFileID()), MacrosEnabledFlag(true) {
HadError = false;
// Save the old handler.
SavedDiagHandler = SrcMgr.getDiagHandler();
SavedDiagContext = SrcMgr.getDiagContext();
// Set our own handler which calls the saved handler.
SrcMgr.setDiagHandler(DiagHandler, this);
Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
// Initialize the platform / file format parser.
switch (Ctx.getObjectFileInfo()->getObjectFileType()) {
case MCObjectFileInfo::IsCOFF:
PlatformParser.reset(createCOFFAsmParser());
break;
case MCObjectFileInfo::IsMachO:
PlatformParser.reset(createDarwinAsmParser());
IsDarwin = true;
break;
case MCObjectFileInfo::IsELF:
PlatformParser.reset(createELFAsmParser());
break;
case MCObjectFileInfo::IsWasm:
PlatformParser.reset(createWasmAsmParser());
break;
case MCObjectFileInfo::IsXCOFF:
// TODO: Need to implement createXCOFFAsmParser for XCOFF format.
break;
}
PlatformParser->Initialize(*this);
initializeDirectiveKindMap();
NumOfMacroInstantiations = 0;
}
AsmParser::~AsmParser() {
assert((HadError || ActiveMacros.empty()) &&
"Unexpected active macro instantiation!");
// Restore the saved diagnostics handler and context for use during
// finalization.
SrcMgr.setDiagHandler(SavedDiagHandler, SavedDiagContext);
}
void AsmParser::printMacroInstantiations() {
// Print the active macro instantiation stack.
for (std::vector<MacroInstantiation *>::const_reverse_iterator
it = ActiveMacros.rbegin(),
ie = ActiveMacros.rend();
it != ie; ++it)
printMessage((*it)->InstantiationLoc, SourceMgr::DK_Note,
"while in macro instantiation");
}
void AsmParser::Note(SMLoc L, const Twine &Msg, SMRange Range) {
printPendingErrors();
printMessage(L, SourceMgr::DK_Note, Msg, Range);
printMacroInstantiations();
}
bool AsmParser::Warning(SMLoc L, const Twine &Msg, SMRange Range) {
if(getTargetParser().getTargetOptions().MCNoWarn)
return false;
if (getTargetParser().getTargetOptions().MCFatalWarnings)
return Error(L, Msg, Range);
printMessage(L, SourceMgr::DK_Warning, Msg, Range);
printMacroInstantiations();
return false;
}
bool AsmParser::printError(SMLoc L, const Twine &Msg, SMRange Range) {
HadError = true;
printMessage(L, SourceMgr::DK_Error, Msg, Range);
printMacroInstantiations();
return true;
}
bool AsmParser::enterIncludeFile(const std::string &Filename) {
std::string IncludedFile;
unsigned NewBuf =
SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile);
if (!NewBuf)
return true;
CurBuffer = NewBuf;
Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
return false;
}
/// Process the specified .incbin file by searching for it in the include paths
/// then just emitting the byte contents of the file to the streamer. This
/// returns true on failure.
bool AsmParser::processIncbinFile(const std::string &Filename, int64_t Skip,
const MCExpr *Count, SMLoc Loc) {
std::string IncludedFile;
unsigned NewBuf =
SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile);
if (!NewBuf)
return true;
// Pick up the bytes from the file and emit them.
StringRef Bytes = SrcMgr.getMemoryBuffer(NewBuf)->getBuffer();
Bytes = Bytes.drop_front(Skip);
if (Count) {
int64_t Res;
if (!Count->evaluateAsAbsolute(Res, getStreamer().getAssemblerPtr()))
return Error(Loc, "expected absolute expression");
if (Res < 0)
return Warning(Loc, "negative count has no effect");
Bytes = Bytes.take_front(Res);
}
getStreamer().EmitBytes(Bytes);
return false;
}
void AsmParser::jumpToLoc(SMLoc Loc, unsigned InBuffer) {
CurBuffer = InBuffer ? InBuffer : SrcMgr.FindBufferContainingLoc(Loc);
Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(),
Loc.getPointer());
}
const AsmToken &AsmParser::Lex() {
if (Lexer.getTok().is(AsmToken::Error))
Error(Lexer.getErrLoc(), Lexer.getErr());
// if it's a end of statement with a comment in it
if (getTok().is(AsmToken::EndOfStatement)) {
// if this is a line comment output it.
if (!getTok().getString().empty() && getTok().getString().front() != '\n' &&
getTok().getString().front() != '\r' && MAI.preserveAsmComments())
Out.addExplicitComment(Twine(getTok().getString()));
}
const AsmToken *tok = &Lexer.Lex();
// Parse comments here to be deferred until end of next statement.
while (tok->is(AsmToken::Comment)) {
if (MAI.preserveAsmComments())
Out.addExplicitComment(Twine(tok->getString()));
tok = &Lexer.Lex();
}
if (tok->is(AsmToken::Eof)) {
// If this is the end of an included file, pop the parent file off the
// include stack.
SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
if (ParentIncludeLoc != SMLoc()) {
jumpToLoc(ParentIncludeLoc);
return Lex();
}
}
return *tok;
}
bool AsmParser::enabledGenDwarfForAssembly() {
// Check whether the user specified -g.
if (!getContext().getGenDwarfForAssembly())
return false;
// If we haven't encountered any .file directives (which would imply that
// the assembler source was produced with debug info already) then emit one
// describing the assembler source file itself.
if (getContext().getGenDwarfFileNumber() == 0) {
// Use the first #line directive for this, if any. It's preprocessed, so
// there is no checksum, and of course no source directive.
if (!FirstCppHashFilename.empty())
getContext().setMCLineTableRootFile(/*CUID=*/0,
getContext().getCompilationDir(),
FirstCppHashFilename,
/*Cksum=*/None, /*Source=*/None);
const MCDwarfFile &RootFile =
getContext().getMCDwarfLineTable(/*CUID=*/0).getRootFile();
getContext().setGenDwarfFileNumber(getStreamer().EmitDwarfFileDirective(
/*CUID=*/0, getContext().getCompilationDir(), RootFile.Name,
RootFile.Checksum, RootFile.Source));
}
return true;
}
bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
// Create the initial section, if requested.
if (!NoInitialTextSection)
Out.InitSections(false);
// Prime the lexer.
Lex();
HadError = false;
AsmCond StartingCondState = TheCondState;
SmallVector<AsmRewrite, 4> AsmStrRewrites;
// If we are generating dwarf for assembly source files save the initial text
// section. (Don't use enabledGenDwarfForAssembly() here, as we aren't
// emitting any actual debug info yet and haven't had a chance to parse any
// embedded .file directives.)
if (getContext().getGenDwarfForAssembly()) {
MCSection *Sec = getStreamer().getCurrentSectionOnly();
if (!Sec->getBeginSymbol()) {
MCSymbol *SectionStartSym = getContext().createTempSymbol();
getStreamer().EmitLabel(SectionStartSym);
Sec->setBeginSymbol(SectionStartSym);
}
bool InsertResult = getContext().addGenDwarfSection(Sec);
assert(InsertResult && ".text section should not have debug info yet");
(void)InsertResult;
}
// While we have input, parse each statement.
while (Lexer.isNot(AsmToken::Eof)) {
ParseStatementInfo Info(&AsmStrRewrites);
if (!parseStatement(Info, nullptr))
continue;
// If we have a Lexer Error we are on an Error Token. Load in Lexer Error
// for printing ErrMsg via Lex() only if no (presumably better) parser error
// exists.
if (!hasPendingError() && Lexer.getTok().is(AsmToken::Error)) {
Lex();
}
// parseStatement returned true so may need to emit an error.
printPendingErrors();
// Skipping to the next line if needed.
if (!getLexer().isAtStartOfStatement())
eatToEndOfStatement();
}
getTargetParser().onEndOfFile();
printPendingErrors();
// All errors should have been emitted.
assert(!hasPendingError() && "unexpected error from parseStatement");
getTargetParser().flushPendingInstructions(getStreamer());
if (TheCondState.TheCond != StartingCondState.TheCond ||
TheCondState.Ignore != StartingCondState.Ignore)
printError(getTok().getLoc(), "unmatched .ifs or .elses");
// Check to see there are no empty DwarfFile slots.
const auto &LineTables = getContext().getMCDwarfLineTables();
if (!LineTables.empty()) {
unsigned Index = 0;
for (const auto &File : LineTables.begin()->second.getMCDwarfFiles()) {
if (File.Name.empty() && Index != 0)
printError(getTok().getLoc(), "unassigned file number: " +
Twine(Index) +
" for .file directives");
++Index;
}
}
// Check to see that all assembler local symbols were actually defined.
// Targets that don't do subsections via symbols may not want this, though,
// so conservatively exclude them. Only do this if we're finalizing, though,
// as otherwise we won't necessarilly have seen everything yet.
if (!NoFinalize) {
if (MAI.hasSubsectionsViaSymbols()) {
for (const auto &TableEntry : getContext().getSymbols()) {
MCSymbol *Sym = TableEntry.getValue();
// Variable symbols may not be marked as defined, so check those
// explicitly. If we know it's a variable, we have a definition for
// the purposes of this check.
if (Sym->isTemporary() && !Sym->isVariable() && !Sym->isDefined())
// FIXME: We would really like to refer back to where the symbol was
// first referenced for a source location. We need to add something
// to track that. Currently, we just point to the end of the file.
printError(getTok().getLoc(), "assembler local symbol '" +
Sym->getName() + "' not defined");
}
}
// Temporary symbols like the ones for directional jumps don't go in the
// symbol table. They also need to be diagnosed in all (final) cases.
for (std::tuple<SMLoc, CppHashInfoTy, MCSymbol *> &LocSym : DirLabels) {
if (std::get<2>(LocSym)->isUndefined()) {
// Reset the state of any "# line file" directives we've seen to the
// context as it was at the diagnostic site.
CppHashInfo = std::get<1>(LocSym);
printError(std::get<0>(LocSym), "directional label undefined");
}
}
}
// Finalize the output stream if there are no errors and if the client wants
// us to.
if (!HadError && !NoFinalize)
Out.Finish();
return HadError || getContext().hadError();
}
bool AsmParser::checkForValidSection() {
if (!ParsingInlineAsm && !getStreamer().getCurrentSectionOnly()) {
Out.InitSections(false);
return Error(getTok().getLoc(),
"expected section directive before assembly directive");
}
return false;
}
/// Throw away the rest of the line for testing purposes.
void AsmParser::eatToEndOfStatement() {
while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.isNot(AsmToken::Eof))
Lexer.Lex();
// Eat EOL.
if (Lexer.is(AsmToken::EndOfStatement))
Lexer.Lex();
}
StringRef AsmParser::parseStringToEndOfStatement() {
const char *Start = getTok().getLoc().getPointer();
while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.isNot(AsmToken::Eof))
Lexer.Lex();
const char *End = getTok().getLoc().getPointer();
return StringRef(Start, End - Start);
}
StringRef AsmParser::parseStringToComma() {
const char *Start = getTok().getLoc().getPointer();
while (Lexer.isNot(AsmToken::EndOfStatement) &&
Lexer.isNot(AsmToken::Comma) && Lexer.isNot(AsmToken::Eof))
Lexer.Lex();
const char *End = getTok().getLoc().getPointer();
return StringRef(Start, End - Start);
}
/// Parse a paren expression and return it.
/// NOTE: This assumes the leading '(' has already been consumed.
///
/// parenexpr ::= expr)
///
bool AsmParser::parseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
if (parseExpression(Res))
return true;
if (Lexer.isNot(AsmToken::RParen))
return TokError("expected ')' in parentheses expression");
EndLoc = Lexer.getTok().getEndLoc();
Lex();
return false;
}
/// Parse a bracket expression and return it.
/// NOTE: This assumes the leading '[' has already been consumed.
///
/// bracketexpr ::= expr]
///
bool AsmParser::parseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) {
if (parseExpression(Res))
return true;
EndLoc = getTok().getEndLoc();
if (parseToken(AsmToken::RBrac, "expected ']' in brackets expression"))
return true;
return false;
}
/// Parse a primary expression and return it.
/// primaryexpr ::= (parenexpr
/// primaryexpr ::= symbol
/// primaryexpr ::= number
/// primaryexpr ::= '.'
/// primaryexpr ::= ~,+,- primaryexpr
bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
SMLoc FirstTokenLoc = getLexer().getLoc();
AsmToken::TokenKind FirstTokenKind = Lexer.getKind();
switch (FirstTokenKind) {
default:
return TokError("unknown token in expression");
// If we have an error assume that we've already handled it.
case AsmToken::Error:
return true;
case AsmToken::Exclaim:
Lex(); // Eat the operator.
if (parsePrimaryExpr(Res, EndLoc))
return true;
Res = MCUnaryExpr::createLNot(Res, getContext(), FirstTokenLoc);
return false;
case AsmToken::Dollar:
case AsmToken::At:
case AsmToken::String:
case AsmToken::Identifier: {
StringRef Identifier;
if (parseIdentifier(Identifier)) {
// We may have failed but $ may be a valid token.
if (getTok().is(AsmToken::Dollar)) {
if (Lexer.getMAI().getDollarIsPC()) {
Lex();
// This is a '$' reference, which references the current PC. Emit a
// temporary label to the streamer and refer to it.
MCSymbol *Sym = Ctx.createTempSymbol();
Out.EmitLabel(Sym);
Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None,
getContext());
EndLoc = FirstTokenLoc;
return false;
}
return Error(FirstTokenLoc, "invalid token in expression");
}
}
// Parse symbol variant
std::pair<StringRef, StringRef> Split;
if (!MAI.useParensForSymbolVariant()) {
if (FirstTokenKind == AsmToken::String) {
if (Lexer.is(AsmToken::At)) {
Lex(); // eat @
SMLoc AtLoc = getLexer().getLoc();
StringRef VName;
if (parseIdentifier(VName))
return Error(AtLoc, "expected symbol variant after '@'");
Split = std::make_pair(Identifier, VName);
}
} else {
Split = Identifier.split('@');
}
} else if (Lexer.is(AsmToken::LParen)) {
Lex(); // eat '('.
StringRef VName;
parseIdentifier(VName);
// eat ')'.
if (parseToken(AsmToken::RParen,
"unexpected token in variant, expected ')'"))
return true;
Split = std::make_pair(Identifier, VName);
}
EndLoc = SMLoc::getFromPointer(Identifier.end());
// This is a symbol reference.
StringRef SymbolName = Identifier;
if (SymbolName.empty())
return Error(getLexer().getLoc(), "expected a symbol reference");
MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
// Lookup the symbol variant if used.
if (!Split.second.empty()) {
Variant = MCSymbolRefExpr::getVariantKindForName(Split.second);
if (Variant != MCSymbolRefExpr::VK_Invalid) {
SymbolName = Split.first;
} else if (MAI.doesAllowAtInName() && !MAI.useParensForSymbolVariant()) {
Variant = MCSymbolRefExpr::VK_None;
} else {
return Error(SMLoc::getFromPointer(Split.second.begin()),
"invalid variant '" + Split.second + "'");
}
}
- MCSymbol *Sym = getContext().getOrCreateSymbol(SymbolName);
+ MCSymbol *Sym = getContext().getInlineAsmLabel(SymbolName);
+ if (!Sym)
+ Sym = getContext().getOrCreateSymbol(SymbolName);
// If this is an absolute variable reference, substitute it now to preserve
// semantics in the face of reassignment.
if (Sym->isVariable()) {
auto V = Sym->getVariableValue(/*SetUsed*/ false);
bool DoInline = isa<MCConstantExpr>(V) && !Variant;
if (auto TV = dyn_cast<MCTargetExpr>(V))
DoInline = TV->inlineAssignedExpr();
if (DoInline) {
if (Variant)
return Error(EndLoc, "unexpected modifier on variable reference");
Res = Sym->getVariableValue(/*SetUsed*/ false);
return false;
}
}
// Otherwise create a symbol ref.
Res = MCSymbolRefExpr::create(Sym, Variant, getContext(), FirstTokenLoc);
return false;
}
case AsmToken::BigNum:
return TokError("literal value out of range for directive");
case AsmToken::Integer: {
SMLoc Loc = getTok().getLoc();
int64_t IntVal = getTok().getIntVal();
Res = MCConstantExpr::create(IntVal, getContext());
EndLoc = Lexer.getTok().getEndLoc();
Lex(); // Eat token.
// Look for 'b' or 'f' following an Integer as a directional label
if (Lexer.getKind() == AsmToken::Identifier) {
StringRef IDVal = getTok().getString();
// Lookup the symbol variant if used.
std::pair<StringRef, StringRef> Split = IDVal.split('@');
MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
if (Split.first.size() != IDVal.size()) {
Variant = MCSymbolRefExpr::getVariantKindForName(Split.second);
if (Variant == MCSymbolRefExpr::VK_Invalid)
return TokError("invalid variant '" + Split.second + "'");
IDVal = Split.first;
}
if (IDVal == "f" || IDVal == "b") {
MCSymbol *Sym =
Ctx.getDirectionalLocalSymbol(IntVal, IDVal == "b");
Res = MCSymbolRefExpr::create(Sym, Variant, getContext());
if (IDVal == "b" && Sym->isUndefined())
return Error(Loc, "directional label undefined");
DirLabels.push_back(std::make_tuple(Loc, CppHashInfo, Sym));
EndLoc = Lexer.getTok().getEndLoc();
Lex(); // Eat identifier.
}
}
return false;
}
case AsmToken::Real: {
APFloat RealVal(APFloat::IEEEdouble(), getTok().getString());
uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
Res = MCConstantExpr::create(IntVal, getContext());
EndLoc = Lexer.getTok().getEndLoc();
Lex(); // Eat token.
return false;
}
case AsmToken::Dot: {
// This is a '.' reference, which references the current PC. Emit a
// temporary label to the streamer and refer to it.
MCSymbol *Sym = Ctx.createTempSymbol();
Out.EmitLabel(Sym);
Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
EndLoc = Lexer.getTok().getEndLoc();
Lex(); // Eat identifier.
return false;
}
case AsmToken::LParen:
Lex(); // Eat the '('.
return parseParenExpr(Res, EndLoc);
case AsmToken::LBrac:
if (!PlatformParser->HasBracketExpressions())
return TokError("brackets expression not supported on this target");
Lex(); // Eat the '['.
return parseBracketExpr(Res, EndLoc);
case AsmToken::Minus:
Lex(); // Eat the operator.
if (parsePrimaryExpr(Res, EndLoc))
return true;
Res = MCUnaryExpr::createMinus(Res, getContext(), FirstTokenLoc);
return false;
case AsmToken::Plus:
Lex(); // Eat the operator.
if (parsePrimaryExpr(Res, EndLoc))
return true;
Res = MCUnaryExpr::createPlus(Res, getContext(), FirstTokenLoc);
return false;
case AsmToken::Tilde:
Lex(); // Eat the operator.
if (parsePrimaryExpr(Res, EndLoc))
return true;
Res = MCUnaryExpr::createNot(Res, getContext(), FirstTokenLoc);
return false;
// MIPS unary expression operators. The lexer won't generate these tokens if
// MCAsmInfo::HasMipsExpressions is false for the target.
case AsmToken::PercentCall16:
case AsmToken::PercentCall_Hi:
case AsmToken::PercentCall_Lo:
case AsmToken::PercentDtprel_Hi:
case AsmToken::PercentDtprel_Lo:
case AsmToken::PercentGot:
case AsmToken::PercentGot_Disp:
case AsmToken::PercentGot_Hi:
case AsmToken::PercentGot_Lo:
case AsmToken::PercentGot_Ofst:
case AsmToken::PercentGot_Page:
case AsmToken::PercentGottprel:
case AsmToken::PercentGp_Rel:
case AsmToken::PercentHi:
case AsmToken::PercentHigher:
case AsmToken::PercentHighest:
case AsmToken::PercentLo:
case AsmToken::PercentNeg:
case AsmToken::PercentPcrel_Hi:
case AsmToken::PercentPcrel_Lo:
case AsmToken::PercentTlsgd:
case AsmToken::PercentTlsldm:
case AsmToken::PercentTprel_Hi:
case AsmToken::PercentTprel_Lo:
Lex(); // Eat the operator.
if (Lexer.isNot(AsmToken::LParen))
return TokError("expected '(' after operator");
Lex(); // Eat the operator.
if (parseExpression(Res, EndLoc))
return true;
if (Lexer.isNot(AsmToken::RParen))
return TokError("expected ')'");
Lex(); // Eat the operator.
Res = getTargetParser().createTargetUnaryExpr(Res, FirstTokenKind, Ctx);
return !Res;
}
}
bool AsmParser::parseExpression(const MCExpr *&Res) {
SMLoc EndLoc;
return parseExpression(Res, EndLoc);
}
const MCExpr *
AsmParser::applyModifierToExpr(const MCExpr *E,
MCSymbolRefExpr::VariantKind Variant) {
// Ask the target implementation about this expression first.
const MCExpr *NewE = getTargetParser().applyModifierToExpr(E, Variant, Ctx);
if (NewE)
return NewE;
// Recurse over the given expression, rebuilding it to apply the given variant
// if there is exactly one symbol.
switch (E->getKind()) {
case MCExpr::Target:
case MCExpr::Constant:
return nullptr;
case MCExpr::SymbolRef: {
const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
if (SRE->getKind() != MCSymbolRefExpr::VK_None) {
TokError("invalid variant on expression '" + getTok().getIdentifier() +
"' (already modified)");
return E;
}
return MCSymbolRefExpr::create(&SRE->getSymbol(), Variant, getContext());
}
case MCExpr::Unary: {
const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
const MCExpr *Sub = applyModifierToExpr(UE->getSubExpr(), Variant);
if (!Sub)
return nullptr;
return MCUnaryExpr::create(UE->getOpcode(), Sub, getContext());
}
case MCExpr::Binary: {
const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
const MCExpr *LHS = applyModifierToExpr(BE->getLHS(), Variant);
const MCExpr *RHS = applyModifierToExpr(BE->getRHS(), Variant);
if (!LHS && !RHS)
return nullptr;
if (!LHS)
LHS = BE->getLHS();
if (!RHS)
RHS = BE->getRHS();
return MCBinaryExpr::create(BE->getOpcode(), LHS, RHS, getContext());
}
}
llvm_unreachable("Invalid expression kind!");
}
/// This function checks if the next token is <string> type or arithmetic.
/// string that begin with character '<' must end with character '>'.
/// otherwise it is arithmetics.
/// If the function returns a 'true' value,
/// the End argument will be filled with the last location pointed to the '>'
/// character.
/// There is a gap between the AltMacro's documentation and the single quote
/// implementation. GCC does not fully support this feature and so we will not
/// support it.
/// TODO: Adding single quote as a string.
static bool isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc) {
assert((StrLoc.getPointer() != nullptr) &&
"Argument to the function cannot be a NULL value");
const char *CharPtr = StrLoc.getPointer();
while ((*CharPtr != '>') && (*CharPtr != '\n') && (*CharPtr != '\r') &&
(*CharPtr != '\0')) {
if (*CharPtr == '!')
CharPtr++;
CharPtr++;
}
if (*CharPtr == '>') {
EndLoc = StrLoc.getFromPointer(CharPtr + 1);
return true;
}
return false;
}
/// creating a string without the escape characters '!'.
static std::string altMacroString(StringRef AltMacroStr) {
std::string Res;
for (size_t Pos = 0; Pos < AltMacroStr.size(); Pos++) {
if (AltMacroStr[Pos] == '!')
Pos++;
Res += AltMacroStr[Pos];
}
return Res;
}
/// Parse an expression and return it.
///
/// expr ::= expr &&,|| expr -> lowest.
/// expr ::= expr |,^,&,! expr
/// expr ::= expr ==,!=,<>,<,<=,>,>= expr
/// expr ::= expr <<,>> expr
/// expr ::= expr +,- expr
/// expr ::= expr *,/,% expr -> highest.
/// expr ::= primaryexpr
///
bool AsmParser::parseExpression(const MCExpr *&Res, SMLoc &EndLoc) {
// Parse the expression.
Res = nullptr;
if (getTargetParser().parsePrimaryExpr(Res, EndLoc) ||
parseBinOpRHS(1, Res, EndLoc))
return true;
// As a special case, we support 'a op b @ modifier' by rewriting the
// expression to include the modifier. This is inefficient, but in general we
// expect users to use 'a@modifier op b'.
if (Lexer.getKind() == AsmToken::At) {
Lex();
if (Lexer.isNot(AsmToken::Identifier))
return TokError("unexpected symbol modifier following '@'");
MCSymbolRefExpr::VariantKind Variant =
MCSymbolRefExpr::getVariantKindForName(getTok().getIdentifier());
if (Variant == MCSymbolRefExpr::VK_Invalid)
return TokError("invalid variant '" + getTok().getIdentifier() + "'");
const MCExpr *ModifiedRes = applyModifierToExpr(Res, Variant);
if (!ModifiedRes) {
return TokError("invalid modifier '" + getTok().getIdentifier() +
"' (no symbols present)");
}
Res = ModifiedRes;
Lex();
}
// Try to constant fold it up front, if possible. Do not exploit
// assembler here.
int64_t Value;
if (Res->evaluateAsAbsolute(Value))
Res = MCConstantExpr::create(Value, getContext());
return false;
}
bool AsmParser::parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) {
Res = nullptr;
return parseParenExpr(Res, EndLoc) || parseBinOpRHS(1, Res, EndLoc);
}
bool AsmParser::parseParenExprOfDepth(unsigned ParenDepth, const MCExpr *&Res,
SMLoc &EndLoc) {
if (parseParenExpr(Res, EndLoc))
return true;
for (; ParenDepth > 0; --ParenDepth) {
if (parseBinOpRHS(1, Res, EndLoc))
return true;
// We don't Lex() the last RParen.
// This is the same behavior as parseParenExpression().
if (ParenDepth - 1 > 0) {
EndLoc = getTok().getEndLoc();
if (parseToken(AsmToken::RParen,
"expected ')' in parentheses expression"))
return true;
}
}
return false;
}
bool AsmParser::parseAbsoluteExpression(int64_t &Res) {
const MCExpr *Expr;
SMLoc StartLoc = Lexer.getLoc();
if (parseExpression(Expr))
return true;
if (!Expr->evaluateAsAbsolute(Res, getStreamer().getAssemblerPtr()))
return Error(StartLoc, "expected absolute expression");
return false;
}
static unsigned getDarwinBinOpPrecedence(AsmToken::TokenKind K,
MCBinaryExpr::Opcode &Kind,
bool ShouldUseLogicalShr) {
switch (K) {
default:
return 0; // not a binop.
// Lowest Precedence: &&, ||
case AsmToken::AmpAmp:
Kind = MCBinaryExpr::LAnd;
return 1;
case AsmToken::PipePipe:
Kind = MCBinaryExpr::LOr;
return 1;
// Low Precedence: |, &, ^
//
// FIXME: gas seems to support '!' as an infix operator?
case AsmToken::Pipe:
Kind = MCBinaryExpr::Or;
return 2;
case AsmToken::Caret:
Kind = MCBinaryExpr::Xor;
return 2;
case AsmToken::Amp:
Kind = MCBinaryExpr::And;
return 2;
// Low Intermediate Precedence: ==, !=, <>, <, <=, >, >=
case AsmToken::EqualEqual:
Kind = MCBinaryExpr::EQ;
return 3;
case AsmToken::ExclaimEqual:
case AsmToken::LessGreater:
Kind = MCBinaryExpr::NE;
return 3;
case AsmToken::Less:
Kind = MCBinaryExpr::LT;
return 3;
case AsmToken::LessEqual:
Kind = MCBinaryExpr::LTE;
return 3;
case AsmToken::Greater:
Kind = MCBinaryExpr::GT;
return 3;
case AsmToken::GreaterEqual:
Kind = MCBinaryExpr::GTE;
return 3;
// Intermediate Precedence: <<, >>
case AsmToken::LessLess:
Kind = MCBinaryExpr::Shl;
return 4;
case AsmToken::GreaterGreater:
Kind = ShouldUseLogicalShr ? MCBinaryExpr::LShr : MCBinaryExpr::AShr;
return 4;
// High Intermediate Precedence: +, -
case AsmToken::Plus:
Kind = MCBinaryExpr::Add;
return 5;
case AsmToken::Minus:
Kind = MCBinaryExpr::Sub;
return 5;
// Highest Precedence: *, /, %
case AsmToken::Star:
Kind = MCBinaryExpr::Mul;
return 6;
case AsmToken::Slash:
Kind = MCBinaryExpr::Div;
return 6;
case AsmToken::Percent:
Kind = MCBinaryExpr::Mod;
return 6;
}
}
static unsigned getGNUBinOpPrecedence(AsmToken::TokenKind K,
MCBinaryExpr::Opcode &Kind,
bool ShouldUseLogicalShr) {
switch (K) {
default:
return 0; // not a binop.
// Lowest Precedence: &&, ||
case AsmToken::AmpAmp:
Kind = MCBinaryExpr::LAnd;
return 2;
case AsmToken::PipePipe:
Kind = MCBinaryExpr::LOr;
return 1;
// Low Precedence: ==, !=, <>, <, <=, >, >=
case AsmToken::EqualEqual:
Kind = MCBinaryExpr::EQ;
return 3;
case AsmToken::ExclaimEqual:
case AsmToken::LessGreater:
Kind = MCBinaryExpr::NE;
return 3;
case AsmToken::Less:
Kind = MCBinaryExpr::LT;
return 3;
case AsmToken::LessEqual:
Kind = MCBinaryExpr::LTE;
return 3;
case AsmToken::Greater:
Kind = MCBinaryExpr::GT;
return 3;
case AsmToken::GreaterEqual:
Kind = MCBinaryExpr::GTE;
return 3;
// Low Intermediate Precedence: +, -
case AsmToken::Plus:
Kind = MCBinaryExpr::Add;
return 4;
case AsmToken::Minus:
Kind = MCBinaryExpr::Sub;
return 4;
// High Intermediate Precedence: |, &, ^
//
// FIXME: gas seems to support '!' as an infix operator?
case AsmToken::Pipe:
Kind = MCBinaryExpr::Or;
return 5;
case AsmToken::Caret:
Kind = MCBinaryExpr::Xor;
return 5;
case AsmToken::Amp:
Kind = MCBinaryExpr::And;
return 5;
// Highest Precedence: *, /, %, <<, >>
case AsmToken::Star:
Kind = MCBinaryExpr::Mul;
return 6;
case AsmToken::Slash:
Kind = MCBinaryExpr::Div;
return 6;
case AsmToken::Percent:
Kind = MCBinaryExpr::Mod;
return 6;
case AsmToken::LessLess:
Kind = MCBinaryExpr::Shl;
return 6;
case AsmToken::GreaterGreater:
Kind = ShouldUseLogicalShr ? MCBinaryExpr::LShr : MCBinaryExpr::AShr;
return 6;
}
}
unsigned AsmParser::getBinOpPrecedence(AsmToken::TokenKind K,
MCBinaryExpr::Opcode &Kind) {
bool ShouldUseLogicalShr = MAI.shouldUseLogicalShr();
return IsDarwin ? getDarwinBinOpPrecedence(K, Kind, ShouldUseLogicalShr)
: getGNUBinOpPrecedence(K, Kind, ShouldUseLogicalShr);
}
/// Parse all binary operators with precedence >= 'Precedence'.
/// Res contains the LHS of the expression on input.
bool AsmParser::parseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
SMLoc &EndLoc) {
SMLoc StartLoc = Lexer.getLoc();
while (true) {
MCBinaryExpr::Opcode Kind = MCBinaryExpr::Add;
unsigned TokPrec = getBinOpPrecedence(Lexer.getKind(), Kind);
// If the next token is lower precedence than we are allowed to eat, return
// successfully with what we ate already.
if (TokPrec < Precedence)
return false;
Lex();
// Eat the next primary expression.
const MCExpr *RHS;
if (getTargetParser().parsePrimaryExpr(RHS, EndLoc))
return true;
// If BinOp binds less tightly with RHS than the operator after RHS, let
// the pending operator take RHS as its LHS.
MCBinaryExpr::Opcode Dummy;
unsigned NextTokPrec = getBinOpPrecedence(Lexer.getKind(), Dummy);
if (TokPrec < NextTokPrec && parseBinOpRHS(TokPrec + 1, RHS, EndLoc))
return true;
// Merge LHS and RHS according to operator.
Res = MCBinaryExpr::create(Kind, Res, RHS, getContext(), StartLoc);
}
}
/// ParseStatement:
/// ::= EndOfStatement
/// ::= Label* Directive ...Operands... EndOfStatement
/// ::= Label* Identifier OperandList* EndOfStatement
bool AsmParser::parseStatement(ParseStatementInfo &Info,
MCAsmParserSemaCallback *SI) {
assert(!hasPendingError() && "parseStatement started with pending error");
// Eat initial spaces and comments
while (Lexer.is(AsmToken::Space))
Lex();
if (Lexer.is(AsmToken::EndOfStatement)) {
// if this is a line comment we can drop it safely
if (getTok().getString().empty() || getTok().getString().front() == '\r' ||
getTok().getString().front() == '\n')
Out.AddBlankLine();
Lex();
return false;
}
// Statements always start with an identifier.
AsmToken ID = getTok();
SMLoc IDLoc = ID.getLoc();
StringRef IDVal;
int64_t LocalLabelVal = -1;
if (Lexer.is(AsmToken::HashDirective))
return parseCppHashLineFilenameComment(IDLoc);
// Allow an integer followed by a ':' as a directional local label.
if (Lexer.is(AsmToken::Integer)) {
LocalLabelVal = getTok().getIntVal();
if (LocalLabelVal < 0) {
if (!TheCondState.Ignore) {
Lex(); // always eat a token
return Error(IDLoc, "unexpected token at start of statement");
}
IDVal = "";
} else {
IDVal = getTok().getString();
Lex(); // Consume the integer token to be used as an identifier token.
if (Lexer.getKind() != AsmToken::Colon) {
if (!TheCondState.Ignore) {
Lex(); // always eat a token
return Error(IDLoc, "unexpected token at start of statement");
}
}
}
} else if (Lexer.is(AsmToken::Dot)) {
// Treat '.' as a valid identifier in this context.
Lex();
IDVal = ".";
} else if (Lexer.is(AsmToken::LCurly)) {
// Treat '{' as a valid identifier in this context.
Lex();
IDVal = "{";
} else if (Lexer.is(AsmToken::RCurly)) {
// Treat '}' as a valid identifier in this context.
Lex();
IDVal = "}";
} else if (Lexer.is(AsmToken::Star) &&
getTargetParser().starIsStartOfStatement()) {
// Accept '*' as a valid start of statement.
Lex();
IDVal = "*";
} else if (parseIdentifier(IDVal)) {
if (!TheCondState.Ignore) {
Lex(); // always eat a token
return Error(IDLoc, "unexpected token at start of statement");
}
IDVal = "";
}
// Handle conditional assembly here before checking for skipping. We
// have to do this so that .endif isn't skipped in a ".if 0" block for
// example.
StringMap<DirectiveKind>::const_iterator DirKindIt =
DirectiveKindMap.find(IDVal);
DirectiveKind DirKind = (DirKindIt == DirectiveKindMap.end())
? DK_NO_DIRECTIVE
: DirKindIt->getValue();
switch (DirKind) {
default:
break;
case DK_IF:
case DK_IFEQ:
case DK_IFGE:
case DK_IFGT:
case DK_IFLE:
case DK_IFLT:
case DK_IFNE:
return parseDirectiveIf(IDLoc, DirKind);
case DK_IFB:
return parseDirectiveIfb(IDLoc, true);
case DK_IFNB:
return parseDirectiveIfb(IDLoc, false);
case DK_IFC:
return parseDirectiveIfc(IDLoc, true);
case DK_IFEQS:
return parseDirectiveIfeqs(IDLoc, true);
case DK_IFNC:
return parseDirectiveIfc(IDLoc, false);
case DK_IFNES:
return parseDirectiveIfeqs(IDLoc, false);
case DK_IFDEF:
return parseDirectiveIfdef(IDLoc, true);
case DK_IFNDEF:
case DK_IFNOTDEF:
return parseDirectiveIfdef(IDLoc, false);
case DK_ELSEIF:
return parseDirectiveElseIf(IDLoc);
case DK_ELSE:
return parseDirectiveElse(IDLoc);
case DK_ENDIF:
return parseDirectiveEndIf(IDLoc);
}
// Ignore the statement if in the middle of inactive conditional
// (e.g. ".if 0").
if (TheCondState.Ignore) {
eatToEndOfStatement();
return false;
}
// FIXME: Recurse on local labels?
// See what kind of statement we have.
switch (Lexer.getKind()) {
case AsmToken::Colon: {
if (!getTargetParser().isLabel(ID))
break;
if (checkForValidSection())
return true;
// identifier ':' -> Label.
Lex();
// Diagnose attempt to use '.' as a label.
if (IDVal == ".")
return Error(IDLoc, "invalid use of pseudo-symbol '.' as a label");
// Diagnose attempt to use a variable as a label.
//
// FIXME: Diagnostics. Note the location of the definition as a label.
// FIXME: This doesn't diagnose assignment to a symbol which has been
// implicitly marked as external.
MCSymbol *Sym;
if (LocalLabelVal == -1) {
if (ParsingInlineAsm && SI) {
StringRef RewrittenLabel =
SI->LookupInlineAsmLabel(IDVal, getSourceManager(), IDLoc, true);
assert(!RewrittenLabel.empty() &&
"We should have an internal name here.");
Info.AsmRewrites->emplace_back(AOK_Label, IDLoc, IDVal.size(),
RewrittenLabel);
IDVal = RewrittenLabel;
}
Sym = getContext().getOrCreateSymbol(IDVal);
} else
Sym = Ctx.createDirectionalLocalSymbol(LocalLabelVal);
// End of Labels should be treated as end of line for lexing
// purposes but that information is not available to the Lexer who
// does not understand Labels. This may cause us to see a Hash
// here instead of a preprocessor line comment.
if (getTok().is(AsmToken::Hash)) {
StringRef CommentStr = parseStringToEndOfStatement();
Lexer.Lex();
Lexer.UnLex(AsmToken(AsmToken::EndOfStatement, CommentStr));
}
// Consume any end of statement token, if present, to avoid spurious
// AddBlankLine calls().
if (getTok().is(AsmToken::EndOfStatement)) {
Lex();
}
getTargetParser().doBeforeLabelEmit(Sym);
// Emit the label.
if (!getTargetParser().isParsingInlineAsm())
Out.EmitLabel(Sym, IDLoc);
// If we are generating dwarf for assembly source files then gather the
// info to make a dwarf label entry for this label if needed.
if (enabledGenDwarfForAssembly())
MCGenDwarfLabelEntry::Make(Sym, &getStreamer(), getSourceManager(),
IDLoc);
getTargetParser().onLabelParsed(Sym);
return false;
}
case AsmToken::Equal:
if (!getTargetParser().equalIsAsmAssignment())
break;
// identifier '=' ... -> assignment statement
Lex();
return parseAssignment(IDVal, true);
default: // Normal instruction or directive.
break;
}
// If macros are enabled, check to see if this is a macro instantiation.
if (areMacrosEnabled())
if (const MCAsmMacro *M = getContext().lookupMacro(IDVal)) {
return handleMacroEntry(M, IDLoc);
}
// Otherwise, we have a normal instruction or directive.
// Directives start with "."
if (IDVal.startswith(".") && IDVal != ".") {
// There are several entities interested in parsing directives:
//
// 1. The target-specific assembly parser. Some directives are target
// specific or may potentially behave differently on certain targets.
// 2. Asm parser extensions. For example, platform-specific parsers
// (like the ELF parser) register themselves as extensions.
// 3. The generic directive parser implemented by this class. These are
// all the directives that behave in a target and platform independent
// manner, or at least have a default behavior that's shared between
// all targets and platforms.
getTargetParser().flushPendingInstructions(getStreamer());
SMLoc StartTokLoc = getTok().getLoc();
bool TPDirectiveReturn = getTargetParser().ParseDirective(ID);
if (hasPendingError())
return true;
// Currently the return value should be true if we are
// uninterested but as this is at odds with the standard parsing
// convention (return true = error) we have instances of a parsed
// directive that fails returning true as an error. Catch these
// cases as best as possible errors here.
if (TPDirectiveReturn && StartTokLoc != getTok().getLoc())
return true;
// Return if we did some parsing or believe we succeeded.
if (!TPDirectiveReturn || StartTokLoc != getTok().getLoc())
return false;
// Next, check the extension directive map to see if any extension has
// registered itself to parse this directive.
std::pair<MCAsmParserExtension *, DirectiveHandler> Handler =
ExtensionDirectiveMap.lookup(IDVal);
if (Handler.first)
return (*Handler.second)(Handler.first, IDVal, IDLoc);
// Finally, if no one else is interested in this directive, it must be
// generic and familiar to this class.
switch (DirKind) {
default:
break;
case DK_SET:
case DK_EQU:
return parseDirectiveSet(IDVal, true);
case DK_EQUIV:
return parseDirectiveSet(IDVal, false);
case DK_ASCII:
return parseDirectiveAscii(IDVal, false);
case DK_ASCIZ:
case DK_STRING:
return parseDirectiveAscii(IDVal, true);
case DK_BYTE:
case DK_DC_B:
return parseDirectiveValue(IDVal, 1);
case DK_DC:
case DK_DC_W:
case DK_SHORT:
case DK_VALUE:
case DK_2BYTE:
return parseDirectiveValue(IDVal, 2);
case DK_LONG:
case DK_INT:
case DK_4BYTE:
case DK_DC_L:
return parseDirectiveValue(IDVal, 4);
case DK_QUAD:
case DK_8BYTE:
return parseDirectiveValue(IDVal, 8);
case DK_DC_A:
return parseDirectiveValue(
IDVal, getContext().getAsmInfo()->getCodePointerSize());
case DK_OCTA:
return parseDirectiveOctaValue(IDVal);
case DK_SINGLE:
case DK_FLOAT:
case DK_DC_S:
return parseDirectiveRealValue(IDVal, APFloat::IEEEsingle());
case DK_DOUBLE:
case DK_DC_D:
return parseDirectiveRealValue(IDVal, APFloat::IEEEdouble());
case DK_ALIGN: {
bool IsPow2 = !getContext().getAsmInfo()->getAlignmentIsInBytes();
return parseDirectiveAlign(IsPow2, /*ExprSize=*/1);
}
case DK_ALIGN32: {
bool IsPow2 = !getContext().getAsmInfo()->getAlignmentIsInBytes();
return parseDirectiveAlign(IsPow2, /*ExprSize=*/4);
}
case DK_BALIGN:
return parseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/1);
case DK_BALIGNW:
return parseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/2);
case DK_BALIGNL:
return parseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/4);
case DK_P2ALIGN:
return parseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/1);
case DK_P2ALIGNW:
return parseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/2);
case DK_P2ALIGNL:
return parseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/4);
case DK_ORG:
return parseDirectiveOrg();
case DK_FILL:
return parseDirectiveFill();
case DK_ZERO:
return parseDirectiveZero();
case DK_EXTERN:
eatToEndOfStatement(); // .extern is the default, ignore it.
return false;
case DK_GLOBL:
case DK_GLOBAL:
return parseDirectiveSymbolAttribute(MCSA_Global);
case DK_LAZY_REFERENCE:
return parseDirectiveSymbolAttribute(MCSA_LazyReference);
case DK_NO_DEAD_STRIP:
return parseDirectiveSymbolAttribute(MCSA_NoDeadStrip);
case DK_SYMBOL_RESOLVER:
return parseDirectiveSymbolAttribute(MCSA_SymbolResolver);
case DK_PRIVATE_EXTERN:
return parseDirectiveSymbolAttribute(MCSA_PrivateExtern);
case DK_REFERENCE:
return parseDirectiveSymbolAttribute(MCSA_Reference);
case DK_WEAK_DEFINITION:
return parseDirectiveSymbolAttribute(MCSA_WeakDefinition);
case DK_WEAK_REFERENCE:
return parseDirectiveSymbolAttribute(MCSA_WeakReference);
case DK_WEAK_DEF_CAN_BE_HIDDEN:
return parseDirectiveSymbolAttribute(MCSA_WeakDefAutoPrivate);
case DK_COLD:
return parseDirectiveSymbolAttribute(MCSA_Cold);
case DK_COMM:
case DK_COMMON:
return parseDirectiveComm(/*IsLocal=*/false);
case DK_LCOMM:
return parseDirectiveComm(/*IsLocal=*/true);
case DK_ABORT:
return parseDirectiveAbort();
case DK_INCLUDE:
return parseDirectiveInclude();
case DK_INCBIN:
return parseDirectiveIncbin();
case DK_CODE16:
case DK_CODE16GCC:
return TokError(Twine(IDVal) +
" not currently supported for this target");
case DK_REPT:
return parseDirectiveRept(IDLoc, IDVal);
case DK_IRP:
return parseDirectiveIrp(IDLoc);
case DK_IRPC:
return parseDirectiveIrpc(IDLoc);
case DK_ENDR:
return parseDirectiveEndr(IDLoc);
case DK_BUNDLE_ALIGN_MODE:
return parseDirectiveBundleAlignMode();
case DK_BUNDLE_LOCK:
return parseDirectiveBundleLock();
case DK_BUNDLE_UNLOCK:
return parseDirectiveBundleUnlock();
case DK_SLEB128:
return parseDirectiveLEB128(true);
case DK_ULEB128:
return parseDirectiveLEB128(false);
case DK_SPACE:
case DK_SKIP:
return parseDirectiveSpace(IDVal);
case DK_FILE:
return parseDirectiveFile(IDLoc);
case DK_LINE:
return parseDirectiveLine();
case DK_LOC:
return parseDirectiveLoc();
case DK_STABS:
return parseDirectiveStabs();
case DK_CV_FILE:
return parseDirectiveCVFile();
case DK_CV_FUNC_ID:
return parseDirectiveCVFuncId();
case DK_CV_INLINE_SITE_ID:
return parseDirectiveCVInlineSiteId();
case DK_CV_LOC:
return parseDirectiveCVLoc();
case DK_CV_LINETABLE:
return parseDirectiveCVLinetable();
case DK_CV_INLINE_LINETABLE:
return parseDirectiveCVInlineLinetable();
case DK_CV_DEF_RANGE:
return parseDirectiveCVDefRange();
case DK_CV_STRING:
return parseDirectiveCVString();
case DK_CV_STRINGTABLE:
return parseDirectiveCVStringTable();
case DK_CV_FILECHECKSUMS:
return parseDirectiveCVFileChecksums();
case DK_CV_FILECHECKSUM_OFFSET:
return parseDirectiveCVFileChecksumOffset();
case DK_CV_FPO_DATA:
return parseDirectiveCVFPOData();
case DK_CFI_SECTIONS:
return parseDirectiveCFISections();
case DK_CFI_STARTPROC:
return parseDirectiveCFIStartProc();
case DK_CFI_ENDPROC:
return parseDirectiveCFIEndProc();
case DK_CFI_DEF_CFA:
return parseDirectiveCFIDefCfa(IDLoc);
case DK_CFI_DEF_CFA_OFFSET:
return parseDirectiveCFIDefCfaOffset();
case DK_CFI_ADJUST_CFA_OFFSET:
return parseDirectiveCFIAdjustCfaOffset();
case DK_CFI_DEF_CFA_REGISTER:
return parseDirectiveCFIDefCfaRegister(IDLoc);
case DK_CFI_OFFSET:
return parseDirectiveCFIOffset(IDLoc);
case DK_CFI_REL_OFFSET:
return parseDirectiveCFIRelOffset(IDLoc);
case DK_CFI_PERSONALITY:
return parseDirectiveCFIPersonalityOrLsda(true);
case DK_CFI_LSDA:
return parseDirectiveCFIPersonalityOrLsda(false);
case DK_CFI_REMEMBER_STATE:
return parseDirectiveCFIRememberState();
case DK_CFI_RESTORE_STATE:
return parseDirectiveCFIRestoreState();
case DK_CFI_SAME_VALUE:
return parseDirectiveCFISameValue(IDLoc);
case DK_CFI_RESTORE:
return parseDirectiveCFIRestore(IDLoc);
case DK_CFI_ESCAPE:
return parseDirectiveCFIEscape();
case DK_CFI_RETURN_COLUMN:
return parseDirectiveCFIReturnColumn(IDLoc);
case DK_CFI_SIGNAL_FRAME:
return parseDirectiveCFISignalFrame();
case DK_CFI_UNDEFINED:
return parseDirectiveCFIUndefined(IDLoc);
case DK_CFI_REGISTER:
return parseDirectiveCFIRegister(IDLoc);
case DK_CFI_WINDOW_SAVE:
return parseDirectiveCFIWindowSave();
case DK_MACROS_ON:
case DK_MACROS_OFF:
return parseDirectiveMacrosOnOff(IDVal);
case DK_MACRO:
return parseDirectiveMacro(IDLoc);
case DK_ALTMACRO:
case DK_NOALTMACRO:
return parseDirectiveAltmacro(IDVal);
case DK_EXITM:
return parseDirectiveExitMacro(IDVal);
case DK_ENDM:
case DK_ENDMACRO:
return parseDirectiveEndMacro(IDVal);
case DK_PURGEM:
return parseDirectivePurgeMacro(IDLoc);
case DK_END:
return parseDirectiveEnd(IDLoc);
case DK_ERR:
return parseDirectiveError(IDLoc, false);
case DK_ERROR:
return parseDirectiveError(IDLoc, true);
case DK_WARNING:
return parseDirectiveWarning(IDLoc);
case DK_RELOC:
return parseDirectiveReloc(IDLoc);
case DK_DCB:
case DK_DCB_W:
return parseDirectiveDCB(IDVal, 2);
case DK_DCB_B:
return parseDirectiveDCB(IDVal, 1);
case DK_DCB_D:
return parseDirectiveRealDCB(IDVal, APFloat::IEEEdouble());
case DK_DCB_L:
return parseDirectiveDCB(IDVal, 4);
case DK_DCB_S:
return parseDirectiveRealDCB(IDVal, APFloat::IEEEsingle());
case DK_DC_X:
case DK_DCB_X:
return TokError(Twine(IDVal) +
" not currently supported for this target");
case DK_DS:
case DK_DS_W:
return parseDirectiveDS(IDVal, 2);
case DK_DS_B:
return parseDirectiveDS(IDVal, 1);
case DK_DS_D:
return parseDirectiveDS(IDVal, 8);
case DK_DS_L:
case DK_DS_S:
return parseDirectiveDS(IDVal, 4);
case DK_DS_P:
case DK_DS_X:
return parseDirectiveDS(IDVal, 12);
case DK_PRINT:
return parseDirectivePrint(IDLoc);
case DK_ADDRSIG:
return parseDirectiveAddrsig();
case DK_ADDRSIG_SYM:
return parseDirectiveAddrsigSym();
}
return Error(IDLoc, "unknown directive");
}
// __asm _emit or __asm __emit
if (ParsingInlineAsm && (IDVal == "_emit" || IDVal == "__emit" ||
IDVal == "_EMIT" || IDVal == "__EMIT"))
return parseDirectiveMSEmit(IDLoc, Info, IDVal.size());
// __asm align
if (ParsingInlineAsm && (IDVal == "align" || IDVal == "ALIGN"))
return parseDirectiveMSAlign(IDLoc, Info);
if (ParsingInlineAsm && (IDVal == "even" || IDVal == "EVEN"))
Info.AsmRewrites->emplace_back(AOK_EVEN, IDLoc, 4);
if (checkForValidSection())
return true;
// Canonicalize the opcode to lower case.
std::string OpcodeStr = IDVal.lower();
ParseInstructionInfo IInfo(Info.AsmRewrites);
bool ParseHadError = getTargetParser().ParseInstruction(IInfo, OpcodeStr, ID,
Info.ParsedOperands);
Info.ParseError = ParseHadError;
// Dump the parsed representation, if requested.
if (getShowParsedOperands()) {
SmallString<256> Str;
raw_svector_ostream OS(Str);
OS << "parsed instruction: [";
for (unsigned i = 0; i != Info.ParsedOperands.size(); ++i) {
if (i != 0)
OS << ", ";
Info.ParsedOperands[i]->print(OS);
}
OS << "]";
printMessage(IDLoc, SourceMgr::DK_Note, OS.str());
}
// Fail even if ParseInstruction erroneously returns false.
if (hasPendingError() || ParseHadError)
return true;
// If we are generating dwarf for the current section then generate a .loc
// directive for the instruction.
if (!ParseHadError && enabledGenDwarfForAssembly() &&
getContext().getGenDwarfSectionSyms().count(
getStreamer().getCurrentSectionOnly())) {
unsigned Line;
if (ActiveMacros.empty())
Line = SrcMgr.FindLineNumber(IDLoc, CurBuffer);
else
Line = SrcMgr.FindLineNumber(ActiveMacros.front()->InstantiationLoc,
ActiveMacros.front()->ExitBuffer);
// If we previously parsed a cpp hash file line comment then make sure the
// current Dwarf File is for the CppHashFilename if not then emit the
// Dwarf File table for it and adjust the line number for the .loc.
if (!CppHashInfo.Filename.empty()) {
unsigned FileNumber = getStreamer().EmitDwarfFileDirective(
0, StringRef(), CppHashInfo.Filename);
getContext().setGenDwarfFileNumber(FileNumber);
unsigned CppHashLocLineNo =
SrcMgr.FindLineNumber(CppHashInfo.Loc, CppHashInfo.Buf);
Line = CppHashInfo.LineNumber - 1 + (Line - CppHashLocLineNo);
}
getStreamer().EmitDwarfLocDirective(
getContext().getGenDwarfFileNumber(), Line, 0,
DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0, 0, 0,
StringRef());
}
// If parsing succeeded, match the instruction.
if (!ParseHadError) {
uint64_t ErrorInfo;
if (getTargetParser().MatchAndEmitInstruction(
IDLoc, Info.Opcode, Info.ParsedOperands, Out, ErrorInfo,
getTargetParser().isParsingInlineAsm()))
return true;
}
return false;
}
// Parse and erase curly braces marking block start/end
bool
AsmParser::parseCurlyBlockScope(SmallVectorImpl<AsmRewrite> &AsmStrRewrites) {
// Identify curly brace marking block start/end
if (Lexer.isNot(AsmToken::LCurly) && Lexer.isNot(AsmToken::RCurly))
return false;
SMLoc StartLoc = Lexer.getLoc();
Lex(); // Eat the brace
if (Lexer.is(AsmToken::EndOfStatement))
Lex(); // Eat EndOfStatement following the brace
// Erase the block start/end brace from the output asm string
AsmStrRewrites.emplace_back(AOK_Skip, StartLoc, Lexer.getLoc().getPointer() -
StartLoc.getPointer());
return true;
}
/// parseCppHashLineFilenameComment as this:
/// ::= # number "filename"
bool AsmParser::parseCppHashLineFilenameComment(SMLoc L) {
Lex(); // Eat the hash token.
// Lexer only ever emits HashDirective if it fully formed if it's
// done the checking already so this is an internal error.
assert(getTok().is(AsmToken::Integer) &&
"Lexing Cpp line comment: Expected Integer");
int64_t LineNumber = getTok().getIntVal();
Lex();
assert(getTok().is(AsmToken::String) &&
"Lexing Cpp line comment: Expected String");
StringRef Filename = getTok().getString();
Lex();
// Get rid of the enclosing quotes.
Filename = Filename.substr(1, Filename.size() - 2);
// Save the SMLoc, Filename and LineNumber for later use by diagnostics
// and possibly DWARF file info.
CppHashInfo.Loc = L;
CppHashInfo.Filename = Filename;
CppHashInfo.LineNumber = LineNumber;
CppHashInfo.Buf = CurBuffer;
if (FirstCppHashFilename.empty())
FirstCppHashFilename = Filename;
return false;
}
/// will use the last parsed cpp hash line filename comment
/// for the Filename and LineNo if any in the diagnostic.
void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
const AsmParser *Parser = static_cast<const AsmParser *>(Context);
raw_ostream &OS = errs();
const SourceMgr &DiagSrcMgr = *Diag.getSourceMgr();
SMLoc DiagLoc = Diag.getLoc();
unsigned DiagBuf = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
unsigned CppHashBuf =
Parser->SrcMgr.FindBufferContainingLoc(Parser->CppHashInfo.Loc);
// Like SourceMgr::printMessage() we need to print the include stack if any
// before printing the message.
unsigned DiagCurBuffer = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
if (!Parser->SavedDiagHandler && DiagCurBuffer &&
DiagCurBuffer != DiagSrcMgr.getMainFileID()) {
SMLoc ParentIncludeLoc = DiagSrcMgr.getParentIncludeLoc(DiagCurBuffer);
DiagSrcMgr.PrintIncludeStack(ParentIncludeLoc, OS);
}
// If we have not parsed a cpp hash line filename comment or the source
// manager changed or buffer changed (like in a nested include) then just
// print the normal diagnostic using its Filename and LineNo.
if (!Parser->CppHashInfo.LineNumber || &DiagSrcMgr != &Parser->SrcMgr ||
DiagBuf != CppHashBuf) {
if (Parser->SavedDiagHandler)
Parser->SavedDiagHandler(Diag, Parser->SavedDiagContext);
else
Diag.print(nullptr, OS);
return;
}
// Use the CppHashFilename and calculate a line number based on the
// CppHashInfo.Loc and CppHashInfo.LineNumber relative to this Diag's SMLoc
// for the diagnostic.
const std::string &Filename = Parser->CppHashInfo.Filename;
int DiagLocLineNo = DiagSrcMgr.FindLineNumber(DiagLoc, DiagBuf);
int CppHashLocLineNo =
Parser->SrcMgr.FindLineNumber(Parser->CppHashInfo.Loc, CppHashBuf);
int LineNo =
Parser->CppHashInfo.LineNumber - 1 + (DiagLocLineNo - CppHashLocLineNo);
SMDiagnostic NewDiag(*Diag.getSourceMgr(), Diag.getLoc(), Filename, LineNo,
Diag.getColumnNo(), Diag.getKind(), Diag.getMessage(),
Diag.getLineContents(), Diag.getRanges());
if (Parser->SavedDiagHandler)
Parser->SavedDiagHandler(NewDiag, Parser->SavedDiagContext);
else
NewDiag.print(nullptr, OS);
}
// FIXME: This is mostly duplicated from the function in AsmLexer.cpp. The
// difference being that that function accepts '@' as part of identifiers and
// we can't do that. AsmLexer.cpp should probably be changed to handle
// '@' as a special case when needed.
static bool isIdentifierChar(char c) {
return isalnum(static_cast<unsigned char>(c)) || c == '_' || c == '$' ||
c == '.';
}
bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
ArrayRef<MCAsmMacroParameter> Parameters,
ArrayRef<MCAsmMacroArgument> A,
bool EnableAtPseudoVariable, SMLoc L) {
unsigned NParameters = Parameters.size();
bool HasVararg = NParameters ? Parameters.back().Vararg : false;
if ((!IsDarwin || NParameters != 0) && NParameters != A.size())
return Error(L, "Wrong number of arguments");
// A macro without parameters is handled differently on Darwin:
// gas accepts no arguments and does no substitutions
while (!Body.empty()) {
// Scan for the next substitution.
std::size_t End = Body.size(), Pos = 0;
for (; Pos != End; ++Pos) {
// Check for a substitution or escape.
if (IsDarwin && !NParameters) {
// This macro has no parameters, look for $0, $1, etc.
if (Body[Pos] != '$' || Pos + 1 == End)
continue;
char Next = Body[Pos + 1];
if (Next == '$' || Next == 'n' ||
isdigit(static_cast<unsigned char>(Next)))
break;
} else {
// This macro has parameters, look for \foo, \bar, etc.
if (Body[Pos] == '\\' && Pos + 1 != End)
break;
}
}
// Add the prefix.
OS << Body.slice(0, Pos);
// Check if we reached the end.
if (Pos == End)
break;
if (IsDarwin && !NParameters) {
switch (Body[Pos + 1]) {
// $$ => $
case '$':
OS << '$';
break;
// $n => number of arguments
case 'n':
OS << A.size();
break;
// $[0-9] => argument
default: {
// Missing arguments are ignored.
unsigned Index = Body[Pos + 1] - '0';
if (Index >= A.size())
break;
// Otherwise substitute with the token values, with spaces eliminated.
for (const AsmToken &Token : A[Index])
OS << Token.getString();
break;
}
}
Pos += 2;
} else {
unsigned I = Pos + 1;
// Check for the \@ pseudo-variable.
if (EnableAtPseudoVariable && Body[I] == '@' && I + 1 != End)
++I;
else
while (isIdentifierChar(Body[I]) && I + 1 != End)
++I;
const char *Begin = Body.data() + Pos + 1;
StringRef Argument(Begin, I - (Pos + 1));
unsigned Index = 0;
if (Argument == "@") {
OS << NumOfMacroInstantiations;
Pos += 2;
} else {
for (; Index < NParameters; ++Index)
if (Parameters[Index].Name == Argument)
break;
if (Index == NParameters) {
if (Body[Pos + 1] == '(' && Body[Pos + 2] == ')')
Pos += 3;
else {
OS << '\\' << Argument;
Pos = I;
}
} else {
bool VarargParameter = HasVararg && Index == (NParameters - 1);
for (const AsmToken &Token : A[Index])
// For altmacro mode, you can write '%expr'.
// The prefix '%' evaluates the expression 'expr'
// and uses the result as a string (e.g. replace %(1+2) with the
// string "3").
// Here, we identify the integer token which is the result of the
// absolute expression evaluation and replace it with its string
// representation.
if (AltMacroMode && Token.getString().front() == '%' &&
Token.is(AsmToken::Integer))
// Emit an integer value to the buffer.
OS << Token.getIntVal();
// Only Token that was validated as a string and begins with '<'
// is considered altMacroString!!!
else if (AltMacroMode && Token.getString().front() == '<' &&
Token.is(AsmToken::String)) {
OS << altMacroString(Token.getStringContents());
}
// We expect no quotes around the string's contents when
// parsing for varargs.
else if (Token.isNot(AsmToken::String) || VarargParameter)
OS << Token.getString();
else
OS << Token.getStringContents();
Pos += 1 + Argument.size();
}
}
}
// Update the scan point.
Body = Body.substr(Pos);
}
return false;
}
MacroInstantiation::MacroInstantiation(SMLoc IL, int EB, SMLoc EL,
size_t CondStackDepth)
: InstantiationLoc(IL), ExitBuffer(EB), ExitLoc(EL),
CondStackDepth(CondStackDepth) {}
static bool isOperator(AsmToken::TokenKind kind) {
switch (kind) {
default:
return false;
case AsmToken::Plus:
case AsmToken::Minus:
case AsmToken::Tilde:
case AsmToken::Slash:
case AsmToken::Star:
case AsmToken::Dot:
case AsmToken::Equal:
case AsmToken::EqualEqual:
case AsmToken::Pipe:
case AsmToken::PipePipe:
case AsmToken::Caret:
case AsmToken::Amp:
case AsmToken::AmpAmp:
case AsmToken::Exclaim:
case AsmToken::ExclaimEqual:
case AsmToken::Less:
case AsmToken::LessEqual:
case AsmToken::LessLess:
case AsmToken::LessGreater:
case AsmToken::Greater:
case AsmToken::GreaterEqual:
case AsmToken::GreaterGreater:
return true;
}
}
namespace {
class AsmLexerSkipSpaceRAII {
public:
AsmLexerSkipSpaceRAII(AsmLexer &Lexer, bool SkipSpace) : Lexer(Lexer) {
Lexer.setSkipSpace(SkipSpace);
}
~AsmLexerSkipSpaceRAII() {
Lexer.setSkipSpace(true);
}
private:
AsmLexer &Lexer;
};
} // end anonymous namespace
bool AsmParser::parseMacroArgument(MCAsmMacroArgument &MA, bool Vararg) {
if (Vararg) {
if (Lexer.isNot(AsmToken::EndOfStatement)) {
StringRef Str = parseStringToEndOfStatement();
MA.emplace_back(AsmToken::String, Str);
}
return false;
}
unsigned ParenLevel = 0;
// Darwin doesn't use spaces to delmit arguments.
AsmLexerSkipSpaceRAII ScopedSkipSpace(Lexer, IsDarwin);
bool SpaceEaten;
while (true) {
SpaceEaten = false;
if (Lexer.is(AsmToken::Eof) || Lexer.is(AsmToken::Equal))
return TokError("unexpected token in macro instantiation");
if (ParenLevel == 0) {
if (Lexer.is(AsmToken::Comma))
break;
if (Lexer.is(AsmToken::Space)) {
SpaceEaten = true;
Lexer.Lex(); // Eat spaces
}
// Spaces can delimit parameters, but could also be part an expression.
// If the token after a space is an operator, add the token and the next
// one into this argument
if (!IsDarwin) {
if (isOperator(Lexer.getKind())) {
MA.push_back(getTok());
Lexer.Lex();
// Whitespace after an operator can be ignored.
if (Lexer.is(AsmToken::Space))
Lexer.Lex();
continue;
}
}
if (SpaceEaten)
break;
}
// handleMacroEntry relies on not advancing the lexer here
// to be able to fill in the remaining default parameter values
if (Lexer.is(AsmToken::EndOfStatement))
break;
// Adjust the current parentheses level.
if (Lexer.is(AsmToken::LParen))
++ParenLevel;
else if (Lexer.is(AsmToken::RParen) && ParenLevel)
--ParenLevel;
// Append the token to the current argument list.
MA.push_back(getTok());
Lexer.Lex();
}
if (ParenLevel != 0)
return TokError("unbalanced parentheses in macro argument");
return false;
}
// Parse the macro instantiation arguments.
bool AsmParser::parseMacroArguments(const MCAsmMacro *M,
MCAsmMacroArguments &A) {
const unsigned NParameters = M ? M->Parameters.size() : 0;
bool NamedParametersFound = false;
SmallVector<SMLoc, 4> FALocs;
A.resize(NParameters);
FALocs.resize(NParameters);
// Parse two kinds of macro invocations:
// - macros defined without any parameters accept an arbitrary number of them
// - macros defined with parameters accept at most that many of them
bool HasVararg = NParameters ? M->Parameters.back().Vararg : false;
for (unsigned Parameter = 0; !NParameters || Parameter < NParameters;
++Parameter) {
SMLoc IDLoc = Lexer.getLoc();
MCAsmMacroParameter FA;
if (Lexer.is(AsmToken::Identifier) && Lexer.peekTok().is(AsmToken::Equal)) {
if (parseIdentifier(FA.Name))
return Error(IDLoc, "invalid argument identifier for formal argument");
if (Lexer.isNot(AsmToken::Equal))
return TokError("expected '=' after formal parameter identifier");
Lex();
NamedParametersFound = true;
}
bool Vararg = HasVararg && Parameter == (NParameters - 1);
if (NamedParametersFound && FA.Name.empty())
return Error(IDLoc, "cannot mix positional and keyword arguments");
SMLoc StrLoc = Lexer.getLoc();
SMLoc EndLoc;
if (AltMacroMode && Lexer.is(AsmToken::Percent)) {
const MCExpr *AbsoluteExp;
int64_t Value;
/// Eat '%'
Lex();
if (parseExpression(AbsoluteExp, EndLoc))
return false;
if (!AbsoluteExp->evaluateAsAbsolute(Value,
getStreamer().getAssemblerPtr()))
return Error(StrLoc, "expected absolute expression");
const char *StrChar = StrLoc.getPointer();
const char *EndChar = EndLoc.getPointer();
AsmToken newToken(AsmToken::Integer,
StringRef(StrChar, EndChar - StrChar), Value);
FA.Value.push_back(newToken);
} else if (AltMacroMode && Lexer.is(AsmToken::Less) &&
isAltmacroString(StrLoc, EndLoc)) {
const char *StrChar = StrLoc.getPointer();
const char *EndChar = EndLoc.getPointer();
jumpToLoc(EndLoc, CurBuffer);
/// Eat from '<' to '>'
Lex();
AsmToken newToken(AsmToken::String,
StringRef(StrChar, EndChar - StrChar));
FA.Value.push_back(newToken);
} else if(parseMacroArgument(FA.Value, Vararg))
return true;
unsigned PI = Parameter;
if (!FA.Name.empty()) {
unsigned FAI = 0;
for (FAI = 0; FAI < NParameters; ++FAI)
if (M->Parameters[FAI].Name == FA.Name)
break;
if (FAI >= NParameters) {
assert(M && "expected macro to be defined");
return Error(IDLoc, "parameter named '" + FA.Name +
"' does not exist for macro '" + M->Name + "'");
}
PI = FAI;
}
if (!FA.Value.empty()) {
if (A.size() <= PI)
A.resize(PI + 1);
A[PI] = FA.Value;
if (FALocs.size() <= PI)
FALocs.resize(PI + 1);
FALocs[PI] = Lexer.getLoc();
}
// At the end of the statement, fill in remaining arguments that have
// default values. If there aren't any, then the next argument is
// required but missing
if (Lexer.is(AsmToken::EndOfStatement)) {
bool Failure = false;
for (unsigned FAI = 0; FAI < NParameters; ++FAI) {
if (A[FAI].empty()) {
if (M->Parameters[FAI].Required) {
Error(FALocs[FAI].isValid() ? FALocs[FAI] : Lexer.getLoc(),
"missing value for required parameter "
"'" + M->Parameters[FAI].Name + "' in macro '" + M->Name + "'");
Failure = true;
}
if (!M->Parameters[FAI].Value.empty())
A[FAI] = M->Parameters[FAI].Value;
}
}
return Failure;
}
if (Lexer.is(AsmToken::Comma))
Lex();
}
return TokError("too many positional arguments");
}
bool AsmParser::handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
// Arbitrarily limit macro nesting depth (default matches 'as'). We can
// eliminate this, although we should protect against infinite loops.
unsigned MaxNestingDepth = AsmMacroMaxNestingDepth;
if (ActiveMacros.size() == MaxNestingDepth) {
std::ostringstream MaxNestingDepthError;
MaxNestingDepthError << "macros cannot be nested more than "
<< MaxNestingDepth << " levels deep."
<< " Use -asm-macro-max-nesting-depth to increase "
"this limit.";
return TokError(MaxNestingDepthError.str());
}
MCAsmMacroArguments A;
if (parseMacroArguments(M, A))
return true;
// Macro instantiation is lexical, unfortunately. We construct a new buffer
// to hold the macro body with substitutions.
SmallString<256> Buf;
StringRef Body = M->Body;
raw_svector_ostream OS(Buf);
if (expandMacro(OS, Body, M->Parameters, A, true, getTok().getLoc()))
return true;
// We include the .endmacro in the buffer as our cue to exit the macro
// instantiation.
OS << ".endmacro\n";
std::unique_ptr<MemoryBuffer> Instantiation =
MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
// Create the macro instantiation object and add to the current macro
// instantiation stack.
MacroInstantiation *MI = new MacroInstantiation(
NameLoc, CurBuffer, getTok().getLoc(), TheCondStack.size());
ActiveMacros.push_back(MI);
++NumOfMacroInstantiations;
// Jump to the macro instantiation and prime the lexer.
CurBuffer = SrcMgr.AddNewSourceBuffer(std::move(Instantiation), SMLoc());
Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
Lex();
return false;
}
void AsmParser::handleMacroExit() {
// Jump to the EndOfStatement we should return to, and consume it.
jumpToLoc(ActiveMacros.back()->ExitLoc, ActiveMacros.back()->ExitBuffer);
Lex();
// Pop the instantiation entry.
delete ActiveMacros.back();
ActiveMacros.pop_back();
}
bool AsmParser::parseAssignment(StringRef Name, bool allow_redef,
bool NoDeadStrip) {
MCSymbol *Sym;
const MCExpr *Value;
if (MCParserUtils::parseAssignmentExpression(Name, allow_redef, *this, Sym,
Value))
return true;
if (!Sym) {
// In the case where we parse an expression starting with a '.', we will
// not generate an error, nor will we create a symbol. In this case we
// should just return out.
return false;
}
// Do the assignment.
Out.EmitAssignment(Sym, Value);
if (NoDeadStrip)
Out.EmitSymbolAttribute(Sym, MCSA_NoDeadStrip);
return false;
}
/// parseIdentifier:
/// ::= identifier
/// ::= string
bool AsmParser::parseIdentifier(StringRef &Res) {
// The assembler has relaxed rules for accepting identifiers, in particular we
// allow things like '.globl $foo' and '.def @feat.00', which would normally be
// separate tokens. At this level, we have already lexed so we cannot (currently)
// handle this as a context dependent token, instead we detect adjacent tokens
// and return the combined identifier.
if (Lexer.is(AsmToken::Dollar) || Lexer.is(AsmToken::At)) {
SMLoc PrefixLoc = getLexer().getLoc();
// Consume the prefix character, and check for a following identifier.
AsmToken Buf[1];
Lexer.peekTokens(Buf, false);
if (Buf[0].isNot(AsmToken::Identifier))
return true;
// We have a '$' or '@' followed by an identifier, make sure they are adjacent.
if (PrefixLoc.getPointer() + 1 != Buf[0].getLoc().getPointer())
return true;
// eat $ or @
Lexer.Lex(); // Lexer's Lex guarantees consecutive token.
// Construct the joined identifier and consume the token.
Res =
StringRef(PrefixLoc.getPointer(), getTok().getIdentifier().size() + 1);
Lex(); // Parser Lex to maintain invariants.
return false;
}
if (Lexer.isNot(AsmToken::Identifier) && Lexer.isNot(AsmToken::String))
return true;
Res = getTok().getIdentifier();
Lex(); // Consume the identifier token.
return false;
}
/// parseDirectiveSet:
/// ::= .equ identifier ',' expression
/// ::= .equiv identifier ',' expression
/// ::= .set identifier ',' expression
bool AsmParser::parseDirectiveSet(StringRef IDVal, bool allow_redef) {
StringRef Name;
if (check(parseIdentifier(Name), "expected identifier") ||
parseToken(AsmToken::Comma) || parseAssignment(Name, allow_redef, true))
return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
return false;
}
bool AsmParser::parseEscapedString(std::string &Data) {
if (check(getTok().isNot(AsmToken::String), "expected string"))
return true;
Data = "";
StringRef Str = getTok().getStringContents();
for (unsigned i = 0, e = Str.size(); i != e; ++i) {
if (Str[i] != '\\') {
Data += Str[i];
continue;
}
// Recognize escaped characters. Note that this escape semantics currently
// loosely follows Darwin 'as'. Notably, it doesn't support hex escapes.
++i;
if (i == e)
return TokError("unexpected backslash at end of string");
// Recognize octal sequences.
if ((unsigned)(Str[i] - '0') <= 7) {
// Consume up to three octal characters.
unsigned Value = Str[i] - '0';
if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
++i;
Value = Value * 8 + (Str[i] - '0');
if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
++i;
Value = Value * 8 + (Str[i] - '0');
}
}
if (Value > 255)
return TokError("invalid octal escape sequence (out of range)");
Data += (unsigned char)Value;
continue;
}
// Otherwise recognize individual escapes.
switch (Str[i]) {
default:
// Just reject invalid escape sequences for now.
return TokError("invalid escape sequence (unrecognized character)");
case 'b': Data += '\b'; break;
case 'f': Data += '\f'; break;
case 'n': Data += '\n'; break;
case 'r': Data += '\r'; break;
case 't': Data += '\t'; break;
case '"': Data += '"'; break;
case '\\': Data += '\\'; break;
}
}
Lex();
return false;
}
/// parseDirectiveAscii:
/// ::= ( .ascii | .asciz | .string ) [ "string" ( , "string" )* ]
bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
auto parseOp = [&]() -> bool {
std::string Data;
if (checkForValidSection() || parseEscapedString(Data))
return true;
getStreamer().EmitBytes(Data);
if (ZeroTerminated)
getStreamer().EmitBytes(StringRef("\0", 1));
return false;
};
if (parseMany(parseOp))
return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
return false;
}
/// parseDirectiveReloc
/// ::= .reloc expression , identifier [ , expression ]
bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) {
const MCExpr *Offset;
const MCExpr *Expr = nullptr;
int64_t OffsetValue;
SMLoc OffsetLoc = Lexer.getTok().getLoc();
if (parseExpression(Offset))
return true;
if ((Offset->evaluateAsAbsolute(OffsetValue,
getStreamer().getAssemblerPtr()) &&
check(OffsetValue < 0, OffsetLoc, "expression is negative")) ||
(check(Offset->getKind() != llvm::MCExpr::Constant &&
Offset->getKind() != llvm::MCExpr::SymbolRef,
OffsetLoc, "expected non-negative number or a label")) ||
(parseToken(AsmToken::Comma, "expected comma") ||
check(getTok().isNot(AsmToken::Identifier), "expected relocation name")))
return true;
SMLoc NameLoc = Lexer.getTok().getLoc();
StringRef Name = Lexer.getTok().getIdentifier();
Lex();
if (Lexer.is(AsmToken::Comma)) {
Lex();
SMLoc ExprLoc = Lexer.getLoc();
if (parseExpression(Expr))
return true;
MCValue Value;
if (!Expr->evaluateAsRelocatable(Value, nullptr, nullptr))
return Error(ExprLoc, "expression must be relocatable");
}
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in .reloc directive"))
return true;
const MCTargetAsmParser &MCT = getTargetParser();
const MCSubtargetInfo &STI = MCT.getSTI();
if (getStreamer().EmitRelocDirective(*Offset, Name, Expr, DirectiveLoc, STI))
return Error(NameLoc, "unknown relocation name");
return false;
}
/// parseDirectiveValue
/// ::= (.byte | .short | ... ) [ expression (, expression)* ]
bool AsmParser::parseDirectiveValue(StringRef IDVal, unsigned Size) {
auto parseOp = [&]() -> bool {
const MCExpr *Value;
SMLoc ExprLoc = getLexer().getLoc();
if (checkForValidSection() || parseExpression(Value))
return true;
// Special case constant expressions to match code generator.
if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
assert(Size <= 8 && "Invalid size");
uint64_t IntValue = MCE->getValue();
if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
return Error(ExprLoc, "out of range literal value");
getStreamer().EmitIntValue(IntValue, Size);
} else
getStreamer().EmitValue(Value, Size, ExprLoc);
return false;
};
if (parseMany(parseOp))
return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
return false;
}
static bool parseHexOcta(AsmParser &Asm, uint64_t &hi, uint64_t &lo) {
if (Asm.getTok().isNot(AsmToken::Integer) &&
Asm.getTok().isNot(AsmToken::BigNum))
return Asm.TokError("unknown token in expression");
SMLoc ExprLoc = Asm.getTok().getLoc();
APInt IntValue = Asm.getTok().getAPIntVal();
Asm.Lex();
if (!IntValue.isIntN(128))
return Asm.Error(ExprLoc, "out of range literal value");
if (!IntValue.isIntN(64)) {
hi = IntValue.getHiBits(IntValue.getBitWidth() - 64).getZExtValue();
lo = IntValue.getLoBits(64).getZExtValue();
} else {
hi = 0;
lo = IntValue.getZExtValue();
}
return false;
}
/// ParseDirectiveOctaValue
/// ::= .octa [ hexconstant (, hexconstant)* ]
bool AsmParser::parseDirectiveOctaValue(StringRef IDVal) {
auto parseOp = [&]() -> bool {
if (checkForValidSection())
return true;
uint64_t hi, lo;
if (parseHexOcta(*this, hi, lo))
return true;
if (MAI.isLittleEndian()) {
getStreamer().EmitIntValue(lo, 8);
getStreamer().EmitIntValue(hi, 8);
} else {
getStreamer().EmitIntValue(hi, 8);
getStreamer().EmitIntValue(lo, 8);
}
return false;
};
if (parseMany(parseOp))
return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
return false;
}
bool AsmParser::parseRealValue(const fltSemantics &Semantics, APInt &Res) {
// We don't truly support arithmetic on floating point expressions, so we
// have to manually parse unary prefixes.
bool IsNeg = false;
if (getLexer().is(AsmToken::Minus)) {
Lexer.Lex();
IsNeg = true;
} else if (getLexer().is(AsmToken::Plus))
Lexer.Lex();
if (Lexer.is(AsmToken::Error))
return TokError(Lexer.getErr());
if (Lexer.isNot(AsmToken::Integer) && Lexer.isNot(AsmToken::Real) &&
Lexer.isNot(AsmToken::Identifier))
return TokError("unexpected token in directive");
// Convert to an APFloat.
APFloat Value(Semantics);
StringRef IDVal = getTok().getString();
if (getLexer().is(AsmToken::Identifier)) {
if (!IDVal.compare_lower("infinity") || !IDVal.compare_lower("inf"))
Value = APFloat::getInf(Semantics);
else if (!IDVal.compare_lower("nan"))
Value = APFloat::getNaN(Semantics, false, ~0);
else
return TokError("invalid floating point literal");
} else if (Value.convertFromString(IDVal, APFloat::rmNearestTiesToEven) ==
APFloat::opInvalidOp)
return TokError("invalid floating point literal");
if (IsNeg)
Value.changeSign();
// Consume the numeric token.
Lex();
Res = Value.bitcastToAPInt();
return false;
}
/// parseDirectiveRealValue
/// ::= (.single | .double) [ expression (, expression)* ]
bool AsmParser::parseDirectiveRealValue(StringRef IDVal,
const fltSemantics &Semantics) {
auto parseOp = [&]() -> bool {
APInt AsInt;
if (checkForValidSection() || parseRealValue(Semantics, AsInt))
return true;
getStreamer().EmitIntValue(AsInt.getLimitedValue(),
AsInt.getBitWidth() / 8);
return false;
};
if (parseMany(parseOp))
return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
return false;
}
/// parseDirectiveZero
/// ::= .zero expression
bool AsmParser::parseDirectiveZero() {
SMLoc NumBytesLoc = Lexer.getLoc();
const MCExpr *NumBytes;
if (checkForValidSection() || parseExpression(NumBytes))
return true;
int64_t Val = 0;
if (getLexer().is(AsmToken::Comma)) {
Lex();
if (parseAbsoluteExpression(Val))
return true;
}
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '.zero' directive"))
return true;
getStreamer().emitFill(*NumBytes, Val, NumBytesLoc);
return false;
}
/// parseDirectiveFill
/// ::= .fill expression [ , expression [ , expression ] ]
bool AsmParser::parseDirectiveFill() {
SMLoc NumValuesLoc = Lexer.getLoc();
const MCExpr *NumValues;
if (checkForValidSection() || parseExpression(NumValues))
return true;
int64_t FillSize = 1;
int64_t FillExpr = 0;
SMLoc SizeLoc, ExprLoc;
if (parseOptionalToken(AsmToken::Comma)) {
SizeLoc = getTok().getLoc();
if (parseAbsoluteExpression(FillSize))
return true;
if (parseOptionalToken(AsmToken::Comma)) {
ExprLoc = getTok().getLoc();
if (parseAbsoluteExpression(FillExpr))
return true;
}
}
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '.fill' directive"))
return true;
if (FillSize < 0) {
Warning(SizeLoc, "'.fill' directive with negative size has no effect");
return false;
}
if (FillSize > 8) {
Warning(SizeLoc, "'.fill' directive with size greater than 8 has been truncated to 8");
FillSize = 8;
}
if (!isUInt<32>(FillExpr) && FillSize > 4)
Warning(ExprLoc, "'.fill' directive pattern has been truncated to 32-bits");
getStreamer().emitFill(*NumValues, FillSize, FillExpr, NumValuesLoc);
return false;
}
/// parseDirectiveOrg
/// ::= .org expression [ , expression ]
bool AsmParser::parseDirectiveOrg() {
const MCExpr *Offset;
SMLoc OffsetLoc = Lexer.getLoc();
if (checkForValidSection() || parseExpression(Offset))
return true;
// Parse optional fill expression.
int64_t FillExpr = 0;
if (parseOptionalToken(AsmToken::Comma))
if (parseAbsoluteExpression(FillExpr))
return addErrorSuffix(" in '.org' directive");
if (parseToken(AsmToken::EndOfStatement))
return addErrorSuffix(" in '.org' directive");
getStreamer().emitValueToOffset(Offset, FillExpr, OffsetLoc);
return false;
}
/// parseDirectiveAlign
/// ::= {.align, ...} expression [ , expression [ , expression ]]
bool AsmParser::parseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
SMLoc AlignmentLoc = getLexer().getLoc();
int64_t Alignment;
SMLoc MaxBytesLoc;
bool HasFillExpr = false;
int64_t FillExpr = 0;
int64_t MaxBytesToFill = 0;
auto parseAlign = [&]() -> bool {
if (parseAbsoluteExpression(Alignment))
return true;
if (parseOptionalToken(AsmToken::Comma)) {
// The fill expression can be omitted while specifying a maximum number of
// alignment bytes, e.g:
// .align 3,,4
if (getTok().isNot(AsmToken::Comma)) {
HasFillExpr = true;
if (parseAbsoluteExpression(FillExpr))
return true;
}
if (parseOptionalToken(AsmToken::Comma))
if (parseTokenLoc(MaxBytesLoc) ||
parseAbsoluteExpression(MaxBytesToFill))
return true;
}
return parseToken(AsmToken::EndOfStatement);
};
if (checkForValidSection())
return addErrorSuffix(" in directive");
// Ignore empty '.p2align' directives for GNU-as compatibility
if (IsPow2 && (ValueSize == 1) && getTok().is(AsmToken::EndOfStatement)) {
Warning(AlignmentLoc, "p2align directive with no operand(s) is ignored");
return parseToken(AsmToken::EndOfStatement);
}
if (parseAlign())
return addErrorSuffix(" in directive");
// Always emit an alignment here even if we thrown an error.
bool ReturnVal = false;
// Compute alignment in bytes.
if (IsPow2) {
// FIXME: Diagnose overflow.
if (Alignment >= 32) {
ReturnVal |= Error(AlignmentLoc, "invalid alignment value");
Alignment = 31;
}
Alignment = 1ULL << Alignment;
} else {
// Reject alignments that aren't either a power of two or zero,
// for gas compatibility. Alignment of zero is silently rounded
// up to one.
if (Alignment == 0)
Alignment = 1;
if (!isPowerOf2_64(Alignment))
ReturnVal |= Error(AlignmentLoc, "alignment must be a power of 2");
}
// Diagnose non-sensical max bytes to align.
if (MaxBytesLoc.isValid()) {
if (MaxBytesToFill < 1) {
ReturnVal |= Error(MaxBytesLoc,
"alignment directive can never be satisfied in this "
"many bytes, ignoring maximum bytes expression");
MaxBytesToFill = 0;
}
if (MaxBytesToFill >= Alignment) {
Warning(MaxBytesLoc, "maximum bytes expression exceeds alignment and "
"has no effect");
MaxBytesToFill = 0;
}
}
// Check whether we should use optimal code alignment for this .align
// directive.
const MCSection *Section = getStreamer().getCurrentSectionOnly();
assert(Section && "must have section to emit alignment");
bool UseCodeAlign = Section->UseCodeAlign();
if ((!HasFillExpr || Lexer.getMAI().getTextAlignFillValue() == FillExpr) &&
ValueSize == 1 && UseCodeAlign) {
getStreamer().EmitCodeAlignment(Alignment, MaxBytesToFill);
} else {
// FIXME: Target specific behavior about how the "extra" bytes are filled.
getStreamer().EmitValueToAlignment(Alignment, FillExpr, ValueSize,
MaxBytesToFill);
}
return ReturnVal;
}
/// parseDirectiveFile
/// ::= .file filename
/// ::= .file number [directory] filename [md5 checksum] [source source-text]
bool AsmParser::parseDirectiveFile(SMLoc DirectiveLoc) {
// FIXME: I'm not sure what this is.
int64_t FileNumber = -1;
if (getLexer().is(AsmToken::Integer)) {
FileNumber = getTok().getIntVal();
Lex();
if (FileNumber < 0)
return TokError("negative file number");
}
std::string Path;
// Usually the directory and filename together, otherwise just the directory.
// Allow the strings to have escaped octal character sequence.
if (check(getTok().isNot(AsmToken::String),
"unexpected token in '.file' directive") ||
parseEscapedString(Path))
return true;
StringRef Directory;
StringRef Filename;
std::string FilenameData;
if (getLexer().is(AsmToken::String)) {
if (check(FileNumber == -1,
"explicit path specified, but no file number") ||
parseEscapedString(FilenameData))
return true;
Filename = FilenameData;
Directory = Path;
} else {
Filename = Path;
}
uint64_t MD5Hi, MD5Lo;
bool HasMD5 = false;
Optional<StringRef> Source;
bool HasSource = false;
std::string SourceString;
while (!parseOptionalToken(AsmToken::EndOfStatement)) {
StringRef Keyword;
if (check(getTok().isNot(AsmToken::Identifier),
"unexpected token in '.file' directive") ||
parseIdentifier(Keyword))
return true;
if (Keyword == "md5") {
HasMD5 = true;
if (check(FileNumber == -1,
"MD5 checksum specified, but no file number") ||
parseHexOcta(*this, MD5Hi, MD5Lo))
return true;
} else if (Keyword == "source") {
HasSource = true;
if (check(FileNumber == -1,
"source specified, but no file number") ||
check(getTok().isNot(AsmToken::String),
"unexpected token in '.file' directive") ||
parseEscapedString(SourceString))
return true;
} else {
return TokError("unexpected token in '.file' directive");
}
}
if (FileNumber == -1) {
// Ignore the directive if there is no number and the target doesn't support
// numberless .file directives. This allows some portability of assembler
// between different object file formats.
if (getContext().getAsmInfo()->hasSingleParameterDotFile())
getStreamer().EmitFileDirective(Filename);
} else {
// In case there is a -g option as well as debug info from directive .file,
// we turn off the -g option, directly use the existing debug info instead.
// Throw away any implicit file table for the assembler source.
if (Ctx.getGenDwarfForAssembly()) {
Ctx.getMCDwarfLineTable(0).resetFileTable();
Ctx.setGenDwarfForAssembly(false);
}
Optional<MD5::MD5Result> CKMem;
if (HasMD5) {
MD5::MD5Result Sum;
for (unsigned i = 0; i != 8; ++i) {
Sum.Bytes[i] = uint8_t(MD5Hi >> ((7 - i) * 8));
Sum.Bytes[i + 8] = uint8_t(MD5Lo >> ((7 - i) * 8));
}
CKMem = Sum;
}
if (HasSource) {
char *SourceBuf = static_cast<char *>(Ctx.allocate(SourceString.size()));
memcpy(SourceBuf, SourceString.data(), SourceString.size());
Source = StringRef(SourceBuf, SourceString.size());
}
if (FileNumber == 0) {
if (Ctx.getDwarfVersion() < 5)
return Warning(DirectiveLoc, "file 0 not supported prior to DWARF-5");
getStreamer().emitDwarfFile0Directive(Directory, Filename, CKMem, Source);
} else {
Expected<unsigned> FileNumOrErr = getStreamer().tryEmitDwarfFileDirective(
FileNumber, Directory, Filename, CKMem, Source);
if (!FileNumOrErr)
return Error(DirectiveLoc, toString(FileNumOrErr.takeError()));
}
// Alert the user if there are some .file directives with MD5 and some not.
// But only do that once.
if (!ReportedInconsistentMD5 && !Ctx.isDwarfMD5UsageConsistent(0)) {
ReportedInconsistentMD5 = true;
return Warning(DirectiveLoc, "inconsistent use of MD5 checksums");
}
}
return false;
}
/// parseDirectiveLine
/// ::= .line [number]
bool AsmParser::parseDirectiveLine() {
int64_t LineNumber;
if (getLexer().is(AsmToken::Integer)) {
if (parseIntToken(LineNumber, "unexpected token in '.line' directive"))
return true;
(void)LineNumber;
// FIXME: Do something with the .line.
}
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '.line' directive"))
return true;
return false;
}
/// parseDirectiveLoc
/// ::= .loc FileNumber [LineNumber] [ColumnPos] [basic_block] [prologue_end]
/// [epilogue_begin] [is_stmt VALUE] [isa VALUE]
/// The first number is a file number, must have been previously assigned with
/// a .file directive, the second number is the line number and optionally the
/// third number is a column position (zero if not specified). The remaining
/// optional items are .loc sub-directives.
bool AsmParser::parseDirectiveLoc() {
int64_t FileNumber = 0, LineNumber = 0;
SMLoc Loc = getTok().getLoc();
if (parseIntToken(FileNumber, "unexpected token in '.loc' directive") ||
check(FileNumber < 1 && Ctx.getDwarfVersion() < 5, Loc,
"file number less than one in '.loc' directive") ||
check(!getContext().isValidDwarfFileNumber(FileNumber), Loc,
"unassigned file number in '.loc' directive"))
return true;
// optional
if (getLexer().is(AsmToken::Integer)) {
LineNumber = getTok().getIntVal();
if (LineNumber < 0)
return TokError("line number less than zero in '.loc' directive");
Lex();
}
int64_t ColumnPos = 0;
if (getLexer().is(AsmToken::Integer)) {
ColumnPos = getTok().getIntVal();
if (ColumnPos < 0)
return TokError("column position less than zero in '.loc' directive");
Lex();
}
unsigned Flags = DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0;
unsigned Isa = 0;
int64_t Discriminator = 0;
auto parseLocOp = [&]() -> bool {
StringRef Name;
SMLoc Loc = getTok().getLoc();
if (parseIdentifier(Name))
return TokError("unexpected token in '.loc' directive");
if (Name == "basic_block")
Flags |= DWARF2_FLAG_BASIC_BLOCK;
else if (Name == "prologue_end")
Flags |= DWARF2_FLAG_PROLOGUE_END;
else if (Name == "epilogue_begin")
Flags |= DWARF2_FLAG_EPILOGUE_BEGIN;
else if (Name == "is_stmt") {
Loc = getTok().getLoc();
const MCExpr *Value;
if (parseExpression(Value))
return true;
// The expression must be the constant 0 or 1.
if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
int Value = MCE->getValue();
if (Value == 0)
Flags &= ~DWARF2_FLAG_IS_STMT;
else if (Value == 1)
Flags |= DWARF2_FLAG_IS_STMT;
else
return Error(Loc, "is_stmt value not 0 or 1");
} else {
return Error(Loc, "is_stmt value not the constant value of 0 or 1");
}
} else if (Name == "isa") {
Loc = getTok().getLoc();
const MCExpr *Value;
if (parseExpression(Value))
return true;
// The expression must be a constant greater or equal to 0.
if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
int Value = MCE->getValue();
if (Value < 0)
return Error(Loc, "isa number less than zero");
Isa = Value;
} else {
return Error(Loc, "isa number not a constant value");
}
} else if (Name == "discriminator") {
if (parseAbsoluteExpression(Discriminator))
return true;
} else {
return Error(Loc, "unknown sub-directive in '.loc' directive");
}
return false;
};
if (parseMany(parseLocOp, false /*hasComma*/))
return true;
getStreamer().EmitDwarfLocDirective(FileNumber, LineNumber, ColumnPos, Flags,
Isa, Discriminator, StringRef());
return false;
}
/// parseDirectiveStabs
/// ::= .stabs string, number, number, number
bool AsmParser::parseDirectiveStabs() {
return TokError("unsupported directive '.stabs'");
}
/// parseDirectiveCVFile
/// ::= .cv_file number filename [checksum] [checksumkind]
bool AsmParser::parseDirectiveCVFile() {
SMLoc FileNumberLoc = getTok().getLoc();
int64_t FileNumber;
std::string Filename;
std::string Checksum;
int64_t ChecksumKind = 0;
if (parseIntToken(FileNumber,
"expected file number in '.cv_file' directive") ||
check(FileNumber < 1, FileNumberLoc, "file number less than one") ||
check(getTok().isNot(AsmToken::String),
"unexpected token in '.cv_file' directive") ||
parseEscapedString(Filename))
return true;
if (!parseOptionalToken(AsmToken::EndOfStatement)) {
if (check(getTok().isNot(AsmToken::String),
"unexpected token in '.cv_file' directive") ||
parseEscapedString(Checksum) ||
parseIntToken(ChecksumKind,
"expected checksum kind in '.cv_file' directive") ||
parseToken(AsmToken::EndOfStatement,
"unexpected token in '.cv_file' directive"))
return true;
}
Checksum = fromHex(Checksum);
void *CKMem = Ctx.allocate(Checksum.size(), 1);
memcpy(CKMem, Checksum.data(), Checksum.size());
ArrayRef<uint8_t> ChecksumAsBytes(reinterpret_cast<const uint8_t *>(CKMem),
Checksum.size());
if (!getStreamer().EmitCVFileDirective(FileNumber, Filename, ChecksumAsBytes,
static_cast<uint8_t>(ChecksumKind)))
return Error(FileNumberLoc, "file number already allocated");
return false;
}
bool AsmParser::parseCVFunctionId(int64_t &FunctionId,
StringRef DirectiveName) {
SMLoc Loc;
return parseTokenLoc(Loc) ||
parseIntToken(FunctionId, "expected function id in '" + DirectiveName +
"' directive") ||
check(FunctionId < 0 || FunctionId >= UINT_MAX, Loc,
"expected function id within range [0, UINT_MAX)");
}
bool AsmParser::parseCVFileId(int64_t &FileNumber, StringRef DirectiveName) {
SMLoc Loc;
return parseTokenLoc(Loc) ||
parseIntToken(FileNumber, "expected integer in '" + DirectiveName +
"' directive") ||
check(FileNumber < 1, Loc, "file number less than one in '" +
DirectiveName + "' directive") ||
check(!getCVContext().isValidFileNumber(FileNumber), Loc,
"unassigned file number in '" + DirectiveName + "' directive");
}
/// parseDirectiveCVFuncId
/// ::= .cv_func_id FunctionId
///
/// Introduces a function ID that can be used with .cv_loc.
bool AsmParser::parseDirectiveCVFuncId() {
SMLoc FunctionIdLoc = getTok().getLoc();
int64_t FunctionId;
if (parseCVFunctionId(FunctionId, ".cv_func_id") ||
parseToken(AsmToken::EndOfStatement,
"unexpected token in '.cv_func_id' directive"))
return true;
if (!getStreamer().EmitCVFuncIdDirective(FunctionId))
return Error(FunctionIdLoc, "function id already allocated");
return false;
}
/// parseDirectiveCVInlineSiteId
/// ::= .cv_inline_site_id FunctionId
/// "within" IAFunc
/// "inlined_at" IAFile IALine [IACol]
///
/// Introduces a function ID that can be used with .cv_loc. Includes "inlined
/// at" source location information for use in the line table of the caller,
/// whether the caller is a real function or another inlined call site.
bool AsmParser::parseDirectiveCVInlineSiteId() {
SMLoc FunctionIdLoc = getTok().getLoc();
int64_t FunctionId;
int64_t IAFunc;
int64_t IAFile;
int64_t IALine;
int64_t IACol = 0;
// FunctionId
if (parseCVFunctionId(FunctionId, ".cv_inline_site_id"))
return true;
// "within"
if (check((getLexer().isNot(AsmToken::Identifier) ||
getTok().getIdentifier() != "within"),
"expected 'within' identifier in '.cv_inline_site_id' directive"))
return true;
Lex();
// IAFunc
if (parseCVFunctionId(IAFunc, ".cv_inline_site_id"))
return true;
// "inlined_at"
if (check((getLexer().isNot(AsmToken::Identifier) ||
getTok().getIdentifier() != "inlined_at"),
"expected 'inlined_at' identifier in '.cv_inline_site_id' "
"directive") )
return true;
Lex();
// IAFile IALine
if (parseCVFileId(IAFile, ".cv_inline_site_id") ||
parseIntToken(IALine, "expected line number after 'inlined_at'"))
return true;
// [IACol]
if (getLexer().is(AsmToken::Integer)) {
IACol = getTok().getIntVal();
Lex();
}
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '.cv_inline_site_id' directive"))
return true;
if (!getStreamer().EmitCVInlineSiteIdDirective(FunctionId, IAFunc, IAFile,
IALine, IACol, FunctionIdLoc))
return Error(FunctionIdLoc, "function id already allocated");
return false;
}
/// parseDirectiveCVLoc
/// ::= .cv_loc FunctionId FileNumber [LineNumber] [ColumnPos] [prologue_end]
/// [is_stmt VALUE]
/// The first number is a file number, must have been previously assigned with
/// a .file directive, the second number is the line number and optionally the
/// third number is a column position (zero if not specified). The remaining
/// optional items are .loc sub-directives.
bool AsmParser::parseDirectiveCVLoc() {
SMLoc DirectiveLoc = getTok().getLoc();
int64_t FunctionId, FileNumber;
if (parseCVFunctionId(FunctionId, ".cv_loc") ||
parseCVFileId(FileNumber, ".cv_loc"))
return true;
int64_t LineNumber = 0;
if (getLexer().is(AsmToken::Integer)) {
LineNumber = getTok().getIntVal();
if (LineNumber < 0)
return TokError("line number less than zero in '.cv_loc' directive");
Lex();
}
int64_t ColumnPos = 0;
if (getLexer().is(AsmToken::Integer)) {
ColumnPos = getTok().getIntVal();
if (ColumnPos < 0)
return TokError("column position less than zero in '.cv_loc' directive");
Lex();
}
bool PrologueEnd = false;
uint64_t IsStmt = 0;
auto parseOp = [&]() -> bool {
StringRef Name;
SMLoc Loc = getTok().getLoc();
if (parseIdentifier(Name))
return TokError("unexpected token in '.cv_loc' directive");
if (Name == "prologue_end")
PrologueEnd = true;
else if (Name == "is_stmt") {
Loc = getTok().getLoc();
const MCExpr *Value;
if (parseExpression(Value))
return true;
// The expression must be the constant 0 or 1.
IsStmt = ~0ULL;
if (const auto *MCE = dyn_cast<MCConstantExpr>(Value))
IsStmt = MCE->getValue();
if (IsStmt > 1)
return Error(Loc, "is_stmt value not 0 or 1");
} else {
return Error(Loc, "unknown sub-directive in '.cv_loc' directive");
}
return false;
};
if (parseMany(parseOp, false /*hasComma*/))
return true;
getStreamer().EmitCVLocDirective(FunctionId, FileNumber, LineNumber,
ColumnPos, PrologueEnd, IsStmt, StringRef(),
DirectiveLoc);
return false;
}
/// parseDirectiveCVLinetable
/// ::= .cv_linetable FunctionId, FnStart, FnEnd
bool AsmParser::parseDirectiveCVLinetable() {
int64_t FunctionId;
StringRef FnStartName, FnEndName;
SMLoc Loc = getTok().getLoc();
if (parseCVFunctionId(FunctionId, ".cv_linetable") ||
parseToken(AsmToken::Comma,
"unexpected token in '.cv_linetable' directive") ||
parseTokenLoc(Loc) || check(parseIdentifier(FnStartName), Loc,
"expected identifier in directive") ||
parseToken(AsmToken::Comma,
"unexpected token in '.cv_linetable' directive") ||
parseTokenLoc(Loc) || check(parseIdentifier(FnEndName), Loc,
"expected identifier in directive"))
return true;
MCSymbol *FnStartSym = getContext().getOrCreateSymbol(FnStartName);
MCSymbol *FnEndSym = getContext().getOrCreateSymbol(FnEndName);
getStreamer().EmitCVLinetableDirective(FunctionId, FnStartSym, FnEndSym);
return false;
}
/// parseDirectiveCVInlineLinetable
/// ::= .cv_inline_linetable PrimaryFunctionId FileId LineNum FnStart FnEnd
bool AsmParser::parseDirectiveCVInlineLinetable() {
int64_t PrimaryFunctionId, SourceFileId, SourceLineNum;
StringRef FnStartName, FnEndName;
SMLoc Loc = getTok().getLoc();
if (parseCVFunctionId(PrimaryFunctionId, ".cv_inline_linetable") ||
parseTokenLoc(Loc) ||
parseIntToken(
SourceFileId,
"expected SourceField in '.cv_inline_linetable' directive") ||
check(SourceFileId <= 0, Loc,
"File id less than zero in '.cv_inline_linetable' directive") ||
parseTokenLoc(Loc) ||
parseIntToken(
SourceLineNum,
"expected SourceLineNum in '.cv_inline_linetable' directive") ||
check(SourceLineNum < 0, Loc,
"Line number less than zero in '.cv_inline_linetable' directive") ||
parseTokenLoc(Loc) || check(parseIdentifier(FnStartName), Loc,
"expected identifier in directive") ||
parseTokenLoc(Loc) || check(parseIdentifier(FnEndName), Loc,
"expected identifier in directive"))
return true;
if (parseToken(AsmToken::EndOfStatement, "Expected End of Statement"))
return true;
MCSymbol *FnStartSym = getContext().getOrCreateSymbol(FnStartName);
MCSymbol *FnEndSym = getContext().getOrCreateSymbol(FnEndName);
getStreamer().EmitCVInlineLinetableDirective(PrimaryFunctionId, SourceFileId,
SourceLineNum, FnStartSym,
FnEndSym);
return false;
}
/// parseDirectiveCVDefRange
/// ::= .cv_def_range RangeStart RangeEnd (GapStart GapEnd)*, bytes*
bool AsmParser::parseDirectiveCVDefRange() {
SMLoc Loc;
std::vector<std::pair<const MCSymbol *, const MCSymbol *>> Ranges;
while (getLexer().is(AsmToken::Identifier)) {
Loc = getLexer().getLoc();
StringRef GapStartName;
if (parseIdentifier(GapStartName))
return Error(Loc, "expected identifier in directive");
MCSymbol *GapStartSym = getContext().getOrCreateSymbol(GapStartName);
Loc = getLexer().getLoc();
StringRef GapEndName;
if (parseIdentifier(GapEndName))
return Error(Loc, "expected identifier in directive");
MCSymbol *GapEndSym = getContext().getOrCreateSymbol(GapEndName);
Ranges.push_back({GapStartSym, GapEndSym});
}
std::string FixedSizePortion;
if (parseToken(AsmToken::Comma, "unexpected token in directive") ||
parseEscapedString(FixedSizePortion))
return true;
getStreamer().EmitCVDefRangeDirective(Ranges, FixedSizePortion);
return false;
}
/// parseDirectiveCVString
/// ::= .cv_stringtable "string"
bool AsmParser::parseDirectiveCVString() {
std::string Data;
if (checkForValidSection() || parseEscapedString(Data))
return addErrorSuffix(" in '.cv_string' directive");
// Put the string in the table and emit the offset.
std::pair<StringRef, unsigned> Insertion =
getCVContext().addToStringTable(Data);
getStreamer().EmitIntValue(Insertion.second, 4);
return false;
}
/// parseDirectiveCVStringTable
/// ::= .cv_stringtable
bool AsmParser::parseDirectiveCVStringTable() {
getStreamer().EmitCVStringTableDirective();
return false;
}
/// parseDirectiveCVFileChecksums
/// ::= .cv_filechecksums
bool AsmParser::parseDirectiveCVFileChecksums() {
getStreamer().EmitCVFileChecksumsDirective();
return false;
}
/// parseDirectiveCVFileChecksumOffset
/// ::= .cv_filechecksumoffset fileno
bool AsmParser::parseDirectiveCVFileChecksumOffset() {
int64_t FileNo;
if (parseIntToken(FileNo, "expected identifier in directive"))
return true;
if (parseToken(AsmToken::EndOfStatement, "Expected End of Statement"))
return true;
getStreamer().EmitCVFileChecksumOffsetDirective(FileNo);
return false;
}
/// parseDirectiveCVFPOData
/// ::= .cv_fpo_data procsym
bool AsmParser::parseDirectiveCVFPOData() {
SMLoc DirLoc = getLexer().getLoc();
StringRef ProcName;
if (parseIdentifier(ProcName))
return TokError("expected symbol name");
if (parseEOL("unexpected tokens"))
return addErrorSuffix(" in '.cv_fpo_data' directive");
MCSymbol *ProcSym = getContext().getOrCreateSymbol(ProcName);
getStreamer().EmitCVFPOData(ProcSym, DirLoc);
return false;
}
/// parseDirectiveCFISections
/// ::= .cfi_sections section [, section]
bool AsmParser::parseDirectiveCFISections() {
StringRef Name;
bool EH = false;
bool Debug = false;
if (parseIdentifier(Name))
return TokError("Expected an identifier");
if (Name == ".eh_frame")
EH = true;
else if (Name == ".debug_frame")
Debug = true;
if (getLexer().is(AsmToken::Comma)) {
Lex();
if (parseIdentifier(Name))
return TokError("Expected an identifier");
if (Name == ".eh_frame")
EH = true;
else if (Name == ".debug_frame")
Debug = true;
}
getStreamer().EmitCFISections(EH, Debug);
return false;
}
/// parseDirectiveCFIStartProc
/// ::= .cfi_startproc [simple]
bool AsmParser::parseDirectiveCFIStartProc() {
StringRef Simple;
if (!parseOptionalToken(AsmToken::EndOfStatement)) {
if (check(parseIdentifier(Simple) || Simple != "simple",
"unexpected token") ||
parseToken(AsmToken::EndOfStatement))
return addErrorSuffix(" in '.cfi_startproc' directive");
}
// TODO(kristina): Deal with a corner case of incorrect diagnostic context
// being produced if this directive is emitted as part of preprocessor macro
// expansion which can *ONLY* happen if Clang's cc1as is the API consumer.
// Tools like llvm-mc on the other hand are not affected by it, and report
// correct context information.
getStreamer().EmitCFIStartProc(!Simple.empty(), Lexer.getLoc());
return false;
}
/// parseDirectiveCFIEndProc
/// ::= .cfi_endproc
bool AsmParser::parseDirectiveCFIEndProc() {
getStreamer().EmitCFIEndProc();
return false;
}
/// parse register name or number.
bool AsmParser::parseRegisterOrRegisterNumber(int64_t &Register,
SMLoc DirectiveLoc) {
unsigned RegNo;
if (getLexer().isNot(AsmToken::Integer)) {
if (getTargetParser().ParseRegister(RegNo, DirectiveLoc, DirectiveLoc))
return true;
Register = getContext().getRegisterInfo()->getDwarfRegNum(RegNo, true);
} else
return parseAbsoluteExpression(Register);
return false;
}
/// parseDirectiveCFIDefCfa
/// ::= .cfi_def_cfa register, offset
bool AsmParser::parseDirectiveCFIDefCfa(SMLoc DirectiveLoc) {
int64_t Register = 0, Offset = 0;
if (parseRegisterOrRegisterNumber(Register, DirectiveLoc) ||
parseToken(AsmToken::Comma, "unexpected token in directive") ||
parseAbsoluteExpression(Offset))
return true;
getStreamer().EmitCFIDefCfa(Register, Offset);
return false;
}
/// parseDirectiveCFIDefCfaOffset
/// ::= .cfi_def_cfa_offset offset
bool AsmParser::parseDirectiveCFIDefCfaOffset() {
int64_t Offset = 0;
if (parseAbsoluteExpression(Offset))
return true;
getStreamer().EmitCFIDefCfaOffset(Offset);
return false;
}
/// parseDirectiveCFIRegister
/// ::= .cfi_register register, register
bool AsmParser::parseDirectiveCFIRegister(SMLoc DirectiveLoc) {
int64_t Register1 = 0, Register2 = 0;
if (parseRegisterOrRegisterNumber(Register1, DirectiveLoc) ||
parseToken(AsmToken::Comma, "unexpected token in directive") ||
parseRegisterOrRegisterNumber(Register2, DirectiveLoc))
return true;
getStreamer().EmitCFIRegister(Register1, Register2);
return false;
}
/// parseDirectiveCFIWindowSave
/// ::= .cfi_window_save
bool AsmParser::parseDirectiveCFIWindowSave() {
getStreamer().EmitCFIWindowSave();
return false;
}
/// parseDirectiveCFIAdjustCfaOffset
/// ::= .cfi_adjust_cfa_offset adjustment
bool AsmParser::parseDirectiveCFIAdjustCfaOffset() {
int64_t Adjustment = 0;
if (parseAbsoluteExpression(Adjustment))
return true;
getStreamer().EmitCFIAdjustCfaOffset(Adjustment);
return false;
}
/// parseDirectiveCFIDefCfaRegister
/// ::= .cfi_def_cfa_register register
bool AsmParser::parseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc) {
int64_t Register = 0;
if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
return true;
getStreamer().EmitCFIDefCfaRegister(Register);
return false;
}
/// parseDirectiveCFIOffset
/// ::= .cfi_offset register, offset
bool AsmParser::parseDirectiveCFIOffset(SMLoc DirectiveLoc) {
int64_t Register = 0;
int64_t Offset = 0;
if (parseRegisterOrRegisterNumber(Register, DirectiveLoc) ||
parseToken(AsmToken::Comma, "unexpected token in directive") ||
parseAbsoluteExpression(Offset))
return true;
getStreamer().EmitCFIOffset(Register, Offset);
return false;
}
/// parseDirectiveCFIRelOffset
/// ::= .cfi_rel_offset register, offset
bool AsmParser::parseDirectiveCFIRelOffset(SMLoc DirectiveLoc) {
int64_t Register = 0, Offset = 0;
if (parseRegisterOrRegisterNumber(Register, DirectiveLoc) ||
parseToken(AsmToken::Comma, "unexpected token in directive") ||
parseAbsoluteExpression(Offset))
return true;
getStreamer().EmitCFIRelOffset(Register, Offset);
return false;
}
static bool isValidEncoding(int64_t Encoding) {
if (Encoding & ~0xff)
return false;
if (Encoding == dwarf::DW_EH_PE_omit)
return true;
const unsigned Format = Encoding & 0xf;
if (Format != dwarf::DW_EH_PE_absptr && Format != dwarf::DW_EH_PE_udata2 &&
Format != dwarf::DW_EH_PE_udata4 && Format != dwarf::DW_EH_PE_udata8 &&
Format != dwarf::DW_EH_PE_sdata2 && Format != dwarf::DW_EH_PE_sdata4 &&
Format != dwarf::DW_EH_PE_sdata8 && Format != dwarf::DW_EH_PE_signed)
return false;
const unsigned Application = Encoding & 0x70;
if (Application != dwarf::DW_EH_PE_absptr &&
Application != dwarf::DW_EH_PE_pcrel)
return false;
return true;
}
/// parseDirectiveCFIPersonalityOrLsda
/// IsPersonality true for cfi_personality, false for cfi_lsda
/// ::= .cfi_personality encoding, [symbol_name]
/// ::= .cfi_lsda encoding, [symbol_name]
bool AsmParser::parseDirectiveCFIPersonalityOrLsda(bool IsPersonality) {
int64_t Encoding = 0;
if (parseAbsoluteExpression(Encoding))
return true;
if (Encoding == dwarf::DW_EH_PE_omit)
return false;
StringRef Name;
if (check(!isValidEncoding(Encoding), "unsupported encoding.") ||
parseToken(AsmToken::Comma, "unexpected token in directive") ||
check(parseIdentifier(Name), "expected identifier in directive"))
return true;
MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
if (IsPersonality)
getStreamer().EmitCFIPersonality(Sym, Encoding);
else
getStreamer().EmitCFILsda(Sym, Encoding);
return false;
}
/// parseDirectiveCFIRememberState
/// ::= .cfi_remember_state
bool AsmParser::parseDirectiveCFIRememberState() {
getStreamer().EmitCFIRememberState();
return false;
}
/// parseDirectiveCFIRestoreState
/// ::= .cfi_remember_state
bool AsmParser::parseDirectiveCFIRestoreState() {
getStreamer().EmitCFIRestoreState();
return false;
}
/// parseDirectiveCFISameValue
/// ::= .cfi_same_value register
bool AsmParser::parseDirectiveCFISameValue(SMLoc DirectiveLoc) {
int64_t Register = 0;
if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
return true;
getStreamer().EmitCFISameValue(Register);
return false;
}
/// parseDirectiveCFIRestore
/// ::= .cfi_restore register
bool AsmParser::parseDirectiveCFIRestore(SMLoc DirectiveLoc) {
int64_t Register = 0;
if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
return true;
getStreamer().EmitCFIRestore(Register);
return false;
}
/// parseDirectiveCFIEscape
/// ::= .cfi_escape expression[,...]
bool AsmParser::parseDirectiveCFIEscape() {
std::string Values;
int64_t CurrValue;
if (parseAbsoluteExpression(CurrValue))
return true;
Values.push_back((uint8_t)CurrValue);
while (getLexer().is(AsmToken::Comma)) {
Lex();
if (parseAbsoluteExpression(CurrValue))
return true;
Values.push_back((uint8_t)CurrValue);
}
getStreamer().EmitCFIEscape(Values);
return false;
}
/// parseDirectiveCFIReturnColumn
/// ::= .cfi_return_column register
bool AsmParser::parseDirectiveCFIReturnColumn(SMLoc DirectiveLoc) {
int64_t Register = 0;
if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
return true;
getStreamer().EmitCFIReturnColumn(Register);
return false;
}
/// parseDirectiveCFISignalFrame
/// ::= .cfi_signal_frame
bool AsmParser::parseDirectiveCFISignalFrame() {
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '.cfi_signal_frame'"))
return true;
getStreamer().EmitCFISignalFrame();
return false;
}
/// parseDirectiveCFIUndefined
/// ::= .cfi_undefined register
bool AsmParser::parseDirectiveCFIUndefined(SMLoc DirectiveLoc) {
int64_t Register = 0;
if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
return true;
getStreamer().EmitCFIUndefined(Register);
return false;
}
/// parseDirectiveAltmacro
/// ::= .altmacro
/// ::= .noaltmacro
bool AsmParser::parseDirectiveAltmacro(StringRef Directive) {
if (getLexer().isNot(AsmToken::EndOfStatement))
return TokError("unexpected token in '" + Directive + "' directive");
AltMacroMode = (Directive == ".altmacro");
return false;
}
/// parseDirectiveMacrosOnOff
/// ::= .macros_on
/// ::= .macros_off
bool AsmParser::parseDirectiveMacrosOnOff(StringRef Directive) {
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '" + Directive + "' directive"))
return true;
setMacrosEnabled(Directive == ".macros_on");
return false;
}
/// parseDirectiveMacro
/// ::= .macro name[,] [parameters]
bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
StringRef Name;
if (parseIdentifier(Name))
return TokError("expected identifier in '.macro' directive");
if (getLexer().is(AsmToken::Comma))
Lex();
MCAsmMacroParameters Parameters;
while (getLexer().isNot(AsmToken::EndOfStatement)) {
if (!Parameters.empty() && Parameters.back().Vararg)
return Error(Lexer.getLoc(),
"Vararg parameter '" + Parameters.back().Name +
"' should be last one in the list of parameters.");
MCAsmMacroParameter Parameter;
if (parseIdentifier(Parameter.Name))
return TokError("expected identifier in '.macro' directive");
// Emit an error if two (or more) named parameters share the same name
for (const MCAsmMacroParameter& CurrParam : Parameters)
if (CurrParam.Name.equals(Parameter.Name))
return TokError("macro '" + Name + "' has multiple parameters"
" named '" + Parameter.Name + "'");
if (Lexer.is(AsmToken::Colon)) {
Lex(); // consume ':'
SMLoc QualLoc;
StringRef Qualifier;
QualLoc = Lexer.getLoc();
if (parseIdentifier(Qualifier))
return Error(QualLoc, "missing parameter qualifier for "
"'" + Parameter.Name + "' in macro '" + Name + "'");
if (Qualifier == "req")
Parameter.Required = true;
else if (Qualifier == "vararg")
Parameter.Vararg = true;
else
return Error(QualLoc, Qualifier + " is not a valid parameter qualifier "
"for '" + Parameter.Name + "' in macro '" + Name + "'");
}
if (getLexer().is(AsmToken::Equal)) {
Lex();
SMLoc ParamLoc;
ParamLoc = Lexer.getLoc();
if (parseMacroArgument(Parameter.Value, /*Vararg=*/false ))
return true;
if (Parameter.Required)
Warning(ParamLoc, "pointless default value for required parameter "
"'" + Parameter.Name + "' in macro '" + Name + "'");
}
Parameters.push_back(std::move(Parameter));
if (getLexer().is(AsmToken::Comma))
Lex();
}
// Eat just the end of statement.
Lexer.Lex();
// Consuming deferred text, so use Lexer.Lex to ignore Lexing Errors
AsmToken EndToken, StartToken = getTok();
unsigned MacroDepth = 0;
// Lex the macro definition.
while (true) {
// Ignore Lexing errors in macros.
while (Lexer.is(AsmToken::Error)) {
Lexer.Lex();
}
// Check whether we have reached the end of the file.
if (getLexer().is(AsmToken::Eof))
return Error(DirectiveLoc, "no matching '.endmacro' in definition");
// Otherwise, check whether we have reach the .endmacro.
if (getLexer().is(AsmToken::Identifier)) {
if (getTok().getIdentifier() == ".endm" ||
getTok().getIdentifier() == ".endmacro") {
if (MacroDepth == 0) { // Outermost macro.
EndToken = getTok();
Lexer.Lex();
if (getLexer().isNot(AsmToken::EndOfStatement))
return TokError("unexpected token in '" + EndToken.getIdentifier() +
"' directive");
break;
} else {
// Otherwise we just found the end of an inner macro.
--MacroDepth;
}
} else if (getTok().getIdentifier() == ".macro") {
// We allow nested macros. Those aren't instantiated until the outermost
// macro is expanded so just ignore them for now.
++MacroDepth;
}
}
// Otherwise, scan til the end of the statement.
eatToEndOfStatement();
}
if (getContext().lookupMacro(Name)) {
return Error(DirectiveLoc, "macro '" + Name + "' is already defined");
}
const char *BodyStart = StartToken.getLoc().getPointer();
const char *BodyEnd = EndToken.getLoc().getPointer();
StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
checkForBadMacro(DirectiveLoc, Name, Body, Parameters);
MCAsmMacro Macro(Name, Body, std::move(Parameters));
DEBUG_WITH_TYPE("asm-macros", dbgs() << "Defining new macro:\n";
Macro.dump());
getContext().defineMacro(Name, std::move(Macro));
return false;
}
/// checkForBadMacro
///
/// With the support added for named parameters there may be code out there that
/// is transitioning from positional parameters. In versions of gas that did
/// not support named parameters they would be ignored on the macro definition.
/// But to support both styles of parameters this is not possible so if a macro
/// definition has named parameters but does not use them and has what appears
/// to be positional parameters, strings like $1, $2, ... and $n, then issue a
/// warning that the positional parameter found in body which have no effect.
/// Hoping the developer will either remove the named parameters from the macro
/// definition so the positional parameters get used if that was what was
/// intended or change the macro to use the named parameters. It is possible
/// this warning will trigger when the none of the named parameters are used
/// and the strings like $1 are infact to simply to be passed trough unchanged.
void AsmParser::checkForBadMacro(SMLoc DirectiveLoc, StringRef Name,
StringRef Body,
ArrayRef<MCAsmMacroParameter> Parameters) {
// If this macro is not defined with named parameters the warning we are
// checking for here doesn't apply.
unsigned NParameters = Parameters.size();
if (NParameters == 0)
return;
bool NamedParametersFound = false;
bool PositionalParametersFound = false;
// Look at the body of the macro for use of both the named parameters and what
// are likely to be positional parameters. This is what expandMacro() is
// doing when it finds the parameters in the body.
while (!Body.empty()) {
// Scan for the next possible parameter.
std::size_t End = Body.size(), Pos = 0;
for (; Pos != End; ++Pos) {
// Check for a substitution or escape.
// This macro is defined with parameters, look for \foo, \bar, etc.
if (Body[Pos] == '\\' && Pos + 1 != End)
break;
// This macro should have parameters, but look for $0, $1, ..., $n too.
if (Body[Pos] != '$' || Pos + 1 == End)
continue;
char Next = Body[Pos + 1];
if (Next == '$' || Next == 'n' ||
isdigit(static_cast<unsigned char>(Next)))
break;
}
// Check if we reached the end.
if (Pos == End)
break;
if (Body[Pos] == '$') {
switch (Body[Pos + 1]) {
// $$ => $
case '$':
break;
// $n => number of arguments
case 'n':
PositionalParametersFound = true;
break;
// $[0-9] => argument
default: {
PositionalParametersFound = true;
break;
}
}
Pos += 2;
} else {
unsigned I = Pos + 1;
while (isIdentifierChar(Body[I]) && I + 1 != End)
++I;
const char *Begin = Body.data() + Pos + 1;
StringRef Argument(Begin, I - (Pos + 1));
unsigned Index = 0;
for (; Index < NParameters; ++Index)
if (Parameters[Index].Name == Argument)
break;
if (Index == NParameters) {
if (Body[Pos + 1] == '(' && Body[Pos + 2] == ')')
Pos += 3;
else {
Pos = I;
}
} else {
NamedParametersFound = true;
Pos += 1 + Argument.size();
}
}
// Update the scan point.
Body = Body.substr(Pos);
}
if (!NamedParametersFound && PositionalParametersFound)
Warning(DirectiveLoc, "macro defined with named parameters which are not "
"used in macro body, possible positional parameter "
"found in body which will have no effect");
}
/// parseDirectiveExitMacro
/// ::= .exitm
bool AsmParser::parseDirectiveExitMacro(StringRef Directive) {
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '" + Directive + "' directive"))
return true;
if (!isInsideMacroInstantiation())
return TokError("unexpected '" + Directive + "' in file, "
"no current macro definition");
// Exit all conditionals that are active in the current macro.
while (TheCondStack.size() != ActiveMacros.back()->CondStackDepth) {
TheCondState = TheCondStack.back();
TheCondStack.pop_back();
}
handleMacroExit();
return false;
}
/// parseDirectiveEndMacro
/// ::= .endm
/// ::= .endmacro
bool AsmParser::parseDirectiveEndMacro(StringRef Directive) {
if (getLexer().isNot(AsmToken::EndOfStatement))
return TokError("unexpected token in '" + Directive + "' directive");
// If we are inside a macro instantiation, terminate the current
// instantiation.
if (isInsideMacroInstantiation()) {
handleMacroExit();
return false;
}
// Otherwise, this .endmacro is a stray entry in the file; well formed
// .endmacro directives are handled during the macro definition parsing.
return TokError("unexpected '" + Directive + "' in file, "
"no current macro definition");
}
/// parseDirectivePurgeMacro
/// ::= .purgem
bool AsmParser::parseDirectivePurgeMacro(SMLoc DirectiveLoc) {
StringRef Name;
SMLoc Loc;
if (parseTokenLoc(Loc) ||
check(parseIdentifier(Name), Loc,
"expected identifier in '.purgem' directive") ||
parseToken(AsmToken::EndOfStatement,
"unexpected token in '.purgem' directive"))
return true;
if (!getContext().lookupMacro(Name))
return Error(DirectiveLoc, "macro '" + Name + "' is not defined");
getContext().undefineMacro(Name);
DEBUG_WITH_TYPE("asm-macros", dbgs()
<< "Un-defining macro: " << Name << "\n");
return false;
}
/// parseDirectiveBundleAlignMode
/// ::= {.bundle_align_mode} expression
bool AsmParser::parseDirectiveBundleAlignMode() {
// Expect a single argument: an expression that evaluates to a constant
// in the inclusive range 0-30.
SMLoc ExprLoc = getLexer().getLoc();
int64_t AlignSizePow2;
if (checkForValidSection() || parseAbsoluteExpression(AlignSizePow2) ||
parseToken(AsmToken::EndOfStatement, "unexpected token after expression "
"in '.bundle_align_mode' "
"directive") ||
check(AlignSizePow2 < 0 || AlignSizePow2 > 30, ExprLoc,
"invalid bundle alignment size (expected between 0 and 30)"))
return true;
// Because of AlignSizePow2's verified range we can safely truncate it to
// unsigned.
getStreamer().EmitBundleAlignMode(static_cast<unsigned>(AlignSizePow2));
return false;
}
/// parseDirectiveBundleLock
/// ::= {.bundle_lock} [align_to_end]
bool AsmParser::parseDirectiveBundleLock() {
if (checkForValidSection())
return true;
bool AlignToEnd = false;
StringRef Option;
SMLoc Loc = getTok().getLoc();
const char *kInvalidOptionError =
"invalid option for '.bundle_lock' directive";
if (!parseOptionalToken(AsmToken::EndOfStatement)) {
if (check(parseIdentifier(Option), Loc, kInvalidOptionError) ||
check(Option != "align_to_end", Loc, kInvalidOptionError) ||
parseToken(AsmToken::EndOfStatement,
"unexpected token after '.bundle_lock' directive option"))
return true;
AlignToEnd = true;
}
getStreamer().EmitBundleLock(AlignToEnd);
return false;
}
/// parseDirectiveBundleLock
/// ::= {.bundle_lock}
bool AsmParser::parseDirectiveBundleUnlock() {
if (checkForValidSection() ||
parseToken(AsmToken::EndOfStatement,
"unexpected token in '.bundle_unlock' directive"))
return true;
getStreamer().EmitBundleUnlock();
return false;
}
/// parseDirectiveSpace
/// ::= (.skip | .space) expression [ , expression ]
bool AsmParser::parseDirectiveSpace(StringRef IDVal) {
SMLoc NumBytesLoc = Lexer.getLoc();
const MCExpr *NumBytes;
if (checkForValidSection() || parseExpression(NumBytes))
return true;
int64_t FillExpr = 0;
if (parseOptionalToken(AsmToken::Comma))
if (parseAbsoluteExpression(FillExpr))
return addErrorSuffix("in '" + Twine(IDVal) + "' directive");
if (parseToken(AsmToken::EndOfStatement))
return addErrorSuffix("in '" + Twine(IDVal) + "' directive");
// FIXME: Sometimes the fill expr is 'nop' if it isn't supplied, instead of 0.
getStreamer().emitFill(*NumBytes, FillExpr, NumBytesLoc);
return false;
}
/// parseDirectiveDCB
/// ::= .dcb.{b, l, w} expression, expression
bool AsmParser::parseDirectiveDCB(StringRef IDVal, unsigned Size) {
SMLoc NumValuesLoc = Lexer.getLoc();
int64_t NumValues;
if (checkForValidSection() || parseAbsoluteExpression(NumValues))
return true;
if (NumValues < 0) {
Warning(NumValuesLoc, "'" + Twine(IDVal) + "' directive with negative repeat count has no effect");
return false;
}
if (parseToken(AsmToken::Comma,
"unexpected token in '" + Twine(IDVal) + "' directive"))
return true;
const MCExpr *Value;
SMLoc ExprLoc = getLexer().getLoc();
if (parseExpression(Value))
return true;
// Special case constant expressions to match code generator.
if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
assert(Size <= 8 && "Invalid size");
uint64_t IntValue = MCE->getValue();
if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
return Error(ExprLoc, "literal value out of range for directive");
for (uint64_t i = 0, e = NumValues; i != e; ++i)
getStreamer().EmitIntValue(IntValue, Size);
} else {
for (uint64_t i = 0, e = NumValues; i != e; ++i)
getStreamer().EmitValue(Value, Size, ExprLoc);
}
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '" + Twine(IDVal) + "' directive"))
return true;
return false;
}
/// parseDirectiveRealDCB
/// ::= .dcb.{d, s} expression, expression
bool AsmParser::parseDirectiveRealDCB(StringRef IDVal, const fltSemantics &Semantics) {
SMLoc NumValuesLoc = Lexer.getLoc();
int64_t NumValues;
if (checkForValidSection() || parseAbsoluteExpression(NumValues))
return true;
if (NumValues < 0) {
Warning(NumValuesLoc, "'" + Twine(IDVal) + "' directive with negative repeat count has no effect");
return false;
}
if (parseToken(AsmToken::Comma,
"unexpected token in '" + Twine(IDVal) + "' directive"))
return true;
APInt AsInt;
if (parseRealValue(Semantics, AsInt))
return true;
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '" + Twine(IDVal) + "' directive"))
return true;
for (uint64_t i = 0, e = NumValues; i != e; ++i)
getStreamer().EmitIntValue(AsInt.getLimitedValue(),
AsInt.getBitWidth() / 8);
return false;
}
/// parseDirectiveDS
/// ::= .ds.{b, d, l, p, s, w, x} expression
bool AsmParser::parseDirectiveDS(StringRef IDVal, unsigned Size) {
SMLoc NumValuesLoc = Lexer.getLoc();
int64_t NumValues;
if (checkForValidSection() || parseAbsoluteExpression(NumValues))
return true;
if (NumValues < 0) {
Warning(NumValuesLoc, "'" + Twine(IDVal) + "' directive with negative repeat count has no effect");
return false;
}
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '" + Twine(IDVal) + "' directive"))
return true;
for (uint64_t i = 0, e = NumValues; i != e; ++i)
getStreamer().emitFill(Size, 0);
return false;
}
/// parseDirectiveLEB128
/// ::= (.sleb128 | .uleb128) [ expression (, expression)* ]
bool AsmParser::parseDirectiveLEB128(bool Signed) {
if (checkForValidSection())
return true;
auto parseOp = [&]() -> bool {
const MCExpr *Value;
if (parseExpression(Value))
return true;
if (Signed)
getStreamer().EmitSLEB128Value(Value);
else
getStreamer().EmitULEB128Value(Value);
return false;
};
if (parseMany(parseOp))
return addErrorSuffix(" in directive");
return false;
}
/// parseDirectiveSymbolAttribute
/// ::= { ".globl", ".weak", ... } [ identifier ( , identifier )* ]
bool AsmParser::parseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
auto parseOp = [&]() -> bool {
StringRef Name;
SMLoc Loc = getTok().getLoc();
if (parseIdentifier(Name))
return Error(Loc, "expected identifier");
MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
// Assembler local symbols don't make any sense here. Complain loudly.
if (Sym->isTemporary())
return Error(Loc, "non-local symbol required");
if (!getStreamer().EmitSymbolAttribute(Sym, Attr))
return Error(Loc, "unable to emit symbol attribute");
return false;
};
if (parseMany(parseOp))
return addErrorSuffix(" in directive");
return false;
}
/// parseDirectiveComm
/// ::= ( .comm | .lcomm ) identifier , size_expression [ , align_expression ]
bool AsmParser::parseDirectiveComm(bool IsLocal) {
if (checkForValidSection())
return true;
SMLoc IDLoc = getLexer().getLoc();
StringRef Name;
if (parseIdentifier(Name))
return TokError("expected identifier in directive");
// Handle the identifier as the key symbol.
MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
if (getLexer().isNot(AsmToken::Comma))
return TokError("unexpected token in directive");
Lex();
int64_t Size;
SMLoc SizeLoc = getLexer().getLoc();
if (parseAbsoluteExpression(Size))
return true;
int64_t Pow2Alignment = 0;
SMLoc Pow2AlignmentLoc;
if (getLexer().is(AsmToken::Comma)) {
Lex();
Pow2AlignmentLoc = getLexer().getLoc();
if (parseAbsoluteExpression(Pow2Alignment))
return true;
LCOMM::LCOMMType LCOMM = Lexer.getMAI().getLCOMMDirectiveAlignmentType();
if (IsLocal && LCOMM == LCOMM::NoAlignment)
return Error(Pow2AlignmentLoc, "alignment not supported on this target");
// If this target takes alignments in bytes (not log) validate and convert.
if ((!IsLocal && Lexer.getMAI().getCOMMDirectiveAlignmentIsInBytes()) ||
(IsLocal && LCOMM == LCOMM::ByteAlignment)) {
if (!isPowerOf2_64(Pow2Alignment))
return Error(Pow2AlignmentLoc, "alignment must be a power of 2");
Pow2Alignment = Log2_64(Pow2Alignment);
}
}
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '.comm' or '.lcomm' directive"))
return true;
// NOTE: a size of zero for a .comm should create a undefined symbol
// but a size of .lcomm creates a bss symbol of size zero.
if (Size < 0)
return Error(SizeLoc, "invalid '.comm' or '.lcomm' directive size, can't "
"be less than zero");
// NOTE: The alignment in the directive is a power of 2 value, the assembler
// may internally end up wanting an alignment in bytes.
// FIXME: Diagnose overflow.
if (Pow2Alignment < 0)
return Error(Pow2AlignmentLoc, "invalid '.comm' or '.lcomm' directive "
"alignment, can't be less than zero");
Sym->redefineIfPossible();
if (!Sym->isUndefined())
return Error(IDLoc, "invalid symbol redefinition");
// Create the Symbol as a common or local common with Size and Pow2Alignment
if (IsLocal) {
getStreamer().EmitLocalCommonSymbol(Sym, Size, 1 << Pow2Alignment);
return false;
}
getStreamer().EmitCommonSymbol(Sym, Size, 1 << Pow2Alignment);
return false;
}
/// parseDirectiveAbort
/// ::= .abort [... message ...]
bool AsmParser::parseDirectiveAbort() {
// FIXME: Use loc from directive.
SMLoc Loc = getLexer().getLoc();
StringRef Str = parseStringToEndOfStatement();
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '.abort' directive"))
return true;
if (Str.empty())
return Error(Loc, ".abort detected. Assembly stopping.");
else
return Error(Loc, ".abort '" + Str + "' detected. Assembly stopping.");
// FIXME: Actually abort assembly here.
return false;
}
/// parseDirectiveInclude
/// ::= .include "filename"
bool AsmParser::parseDirectiveInclude() {
// Allow the strings to have escaped octal character sequence.
std::string Filename;
SMLoc IncludeLoc = getTok().getLoc();
if (check(getTok().isNot(AsmToken::String),
"expected string in '.include' directive") ||
parseEscapedString(Filename) ||
check(getTok().isNot(AsmToken::EndOfStatement),
"unexpected token in '.include' directive") ||
// Attempt to switch the lexer to the included file before consuming the
// end of statement to avoid losing it when we switch.
check(enterIncludeFile(Filename), IncludeLoc,
"Could not find include file '" + Filename + "'"))
return true;
return false;
}
/// parseDirectiveIncbin
/// ::= .incbin "filename" [ , skip [ , count ] ]
bool AsmParser::parseDirectiveIncbin() {
// Allow the strings to have escaped octal character sequence.
std::string Filename;
SMLoc IncbinLoc = getTok().getLoc();
if (check(getTok().isNot(AsmToken::String),
"expected string in '.incbin' directive") ||
parseEscapedString(Filename))
return true;
int64_t Skip = 0;
const MCExpr *Count = nullptr;
SMLoc SkipLoc, CountLoc;
if (parseOptionalToken(AsmToken::Comma)) {
// The skip expression can be omitted while specifying the count, e.g:
// .incbin "filename",,4
if (getTok().isNot(AsmToken::Comma)) {
if (parseTokenLoc(SkipLoc) || parseAbsoluteExpression(Skip))
return true;
}
if (parseOptionalToken(AsmToken::Comma)) {
CountLoc = getTok().getLoc();
if (parseExpression(Count))
return true;
}
}
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '.incbin' directive"))
return true;
if (check(Skip < 0, SkipLoc, "skip is negative"))
return true;
// Attempt to process the included file.
if (processIncbinFile(Filename, Skip, Count, CountLoc))
return Error(IncbinLoc, "Could not find incbin file '" + Filename + "'");
return false;
}
/// parseDirectiveIf
/// ::= .if{,eq,ge,gt,le,lt,ne} expression
bool AsmParser::parseDirectiveIf(SMLoc DirectiveLoc, DirectiveKind DirKind) {
TheCondStack.push_back(TheCondState);
TheCondState.TheCond = AsmCond::IfCond;
if (TheCondState.Ignore) {
eatToEndOfStatement();
} else {
int64_t ExprValue;
if (parseAbsoluteExpression(ExprValue) ||
parseToken(AsmToken::EndOfStatement,
"unexpected token in '.if' directive"))
return true;
switch (DirKind) {
default:
llvm_unreachable("unsupported directive");
case DK_IF:
case DK_IFNE:
break;
case DK_IFEQ:
ExprValue = ExprValue == 0;
break;
case DK_IFGE:
ExprValue = ExprValue >= 0;
break;
case DK_IFGT:
ExprValue = ExprValue > 0;
break;
case DK_IFLE:
ExprValue = ExprValue <= 0;
break;
case DK_IFLT:
ExprValue = ExprValue < 0;
break;
}
TheCondState.CondMet = ExprValue;
TheCondState.Ignore = !TheCondState.CondMet;
}
return false;
}
/// parseDirectiveIfb
/// ::= .ifb string
bool AsmParser::parseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
TheCondStack.push_back(TheCondState);
TheCondState.TheCond = AsmCond::IfCond;
if (TheCondState.Ignore) {
eatToEndOfStatement();
} else {
StringRef Str = parseStringToEndOfStatement();
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '.ifb' directive"))
return true;
TheCondState.CondMet = ExpectBlank == Str.empty();
TheCondState.Ignore = !TheCondState.CondMet;
}
return false;
}
/// parseDirectiveIfc
/// ::= .ifc string1, string2
/// ::= .ifnc string1, string2
bool AsmParser::parseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual) {
TheCondStack.push_back(TheCondState);
TheCondState.TheCond = AsmCond::IfCond;
if (TheCondState.Ignore) {
eatToEndOfStatement();
} else {
StringRef Str1 = parseStringToComma();
if (parseToken(AsmToken::Comma, "unexpected token in '.ifc' directive"))
return true;
StringRef Str2 = parseStringToEndOfStatement();
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '.ifc' directive"))
return true;
TheCondState.CondMet = ExpectEqual == (Str1.trim() == Str2.trim());
TheCondState.Ignore = !TheCondState.CondMet;
}
return false;
}
/// parseDirectiveIfeqs
/// ::= .ifeqs string1, string2
bool AsmParser::parseDirectiveIfeqs(SMLoc DirectiveLoc, bool ExpectEqual) {
if (Lexer.isNot(AsmToken::String)) {
if (ExpectEqual)
return TokError("expected string parameter for '.ifeqs' directive");
return TokError("expected string parameter for '.ifnes' directive");
}
StringRef String1 = getTok().getStringContents();
Lex();
if (Lexer.isNot(AsmToken::Comma)) {
if (ExpectEqual)
return TokError(
"expected comma after first string for '.ifeqs' directive");
return TokError("expected comma after first string for '.ifnes' directive");
}
Lex();
if (Lexer.isNot(AsmToken::String)) {
if (ExpectEqual)
return TokError("expected string parameter for '.ifeqs' directive");
return TokError("expected string parameter for '.ifnes' directive");
}
StringRef String2 = getTok().getStringContents();
Lex();
TheCondStack.push_back(TheCondState);
TheCondState.TheCond = AsmCond::IfCond;
TheCondState.CondMet = ExpectEqual == (String1 == String2);
TheCondState.Ignore = !TheCondState.CondMet;
return false;
}
/// parseDirectiveIfdef
/// ::= .ifdef symbol
bool AsmParser::parseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) {
StringRef Name;
TheCondStack.push_back(TheCondState);
TheCondState.TheCond = AsmCond::IfCond;
if (TheCondState.Ignore) {
eatToEndOfStatement();
} else {
if (check(parseIdentifier(Name), "expected identifier after '.ifdef'") ||
parseToken(AsmToken::EndOfStatement, "unexpected token in '.ifdef'"))
return true;
MCSymbol *Sym = getContext().lookupSymbol(Name);
if (expect_defined)
TheCondState.CondMet = (Sym && !Sym->isUndefined(false));
else
TheCondState.CondMet = (!Sym || Sym->isUndefined(false));
TheCondState.Ignore = !TheCondState.CondMet;
}
return false;
}
/// parseDirectiveElseIf
/// ::= .elseif expression
bool AsmParser::parseDirectiveElseIf(SMLoc DirectiveLoc) {
if (TheCondState.TheCond != AsmCond::IfCond &&
TheCondState.TheCond != AsmCond::ElseIfCond)
return Error(DirectiveLoc, "Encountered a .elseif that doesn't follow an"
" .if or an .elseif");
TheCondState.TheCond = AsmCond::ElseIfCond;
bool LastIgnoreState = false;
if (!TheCondStack.empty())
LastIgnoreState = TheCondStack.back().Ignore;
if (LastIgnoreState || TheCondState.CondMet) {
TheCondState.Ignore = true;
eatToEndOfStatement();
} else {
int64_t ExprValue;
if (parseAbsoluteExpression(ExprValue))
return true;
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '.elseif' directive"))
return true;
TheCondState.CondMet = ExprValue;
TheCondState.Ignore = !TheCondState.CondMet;
}
return false;
}
/// parseDirectiveElse
/// ::= .else
bool AsmParser::parseDirectiveElse(SMLoc DirectiveLoc) {
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '.else' directive"))
return true;
if (TheCondState.TheCond != AsmCond::IfCond &&
TheCondState.TheCond != AsmCond::ElseIfCond)
return Error(DirectiveLoc, "Encountered a .else that doesn't follow "
" an .if or an .elseif");
TheCondState.TheCond = AsmCond::ElseCond;
bool LastIgnoreState = false;
if (!TheCondStack.empty())
LastIgnoreState = TheCondStack.back().Ignore;
if (LastIgnoreState || TheCondState.CondMet)
TheCondState.Ignore = true;
else
TheCondState.Ignore = false;
return false;
}
/// parseDirectiveEnd
/// ::= .end
bool AsmParser::parseDirectiveEnd(SMLoc DirectiveLoc) {
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '.end' directive"))
return true;
while (Lexer.isNot(AsmToken::Eof))
Lexer.Lex();
return false;
}
/// parseDirectiveError
/// ::= .err
/// ::= .error [string]
bool AsmParser::parseDirectiveError(SMLoc L, bool WithMessage) {
if (!TheCondStack.empty()) {
if (TheCondStack.back().Ignore) {
eatToEndOfStatement();
return false;
}
}
if (!WithMessage)
return Error(L, ".err encountered");
StringRef Message = ".error directive invoked in source file";
if (Lexer.isNot(AsmToken::EndOfStatement)) {
if (Lexer.isNot(AsmToken::String))
return TokError(".error argument must be a string");
Message = getTok().getStringContents();
Lex();
}
return Error(L, Message);
}
/// parseDirectiveWarning
/// ::= .warning [string]
bool AsmParser::parseDirectiveWarning(SMLoc L) {
if (!TheCondStack.empty()) {
if (TheCondStack.back().Ignore) {
eatToEndOfStatement();
return false;
}
}
StringRef Message = ".warning directive invoked in source file";
if (!parseOptionalToken(AsmToken::EndOfStatement)) {
if (Lexer.isNot(AsmToken::String))
return TokError(".warning argument must be a string");
Message = getTok().getStringContents();
Lex();
if (parseToken(AsmToken::EndOfStatement,
"expected end of statement in '.warning' directive"))
return true;
}
return Warning(L, Message);
}
/// parseDirectiveEndIf
/// ::= .endif
bool AsmParser::parseDirectiveEndIf(SMLoc DirectiveLoc) {
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '.endif' directive"))
return true;
if ((TheCondState.TheCond == AsmCond::NoCond) || TheCondStack.empty())
return Error(DirectiveLoc, "Encountered a .endif that doesn't follow "
"an .if or .else");
if (!TheCondStack.empty()) {
TheCondState = TheCondStack.back();
TheCondStack.pop_back();
}
return false;
}
void AsmParser::initializeDirectiveKindMap() {
DirectiveKindMap[".set"] = DK_SET;
DirectiveKindMap[".equ"] = DK_EQU;
DirectiveKindMap[".equiv"] = DK_EQUIV;
DirectiveKindMap[".ascii"] = DK_ASCII;
DirectiveKindMap[".asciz"] = DK_ASCIZ;
DirectiveKindMap[".string"] = DK_STRING;
DirectiveKindMap[".byte"] = DK_BYTE;
DirectiveKindMap[".short"] = DK_SHORT;
DirectiveKindMap[".value"] = DK_VALUE;
DirectiveKindMap[".2byte"] = DK_2BYTE;
DirectiveKindMap[".long"] = DK_LONG;
DirectiveKindMap[".int"] = DK_INT;
DirectiveKindMap[".4byte"] = DK_4BYTE;
DirectiveKindMap[".quad"] = DK_QUAD;
DirectiveKindMap[".8byte"] = DK_8BYTE;
DirectiveKindMap[".octa"] = DK_OCTA;
DirectiveKindMap[".single"] = DK_SINGLE;
DirectiveKindMap[".float"] = DK_FLOAT;
DirectiveKindMap[".double"] = DK_DOUBLE;
DirectiveKindMap[".align"] = DK_ALIGN;
DirectiveKindMap[".align32"] = DK_ALIGN32;
DirectiveKindMap[".balign"] = DK_BALIGN;
DirectiveKindMap[".balignw"] = DK_BALIGNW;
DirectiveKindMap[".balignl"] = DK_BALIGNL;
DirectiveKindMap[".p2align"] = DK_P2ALIGN;
DirectiveKindMap[".p2alignw"] = DK_P2ALIGNW;
DirectiveKindMap[".p2alignl"] = DK_P2ALIGNL;
DirectiveKindMap[".org"] = DK_ORG;
DirectiveKindMap[".fill"] = DK_FILL;
DirectiveKindMap[".zero"] = DK_ZERO;
DirectiveKindMap[".extern"] = DK_EXTERN;
DirectiveKindMap[".globl"] = DK_GLOBL;
DirectiveKindMap[".global"] = DK_GLOBAL;
DirectiveKindMap[".lazy_reference"] = DK_LAZY_REFERENCE;
DirectiveKindMap[".no_dead_strip"] = DK_NO_DEAD_STRIP;
DirectiveKindMap[".symbol_resolver"] = DK_SYMBOL_RESOLVER;
DirectiveKindMap[".private_extern"] = DK_PRIVATE_EXTERN;
DirectiveKindMap[".reference"] = DK_REFERENCE;
DirectiveKindMap[".weak_definition"] = DK_WEAK_DEFINITION;
DirectiveKindMap[".weak_reference"] = DK_WEAK_REFERENCE;
DirectiveKindMap[".weak_def_can_be_hidden"] = DK_WEAK_DEF_CAN_BE_HIDDEN;
DirectiveKindMap[".cold"] = DK_COLD;
DirectiveKindMap[".comm"] = DK_COMM;
DirectiveKindMap[".common"] = DK_COMMON;
DirectiveKindMap[".lcomm"] = DK_LCOMM;
DirectiveKindMap[".abort"] = DK_ABORT;
DirectiveKindMap[".include"] = DK_INCLUDE;
DirectiveKindMap[".incbin"] = DK_INCBIN;
DirectiveKindMap[".code16"] = DK_CODE16;
DirectiveKindMap[".code16gcc"] = DK_CODE16GCC;
DirectiveKindMap[".rept"] = DK_REPT;
DirectiveKindMap[".rep"] = DK_REPT;
DirectiveKindMap[".irp"] = DK_IRP;
DirectiveKindMap[".irpc"] = DK_IRPC;
DirectiveKindMap[".endr"] = DK_ENDR;
DirectiveKindMap[".bundle_align_mode"] = DK_BUNDLE_ALIGN_MODE;
DirectiveKindMap[".bundle_lock"] = DK_BUNDLE_LOCK;
DirectiveKindMap[".bundle_unlock"] = DK_BUNDLE_UNLOCK;
DirectiveKindMap[".if"] = DK_IF;
DirectiveKindMap[".ifeq"] = DK_IFEQ;
DirectiveKindMap[".ifge"] = DK_IFGE;
DirectiveKindMap[".ifgt"] = DK_IFGT;
DirectiveKindMap[".ifle"] = DK_IFLE;
DirectiveKindMap[".iflt"] = DK_IFLT;
DirectiveKindMap[".ifne"] = DK_IFNE;
DirectiveKindMap[".ifb"] = DK_IFB;
DirectiveKindMap[".ifnb"] = DK_IFNB;
DirectiveKindMap[".ifc"] = DK_IFC;
DirectiveKindMap[".ifeqs"] = DK_IFEQS;
DirectiveKindMap[".ifnc"] = DK_IFNC;
DirectiveKindMap[".ifnes"] = DK_IFNES;
DirectiveKindMap[".ifdef"] = DK_IFDEF;
DirectiveKindMap[".ifndef"] = DK_IFNDEF;
DirectiveKindMap[".ifnotdef"] = DK_IFNOTDEF;
DirectiveKindMap[".elseif"] = DK_ELSEIF;
DirectiveKindMap[".else"] = DK_ELSE;
DirectiveKindMap[".end"] = DK_END;
DirectiveKindMap[".endif"] = DK_ENDIF;
DirectiveKindMap[".skip"] = DK_SKIP;
DirectiveKindMap[".space"] = DK_SPACE;
DirectiveKindMap[".file"] = DK_FILE;
DirectiveKindMap[".line"] = DK_LINE;
DirectiveKindMap[".loc"] = DK_LOC;
DirectiveKindMap[".stabs"] = DK_STABS;
DirectiveKindMap[".cv_file"] = DK_CV_FILE;
DirectiveKindMap[".cv_func_id"] = DK_CV_FUNC_ID;
DirectiveKindMap[".cv_loc"] = DK_CV_LOC;
DirectiveKindMap[".cv_linetable"] = DK_CV_LINETABLE;
DirectiveKindMap[".cv_inline_linetable"] = DK_CV_INLINE_LINETABLE;
DirectiveKindMap[".cv_inline_site_id"] = DK_CV_INLINE_SITE_ID;
DirectiveKindMap[".cv_def_range"] = DK_CV_DEF_RANGE;
DirectiveKindMap[".cv_string"] = DK_CV_STRING;
DirectiveKindMap[".cv_stringtable"] = DK_CV_STRINGTABLE;
DirectiveKindMap[".cv_filechecksums"] = DK_CV_FILECHECKSUMS;
DirectiveKindMap[".cv_filechecksumoffset"] = DK_CV_FILECHECKSUM_OFFSET;
DirectiveKindMap[".cv_fpo_data"] = DK_CV_FPO_DATA;
DirectiveKindMap[".sleb128"] = DK_SLEB128;
DirectiveKindMap[".uleb128"] = DK_ULEB128;
DirectiveKindMap[".cfi_sections"] = DK_CFI_SECTIONS;
DirectiveKindMap[".cfi_startproc"] = DK_CFI_STARTPROC;
DirectiveKindMap[".cfi_endproc"] = DK_CFI_ENDPROC;
DirectiveKindMap[".cfi_def_cfa"] = DK_CFI_DEF_CFA;
DirectiveKindMap[".cfi_def_cfa_offset"] = DK_CFI_DEF_CFA_OFFSET;
DirectiveKindMap[".cfi_adjust_cfa_offset"] = DK_CFI_ADJUST_CFA_OFFSET;
DirectiveKindMap[".cfi_def_cfa_register"] = DK_CFI_DEF_CFA_REGISTER;
DirectiveKindMap[".cfi_offset"] = DK_CFI_OFFSET;
DirectiveKindMap[".cfi_rel_offset"] = DK_CFI_REL_OFFSET;
DirectiveKindMap[".cfi_personality"] = DK_CFI_PERSONALITY;
DirectiveKindMap[".cfi_lsda"] = DK_CFI_LSDA;
DirectiveKindMap[".cfi_remember_state"] = DK_CFI_REMEMBER_STATE;
DirectiveKindMap[".cfi_restore_state"] = DK_CFI_RESTORE_STATE;
DirectiveKindMap[".cfi_same_value"] = DK_CFI_SAME_VALUE;
DirectiveKindMap[".cfi_restore"] = DK_CFI_RESTORE;
DirectiveKindMap[".cfi_escape"] = DK_CFI_ESCAPE;
DirectiveKindMap[".cfi_return_column"] = DK_CFI_RETURN_COLUMN;
DirectiveKindMap[".cfi_signal_frame"] = DK_CFI_SIGNAL_FRAME;
DirectiveKindMap[".cfi_undefined"] = DK_CFI_UNDEFINED;
DirectiveKindMap[".cfi_register"] = DK_CFI_REGISTER;
DirectiveKindMap[".cfi_window_save"] = DK_CFI_WINDOW_SAVE;
DirectiveKindMap[".cfi_b_key_frame"] = DK_CFI_B_KEY_FRAME;
DirectiveKindMap[".macros_on"] = DK_MACROS_ON;
DirectiveKindMap[".macros_off"] = DK_MACROS_OFF;
DirectiveKindMap[".macro"] = DK_MACRO;
DirectiveKindMap[".exitm"] = DK_EXITM;
DirectiveKindMap[".endm"] = DK_ENDM;
DirectiveKindMap[".endmacro"] = DK_ENDMACRO;
DirectiveKindMap[".purgem"] = DK_PURGEM;
DirectiveKindMap[".err"] = DK_ERR;
DirectiveKindMap[".error"] = DK_ERROR;
DirectiveKindMap[".warning"] = DK_WARNING;
DirectiveKindMap[".altmacro"] = DK_ALTMACRO;
DirectiveKindMap[".noaltmacro"] = DK_NOALTMACRO;
DirectiveKindMap[".reloc"] = DK_RELOC;
DirectiveKindMap[".dc"] = DK_DC;
DirectiveKindMap[".dc.a"] = DK_DC_A;
DirectiveKindMap[".dc.b"] = DK_DC_B;
DirectiveKindMap[".dc.d"] = DK_DC_D;
DirectiveKindMap[".dc.l"] = DK_DC_L;
DirectiveKindMap[".dc.s"] = DK_DC_S;
DirectiveKindMap[".dc.w"] = DK_DC_W;
DirectiveKindMap[".dc.x"] = DK_DC_X;
DirectiveKindMap[".dcb"] = DK_DCB;
DirectiveKindMap[".dcb.b"] = DK_DCB_B;
DirectiveKindMap[".dcb.d"] = DK_DCB_D;
DirectiveKindMap[".dcb.l"] = DK_DCB_L;
DirectiveKindMap[".dcb.s"] = DK_DCB_S;
DirectiveKindMap[".dcb.w"] = DK_DCB_W;
DirectiveKindMap[".dcb.x"] = DK_DCB_X;
DirectiveKindMap[".ds"] = DK_DS;
DirectiveKindMap[".ds.b"] = DK_DS_B;
DirectiveKindMap[".ds.d"] = DK_DS_D;
DirectiveKindMap[".ds.l"] = DK_DS_L;
DirectiveKindMap[".ds.p"] = DK_DS_P;
DirectiveKindMap[".ds.s"] = DK_DS_S;
DirectiveKindMap[".ds.w"] = DK_DS_W;
DirectiveKindMap[".ds.x"] = DK_DS_X;
DirectiveKindMap[".print"] = DK_PRINT;
DirectiveKindMap[".addrsig"] = DK_ADDRSIG;
DirectiveKindMap[".addrsig_sym"] = DK_ADDRSIG_SYM;
}
MCAsmMacro *AsmParser::parseMacroLikeBody(SMLoc DirectiveLoc) {
AsmToken EndToken, StartToken = getTok();
unsigned NestLevel = 0;
while (true) {
// Check whether we have reached the end of the file.
if (getLexer().is(AsmToken::Eof)) {
printError(DirectiveLoc, "no matching '.endr' in definition");
return nullptr;
}
if (Lexer.is(AsmToken::Identifier) &&
(getTok().getIdentifier() == ".rep" ||
getTok().getIdentifier() == ".rept" ||
getTok().getIdentifier() == ".irp" ||
getTok().getIdentifier() == ".irpc")) {
++NestLevel;
}
// Otherwise, check whether we have reached the .endr.
if (Lexer.is(AsmToken::Identifier) && getTok().getIdentifier() == ".endr") {
if (NestLevel == 0) {
EndToken = getTok();
Lex();
if (Lexer.isNot(AsmToken::EndOfStatement)) {
printError(getTok().getLoc(),
"unexpected token in '.endr' directive");
return nullptr;
}
break;
}
--NestLevel;
}
// Otherwise, scan till the end of the statement.
eatToEndOfStatement();
}
const char *BodyStart = StartToken.getLoc().getPointer();
const char *BodyEnd = EndToken.getLoc().getPointer();
StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
// We Are Anonymous.
MacroLikeBodies.emplace_back(StringRef(), Body, MCAsmMacroParameters());
return &MacroLikeBodies.back();
}
void AsmParser::instantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
raw_svector_ostream &OS) {
OS << ".endr\n";
std::unique_ptr<MemoryBuffer> Instantiation =
MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
// Create the macro instantiation object and add to the current macro
// instantiation stack.
MacroInstantiation *MI = new MacroInstantiation(
DirectiveLoc, CurBuffer, getTok().getLoc(), TheCondStack.size());
ActiveMacros.push_back(MI);
// Jump to the macro instantiation and prime the lexer.
CurBuffer = SrcMgr.AddNewSourceBuffer(std::move(Instantiation), SMLoc());
Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
Lex();
}
/// parseDirectiveRept
/// ::= .rep | .rept count
bool AsmParser::parseDirectiveRept(SMLoc DirectiveLoc, StringRef Dir) {
const MCExpr *CountExpr;
SMLoc CountLoc = getTok().getLoc();
if (parseExpression(CountExpr))
return true;
int64_t Count;
if (!CountExpr->evaluateAsAbsolute(Count, getStreamer().getAssemblerPtr())) {
return Error(CountLoc, "unexpected token in '" + Dir + "' directive");
}
if (check(Count < 0, CountLoc, "Count is negative") ||
parseToken(AsmToken::EndOfStatement,
"unexpected token in '" + Dir + "' directive"))
return true;
// Lex the rept definition.
MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
if (!M)
return true;
// Macro instantiation is lexical, unfortunately. We construct a new buffer
// to hold the macro body with substitutions.
SmallString<256> Buf;
raw_svector_ostream OS(Buf);
while (Count--) {
// Note that the AtPseudoVariable is disabled for instantiations of .rep(t).
if (expandMacro(OS, M->Body, None, None, false, getTok().getLoc()))
return true;
}
instantiateMacroLikeBody(M, DirectiveLoc, OS);
return false;
}
/// parseDirectiveIrp
/// ::= .irp symbol,values
bool AsmParser::parseDirectiveIrp(SMLoc DirectiveLoc) {
MCAsmMacroParameter Parameter;
MCAsmMacroArguments A;
if (check(parseIdentifier(Parameter.Name),
"expected identifier in '.irp' directive") ||
parseToken(AsmToken::Comma, "expected comma in '.irp' directive") ||
parseMacroArguments(nullptr, A) ||
parseToken(AsmToken::EndOfStatement, "expected End of Statement"))
return true;
// Lex the irp definition.
MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
if (!M)
return true;
// Macro instantiation is lexical, unfortunately. We construct a new buffer
// to hold the macro body with substitutions.
SmallString<256> Buf;
raw_svector_ostream OS(Buf);
for (const MCAsmMacroArgument &Arg : A) {
// Note that the AtPseudoVariable is enabled for instantiations of .irp.
// This is undocumented, but GAS seems to support it.
if (expandMacro(OS, M->Body, Parameter, Arg, true, getTok().getLoc()))
return true;
}
instantiateMacroLikeBody(M, DirectiveLoc, OS);
return false;
}
/// parseDirectiveIrpc
/// ::= .irpc symbol,values
bool AsmParser::parseDirectiveIrpc(SMLoc DirectiveLoc) {
MCAsmMacroParameter Parameter;
MCAsmMacroArguments A;
if (check(parseIdentifier(Parameter.Name),
"expected identifier in '.irpc' directive") ||
parseToken(AsmToken::Comma, "expected comma in '.irpc' directive") ||
parseMacroArguments(nullptr, A))
return true;
if (A.size() != 1 || A.front().size() != 1)
return TokError("unexpected token in '.irpc' directive");
// Eat the end of statement.
if (parseToken(AsmToken::EndOfStatement, "expected end of statement"))
return true;
// Lex the irpc definition.
MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
if (!M)
return true;
// Macro instantiation is lexical, unfortunately. We construct a new buffer
// to hold the macro body with substitutions.
SmallString<256> Buf;
raw_svector_ostream OS(Buf);
StringRef Values = A.front().front().getString();
for (std::size_t I = 0, End = Values.size(); I != End; ++I) {
MCAsmMacroArgument Arg;
Arg.emplace_back(AsmToken::Identifier, Values.slice(I, I + 1));
// Note that the AtPseudoVariable is enabled for instantiations of .irpc.
// This is undocumented, but GAS seems to support it.
if (expandMacro(OS, M->Body, Parameter, Arg, true, getTok().getLoc()))
return true;
}
instantiateMacroLikeBody(M, DirectiveLoc, OS);
return false;
}
bool AsmParser::parseDirectiveEndr(SMLoc DirectiveLoc) {
if (ActiveMacros.empty())
return TokError("unmatched '.endr' directive");
// The only .repl that should get here are the ones created by
// instantiateMacroLikeBody.
assert(getLexer().is(AsmToken::EndOfStatement));
handleMacroExit();
return false;
}
bool AsmParser::parseDirectiveMSEmit(SMLoc IDLoc, ParseStatementInfo &Info,
size_t Len) {
const MCExpr *Value;
SMLoc ExprLoc = getLexer().getLoc();
if (parseExpression(Value))
return true;
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
if (!MCE)
return Error(ExprLoc, "unexpected expression in _emit");
uint64_t IntValue = MCE->getValue();
if (!isUInt<8>(IntValue) && !isInt<8>(IntValue))
return Error(ExprLoc, "literal value out of range for directive");
Info.AsmRewrites->emplace_back(AOK_Emit, IDLoc, Len);
return false;
}
bool AsmParser::parseDirectiveMSAlign(SMLoc IDLoc, ParseStatementInfo &Info) {
const MCExpr *Value;
SMLoc ExprLoc = getLexer().getLoc();
if (parseExpression(Value))
return true;
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
if (!MCE)
return Error(ExprLoc, "unexpected expression in align");
uint64_t IntValue = MCE->getValue();
if (!isPowerOf2_64(IntValue))
return Error(ExprLoc, "literal value not a power of two greater then zero");
Info.AsmRewrites->emplace_back(AOK_Align, IDLoc, 5, Log2_64(IntValue));
return false;
}
bool AsmParser::parseDirectivePrint(SMLoc DirectiveLoc) {
const AsmToken StrTok = getTok();
Lex();
if (StrTok.isNot(AsmToken::String) || StrTok.getString().front() != '"')
return Error(DirectiveLoc, "expected double quoted string after .print");
if (parseToken(AsmToken::EndOfStatement, "expected end of statement"))
return true;
llvm::outs() << StrTok.getStringContents() << '\n';
return false;
}
bool AsmParser::parseDirectiveAddrsig() {
getStreamer().EmitAddrsig();
return false;
}
bool AsmParser::parseDirectiveAddrsigSym() {
StringRef Name;
if (check(parseIdentifier(Name),
"expected identifier in '.addrsig_sym' directive"))
return true;
MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
getStreamer().EmitAddrsigSym(Sym);
return false;
}
// We are comparing pointers, but the pointers are relative to a single string.
// Thus, this should always be deterministic.
static int rewritesSort(const AsmRewrite *AsmRewriteA,
const AsmRewrite *AsmRewriteB) {
if (AsmRewriteA->Loc.getPointer() < AsmRewriteB->Loc.getPointer())
return -1;
if (AsmRewriteB->Loc.getPointer() < AsmRewriteA->Loc.getPointer())
return 1;
// It's possible to have a SizeDirective, Imm/ImmPrefix and an Input/Output
// rewrite to the same location. Make sure the SizeDirective rewrite is
// performed first, then the Imm/ImmPrefix and finally the Input/Output. This
// ensures the sort algorithm is stable.
if (AsmRewritePrecedence[AsmRewriteA->Kind] >
AsmRewritePrecedence[AsmRewriteB->Kind])
return -1;
if (AsmRewritePrecedence[AsmRewriteA->Kind] <
AsmRewritePrecedence[AsmRewriteB->Kind])
return 1;
llvm_unreachable("Unstable rewrite sort.");
}
bool AsmParser::parseMSInlineAsm(
void *AsmLoc, std::string &AsmString, unsigned &NumOutputs,
unsigned &NumInputs, SmallVectorImpl<std::pair<void *, bool>> &OpDecls,
SmallVectorImpl<std::string> &Constraints,
SmallVectorImpl<std::string> &Clobbers, const MCInstrInfo *MII,
const MCInstPrinter *IP, MCAsmParserSemaCallback &SI) {
SmallVector<void *, 4> InputDecls;
SmallVector<void *, 4> OutputDecls;
SmallVector<bool, 4> InputDeclsAddressOf;
SmallVector<bool, 4> OutputDeclsAddressOf;
SmallVector<std::string, 4> InputConstraints;
SmallVector<std::string, 4> OutputConstraints;
SmallVector<unsigned, 4> ClobberRegs;
SmallVector<AsmRewrite, 4> AsmStrRewrites;
// Prime the lexer.
Lex();
// While we have input, parse each statement.
unsigned InputIdx = 0;
unsigned OutputIdx = 0;
while (getLexer().isNot(AsmToken::Eof)) {
// Parse curly braces marking block start/end
if (parseCurlyBlockScope(AsmStrRewrites))
continue;
ParseStatementInfo Info(&AsmStrRewrites);
bool StatementErr = parseStatement(Info, &SI);
if (StatementErr || Info.ParseError) {
// Emit pending errors if any exist.
printPendingErrors();
return true;
}
// No pending error should exist here.
assert(!hasPendingError() && "unexpected error from parseStatement");
if (Info.Opcode == ~0U)
continue;
const MCInstrDesc &Desc = MII->get(Info.Opcode);
// Build the list of clobbers, outputs and inputs.
for (unsigned i = 1, e = Info.ParsedOperands.size(); i != e; ++i) {
MCParsedAsmOperand &Operand = *Info.ParsedOperands[i];
// Immediate.
if (Operand.isImm())
continue;
// Register operand.
if (Operand.isReg() && !Operand.needAddressOf() &&
!getTargetParser().OmitRegisterFromClobberLists(Operand.getReg())) {
unsigned NumDefs = Desc.getNumDefs();
// Clobber.
if (NumDefs && Operand.getMCOperandNum() < NumDefs)
ClobberRegs.push_back(Operand.getReg());
continue;
}
// Expr/Input or Output.
StringRef SymName = Operand.getSymName();
if (SymName.empty())
continue;
void *OpDecl = Operand.getOpDecl();
if (!OpDecl)
continue;
bool isOutput = (i == 1) && Desc.mayStore();
SMLoc Start = SMLoc::getFromPointer(SymName.data());
if (isOutput) {
++InputIdx;
OutputDecls.push_back(OpDecl);
OutputDeclsAddressOf.push_back(Operand.needAddressOf());
OutputConstraints.push_back(("=" + Operand.getConstraint()).str());
AsmStrRewrites.emplace_back(AOK_Output, Start, SymName.size());
} else {
InputDecls.push_back(OpDecl);
InputDeclsAddressOf.push_back(Operand.needAddressOf());
InputConstraints.push_back(Operand.getConstraint().str());
AsmStrRewrites.emplace_back(AOK_Input, Start, SymName.size());
}
}
// Consider implicit defs to be clobbers. Think of cpuid and push.
ArrayRef<MCPhysReg> ImpDefs(Desc.getImplicitDefs(),
Desc.getNumImplicitDefs());
ClobberRegs.insert(ClobberRegs.end(), ImpDefs.begin(), ImpDefs.end());
}
// Set the number of Outputs and Inputs.
NumOutputs = OutputDecls.size();
NumInputs = InputDecls.size();
// Set the unique clobbers.
array_pod_sort(ClobberRegs.begin(), ClobberRegs.end());
ClobberRegs.erase(std::unique(ClobberRegs.begin(), ClobberRegs.end()),
ClobberRegs.end());
Clobbers.assign(ClobberRegs.size(), std::string());
for (unsigned I = 0, E = ClobberRegs.size(); I != E; ++I) {
raw_string_ostream OS(Clobbers[I]);
IP->printRegName(OS, ClobberRegs[I]);
}
// Merge the various outputs and inputs. Output are expected first.
if (NumOutputs || NumInputs) {
unsigned NumExprs = NumOutputs + NumInputs;
OpDecls.resize(NumExprs);
Constraints.resize(NumExprs);
for (unsigned i = 0; i < NumOutputs; ++i) {
OpDecls[i] = std::make_pair(OutputDecls[i], OutputDeclsAddressOf[i]);
Constraints[i] = OutputConstraints[i];
}
for (unsigned i = 0, j = NumOutputs; i < NumInputs; ++i, ++j) {
OpDecls[j] = std::make_pair(InputDecls[i], InputDeclsAddressOf[i]);
Constraints[j] = InputConstraints[i];
}
}
// Build the IR assembly string.
std::string AsmStringIR;
raw_string_ostream OS(AsmStringIR);
StringRef ASMString =
SrcMgr.getMemoryBuffer(SrcMgr.getMainFileID())->getBuffer();
const char *AsmStart = ASMString.begin();
const char *AsmEnd = ASMString.end();
array_pod_sort(AsmStrRewrites.begin(), AsmStrRewrites.end(), rewritesSort);
for (const AsmRewrite &AR : AsmStrRewrites) {
AsmRewriteKind Kind = AR.Kind;
const char *Loc = AR.Loc.getPointer();
assert(Loc >= AsmStart && "Expected Loc to be at or after Start!");
// Emit everything up to the immediate/expression.
if (unsigned Len = Loc - AsmStart)
OS << StringRef(AsmStart, Len);
// Skip the original expression.
if (Kind == AOK_Skip) {
AsmStart = Loc + AR.Len;
continue;
}
unsigned AdditionalSkip = 0;
// Rewrite expressions in $N notation.
switch (Kind) {
default:
break;
case AOK_IntelExpr:
assert(AR.IntelExp.isValid() && "cannot write invalid intel expression");
if (AR.IntelExp.NeedBracs)
OS << "[";
if (AR.IntelExp.hasBaseReg())
OS << AR.IntelExp.BaseReg;
if (AR.IntelExp.hasIndexReg())
OS << (AR.IntelExp.hasBaseReg() ? " + " : "")
<< AR.IntelExp.IndexReg;
if (AR.IntelExp.Scale > 1)
OS << " * $$" << AR.IntelExp.Scale;
if (AR.IntelExp.Imm || !AR.IntelExp.hasRegs())
OS << (AR.IntelExp.hasRegs() ? " + $$" : "$$") << AR.IntelExp.Imm;
if (AR.IntelExp.NeedBracs)
OS << "]";
break;
case AOK_Label:
OS << Ctx.getAsmInfo()->getPrivateLabelPrefix() << AR.Label;
break;
case AOK_Input:
OS << '$' << InputIdx++;
break;
case AOK_Output:
OS << '$' << OutputIdx++;
break;
case AOK_SizeDirective:
switch (AR.Val) {
default: break;
case 8: OS << "byte ptr "; break;
case 16: OS << "word ptr "; break;
case 32: OS << "dword ptr "; break;
case 64: OS << "qword ptr "; break;
case 80: OS << "xword ptr "; break;
case 128: OS << "xmmword ptr "; break;
case 256: OS << "ymmword ptr "; break;
}
break;
case AOK_Emit:
OS << ".byte";
break;
case AOK_Align: {
// MS alignment directives are measured in bytes. If the native assembler
// measures alignment in bytes, we can pass it straight through.
OS << ".align";
if (getContext().getAsmInfo()->getAlignmentIsInBytes())
break;
// Alignment is in log2 form, so print that instead and skip the original
// immediate.
unsigned Val = AR.Val;
OS << ' ' << Val;
assert(Val < 10 && "Expected alignment less then 2^10.");
AdditionalSkip = (Val < 4) ? 2 : Val < 7 ? 3 : 4;
break;
}
case AOK_EVEN:
OS << ".even";
break;
case AOK_EndOfStatement:
OS << "\n\t";
break;
}
// Skip the original expression.
AsmStart = Loc + AR.Len + AdditionalSkip;
}
// Emit the remainder of the asm string.
if (AsmStart != AsmEnd)
OS << StringRef(AsmStart, AsmEnd - AsmStart);
AsmString = OS.str();
return false;
}
namespace llvm {
namespace MCParserUtils {
/// Returns whether the given symbol is used anywhere in the given expression,
/// or subexpressions.
static bool isSymbolUsedInExpression(const MCSymbol *Sym, const MCExpr *Value) {
switch (Value->getKind()) {
case MCExpr::Binary: {
const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Value);
return isSymbolUsedInExpression(Sym, BE->getLHS()) ||
isSymbolUsedInExpression(Sym, BE->getRHS());
}
case MCExpr::Target:
case MCExpr::Constant:
return false;
case MCExpr::SymbolRef: {
const MCSymbol &S =
static_cast<const MCSymbolRefExpr *>(Value)->getSymbol();
if (S.isVariable())
return isSymbolUsedInExpression(Sym, S.getVariableValue());
return &S == Sym;
}
case MCExpr::Unary:
return isSymbolUsedInExpression(
Sym, static_cast<const MCUnaryExpr *>(Value)->getSubExpr());
}
llvm_unreachable("Unknown expr kind!");
}
bool parseAssignmentExpression(StringRef Name, bool allow_redef,
MCAsmParser &Parser, MCSymbol *&Sym,
const MCExpr *&Value) {
// FIXME: Use better location, we should use proper tokens.
SMLoc EqualLoc = Parser.getTok().getLoc();
if (Parser.parseExpression(Value))
return Parser.TokError("missing expression");
// Note: we don't count b as used in "a = b". This is to allow
// a = b
// b = c
if (Parser.parseToken(AsmToken::EndOfStatement))
return true;
// Validate that the LHS is allowed to be a variable (either it has not been
// used as a symbol, or it is an absolute symbol).
Sym = Parser.getContext().lookupSymbol(Name);
if (Sym) {
// Diagnose assignment to a label.
//
// FIXME: Diagnostics. Note the location of the definition as a label.
// FIXME: Diagnose assignment to protected identifier (e.g., register name).
if (isSymbolUsedInExpression(Sym, Value))
return Parser.Error(EqualLoc, "Recursive use of '" + Name + "'");
else if (Sym->isUndefined(/*SetUsed*/ false) && !Sym->isUsed() &&
!Sym->isVariable())
; // Allow redefinitions of undefined symbols only used in directives.
else if (Sym->isVariable() && !Sym->isUsed() && allow_redef)
; // Allow redefinitions of variables that haven't yet been used.
else if (!Sym->isUndefined() && (!Sym->isVariable() || !allow_redef))
return Parser.Error(EqualLoc, "redefinition of '" + Name + "'");
else if (!Sym->isVariable())
return Parser.Error(EqualLoc, "invalid assignment to '" + Name + "'");
else if (!isa<MCConstantExpr>(Sym->getVariableValue()))
return Parser.Error(EqualLoc,
"invalid reassignment of non-absolute variable '" +
Name + "'");
} else if (Name == ".") {
Parser.getStreamer().emitValueToOffset(Value, 0, EqualLoc);
return false;
} else
Sym = Parser.getContext().getOrCreateSymbol(Name);
Sym->setRedefinable(allow_redef);
return false;
}
} // end namespace MCParserUtils
} // end namespace llvm
/// Create an MCAsmParser instance.
MCAsmParser *llvm::createMCAsmParser(SourceMgr &SM, MCContext &C,
MCStreamer &Out, const MCAsmInfo &MAI,
unsigned CB) {
return new AsmParser(SM, C, Out, MAI, CB);
}
Index: vendor/llvm/dist-release_90/lib/Object/RelocationResolver.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/Object/RelocationResolver.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Object/RelocationResolver.cpp (revision 351303)
@@ -1,550 +1,550 @@
//===- RelocationResolver.cpp ------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines utilities to resolve relocations in object files.
//
//===----------------------------------------------------------------------===//
#include "llvm/Object/RelocationResolver.h"
namespace llvm {
namespace object {
static int64_t getELFAddend(RelocationRef R) {
Expected<int64_t> AddendOrErr = ELFRelocationRef(R).getAddend();
handleAllErrors(AddendOrErr.takeError(), [](const ErrorInfoBase &EI) {
report_fatal_error(EI.message());
});
return *AddendOrErr;
}
static bool supportsX86_64(uint64_t Type) {
switch (Type) {
case ELF::R_X86_64_NONE:
case ELF::R_X86_64_64:
case ELF::R_X86_64_DTPOFF32:
case ELF::R_X86_64_DTPOFF64:
case ELF::R_X86_64_PC32:
case ELF::R_X86_64_32:
case ELF::R_X86_64_32S:
return true;
default:
return false;
}
}
static uint64_t resolveX86_64(RelocationRef R, uint64_t S, uint64_t A) {
switch (R.getType()) {
case ELF::R_X86_64_NONE:
return A;
case ELF::R_X86_64_64:
case ELF::R_X86_64_DTPOFF32:
case ELF::R_X86_64_DTPOFF64:
return S + getELFAddend(R);
case ELF::R_X86_64_PC32:
return S + getELFAddend(R) - R.getOffset();
case ELF::R_X86_64_32:
case ELF::R_X86_64_32S:
return (S + getELFAddend(R)) & 0xFFFFFFFF;
default:
llvm_unreachable("Invalid relocation type");
}
}
static bool supportsAArch64(uint64_t Type) {
switch (Type) {
case ELF::R_AARCH64_ABS32:
case ELF::R_AARCH64_ABS64:
return true;
default:
return false;
}
}
static uint64_t resolveAArch64(RelocationRef R, uint64_t S, uint64_t A) {
switch (R.getType()) {
case ELF::R_AARCH64_ABS32:
return (S + getELFAddend(R)) & 0xFFFFFFFF;
case ELF::R_AARCH64_ABS64:
return S + getELFAddend(R);
default:
llvm_unreachable("Invalid relocation type");
}
}
static bool supportsBPF(uint64_t Type) {
switch (Type) {
case ELF::R_BPF_64_32:
case ELF::R_BPF_64_64:
return true;
default:
return false;
}
}
static uint64_t resolveBPF(RelocationRef R, uint64_t S, uint64_t A) {
switch (R.getType()) {
case ELF::R_BPF_64_32:
- return S & 0xFFFFFFFF;
+ return (S + A) & 0xFFFFFFFF;
case ELF::R_BPF_64_64:
- return S;
+ return S + A;
default:
llvm_unreachable("Invalid relocation type");
}
}
static bool supportsMips64(uint64_t Type) {
switch (Type) {
case ELF::R_MIPS_32:
case ELF::R_MIPS_64:
case ELF::R_MIPS_TLS_DTPREL64:
return true;
default:
return false;
}
}
static uint64_t resolveMips64(RelocationRef R, uint64_t S, uint64_t A) {
switch (R.getType()) {
case ELF::R_MIPS_32:
return (S + getELFAddend(R)) & 0xFFFFFFFF;
case ELF::R_MIPS_64:
return S + getELFAddend(R);
case ELF::R_MIPS_TLS_DTPREL64:
return S + getELFAddend(R) - 0x8000;
default:
llvm_unreachable("Invalid relocation type");
}
}
static bool supportsPPC64(uint64_t Type) {
switch (Type) {
case ELF::R_PPC64_ADDR32:
case ELF::R_PPC64_ADDR64:
return true;
default:
return false;
}
}
static uint64_t resolvePPC64(RelocationRef R, uint64_t S, uint64_t A) {
switch (R.getType()) {
case ELF::R_PPC64_ADDR32:
return (S + getELFAddend(R)) & 0xFFFFFFFF;
case ELF::R_PPC64_ADDR64:
return S + getELFAddend(R);
default:
llvm_unreachable("Invalid relocation type");
}
}
static bool supportsSystemZ(uint64_t Type) {
switch (Type) {
case ELF::R_390_32:
case ELF::R_390_64:
return true;
default:
return false;
}
}
static uint64_t resolveSystemZ(RelocationRef R, uint64_t S, uint64_t A) {
switch (R.getType()) {
case ELF::R_390_32:
return (S + getELFAddend(R)) & 0xFFFFFFFF;
case ELF::R_390_64:
return S + getELFAddend(R);
default:
llvm_unreachable("Invalid relocation type");
}
}
static bool supportsSparc64(uint64_t Type) {
switch (Type) {
case ELF::R_SPARC_32:
case ELF::R_SPARC_64:
case ELF::R_SPARC_UA32:
case ELF::R_SPARC_UA64:
return true;
default:
return false;
}
}
static uint64_t resolveSparc64(RelocationRef R, uint64_t S, uint64_t A) {
switch (R.getType()) {
case ELF::R_SPARC_32:
case ELF::R_SPARC_64:
case ELF::R_SPARC_UA32:
case ELF::R_SPARC_UA64:
return S + getELFAddend(R);
default:
llvm_unreachable("Invalid relocation type");
}
}
static bool supportsAmdgpu(uint64_t Type) {
switch (Type) {
case ELF::R_AMDGPU_ABS32:
case ELF::R_AMDGPU_ABS64:
return true;
default:
return false;
}
}
static uint64_t resolveAmdgpu(RelocationRef R, uint64_t S, uint64_t A) {
switch (R.getType()) {
case ELF::R_AMDGPU_ABS32:
case ELF::R_AMDGPU_ABS64:
return S + getELFAddend(R);
default:
llvm_unreachable("Invalid relocation type");
}
}
static bool supportsX86(uint64_t Type) {
switch (Type) {
case ELF::R_386_NONE:
case ELF::R_386_32:
case ELF::R_386_PC32:
return true;
default:
return false;
}
}
static uint64_t resolveX86(RelocationRef R, uint64_t S, uint64_t A) {
switch (R.getType()) {
case ELF::R_386_NONE:
return A;
case ELF::R_386_32:
return S + A;
case ELF::R_386_PC32:
return S - R.getOffset() + A;
default:
llvm_unreachable("Invalid relocation type");
}
}
static bool supportsPPC32(uint64_t Type) {
return Type == ELF::R_PPC_ADDR32;
}
static uint64_t resolvePPC32(RelocationRef R, uint64_t S, uint64_t A) {
if (R.getType() == ELF::R_PPC_ADDR32)
return (S + getELFAddend(R)) & 0xFFFFFFFF;
llvm_unreachable("Invalid relocation type");
}
static bool supportsARM(uint64_t Type) {
return Type == ELF::R_ARM_ABS32;
}
static uint64_t resolveARM(RelocationRef R, uint64_t S, uint64_t A) {
if (R.getType() == ELF::R_ARM_ABS32)
return (S + A) & 0xFFFFFFFF;
llvm_unreachable("Invalid relocation type");
}
static bool supportsAVR(uint64_t Type) {
switch (Type) {
case ELF::R_AVR_16:
case ELF::R_AVR_32:
return true;
default:
return false;
}
}
static uint64_t resolveAVR(RelocationRef R, uint64_t S, uint64_t A) {
switch (R.getType()) {
case ELF::R_AVR_16:
return (S + getELFAddend(R)) & 0xFFFF;
case ELF::R_AVR_32:
return (S + getELFAddend(R)) & 0xFFFFFFFF;
default:
llvm_unreachable("Invalid relocation type");
}
}
static bool supportsLanai(uint64_t Type) {
return Type == ELF::R_LANAI_32;
}
static uint64_t resolveLanai(RelocationRef R, uint64_t S, uint64_t A) {
if (R.getType() == ELF::R_LANAI_32)
return (S + getELFAddend(R)) & 0xFFFFFFFF;
llvm_unreachable("Invalid relocation type");
}
static bool supportsMips32(uint64_t Type) {
switch (Type) {
case ELF::R_MIPS_32:
case ELF::R_MIPS_TLS_DTPREL32:
return true;
default:
return false;
}
}
static uint64_t resolveMips32(RelocationRef R, uint64_t S, uint64_t A) {
// FIXME: Take in account implicit addends to get correct results.
uint32_t Rel = R.getType();
if (Rel == ELF::R_MIPS_32)
return (S + A) & 0xFFFFFFFF;
if (Rel == ELF::R_MIPS_TLS_DTPREL32)
return (S + A) & 0xFFFFFFFF;
llvm_unreachable("Invalid relocation type");
}
static bool supportsSparc32(uint64_t Type) {
switch (Type) {
case ELF::R_SPARC_32:
case ELF::R_SPARC_UA32:
return true;
default:
return false;
}
}
static uint64_t resolveSparc32(RelocationRef R, uint64_t S, uint64_t A) {
uint32_t Rel = R.getType();
if (Rel == ELF::R_SPARC_32 || Rel == ELF::R_SPARC_UA32)
return S + getELFAddend(R);
return A;
}
static bool supportsHexagon(uint64_t Type) {
return Type == ELF::R_HEX_32;
}
static uint64_t resolveHexagon(RelocationRef R, uint64_t S, uint64_t A) {
if (R.getType() == ELF::R_HEX_32)
return S + getELFAddend(R);
llvm_unreachable("Invalid relocation type");
}
static bool supportsRISCV(uint64_t Type) {
switch (Type) {
case ELF::R_RISCV_NONE:
case ELF::R_RISCV_32:
case ELF::R_RISCV_64:
case ELF::R_RISCV_ADD8:
case ELF::R_RISCV_SUB8:
case ELF::R_RISCV_ADD16:
case ELF::R_RISCV_SUB16:
case ELF::R_RISCV_ADD32:
case ELF::R_RISCV_SUB32:
case ELF::R_RISCV_ADD64:
case ELF::R_RISCV_SUB64:
return true;
default:
return false;
}
}
static uint64_t resolveRISCV(RelocationRef R, uint64_t S, uint64_t A) {
int64_t RA = getELFAddend(R);
switch (R.getType()) {
case ELF::R_RISCV_NONE:
return A;
case ELF::R_RISCV_32:
return (S + RA) & 0xFFFFFFFF;
case ELF::R_RISCV_64:
return S + RA;
case ELF::R_RISCV_ADD8:
return (A + (S + RA)) & 0xFF;
case ELF::R_RISCV_SUB8:
return (A - (S + RA)) & 0xFF;
case ELF::R_RISCV_ADD16:
return (A + (S + RA)) & 0xFFFF;
case ELF::R_RISCV_SUB16:
return (A - (S + RA)) & 0xFFFF;
case ELF::R_RISCV_ADD32:
return (A + (S + RA)) & 0xFFFFFFFF;
case ELF::R_RISCV_SUB32:
return (A - (S + RA)) & 0xFFFFFFFF;
case ELF::R_RISCV_ADD64:
return (A + (S + RA));
case ELF::R_RISCV_SUB64:
return (A - (S + RA));
default:
llvm_unreachable("Invalid relocation type");
}
}
static bool supportsCOFFX86(uint64_t Type) {
switch (Type) {
case COFF::IMAGE_REL_I386_SECREL:
case COFF::IMAGE_REL_I386_DIR32:
return true;
default:
return false;
}
}
static uint64_t resolveCOFFX86(RelocationRef R, uint64_t S, uint64_t A) {
switch (R.getType()) {
case COFF::IMAGE_REL_I386_SECREL:
case COFF::IMAGE_REL_I386_DIR32:
return (S + A) & 0xFFFFFFFF;
default:
llvm_unreachable("Invalid relocation type");
}
}
static bool supportsCOFFX86_64(uint64_t Type) {
switch (Type) {
case COFF::IMAGE_REL_AMD64_SECREL:
case COFF::IMAGE_REL_AMD64_ADDR64:
return true;
default:
return false;
}
}
static uint64_t resolveCOFFX86_64(RelocationRef R, uint64_t S, uint64_t A) {
switch (R.getType()) {
case COFF::IMAGE_REL_AMD64_SECREL:
return (S + A) & 0xFFFFFFFF;
case COFF::IMAGE_REL_AMD64_ADDR64:
return S + A;
default:
llvm_unreachable("Invalid relocation type");
}
}
static bool supportsMachOX86_64(uint64_t Type) {
return Type == MachO::X86_64_RELOC_UNSIGNED;
}
static uint64_t resolveMachOX86_64(RelocationRef R, uint64_t S, uint64_t A) {
if (R.getType() == MachO::X86_64_RELOC_UNSIGNED)
return S;
llvm_unreachable("Invalid relocation type");
}
static bool supportsWasm32(uint64_t Type) {
switch (Type) {
case wasm::R_WASM_FUNCTION_INDEX_LEB:
case wasm::R_WASM_TABLE_INDEX_SLEB:
case wasm::R_WASM_TABLE_INDEX_I32:
case wasm::R_WASM_MEMORY_ADDR_LEB:
case wasm::R_WASM_MEMORY_ADDR_SLEB:
case wasm::R_WASM_MEMORY_ADDR_I32:
case wasm::R_WASM_TYPE_INDEX_LEB:
case wasm::R_WASM_GLOBAL_INDEX_LEB:
case wasm::R_WASM_FUNCTION_OFFSET_I32:
case wasm::R_WASM_SECTION_OFFSET_I32:
case wasm::R_WASM_EVENT_INDEX_LEB:
return true;
default:
return false;
}
}
static uint64_t resolveWasm32(RelocationRef R, uint64_t S, uint64_t A) {
switch (R.getType()) {
case wasm::R_WASM_FUNCTION_INDEX_LEB:
case wasm::R_WASM_TABLE_INDEX_SLEB:
case wasm::R_WASM_TABLE_INDEX_I32:
case wasm::R_WASM_MEMORY_ADDR_LEB:
case wasm::R_WASM_MEMORY_ADDR_SLEB:
case wasm::R_WASM_MEMORY_ADDR_I32:
case wasm::R_WASM_TYPE_INDEX_LEB:
case wasm::R_WASM_GLOBAL_INDEX_LEB:
case wasm::R_WASM_FUNCTION_OFFSET_I32:
case wasm::R_WASM_SECTION_OFFSET_I32:
case wasm::R_WASM_EVENT_INDEX_LEB:
// For wasm section, its offset at 0 -- ignoring Value
return A;
default:
llvm_unreachable("Invalid relocation type");
}
}
std::pair<bool (*)(uint64_t), RelocationResolver>
getRelocationResolver(const ObjectFile &Obj) {
if (Obj.isCOFF()) {
if (Obj.getBytesInAddress() == 8)
return {supportsCOFFX86_64, resolveCOFFX86_64};
return {supportsCOFFX86, resolveCOFFX86};
} else if (Obj.isELF()) {
if (Obj.getBytesInAddress() == 8) {
switch (Obj.getArch()) {
case Triple::x86_64:
return {supportsX86_64, resolveX86_64};
case Triple::aarch64:
case Triple::aarch64_be:
return {supportsAArch64, resolveAArch64};
case Triple::bpfel:
case Triple::bpfeb:
return {supportsBPF, resolveBPF};
case Triple::mips64el:
case Triple::mips64:
return {supportsMips64, resolveMips64};
case Triple::ppc64le:
case Triple::ppc64:
return {supportsPPC64, resolvePPC64};
case Triple::systemz:
return {supportsSystemZ, resolveSystemZ};
case Triple::sparcv9:
return {supportsSparc64, resolveSparc64};
case Triple::amdgcn:
return {supportsAmdgpu, resolveAmdgpu};
case Triple::riscv64:
return {supportsRISCV, resolveRISCV};
default:
return {nullptr, nullptr};
}
}
// 32-bit object file
assert(Obj.getBytesInAddress() == 4 &&
"Invalid word size in object file");
switch (Obj.getArch()) {
case Triple::x86:
return {supportsX86, resolveX86};
case Triple::ppc:
return {supportsPPC32, resolvePPC32};
case Triple::arm:
case Triple::armeb:
return {supportsARM, resolveARM};
case Triple::avr:
return {supportsAVR, resolveAVR};
case Triple::lanai:
return {supportsLanai, resolveLanai};
case Triple::mipsel:
case Triple::mips:
return {supportsMips32, resolveMips32};
case Triple::sparc:
return {supportsSparc32, resolveSparc32};
case Triple::hexagon:
return {supportsHexagon, resolveHexagon};
case Triple::riscv32:
return {supportsRISCV, resolveRISCV};
default:
return {nullptr, nullptr};
}
} else if (Obj.isMachO()) {
if (Obj.getArch() == Triple::x86_64)
return {supportsMachOX86_64, resolveMachOX86_64};
return {nullptr, nullptr};
} else if (Obj.isWasm()) {
if (Obj.getArch() == Triple::wasm32)
return {supportsWasm32, resolveWasm32};
return {nullptr, nullptr};
}
llvm_unreachable("Invalid object file");
}
} // namespace object
} // namespace llvm
Index: vendor/llvm/dist-release_90/lib/Support/AArch64TargetParser.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/Support/AArch64TargetParser.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Support/AArch64TargetParser.cpp (revision 351303)
@@ -1,215 +1,215 @@
//===-- AArch64TargetParser - Parser for AArch64 features -------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements a target parser to recognise AArch64 hardware features
// such as FPU/CPU/ARCH and extension names.
//
//===----------------------------------------------------------------------===//
#include "llvm/Support/AArch64TargetParser.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include <cctype>
using namespace llvm;
static unsigned checkArchVersion(llvm::StringRef Arch) {
if (Arch.size() >= 2 && Arch[0] == 'v' && std::isdigit(Arch[1]))
return (Arch[1] - 48);
return 0;
}
unsigned AArch64::getDefaultFPU(StringRef CPU, AArch64::ArchKind AK) {
if (CPU == "generic")
return AArch64ARCHNames[static_cast<unsigned>(AK)].DefaultFPU;
return StringSwitch<unsigned>(CPU)
#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
.Case(NAME, ARM::DEFAULT_FPU)
#include "../../include/llvm/Support/AArch64TargetParser.def"
.Default(ARM::FK_INVALID);
}
unsigned AArch64::getDefaultExtensions(StringRef CPU, AArch64::ArchKind AK) {
if (CPU == "generic")
return AArch64ARCHNames[static_cast<unsigned>(AK)].ArchBaseExtensions;
return StringSwitch<unsigned>(CPU)
#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
.Case(NAME, AArch64ARCHNames[static_cast<unsigned>(ArchKind::ID)] \
.ArchBaseExtensions | \
DEFAULT_EXT)
#include "../../include/llvm/Support/AArch64TargetParser.def"
.Default(AArch64::AEK_INVALID);
}
AArch64::ArchKind AArch64::getCPUArchKind(StringRef CPU) {
if (CPU == "generic")
return ArchKind::ARMV8A;
return StringSwitch<AArch64::ArchKind>(CPU)
#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
.Case(NAME, ArchKind::ID)
#include "../../include/llvm/Support/AArch64TargetParser.def"
.Default(ArchKind::INVALID);
}
bool AArch64::getExtensionFeatures(unsigned Extensions,
std::vector<StringRef> &Features) {
if (Extensions == AArch64::AEK_INVALID)
return false;
if (Extensions & AEK_FP)
Features.push_back("+fp-armv8");
if (Extensions & AEK_SIMD)
Features.push_back("+neon");
if (Extensions & AEK_CRC)
Features.push_back("+crc");
if (Extensions & AEK_CRYPTO)
Features.push_back("+crypto");
if (Extensions & AEK_DOTPROD)
Features.push_back("+dotprod");
if (Extensions & AEK_FP16FML)
Features.push_back("+fp16fml");
if (Extensions & AEK_FP16)
Features.push_back("+fullfp16");
if (Extensions & AEK_PROFILE)
Features.push_back("+spe");
if (Extensions & AEK_RAS)
Features.push_back("+ras");
if (Extensions & AEK_LSE)
Features.push_back("+lse");
if (Extensions & AEK_RDM)
Features.push_back("+rdm");
if (Extensions & AEK_SVE)
Features.push_back("+sve");
if (Extensions & AEK_SVE2)
Features.push_back("+sve2");
if (Extensions & AEK_SVE2AES)
Features.push_back("+sve2-aes");
if (Extensions & AEK_SVE2SM4)
Features.push_back("+sve2-sm4");
if (Extensions & AEK_SVE2SHA3)
Features.push_back("+sve2-sha3");
- if (Extensions & AEK_BITPERM)
- Features.push_back("+bitperm");
+ if (Extensions & AEK_SVE2BITPERM)
+ Features.push_back("+sve2-bitperm");
if (Extensions & AEK_RCPC)
Features.push_back("+rcpc");
return true;
}
bool AArch64::getArchFeatures(AArch64::ArchKind AK,
std::vector<StringRef> &Features) {
if (AK == ArchKind::ARMV8_1A)
Features.push_back("+v8.1a");
if (AK == ArchKind::ARMV8_2A)
Features.push_back("+v8.2a");
if (AK == ArchKind::ARMV8_3A)
Features.push_back("+v8.3a");
if (AK == ArchKind::ARMV8_4A)
Features.push_back("+v8.4a");
if (AK == ArchKind::ARMV8_5A)
Features.push_back("+v8.5a");
return AK != ArchKind::INVALID;
}
StringRef AArch64::getArchName(AArch64::ArchKind AK) {
return AArch64ARCHNames[static_cast<unsigned>(AK)].getName();
}
StringRef AArch64::getCPUAttr(AArch64::ArchKind AK) {
return AArch64ARCHNames[static_cast<unsigned>(AK)].getCPUAttr();
}
StringRef AArch64::getSubArch(AArch64::ArchKind AK) {
return AArch64ARCHNames[static_cast<unsigned>(AK)].getSubArch();
}
unsigned AArch64::getArchAttr(AArch64::ArchKind AK) {
return AArch64ARCHNames[static_cast<unsigned>(AK)].ArchAttr;
}
StringRef AArch64::getArchExtName(unsigned ArchExtKind) {
for (const auto &AE : AArch64ARCHExtNames)
if (ArchExtKind == AE.ID)
return AE.getName();
return StringRef();
}
StringRef AArch64::getArchExtFeature(StringRef ArchExt) {
if (ArchExt.startswith("no")) {
StringRef ArchExtBase(ArchExt.substr(2));
for (const auto &AE : AArch64ARCHExtNames) {
if (AE.NegFeature && ArchExtBase == AE.getName())
return StringRef(AE.NegFeature);
}
}
for (const auto &AE : AArch64ARCHExtNames)
if (AE.Feature && ArchExt == AE.getName())
return StringRef(AE.Feature);
return StringRef();
}
StringRef AArch64::getDefaultCPU(StringRef Arch) {
ArchKind AK = parseArch(Arch);
if (AK == ArchKind::INVALID)
return StringRef();
// Look for multiple AKs to find the default for pair AK+Name.
for (const auto &CPU : AArch64CPUNames)
if (CPU.ArchID == AK && CPU.Default)
return CPU.getName();
// If we can't find a default then target the architecture instead
return "generic";
}
void AArch64::fillValidCPUArchList(SmallVectorImpl<StringRef> &Values) {
for (const auto &Arch : AArch64CPUNames) {
if (Arch.ArchID != ArchKind::INVALID)
Values.push_back(Arch.getName());
}
}
bool AArch64::isX18ReservedByDefault(const Triple &TT) {
return TT.isAndroid() || TT.isOSDarwin() || TT.isOSFuchsia() ||
TT.isOSWindows();
}
// Allows partial match, ex. "v8a" matches "armv8a".
AArch64::ArchKind AArch64::parseArch(StringRef Arch) {
Arch = ARM::getCanonicalArchName(Arch);
if (checkArchVersion(Arch) < 8)
return ArchKind::INVALID;
StringRef Syn = ARM::getArchSynonym(Arch);
for (const auto A : AArch64ARCHNames) {
if (A.getName().endswith(Syn))
return A.ID;
}
return ArchKind::INVALID;
}
AArch64::ArchExtKind AArch64::parseArchExt(StringRef ArchExt) {
for (const auto A : AArch64ARCHExtNames) {
if (ArchExt == A.getName())
return static_cast<ArchExtKind>(A.ID);
}
return AArch64::AEK_INVALID;
}
AArch64::ArchKind AArch64::parseCPUArch(StringRef CPU) {
for (const auto C : AArch64CPUNames) {
if (CPU == C.getName())
return C.ArchID;
}
return ArchKind::INVALID;
}
Index: vendor/llvm/dist-release_90/lib/Support/Unix/Path.inc
===================================================================
--- vendor/llvm/dist-release_90/lib/Support/Unix/Path.inc (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Support/Unix/Path.inc (revision 351303)
@@ -1,1226 +1,1226 @@
//===- llvm/Support/Unix/Path.inc - Unix Path Implementation ----*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the Unix specific implementation of the Path API.
//
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
//=== WARNING: Implementation here must contain only generic UNIX code that
//=== is guaranteed to work on *all* UNIX variants.
//===----------------------------------------------------------------------===//
#include "Unix.h"
#include <limits.h>
#include <stdio.h>
#if HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#if HAVE_FCNTL_H
#include <fcntl.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_SYS_MMAN_H
#include <sys/mman.h>
#endif
#include <dirent.h>
#include <pwd.h>
#ifdef __APPLE__
#include <mach-o/dyld.h>
#include <sys/attr.h>
#include <copyfile.h>
#elif defined(__DragonFly__)
#include <sys/mount.h>
#endif
// Both stdio.h and cstdio are included via different paths and
// stdcxx's cstdio doesn't include stdio.h, so it doesn't #undef the macros
// either.
#undef ferror
#undef feof
// For GNU Hurd
#if defined(__GNU__) && !defined(PATH_MAX)
# define PATH_MAX 4096
# define MAXPATHLEN 4096
#endif
#include <sys/types.h>
#if !defined(__APPLE__) && !defined(__OpenBSD__) && !defined(__FreeBSD__) && \
!defined(__linux__) && !defined(__FreeBSD_kernel__) && !defined(_AIX)
#include <sys/statvfs.h>
#define STATVFS statvfs
#define FSTATVFS fstatvfs
#define STATVFS_F_FRSIZE(vfs) vfs.f_frsize
#else
#if defined(__OpenBSD__) || defined(__FreeBSD__)
#include <sys/mount.h>
#include <sys/param.h>
#elif defined(__linux__)
#if defined(HAVE_LINUX_MAGIC_H)
#include <linux/magic.h>
#else
#if defined(HAVE_LINUX_NFS_FS_H)
#include <linux/nfs_fs.h>
#endif
#if defined(HAVE_LINUX_SMB_H)
#include <linux/smb.h>
#endif
#endif
#include <sys/vfs.h>
#elif defined(_AIX)
#include <sys/statfs.h>
// <sys/vmount.h> depends on `uint` to be a typedef from <sys/types.h> to
// `uint_t`; however, <sys/types.h> does not always declare `uint`. We provide
// the typedef prior to including <sys/vmount.h> to work around this issue.
typedef uint_t uint;
#include <sys/vmount.h>
#else
#include <sys/mount.h>
#endif
#define STATVFS statfs
#define FSTATVFS fstatfs
#define STATVFS_F_FRSIZE(vfs) static_cast<uint64_t>(vfs.f_bsize)
#endif
#if defined(__NetBSD__) || defined(__DragonFly__) || defined(__GNU__)
#define STATVFS_F_FLAG(vfs) (vfs).f_flag
#else
#define STATVFS_F_FLAG(vfs) (vfs).f_flags
#endif
using namespace llvm;
namespace llvm {
namespace sys {
namespace fs {
const file_t kInvalidFile = -1;
#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
defined(__minix) || defined(__FreeBSD_kernel__) || defined(__linux__) || \
defined(__CYGWIN__) || defined(__DragonFly__) || defined(_AIX) || defined(__GNU__)
static int
test_dir(char ret[PATH_MAX], const char *dir, const char *bin)
{
struct stat sb;
char fullpath[PATH_MAX];
int chars = snprintf(fullpath, PATH_MAX, "%s/%s", dir, bin);
// We cannot write PATH_MAX characters because the string will be terminated
// with a null character. Fail if truncation happened.
if (chars >= PATH_MAX)
return 1;
if (!realpath(fullpath, ret))
return 1;
if (stat(fullpath, &sb) != 0)
return 1;
return 0;
}
static char *
getprogpath(char ret[PATH_MAX], const char *bin)
{
/* First approach: absolute path. */
if (bin[0] == '/') {
if (test_dir(ret, "/", bin) == 0)
return ret;
return nullptr;
}
/* Second approach: relative path. */
if (strchr(bin, '/')) {
char cwd[PATH_MAX];
if (!getcwd(cwd, PATH_MAX))
return nullptr;
if (test_dir(ret, cwd, bin) == 0)
return ret;
return nullptr;
}
/* Third approach: $PATH */
char *pv;
if ((pv = getenv("PATH")) == nullptr)
return nullptr;
char *s = strdup(pv);
if (!s)
return nullptr;
char *state;
for (char *t = strtok_r(s, ":", &state); t != nullptr;
t = strtok_r(nullptr, ":", &state)) {
if (test_dir(ret, t, bin) == 0) {
free(s);
return ret;
}
}
free(s);
return nullptr;
}
#endif // __FreeBSD__ || __NetBSD__ || __FreeBSD_kernel__
/// GetMainExecutable - Return the path to the main executable, given the
/// value of argv[0] from program startup.
std::string getMainExecutable(const char *argv0, void *MainAddr) {
#if defined(__APPLE__)
// On OS X the executable path is saved to the stack by dyld. Reading it
// from there is much faster than calling dladdr, especially for large
// binaries with symbols.
char exe_path[MAXPATHLEN];
uint32_t size = sizeof(exe_path);
if (_NSGetExecutablePath(exe_path, &size) == 0) {
char link_path[MAXPATHLEN];
if (realpath(exe_path, link_path))
return link_path;
}
#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
defined(__minix) || defined(__DragonFly__) || \
defined(__FreeBSD_kernel__) || defined(_AIX)
StringRef curproc("/proc/curproc/file");
char exe_path[PATH_MAX];
// /proc is not mounted by default under FreeBSD, but gives more accurate
// information than argv[0] when it is.
if (sys::fs::exists(curproc)) {
ssize_t len = readlink(curproc.str().c_str(), exe_path, sizeof(exe_path));
if (len > 0) {
// Null terminate the string for realpath. readlink never null
// terminates its output.
len = std::min(len, ssize_t(sizeof(exe_path) - 1));
exe_path[len] = '\0';
return exe_path;
}
}
// If we don't have procfs mounted, fall back to argv[0]
if (getprogpath(exe_path, argv0) != NULL)
return exe_path;
#elif defined(__linux__) || defined(__CYGWIN__)
char exe_path[MAXPATHLEN];
StringRef aPath("/proc/self/exe");
if (sys::fs::exists(aPath)) {
// /proc is not always mounted under Linux (chroot for example).
ssize_t len = readlink(aPath.str().c_str(), exe_path, sizeof(exe_path));
if (len < 0)
return "";
// Null terminate the string for realpath. readlink never null
// terminates its output.
len = std::min(len, ssize_t(sizeof(exe_path) - 1));
exe_path[len] = '\0';
// On Linux, /proc/self/exe always looks through symlinks. However, on
// GNU/Hurd, /proc/self/exe is a symlink to the path that was used to start
// the program, and not the eventual binary file. Therefore, call realpath
// so this behaves the same on all platforms.
#if _POSIX_VERSION >= 200112 || defined(__GLIBC__)
if (char *real_path = realpath(exe_path, NULL)) {
std::string ret = std::string(real_path);
free(real_path);
return ret;
}
#else
char real_path[MAXPATHLEN];
if (realpath(exe_path, real_path))
return std::string(real_path);
#endif
}
// Fall back to the classical detection.
if (getprogpath(exe_path, argv0))
return exe_path;
#elif defined(HAVE_DLFCN_H) && defined(HAVE_DLADDR)
// Use dladdr to get executable path if available.
Dl_info DLInfo;
int err = dladdr(MainAddr, &DLInfo);
if (err == 0)
return "";
// If the filename is a symlink, we need to resolve and return the location of
// the actual executable.
char link_path[MAXPATHLEN];
if (realpath(DLInfo.dli_fname, link_path))
return link_path;
#else
#error GetMainExecutable is not implemented on this host yet.
#endif
return "";
}
TimePoint<> basic_file_status::getLastAccessedTime() const {
return toTimePoint(fs_st_atime, fs_st_atime_nsec);
}
TimePoint<> basic_file_status::getLastModificationTime() const {
return toTimePoint(fs_st_mtime, fs_st_mtime_nsec);
}
UniqueID file_status::getUniqueID() const {
return UniqueID(fs_st_dev, fs_st_ino);
}
uint32_t file_status::getLinkCount() const {
return fs_st_nlinks;
}
ErrorOr<space_info> disk_space(const Twine &Path) {
struct STATVFS Vfs;
if (::STATVFS(const_cast<char *>(Path.str().c_str()), &Vfs))
return std::error_code(errno, std::generic_category());
auto FrSize = STATVFS_F_FRSIZE(Vfs);
space_info SpaceInfo;
SpaceInfo.capacity = static_cast<uint64_t>(Vfs.f_blocks) * FrSize;
SpaceInfo.free = static_cast<uint64_t>(Vfs.f_bfree) * FrSize;
SpaceInfo.available = static_cast<uint64_t>(Vfs.f_bavail) * FrSize;
return SpaceInfo;
}
std::error_code current_path(SmallVectorImpl<char> &result) {
result.clear();
const char *pwd = ::getenv("PWD");
llvm::sys::fs::file_status PWDStatus, DotStatus;
if (pwd && llvm::sys::path::is_absolute(pwd) &&
!llvm::sys::fs::status(pwd, PWDStatus) &&
!llvm::sys::fs::status(".", DotStatus) &&
PWDStatus.getUniqueID() == DotStatus.getUniqueID()) {
result.append(pwd, pwd + strlen(pwd));
return std::error_code();
}
#ifdef MAXPATHLEN
result.reserve(MAXPATHLEN);
#else
// For GNU Hurd
result.reserve(1024);
#endif
while (true) {
if (::getcwd(result.data(), result.capacity()) == nullptr) {
// See if there was a real error.
if (errno != ENOMEM)
return std::error_code(errno, std::generic_category());
// Otherwise there just wasn't enough space.
result.reserve(result.capacity() * 2);
} else
break;
}
result.set_size(strlen(result.data()));
return std::error_code();
}
std::error_code set_current_path(const Twine &path) {
SmallString<128> path_storage;
StringRef p = path.toNullTerminatedStringRef(path_storage);
if (::chdir(p.begin()) == -1)
return std::error_code(errno, std::generic_category());
return std::error_code();
}
std::error_code create_directory(const Twine &path, bool IgnoreExisting,
perms Perms) {
SmallString<128> path_storage;
StringRef p = path.toNullTerminatedStringRef(path_storage);
if (::mkdir(p.begin(), Perms) == -1) {
if (errno != EEXIST || !IgnoreExisting)
return std::error_code(errno, std::generic_category());
}
return std::error_code();
}
// Note that we are using symbolic link because hard links are not supported by
// all filesystems (SMB doesn't).
std::error_code create_link(const Twine &to, const Twine &from) {
// Get arguments.
SmallString<128> from_storage;
SmallString<128> to_storage;
StringRef f = from.toNullTerminatedStringRef(from_storage);
StringRef t = to.toNullTerminatedStringRef(to_storage);
if (::symlink(t.begin(), f.begin()) == -1)
return std::error_code(errno, std::generic_category());
return std::error_code();
}
std::error_code create_hard_link(const Twine &to, const Twine &from) {
// Get arguments.
SmallString<128> from_storage;
SmallString<128> to_storage;
StringRef f = from.toNullTerminatedStringRef(from_storage);
StringRef t = to.toNullTerminatedStringRef(to_storage);
if (::link(t.begin(), f.begin()) == -1)
return std::error_code(errno, std::generic_category());
return std::error_code();
}
std::error_code remove(const Twine &path, bool IgnoreNonExisting) {
SmallString<128> path_storage;
StringRef p = path.toNullTerminatedStringRef(path_storage);
struct stat buf;
if (lstat(p.begin(), &buf) != 0) {
if (errno != ENOENT || !IgnoreNonExisting)
return std::error_code(errno, std::generic_category());
return std::error_code();
}
// Note: this check catches strange situations. In all cases, LLVM should
// only be involved in the creation and deletion of regular files. This
// check ensures that what we're trying to erase is a regular file. It
// effectively prevents LLVM from erasing things like /dev/null, any block
// special file, or other things that aren't "regular" files.
if (!S_ISREG(buf.st_mode) && !S_ISDIR(buf.st_mode) && !S_ISLNK(buf.st_mode))
return make_error_code(errc::operation_not_permitted);
if (::remove(p.begin()) == -1) {
if (errno != ENOENT || !IgnoreNonExisting)
return std::error_code(errno, std::generic_category());
}
return std::error_code();
}
static bool is_local_impl(struct STATVFS &Vfs) {
#if defined(__linux__) || defined(__GNU__)
#ifndef NFS_SUPER_MAGIC
#define NFS_SUPER_MAGIC 0x6969
#endif
#ifndef SMB_SUPER_MAGIC
#define SMB_SUPER_MAGIC 0x517B
#endif
#ifndef CIFS_MAGIC_NUMBER
#define CIFS_MAGIC_NUMBER 0xFF534D42
#endif
#ifdef __GNU__
switch ((uint32_t)Vfs.__f_type) {
#else
switch ((uint32_t)Vfs.f_type) {
#endif
case NFS_SUPER_MAGIC:
case SMB_SUPER_MAGIC:
case CIFS_MAGIC_NUMBER:
return false;
default:
return true;
}
#elif defined(__CYGWIN__)
// Cygwin doesn't expose this information; would need to use Win32 API.
return false;
#elif defined(__Fuchsia__)
// Fuchsia doesn't yet support remote filesystem mounts.
return true;
#elif defined(__EMSCRIPTEN__)
// Emscripten doesn't currently support remote filesystem mounts.
return true;
#elif defined(__HAIKU__)
// Haiku doesn't expose this information.
return false;
#elif defined(__sun)
// statvfs::f_basetype contains a null-terminated FSType name of the mounted target
StringRef fstype(Vfs.f_basetype);
// NFS is the only non-local fstype??
return !fstype.equals("nfs");
#elif defined(_AIX)
// Call mntctl; try more than twice in case of timing issues with a concurrent
// mount.
int Ret;
size_t BufSize = 2048u;
std::unique_ptr<char[]> Buf;
int Tries = 3;
while (Tries--) {
Buf = llvm::make_unique<char[]>(BufSize);
Ret = mntctl(MCTL_QUERY, BufSize, Buf.get());
if (Ret != 0)
break;
BufSize = *reinterpret_cast<unsigned int *>(Buf.get());
Buf.reset();
}
if (Ret == -1)
// There was an error; "remote" is the conservative answer.
return false;
// Look for the correct vmount entry.
char *CurObjPtr = Buf.get();
while (Ret--) {
struct vmount *Vp = reinterpret_cast<struct vmount *>(CurObjPtr);
static_assert(sizeof(Vfs.f_fsid) == sizeof(Vp->vmt_fsid),
"fsid length mismatch");
if (memcmp(&Vfs.f_fsid, &Vp->vmt_fsid, sizeof Vfs.f_fsid) == 0)
return (Vp->vmt_flags & MNT_REMOTE) == 0;
CurObjPtr += Vp->vmt_length;
}
// vmount entry not found; "remote" is the conservative answer.
return false;
#else
return !!(STATVFS_F_FLAG(Vfs) & MNT_LOCAL);
#endif
}
std::error_code is_local(const Twine &Path, bool &Result) {
struct STATVFS Vfs;
if (::STATVFS(const_cast<char *>(Path.str().c_str()), &Vfs))
return std::error_code(errno, std::generic_category());
Result = is_local_impl(Vfs);
return std::error_code();
}
std::error_code is_local(int FD, bool &Result) {
struct STATVFS Vfs;
if (::FSTATVFS(FD, &Vfs))
return std::error_code(errno, std::generic_category());
Result = is_local_impl(Vfs);
return std::error_code();
}
std::error_code rename(const Twine &from, const Twine &to) {
// Get arguments.
SmallString<128> from_storage;
SmallString<128> to_storage;
StringRef f = from.toNullTerminatedStringRef(from_storage);
StringRef t = to.toNullTerminatedStringRef(to_storage);
if (::rename(f.begin(), t.begin()) == -1)
return std::error_code(errno, std::generic_category());
return std::error_code();
}
std::error_code resize_file(int FD, uint64_t Size) {
#if defined(HAVE_POSIX_FALLOCATE)
// If we have posix_fallocate use it. Unlike ftruncate it always allocates
// space, so we get an error if the disk is full.
if (int Err = ::posix_fallocate(FD, 0, Size)) {
#ifdef _AIX
constexpr int NotSupportedError = ENOTSUP;
#else
constexpr int NotSupportedError = EOPNOTSUPP;
#endif
if (Err != EINVAL && Err != NotSupportedError)
return std::error_code(Err, std::generic_category());
}
#endif
// Use ftruncate as a fallback. It may or may not allocate space. At least on
// OS X with HFS+ it does.
if (::ftruncate(FD, Size) == -1)
return std::error_code(errno, std::generic_category());
return std::error_code();
}
static int convertAccessMode(AccessMode Mode) {
switch (Mode) {
case AccessMode::Exist:
return F_OK;
case AccessMode::Write:
return W_OK;
case AccessMode::Execute:
return R_OK | X_OK; // scripts also need R_OK.
}
llvm_unreachable("invalid enum");
}
std::error_code access(const Twine &Path, AccessMode Mode) {
SmallString<128> PathStorage;
StringRef P = Path.toNullTerminatedStringRef(PathStorage);
if (::access(P.begin(), convertAccessMode(Mode)) == -1)
return std::error_code(errno, std::generic_category());
if (Mode == AccessMode::Execute) {
// Don't say that directories are executable.
struct stat buf;
if (0 != stat(P.begin(), &buf))
return errc::permission_denied;
if (!S_ISREG(buf.st_mode))
return errc::permission_denied;
}
return std::error_code();
}
bool can_execute(const Twine &Path) {
return !access(Path, AccessMode::Execute);
}
bool equivalent(file_status A, file_status B) {
assert(status_known(A) && status_known(B));
return A.fs_st_dev == B.fs_st_dev &&
A.fs_st_ino == B.fs_st_ino;
}
std::error_code equivalent(const Twine &A, const Twine &B, bool &result) {
file_status fsA, fsB;
if (std::error_code ec = status(A, fsA))
return ec;
if (std::error_code ec = status(B, fsB))
return ec;
result = equivalent(fsA, fsB);
return std::error_code();
}
static void expandTildeExpr(SmallVectorImpl<char> &Path) {
StringRef PathStr(Path.begin(), Path.size());
if (PathStr.empty() || !PathStr.startswith("~"))
return;
PathStr = PathStr.drop_front();
StringRef Expr =
PathStr.take_until([](char c) { return path::is_separator(c); });
StringRef Remainder = PathStr.substr(Expr.size() + 1);
SmallString<128> Storage;
if (Expr.empty()) {
// This is just ~/..., resolve it to the current user's home dir.
if (!path::home_directory(Storage)) {
// For some reason we couldn't get the home directory. Just exit.
return;
}
// Overwrite the first character and insert the rest.
Path[0] = Storage[0];
Path.insert(Path.begin() + 1, Storage.begin() + 1, Storage.end());
return;
}
// This is a string of the form ~username/, look up this user's entry in the
// password database.
struct passwd *Entry = nullptr;
std::string User = Expr.str();
Entry = ::getpwnam(User.c_str());
if (!Entry) {
// Unable to look up the entry, just return back the original path.
return;
}
Storage = Remainder;
Path.clear();
Path.append(Entry->pw_dir, Entry->pw_dir + strlen(Entry->pw_dir));
llvm::sys::path::append(Path, Storage);
}
void expand_tilde(const Twine &path, SmallVectorImpl<char> &dest) {
dest.clear();
if (path.isTriviallyEmpty())
return;
path.toVector(dest);
expandTildeExpr(dest);
return;
}
static file_type typeForMode(mode_t Mode) {
if (S_ISDIR(Mode))
return file_type::directory_file;
else if (S_ISREG(Mode))
return file_type::regular_file;
else if (S_ISBLK(Mode))
return file_type::block_file;
else if (S_ISCHR(Mode))
return file_type::character_file;
else if (S_ISFIFO(Mode))
return file_type::fifo_file;
else if (S_ISSOCK(Mode))
return file_type::socket_file;
else if (S_ISLNK(Mode))
return file_type::symlink_file;
return file_type::type_unknown;
}
static std::error_code fillStatus(int StatRet, const struct stat &Status,
file_status &Result) {
if (StatRet != 0) {
std::error_code EC(errno, std::generic_category());
if (EC == errc::no_such_file_or_directory)
Result = file_status(file_type::file_not_found);
else
Result = file_status(file_type::status_error);
return EC;
}
uint32_t atime_nsec, mtime_nsec;
#if defined(HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC)
atime_nsec = Status.st_atimespec.tv_nsec;
mtime_nsec = Status.st_mtimespec.tv_nsec;
#elif defined(HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC)
atime_nsec = Status.st_atim.tv_nsec;
mtime_nsec = Status.st_mtim.tv_nsec;
#else
atime_nsec = mtime_nsec = 0;
#endif
perms Perms = static_cast<perms>(Status.st_mode) & all_perms;
Result = file_status(typeForMode(Status.st_mode), Perms, Status.st_dev,
Status.st_nlink, Status.st_ino,
Status.st_atime, atime_nsec, Status.st_mtime, mtime_nsec,
Status.st_uid, Status.st_gid, Status.st_size);
return std::error_code();
}
std::error_code status(const Twine &Path, file_status &Result, bool Follow) {
SmallString<128> PathStorage;
StringRef P = Path.toNullTerminatedStringRef(PathStorage);
struct stat Status;
int StatRet = (Follow ? ::stat : ::lstat)(P.begin(), &Status);
return fillStatus(StatRet, Status, Result);
}
std::error_code status(int FD, file_status &Result) {
struct stat Status;
int StatRet = ::fstat(FD, &Status);
return fillStatus(StatRet, Status, Result);
}
unsigned getUmask() {
// Chose arbitary new mask and reset the umask to the old mask.
// umask(2) never fails so ignore the return of the second call.
unsigned Mask = ::umask(0);
(void) ::umask(Mask);
return Mask;
}
std::error_code setPermissions(const Twine &Path, perms Permissions) {
SmallString<128> PathStorage;
StringRef P = Path.toNullTerminatedStringRef(PathStorage);
if (::chmod(P.begin(), Permissions))
return std::error_code(errno, std::generic_category());
return std::error_code();
}
std::error_code setPermissions(int FD, perms Permissions) {
if (::fchmod(FD, Permissions))
return std::error_code(errno, std::generic_category());
return std::error_code();
}
std::error_code setLastAccessAndModificationTime(int FD, TimePoint<> AccessTime,
TimePoint<> ModificationTime) {
#if defined(HAVE_FUTIMENS)
timespec Times[2];
Times[0] = sys::toTimeSpec(AccessTime);
Times[1] = sys::toTimeSpec(ModificationTime);
if (::futimens(FD, Times))
return std::error_code(errno, std::generic_category());
return std::error_code();
#elif defined(HAVE_FUTIMES)
timeval Times[2];
Times[0] = sys::toTimeVal(
std::chrono::time_point_cast<std::chrono::microseconds>(AccessTime));
Times[1] =
sys::toTimeVal(std::chrono::time_point_cast<std::chrono::microseconds>(
ModificationTime));
if (::futimes(FD, Times))
return std::error_code(errno, std::generic_category());
return std::error_code();
#else
#warning Missing futimes() and futimens()
return make_error_code(errc::function_not_supported);
#endif
}
std::error_code mapped_file_region::init(int FD, uint64_t Offset,
mapmode Mode) {
assert(Size != 0);
int flags = (Mode == readwrite) ? MAP_SHARED : MAP_PRIVATE;
int prot = (Mode == readonly) ? PROT_READ : (PROT_READ | PROT_WRITE);
#if defined(__APPLE__)
//----------------------------------------------------------------------
// Newer versions of MacOSX have a flag that will allow us to read from
// binaries whose code signature is invalid without crashing by using
// the MAP_RESILIENT_CODESIGN flag. Also if a file from removable media
// is mapped we can avoid crashing and return zeroes to any pages we try
// to read if the media becomes unavailable by using the
// MAP_RESILIENT_MEDIA flag. These flags are only usable when mapping
// with PROT_READ, so take care not to specify them otherwise.
//----------------------------------------------------------------------
if (Mode == readonly) {
#if defined(MAP_RESILIENT_CODESIGN)
flags |= MAP_RESILIENT_CODESIGN;
#endif
#if defined(MAP_RESILIENT_MEDIA)
flags |= MAP_RESILIENT_MEDIA;
#endif
}
#endif // #if defined (__APPLE__)
Mapping = ::mmap(nullptr, Size, prot, flags, FD, Offset);
if (Mapping == MAP_FAILED)
return std::error_code(errno, std::generic_category());
return std::error_code();
}
mapped_file_region::mapped_file_region(int fd, mapmode mode, size_t length,
uint64_t offset, std::error_code &ec)
: Size(length), Mapping(), Mode(mode) {
(void)Mode;
ec = init(fd, offset, mode);
if (ec)
Mapping = nullptr;
}
mapped_file_region::~mapped_file_region() {
if (Mapping)
::munmap(Mapping, Size);
}
size_t mapped_file_region::size() const {
assert(Mapping && "Mapping failed but used anyway!");
return Size;
}
char *mapped_file_region::data() const {
assert(Mapping && "Mapping failed but used anyway!");
return reinterpret_cast<char*>(Mapping);
}
const char *mapped_file_region::const_data() const {
assert(Mapping && "Mapping failed but used anyway!");
return reinterpret_cast<const char*>(Mapping);
}
int mapped_file_region::alignment() {
return Process::getPageSizeEstimate();
}
std::error_code detail::directory_iterator_construct(detail::DirIterState &it,
StringRef path,
bool follow_symlinks) {
SmallString<128> path_null(path);
DIR *directory = ::opendir(path_null.c_str());
if (!directory)
return std::error_code(errno, std::generic_category());
it.IterationHandle = reinterpret_cast<intptr_t>(directory);
// Add something for replace_filename to replace.
path::append(path_null, ".");
it.CurrentEntry = directory_entry(path_null.str(), follow_symlinks);
return directory_iterator_increment(it);
}
std::error_code detail::directory_iterator_destruct(detail::DirIterState &it) {
if (it.IterationHandle)
::closedir(reinterpret_cast<DIR *>(it.IterationHandle));
it.IterationHandle = 0;
it.CurrentEntry = directory_entry();
return std::error_code();
}
static file_type direntType(dirent* Entry) {
// Most platforms provide the file type in the dirent: Linux/BSD/Mac.
// The DTTOIF macro lets us reuse our status -> type conversion.
#if defined(_DIRENT_HAVE_D_TYPE) && defined(DTTOIF)
return typeForMode(DTTOIF(Entry->d_type));
#else
// Other platforms such as Solaris require a stat() to get the type.
return file_type::type_unknown;
#endif
}
std::error_code detail::directory_iterator_increment(detail::DirIterState &It) {
errno = 0;
dirent *CurDir = ::readdir(reinterpret_cast<DIR *>(It.IterationHandle));
if (CurDir == nullptr && errno != 0) {
return std::error_code(errno, std::generic_category());
} else if (CurDir != nullptr) {
StringRef Name(CurDir->d_name);
if ((Name.size() == 1 && Name[0] == '.') ||
(Name.size() == 2 && Name[0] == '.' && Name[1] == '.'))
return directory_iterator_increment(It);
It.CurrentEntry.replace_filename(Name, direntType(CurDir));
} else
return directory_iterator_destruct(It);
return std::error_code();
}
ErrorOr<basic_file_status> directory_entry::status() const {
file_status s;
if (auto EC = fs::status(Path, s, FollowSymlinks))
return EC;
return s;
}
#if !defined(F_GETPATH)
static bool hasProcSelfFD() {
// If we have a /proc filesystem mounted, we can quickly establish the
// real name of the file with readlink
static const bool Result = (::access("/proc/self/fd", R_OK) == 0);
return Result;
}
#endif
static int nativeOpenFlags(CreationDisposition Disp, OpenFlags Flags,
FileAccess Access) {
int Result = 0;
if (Access == FA_Read)
Result |= O_RDONLY;
else if (Access == FA_Write)
Result |= O_WRONLY;
else if (Access == (FA_Read | FA_Write))
Result |= O_RDWR;
// This is for compatibility with old code that assumed F_Append implied
// would open an existing file. See Windows/Path.inc for a longer comment.
if (Flags & F_Append)
Disp = CD_OpenAlways;
if (Disp == CD_CreateNew) {
Result |= O_CREAT; // Create if it doesn't exist.
Result |= O_EXCL; // Fail if it does.
} else if (Disp == CD_CreateAlways) {
Result |= O_CREAT; // Create if it doesn't exist.
Result |= O_TRUNC; // Truncate if it does.
} else if (Disp == CD_OpenAlways) {
Result |= O_CREAT; // Create if it doesn't exist.
} else if (Disp == CD_OpenExisting) {
// Nothing special, just don't add O_CREAT and we get these semantics.
}
if (Flags & F_Append)
Result |= O_APPEND;
#ifdef O_CLOEXEC
if (!(Flags & OF_ChildInherit))
Result |= O_CLOEXEC;
#endif
return Result;
}
std::error_code openFile(const Twine &Name, int &ResultFD,
CreationDisposition Disp, FileAccess Access,
OpenFlags Flags, unsigned Mode) {
int OpenFlags = nativeOpenFlags(Disp, Flags, Access);
SmallString<128> Storage;
StringRef P = Name.toNullTerminatedStringRef(Storage);
// Call ::open in a lambda to avoid overload resolution in RetryAfterSignal
// when open is overloaded, such as in Bionic.
auto Open = [&]() { return ::open(P.begin(), OpenFlags, Mode); };
if ((ResultFD = sys::RetryAfterSignal(-1, Open)) < 0)
return std::error_code(errno, std::generic_category());
#ifndef O_CLOEXEC
if (!(Flags & OF_ChildInherit)) {
int r = fcntl(ResultFD, F_SETFD, FD_CLOEXEC);
(void)r;
assert(r == 0 && "fcntl(F_SETFD, FD_CLOEXEC) failed");
}
#endif
return std::error_code();
}
Expected<int> openNativeFile(const Twine &Name, CreationDisposition Disp,
FileAccess Access, OpenFlags Flags,
unsigned Mode) {
int FD;
std::error_code EC = openFile(Name, FD, Disp, Access, Flags, Mode);
if (EC)
return errorCodeToError(EC);
return FD;
}
std::error_code openFileForRead(const Twine &Name, int &ResultFD,
OpenFlags Flags,
SmallVectorImpl<char> *RealPath) {
std::error_code EC =
openFile(Name, ResultFD, CD_OpenExisting, FA_Read, Flags, 0666);
if (EC)
return EC;
// Attempt to get the real name of the file, if the user asked
if(!RealPath)
return std::error_code();
RealPath->clear();
#if defined(F_GETPATH)
// When F_GETPATH is availble, it is the quickest way to get
// the real path name.
char Buffer[MAXPATHLEN];
if (::fcntl(ResultFD, F_GETPATH, Buffer) != -1)
RealPath->append(Buffer, Buffer + strlen(Buffer));
#else
char Buffer[PATH_MAX];
if (hasProcSelfFD()) {
char ProcPath[64];
snprintf(ProcPath, sizeof(ProcPath), "/proc/self/fd/%d", ResultFD);
ssize_t CharCount = ::readlink(ProcPath, Buffer, sizeof(Buffer));
if (CharCount > 0)
RealPath->append(Buffer, Buffer + CharCount);
} else {
SmallString<128> Storage;
StringRef P = Name.toNullTerminatedStringRef(Storage);
// Use ::realpath to get the real path name
if (::realpath(P.begin(), Buffer) != nullptr)
RealPath->append(Buffer, Buffer + strlen(Buffer));
}
#endif
return std::error_code();
}
Expected<file_t> openNativeFileForRead(const Twine &Name, OpenFlags Flags,
SmallVectorImpl<char> *RealPath) {
file_t ResultFD;
std::error_code EC = openFileForRead(Name, ResultFD, Flags, RealPath);
if (EC)
return errorCodeToError(EC);
return ResultFD;
}
file_t getStdinHandle() { return 0; }
file_t getStdoutHandle() { return 1; }
file_t getStderrHandle() { return 2; }
std::error_code readNativeFile(file_t FD, MutableArrayRef<char> Buf,
size_t *BytesRead) {
*BytesRead = sys::RetryAfterSignal(-1, ::read, FD, Buf.data(), Buf.size());
if (ssize_t(*BytesRead) == -1)
return std::error_code(errno, std::generic_category());
return std::error_code();
}
std::error_code readNativeFileSlice(file_t FD, MutableArrayRef<char> Buf,
size_t Offset) {
char *BufPtr = Buf.data();
size_t BytesLeft = Buf.size();
#ifndef HAVE_PREAD
// If we don't have pread, seek to Offset.
if (lseek(FD, Offset, SEEK_SET) == -1)
return std::error_code(errno, std::generic_category());
#endif
while (BytesLeft) {
#ifdef HAVE_PREAD
ssize_t NumRead = sys::RetryAfterSignal(-1, ::pread, FD, BufPtr, BytesLeft,
Buf.size() - BytesLeft + Offset);
#else
ssize_t NumRead = sys::RetryAfterSignal(-1, ::read, FD, BufPtr, BytesLeft);
#endif
if (NumRead == -1) {
// Error while reading.
return std::error_code(errno, std::generic_category());
}
if (NumRead == 0) {
memset(BufPtr, 0, BytesLeft); // zero-initialize rest of the buffer.
break;
}
BytesLeft -= NumRead;
BufPtr += NumRead;
}
return std::error_code();
}
std::error_code closeFile(file_t &F) {
file_t TmpF = F;
F = kInvalidFile;
return Process::SafelyCloseFileDescriptor(TmpF);
}
template <typename T>
static std::error_code remove_directories_impl(const T &Entry,
bool IgnoreErrors) {
std::error_code EC;
directory_iterator Begin(Entry, EC, false);
directory_iterator End;
while (Begin != End) {
auto &Item = *Begin;
ErrorOr<basic_file_status> st = Item.status();
if (!st && !IgnoreErrors)
return st.getError();
if (is_directory(*st)) {
EC = remove_directories_impl(Item, IgnoreErrors);
if (EC && !IgnoreErrors)
return EC;
}
EC = fs::remove(Item.path(), true);
if (EC && !IgnoreErrors)
return EC;
Begin.increment(EC);
if (EC && !IgnoreErrors)
return EC;
}
return std::error_code();
}
std::error_code remove_directories(const Twine &path, bool IgnoreErrors) {
auto EC = remove_directories_impl(path, IgnoreErrors);
if (EC && !IgnoreErrors)
return EC;
EC = fs::remove(path, true);
if (EC && !IgnoreErrors)
return EC;
return std::error_code();
}
std::error_code real_path(const Twine &path, SmallVectorImpl<char> &dest,
bool expand_tilde) {
dest.clear();
if (path.isTriviallyEmpty())
return std::error_code();
if (expand_tilde) {
SmallString<128> Storage;
path.toVector(Storage);
expandTildeExpr(Storage);
return real_path(Storage, dest, false);
}
SmallString<128> Storage;
StringRef P = path.toNullTerminatedStringRef(Storage);
char Buffer[PATH_MAX];
if (::realpath(P.begin(), Buffer) == nullptr)
return std::error_code(errno, std::generic_category());
dest.append(Buffer, Buffer + strlen(Buffer));
return std::error_code();
}
} // end namespace fs
namespace path {
bool home_directory(SmallVectorImpl<char> &result) {
char *RequestedDir = getenv("HOME");
if (!RequestedDir) {
struct passwd *pw = getpwuid(getuid());
if (pw && pw->pw_dir)
RequestedDir = pw->pw_dir;
}
if (!RequestedDir)
return false;
result.clear();
result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
return true;
}
static bool getDarwinConfDir(bool TempDir, SmallVectorImpl<char> &Result) {
#if defined(_CS_DARWIN_USER_TEMP_DIR) && defined(_CS_DARWIN_USER_CACHE_DIR)
// On Darwin, use DARWIN_USER_TEMP_DIR or DARWIN_USER_CACHE_DIR.
// macros defined in <unistd.h> on darwin >= 9
int ConfName = TempDir ? _CS_DARWIN_USER_TEMP_DIR
: _CS_DARWIN_USER_CACHE_DIR;
size_t ConfLen = confstr(ConfName, nullptr, 0);
if (ConfLen > 0) {
do {
Result.resize(ConfLen);
ConfLen = confstr(ConfName, Result.data(), Result.size());
} while (ConfLen > 0 && ConfLen != Result.size());
if (ConfLen > 0) {
assert(Result.back() == 0);
Result.pop_back();
return true;
}
Result.clear();
}
#endif
return false;
}
static const char *getEnvTempDir() {
// Check whether the temporary directory is specified by an environment
// variable.
const char *EnvironmentVariables[] = {"TMPDIR", "TMP", "TEMP", "TEMPDIR"};
for (const char *Env : EnvironmentVariables) {
if (const char *Dir = std::getenv(Env))
return Dir;
}
return nullptr;
}
static const char *getDefaultTempDir(bool ErasedOnReboot) {
#ifdef P_tmpdir
if ((bool)P_tmpdir)
return P_tmpdir;
#endif
if (ErasedOnReboot)
return "/tmp";
return "/var/tmp";
}
void system_temp_directory(bool ErasedOnReboot, SmallVectorImpl<char> &Result) {
Result.clear();
if (ErasedOnReboot) {
// There is no env variable for the cache directory.
if (const char *RequestedDir = getEnvTempDir()) {
Result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
return;
}
}
if (getDarwinConfDir(ErasedOnReboot, Result))
return;
const char *RequestedDir = getDefaultTempDir(ErasedOnReboot);
Result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
}
} // end namespace path
namespace fs {
#ifdef __APPLE__
/// This implementation tries to perform an APFS CoW clone of the file,
/// which can be much faster and uses less space.
/// Unfortunately fcopyfile(3) does not support COPYFILE_CLONE, so the
/// file descriptor variant of this function still uses the default
/// implementation.
std::error_code copy_file(const Twine &From, const Twine &To) {
uint32_t Flag = COPYFILE_DATA;
-#if __has_builtin(__builtin_available)
+#if __has_builtin(__builtin_available) && defined(COPYFILE_CLONE)
if (__builtin_available(macos 10.12, *)) {
bool IsSymlink;
if (std::error_code Error = is_symlink_file(From, IsSymlink))
return Error;
// COPYFILE_CLONE clones the symlink instead of following it
// and returns EEXISTS if the target file already exists.
if (!IsSymlink && !exists(To))
Flag = COPYFILE_CLONE;
}
#endif
int Status =
copyfile(From.str().c_str(), To.str().c_str(), /* State */ NULL, Flag);
if (Status == 0)
return std::error_code();
return std::error_code(errno, std::generic_category());
}
#endif // __APPLE__
} // end namespace fs
} // end namespace sys
} // end namespace llvm
Index: vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64.td
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64.td (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64.td (revision 351303)
@@ -1,822 +1,822 @@
//=- AArch64.td - Describe the AArch64 Target Machine --------*- tablegen -*-=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
// Target-independent interfaces which we are implementing.
//===----------------------------------------------------------------------===//
include "llvm/Target/Target.td"
//===----------------------------------------------------------------------===//
// AArch64 Subtarget features.
//
def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
"Enable ARMv8 FP">;
def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
"Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
def FeatureSM4 : SubtargetFeature<
"sm4", "HasSM4", "true",
"Enable SM3 and SM4 support", [FeatureNEON]>;
def FeatureSHA2 : SubtargetFeature<
"sha2", "HasSHA2", "true",
"Enable SHA1 and SHA256 support", [FeatureNEON]>;
def FeatureSHA3 : SubtargetFeature<
"sha3", "HasSHA3", "true",
"Enable SHA512 and SHA3 support", [FeatureNEON, FeatureSHA2]>;
def FeatureAES : SubtargetFeature<
"aes", "HasAES", "true",
"Enable AES support", [FeatureNEON]>;
// Crypto has been split up and any combination is now valid (see the
// crypto defintions above). Also, crypto is now context sensitive:
// it has a different meaning for e.g. Armv8.4 than it has for Armv8.2.
// Therefore, we rely on Clang, the user interacing tool, to pass on the
// appropriate crypto options. But here in the backend, crypto has very little
// meaning anymore. We kept the Crypto defintion here for backward
// compatibility, and now imply features SHA2 and AES, which was the
// "traditional" meaning of Crypto.
def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
"Enable cryptographic instructions", [FeatureNEON, FeatureSHA2, FeatureAES]>;
def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
"Enable ARMv8 CRC-32 checksum instructions">;
def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
"Enable ARMv8 Reliability, Availability and Serviceability Extensions">;
def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true",
"Enable ARMv8.1 Large System Extension (LSE) atomic instructions">;
def FeatureRDM : SubtargetFeature<"rdm", "HasRDM", "true",
"Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions">;
def FeaturePAN : SubtargetFeature<
"pan", "HasPAN", "true",
"Enables ARM v8.1 Privileged Access-Never extension">;
def FeatureLOR : SubtargetFeature<
"lor", "HasLOR", "true",
"Enables ARM v8.1 Limited Ordering Regions extension">;
def FeatureVH : SubtargetFeature<
"vh", "HasVH", "true",
"Enables ARM v8.1 Virtual Host extension">;
def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
"Enable ARMv8 PMUv3 Performance Monitors extension">;
def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
"Full FP16", [FeatureFPARMv8]>;
def FeatureFP16FML : SubtargetFeature<"fp16fml", "HasFP16FML", "true",
"Enable FP16 FML instructions", [FeatureFullFP16]>;
def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true",
"Enable Statistical Profiling extension">;
def FeaturePAN_RWV : SubtargetFeature<
"pan-rwv", "HasPAN_RWV", "true",
"Enable v8.2 PAN s1e1R and s1e1W Variants",
[FeaturePAN]>;
// UAO PState
def FeaturePsUAO : SubtargetFeature< "uaops", "HasPsUAO", "true",
"Enable v8.2 UAO PState">;
def FeatureCCPP : SubtargetFeature<"ccpp", "HasCCPP",
"true", "Enable v8.2 data Cache Clean to Point of Persistence" >;
def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true",
"Enable Scalable Vector Extension (SVE) instructions">;
def FeatureSVE2 : SubtargetFeature<"sve2", "HasSVE2", "true",
"Enable Scalable Vector Extension 2 (SVE2) instructions", [FeatureSVE]>;
def FeatureSVE2AES : SubtargetFeature<"sve2-aes", "HasSVE2AES", "true",
"Enable AES SVE2 instructions", [FeatureSVE2, FeatureAES]>;
def FeatureSVE2SM4 : SubtargetFeature<"sve2-sm4", "HasSVE2SM4", "true",
"Enable SM4 SVE2 instructions", [FeatureSVE2, FeatureSM4]>;
def FeatureSVE2SHA3 : SubtargetFeature<"sve2-sha3", "HasSVE2SHA3", "true",
"Enable SHA3 SVE2 instructions", [FeatureSVE2, FeatureSHA3]>;
-def FeatureSVE2BitPerm : SubtargetFeature<"bitperm", "HasSVE2BitPerm", "true",
+def FeatureSVE2BitPerm : SubtargetFeature<"sve2-bitperm", "HasSVE2BitPerm", "true",
"Enable bit permutation SVE2 instructions", [FeatureSVE2]>;
def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
"Has zero-cycle register moves">;
def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
"Has zero-cycle zeroing instructions for generic registers">;
def FeatureZCZeroingFP : SubtargetFeature<"zcz-fp", "HasZeroCycleZeroingFP", "true",
"Has zero-cycle zeroing instructions for FP registers">;
def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
"Has zero-cycle zeroing instructions",
[FeatureZCZeroingGP, FeatureZCZeroingFP]>;
/// ... but the floating-point version doesn't quite work in rare cases on older
/// CPUs.
def FeatureZCZeroingFPWorkaround : SubtargetFeature<"zcz-fp-workaround",
"HasZeroCycleZeroingFPWorkaround", "true",
"The zero-cycle floating-point zeroing instruction has a bug">;
def FeatureStrictAlign : SubtargetFeature<"strict-align",
"StrictAlign", "true",
"Disallow all unaligned memory "
"access">;
foreach i = {1-7,9-15,18,20-28} in
def FeatureReserveX#i : SubtargetFeature<"reserve-x"#i, "ReserveXRegister["#i#"]", "true",
"Reserve X"#i#", making it unavailable "
"as a GPR">;
foreach i = {8-15,18} in
def FeatureCallSavedX#i : SubtargetFeature<"call-saved-x"#i,
"CustomCallSavedXRegs["#i#"]", "true", "Make X"#i#" callee saved.">;
def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
"Use alias analysis during codegen">;
def FeatureBalanceFPOps : SubtargetFeature<"balance-fp-ops", "BalanceFPOps",
"true",
"balance mix of odd and even D-registers for fp multiply(-accumulate) ops">;
def FeaturePredictableSelectIsExpensive : SubtargetFeature<
"predictable-select-expensive", "PredictableSelectIsExpensive", "true",
"Prefer likely predicted branches over selects">;
def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",
"CustomAsCheapAsMove", "true",
"Use custom handling of cheap instructions">;
def FeatureExynosCheapAsMoveHandling : SubtargetFeature<"exynos-cheap-as-move",
"ExynosAsCheapAsMove", "true",
"Use Exynos specific handling of cheap instructions",
[FeatureCustomCheapAsMoveHandling]>;
def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
"UsePostRAScheduler", "true", "Schedule again after register allocation">;
def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store",
"Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">;
def FeatureSlowPaired128 : SubtargetFeature<"slow-paired-128",
"Paired128IsSlow", "true", "Paired 128 bit loads and stores are slow">;
def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "STRQroIsSlow",
"true", "STR of Q register with register offset is slow">;
def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature<
"alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern",
"true", "Use alternative pattern for sextload convert to f32">;
def FeatureArithmeticBccFusion : SubtargetFeature<
"arith-bcc-fusion", "HasArithmeticBccFusion", "true",
"CPU fuses arithmetic+bcc operations">;
def FeatureArithmeticCbzFusion : SubtargetFeature<
"arith-cbz-fusion", "HasArithmeticCbzFusion", "true",
"CPU fuses arithmetic + cbz/cbnz operations">;
def FeatureFuseAddress : SubtargetFeature<
"fuse-address", "HasFuseAddress", "true",
"CPU fuses address generation and memory operations">;
def FeatureFuseAES : SubtargetFeature<
"fuse-aes", "HasFuseAES", "true",
"CPU fuses AES crypto operations">;
def FeatureFuseArithmeticLogic : SubtargetFeature<
"fuse-arith-logic", "HasFuseArithmeticLogic", "true",
"CPU fuses arithmetic and logic operations">;
def FeatureFuseCCSelect : SubtargetFeature<
"fuse-csel", "HasFuseCCSelect", "true",
"CPU fuses conditional select operations">;
def FeatureFuseCryptoEOR : SubtargetFeature<
"fuse-crypto-eor", "HasFuseCryptoEOR", "true",
"CPU fuses AES/PMULL and EOR operations">;
def FeatureFuseLiterals : SubtargetFeature<
"fuse-literals", "HasFuseLiterals", "true",
"CPU fuses literal generation operations">;
def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
"disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
"Disable latency scheduling heuristic">;
def FeatureForce32BitJumpTables
: SubtargetFeature<"force-32bit-jump-tables", "Force32BitJumpTables", "true",
"Force jump table entries to be 32-bits wide except at MinSize">;
def FeatureRCPC : SubtargetFeature<"rcpc", "HasRCPC", "true",
"Enable support for RCPC extension">;
def FeatureUseRSqrt : SubtargetFeature<
"use-reciprocal-square-root", "UseRSqrt", "true",
"Use the reciprocal square root approximation">;
def FeatureDotProd : SubtargetFeature<
"dotprod", "HasDotProd", "true",
"Enable dot product support">;
def FeaturePA : SubtargetFeature<
"pa", "HasPA", "true",
"Enable v8.3-A Pointer Authentication enchancement">;
def FeatureJS : SubtargetFeature<
"jsconv", "HasJS", "true",
"Enable v8.3-A JavaScript FP conversion enchancement",
[FeatureFPARMv8]>;
def FeatureCCIDX : SubtargetFeature<
"ccidx", "HasCCIDX", "true",
"Enable v8.3-A Extend of the CCSIDR number of sets">;
def FeatureComplxNum : SubtargetFeature<
"complxnum", "HasComplxNum", "true",
"Enable v8.3-A Floating-point complex number support",
[FeatureNEON]>;
def FeatureNV : SubtargetFeature<
"nv", "HasNV", "true",
"Enable v8.4-A Nested Virtualization Enchancement">;
def FeatureRASv8_4 : SubtargetFeature<
"rasv8_4", "HasRASv8_4", "true",
"Enable v8.4-A Reliability, Availability and Serviceability extension",
[FeatureRAS]>;
def FeatureMPAM : SubtargetFeature<
"mpam", "HasMPAM", "true",
"Enable v8.4-A Memory system Partitioning and Monitoring extension">;
def FeatureDIT : SubtargetFeature<
"dit", "HasDIT", "true",
"Enable v8.4-A Data Independent Timing instructions">;
def FeatureTRACEV8_4 : SubtargetFeature<
"tracev8.4", "HasTRACEV8_4", "true",
"Enable v8.4-A Trace extension">;
def FeatureAM : SubtargetFeature<
"am", "HasAM", "true",
"Enable v8.4-A Activity Monitors extension">;
def FeatureSEL2 : SubtargetFeature<
"sel2", "HasSEL2", "true",
"Enable v8.4-A Secure Exception Level 2 extension">;
def FeatureTLB_RMI : SubtargetFeature<
"tlb-rmi", "HasTLB_RMI", "true",
"Enable v8.4-A TLB Range and Maintenance Instructions">;
def FeatureFMI : SubtargetFeature<
"fmi", "HasFMI", "true",
"Enable v8.4-A Flag Manipulation Instructions">;
// 8.4 RCPC enchancements: LDAPR & STLR instructions with Immediate Offset
def FeatureRCPC_IMMO : SubtargetFeature<"rcpc-immo", "HasRCPC_IMMO", "true",
"Enable v8.4-A RCPC instructions with Immediate Offsets",
[FeatureRCPC]>;
def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
"NegativeImmediates", "false",
"Convert immediates and instructions "
"to their negated or complemented "
"equivalent when the immediate does "
"not fit in the encoding.">;
def FeatureLSLFast : SubtargetFeature<
"lsl-fast", "HasLSLFast", "true",
"CPU has a fastpath logical shift of up to 3 places">;
def FeatureAggressiveFMA :
SubtargetFeature<"aggressive-fma",
"HasAggressiveFMA",
"true",
"Enable Aggressive FMA for floating-point.">;
def FeatureAltFPCmp : SubtargetFeature<"altnzcv", "HasAlternativeNZCV", "true",
"Enable alternative NZCV format for floating point comparisons">;
def FeatureFRInt3264 : SubtargetFeature<"fptoint", "HasFRInt3264", "true",
"Enable FRInt[32|64][Z|X] instructions that round a floating-point number to "
"an integer (in FP format) forcing it to fit into a 32- or 64-bit int" >;
def FeatureSpecRestrict : SubtargetFeature<"specrestrict", "HasSpecRestrict",
"true", "Enable architectural speculation restriction" >;
def FeatureSB : SubtargetFeature<"sb", "HasSB",
"true", "Enable v8.5 Speculation Barrier" >;
def FeatureSSBS : SubtargetFeature<"ssbs", "HasSSBS",
"true", "Enable Speculative Store Bypass Safe bit" >;
def FeaturePredRes : SubtargetFeature<"predres", "HasPredRes", "true",
"Enable v8.5a execution and data prediction invalidation instructions" >;
def FeatureCacheDeepPersist : SubtargetFeature<"ccdp", "HasCCDP",
"true", "Enable v8.5 Cache Clean to Point of Deep Persistence" >;
def FeatureBranchTargetId : SubtargetFeature<"bti", "HasBTI",
"true", "Enable Branch Target Identification" >;
def FeatureRandGen : SubtargetFeature<"rand", "HasRandGen",
"true", "Enable Random Number generation instructions" >;
def FeatureMTE : SubtargetFeature<"mte", "HasMTE",
"true", "Enable Memory Tagging Extension" >;
//===----------------------------------------------------------------------===//
// Architectures.
//
def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
"Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE, FeatureRDM,
FeaturePAN, FeatureLOR, FeatureVH]>;
def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
"Support ARM v8.2a instructions", [HasV8_1aOps, FeaturePsUAO,
FeaturePAN_RWV, FeatureRAS, FeatureCCPP]>;
def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true",
"Support ARM v8.3a instructions", [HasV8_2aOps, FeatureRCPC, FeaturePA,
FeatureJS, FeatureCCIDX, FeatureComplxNum]>;
def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true",
"Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd,
FeatureNV, FeatureRASv8_4, FeatureMPAM, FeatureDIT,
FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeatureTLB_RMI,
FeatureFMI, FeatureRCPC_IMMO]>;
def HasV8_5aOps : SubtargetFeature<
"v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions",
[HasV8_4aOps, FeatureAltFPCmp, FeatureFRInt3264, FeatureSpecRestrict,
FeatureSSBS, FeatureSB, FeaturePredRes, FeatureCacheDeepPersist,
FeatureBranchTargetId]
>;
//===----------------------------------------------------------------------===//
// Register File Description
//===----------------------------------------------------------------------===//
include "AArch64RegisterInfo.td"
include "AArch64RegisterBanks.td"
include "AArch64CallingConvention.td"
//===----------------------------------------------------------------------===//
// Instruction Descriptions
//===----------------------------------------------------------------------===//
include "AArch64Schedule.td"
include "AArch64InstrInfo.td"
include "AArch64SchedPredicates.td"
include "AArch64SchedPredExynos.td"
def AArch64InstrInfo : InstrInfo;
//===----------------------------------------------------------------------===//
// Named operands for MRS/MSR/TLBI/...
//===----------------------------------------------------------------------===//
include "AArch64SystemOperands.td"
//===----------------------------------------------------------------------===//
// Access to privileged registers
//===----------------------------------------------------------------------===//
foreach i = 1-3 in
def FeatureUseEL#i#ForTP : SubtargetFeature<"tpidr-el"#i, "UseEL"#i#"ForTP",
"true", "Permit use of TPIDR_EL"#i#" for the TLS base">;
//===----------------------------------------------------------------------===//
// AArch64 Processors supported.
//
//===----------------------------------------------------------------------===//
// Unsupported features to disable for scheduling models
//===----------------------------------------------------------------------===//
class AArch64Unsupported { list<Predicate> F; }
def SVEUnsupported : AArch64Unsupported {
let F = [HasSVE, HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3,
HasSVE2BitPerm];
}
include "AArch64SchedA53.td"
include "AArch64SchedA57.td"
include "AArch64SchedCyclone.td"
include "AArch64SchedFalkor.td"
include "AArch64SchedKryo.td"
include "AArch64SchedExynosM1.td"
include "AArch64SchedExynosM3.td"
include "AArch64SchedExynosM4.td"
include "AArch64SchedThunderX.td"
include "AArch64SchedThunderX2T99.td"
def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
"Cortex-A35 ARM processors", [
FeatureCRC,
FeatureCrypto,
FeatureFPARMv8,
FeatureNEON,
FeaturePerfMon
]>;
def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
"Cortex-A53 ARM processors", [
FeatureBalanceFPOps,
FeatureCRC,
FeatureCrypto,
FeatureCustomCheapAsMoveHandling,
FeatureFPARMv8,
FeatureFuseAES,
FeatureNEON,
FeaturePerfMon,
FeaturePostRAScheduler,
FeatureUseAA
]>;
def ProcA55 : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55",
"Cortex-A55 ARM processors", [
HasV8_2aOps,
FeatureCrypto,
FeatureFPARMv8,
FeatureFuseAES,
FeatureNEON,
FeatureFullFP16,
FeatureDotProd,
FeatureRCPC,
FeaturePerfMon
]>;
def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
"Cortex-A57 ARM processors", [
FeatureBalanceFPOps,
FeatureCRC,
FeatureCrypto,
FeatureCustomCheapAsMoveHandling,
FeatureFPARMv8,
FeatureFuseAES,
FeatureFuseLiterals,
FeatureNEON,
FeaturePerfMon,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive
]>;
def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
"Cortex-A72 ARM processors", [
FeatureCRC,
FeatureCrypto,
FeatureFPARMv8,
FeatureFuseAES,
FeatureNEON,
FeaturePerfMon
]>;
def ProcA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
"Cortex-A73 ARM processors", [
FeatureCRC,
FeatureCrypto,
FeatureFPARMv8,
FeatureFuseAES,
FeatureNEON,
FeaturePerfMon
]>;
def ProcA75 : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75",
"Cortex-A75 ARM processors", [
HasV8_2aOps,
FeatureCrypto,
FeatureFPARMv8,
FeatureFuseAES,
FeatureNEON,
FeatureFullFP16,
FeatureDotProd,
FeatureRCPC,
FeaturePerfMon
]>;
def ProcA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
"Cortex-A76 ARM processors", [
HasV8_2aOps,
FeatureFPARMv8,
FeatureNEON,
FeatureRCPC,
FeatureCrypto,
FeatureFullFP16,
FeatureDotProd,
FeatureSSBS
]>;
// Note that cyclone does not fuse AES instructions, but newer apple chips do
// perform the fusion and cyclone is used by default when targetting apple OSes.
def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
"Cyclone", [
FeatureAlternateSExtLoadCVTF32Pattern,
FeatureArithmeticBccFusion,
FeatureArithmeticCbzFusion,
FeatureCrypto,
FeatureDisableLatencySchedHeuristic,
FeatureFPARMv8,
FeatureFuseAES,
FeatureFuseCryptoEOR,
FeatureNEON,
FeaturePerfMon,
FeatureZCRegMove,
FeatureZCZeroing,
FeatureZCZeroingFPWorkaround
]>;
def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
"Samsung Exynos-M1 processors",
[FeatureSlowPaired128,
FeatureCRC,
FeatureCrypto,
FeatureExynosCheapAsMoveHandling,
FeatureForce32BitJumpTables,
FeatureFuseAES,
FeaturePerfMon,
FeaturePostRAScheduler,
FeatureSlowMisaligned128Store,
FeatureUseRSqrt,
FeatureZCZeroingFP]>;
def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
"Samsung Exynos-M2 processors",
[FeatureSlowPaired128,
FeatureCRC,
FeatureCrypto,
FeatureExynosCheapAsMoveHandling,
FeatureForce32BitJumpTables,
FeatureFuseAES,
FeaturePerfMon,
FeaturePostRAScheduler,
FeatureSlowMisaligned128Store,
FeatureZCZeroingFP]>;
def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
"Samsung Exynos-M3 processors",
[FeatureCRC,
FeatureCrypto,
FeatureExynosCheapAsMoveHandling,
FeatureForce32BitJumpTables,
FeatureFuseAddress,
FeatureFuseAES,
FeatureFuseCCSelect,
FeatureFuseLiterals,
FeatureLSLFast,
FeaturePerfMon,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
FeatureZCZeroingFP]>;
def ProcExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
"Samsung Exynos-M4 processors",
[HasV8_2aOps,
FeatureArithmeticBccFusion,
FeatureArithmeticCbzFusion,
FeatureCrypto,
FeatureDotProd,
FeatureExynosCheapAsMoveHandling,
FeatureForce32BitJumpTables,
FeatureFullFP16,
FeatureFuseAddress,
FeatureFuseAES,
FeatureFuseArithmeticLogic,
FeatureFuseCCSelect,
FeatureFuseLiterals,
FeatureLSLFast,
FeaturePerfMon,
FeaturePostRAScheduler,
FeatureZCZeroing]>;
def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
"Qualcomm Kryo processors", [
FeatureCRC,
FeatureCrypto,
FeatureCustomCheapAsMoveHandling,
FeatureFPARMv8,
FeatureNEON,
FeaturePerfMon,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
FeatureZCZeroing,
FeatureLSLFast
]>;
def ProcFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
"Qualcomm Falkor processors", [
FeatureCRC,
FeatureCrypto,
FeatureCustomCheapAsMoveHandling,
FeatureFPARMv8,
FeatureNEON,
FeaturePerfMon,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
FeatureRDM,
FeatureZCZeroing,
FeatureLSLFast,
FeatureSlowSTRQro
]>;
def ProcSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
"Qualcomm Saphira processors", [
FeatureCrypto,
FeatureCustomCheapAsMoveHandling,
FeatureFPARMv8,
FeatureNEON,
FeatureSPE,
FeaturePerfMon,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
FeatureZCZeroing,
FeatureLSLFast,
HasV8_4aOps]>;
def ProcThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily",
"ThunderX2T99",
"Cavium ThunderX2 processors", [
FeatureAggressiveFMA,
FeatureCRC,
FeatureCrypto,
FeatureFPARMv8,
FeatureArithmeticBccFusion,
FeatureNEON,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
FeatureLSE,
HasV8_1aOps]>;
def ProcThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX",
"Cavium ThunderX processors", [
FeatureCRC,
FeatureCrypto,
FeatureFPARMv8,
FeaturePerfMon,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
FeatureNEON]>;
def ProcThunderXT88 : SubtargetFeature<"thunderxt88", "ARMProcFamily",
"ThunderXT88",
"Cavium ThunderX processors", [
FeatureCRC,
FeatureCrypto,
FeatureFPARMv8,
FeaturePerfMon,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
FeatureNEON]>;
def ProcThunderXT81 : SubtargetFeature<"thunderxt81", "ARMProcFamily",
"ThunderXT81",
"Cavium ThunderX processors", [
FeatureCRC,
FeatureCrypto,
FeatureFPARMv8,
FeaturePerfMon,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
FeatureNEON]>;
def ProcThunderXT83 : SubtargetFeature<"thunderxt83", "ARMProcFamily",
"ThunderXT83",
"Cavium ThunderX processors", [
FeatureCRC,
FeatureCrypto,
FeatureFPARMv8,
FeaturePerfMon,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
FeatureNEON]>;
def ProcTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110",
"HiSilicon TS-V110 processors", [
HasV8_2aOps,
FeatureCrypto,
FeatureCustomCheapAsMoveHandling,
FeatureFPARMv8,
FeatureFuseAES,
FeatureNEON,
FeaturePerfMon,
FeaturePostRAScheduler,
FeatureSPE,
FeatureFullFP16,
FeatureFP16FML,
FeatureDotProd]>;
def : ProcessorModel<"generic", NoSchedModel, [
FeatureFPARMv8,
FeatureFuseAES,
FeatureNEON,
FeaturePerfMon,
FeaturePostRAScheduler
]>;
// FIXME: Cortex-A35 and Cortex-A55 are currently modeled as a Cortex-A53.
def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
def : ProcessorModel<"cortex-a55", CortexA53Model, [ProcA55]>;
def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>;
def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>;
def : ProcessorModel<"cortex-a75", CortexA57Model, [ProcA75]>;
def : ProcessorModel<"cortex-a76", CortexA57Model, [ProcA76]>;
def : ProcessorModel<"cortex-a76ae", CortexA57Model, [ProcA76]>;
def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>;
def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>;
def : ProcessorModel<"exynos-m3", ExynosM3Model, [ProcExynosM3]>;
def : ProcessorModel<"exynos-m4", ExynosM4Model, [ProcExynosM4]>;
def : ProcessorModel<"exynos-m5", ExynosM4Model, [ProcExynosM4]>;
def : ProcessorModel<"falkor", FalkorModel, [ProcFalkor]>;
def : ProcessorModel<"saphira", FalkorModel, [ProcSaphira]>;
def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>;
// Cavium ThunderX/ThunderX T8X Processors
def : ProcessorModel<"thunderx", ThunderXT8XModel, [ProcThunderX]>;
def : ProcessorModel<"thunderxt88", ThunderXT8XModel, [ProcThunderXT88]>;
def : ProcessorModel<"thunderxt81", ThunderXT8XModel, [ProcThunderXT81]>;
def : ProcessorModel<"thunderxt83", ThunderXT8XModel, [ProcThunderXT83]>;
// Cavium ThunderX2T9X Processors. Formerly Broadcom Vulcan.
def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>;
// FIXME: HiSilicon TSV110 is currently modeled as a Cortex-A57.
def : ProcessorModel<"tsv110", CortexA57Model, [ProcTSV110]>;
// Alias for the latest Apple processor model supported by LLVM.
def : ProcessorModel<"apple-latest", CycloneModel, [ProcCyclone]>;
//===----------------------------------------------------------------------===//
// Assembly parser
//===----------------------------------------------------------------------===//
def GenericAsmParserVariant : AsmParserVariant {
int Variant = 0;
string Name = "generic";
string BreakCharacters = ".";
string TokenizingCharacters = "[]*!/";
}
def AppleAsmParserVariant : AsmParserVariant {
int Variant = 1;
string Name = "apple-neon";
string BreakCharacters = ".";
string TokenizingCharacters = "[]*!/";
}
//===----------------------------------------------------------------------===//
// Assembly printer
//===----------------------------------------------------------------------===//
// AArch64 Uses the MC printer for asm output, so make sure the TableGen
// AsmWriter bits get associated with the correct class.
def GenericAsmWriter : AsmWriter {
string AsmWriterClassName = "InstPrinter";
int PassSubtarget = 1;
int Variant = 0;
bit isMCAsmWriter = 1;
}
def AppleAsmWriter : AsmWriter {
let AsmWriterClassName = "AppleInstPrinter";
int PassSubtarget = 1;
int Variant = 1;
int isMCAsmWriter = 1;
}
//===----------------------------------------------------------------------===//
// Target Declaration
//===----------------------------------------------------------------------===//
def AArch64 : Target {
let InstructionSet = AArch64InstrInfo;
let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant];
let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter];
let AllowRegisterRenaming = 1;
}
//===----------------------------------------------------------------------===//
// Pfm Counters
//===----------------------------------------------------------------------===//
include "AArch64PfmCounters.td"
Index: vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64ISelLowering.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64ISelLowering.cpp (revision 351303)
@@ -1,12071 +1,12083 @@
//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the AArch64TargetLowering class.
//
//===----------------------------------------------------------------------===//
#include "AArch64ExpandImm.h"
#include "AArch64ISelLowering.h"
#include "AArch64CallingConvention.h"
#include "AArch64MachineFunctionInfo.h"
#include "AArch64PerfectShuffle.h"
#include "AArch64RegisterInfo.h"
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Triple.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetCallingConv.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/OperandTraits.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/Value.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include <algorithm>
#include <bitset>
#include <cassert>
#include <cctype>
#include <cstdint>
#include <cstdlib>
#include <iterator>
#include <limits>
#include <tuple>
#include <utility>
#include <vector>
using namespace llvm;
using namespace llvm::PatternMatch;
#define DEBUG_TYPE "aarch64-lower"
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumShiftInserts, "Number of vector shift inserts");
STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
static cl::opt<bool>
EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
cl::desc("Allow AArch64 SLI/SRI formation"),
cl::init(false));
// FIXME: The necessary dtprel relocations don't seem to be supported
// well in the GNU bfd and gold linkers at the moment. Therefore, by
// default, for now, fall back to GeneralDynamic code generation.
cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
"aarch64-elf-ldtls-generation", cl::Hidden,
cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
cl::init(false));
static cl::opt<bool>
EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
cl::desc("Enable AArch64 logical imm instruction "
"optimization"),
cl::init(true));
/// Value type used for condition codes.
static const MVT MVT_CC = MVT::i32;
AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
const AArch64Subtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
// AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
// we have to make something up. Arbitrarily, choose ZeroOrOne.
setBooleanContents(ZeroOrOneBooleanContent);
// When comparing vectors the result sets the different elements in the
// vector to all-one or all-zero.
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
// Set up the register classes.
addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
if (Subtarget->hasFPARMv8()) {
addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
}
if (Subtarget->hasNEON()) {
addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
// Someone set us up the NEON.
addDRTypeForNEON(MVT::v2f32);
addDRTypeForNEON(MVT::v8i8);
addDRTypeForNEON(MVT::v4i16);
addDRTypeForNEON(MVT::v2i32);
addDRTypeForNEON(MVT::v1i64);
addDRTypeForNEON(MVT::v1f64);
addDRTypeForNEON(MVT::v4f16);
addQRTypeForNEON(MVT::v4f32);
addQRTypeForNEON(MVT::v2f64);
addQRTypeForNEON(MVT::v16i8);
addQRTypeForNEON(MVT::v8i16);
addQRTypeForNEON(MVT::v4i32);
addQRTypeForNEON(MVT::v2i64);
addQRTypeForNEON(MVT::v8f16);
}
// Compute derived properties from the register classes
computeRegisterProperties(Subtarget->getRegisterInfo());
// Provide all sorts of operation actions
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
setOperationAction(ISD::SETCC, MVT::i32, Custom);
setOperationAction(ISD::SETCC, MVT::i64, Custom);
setOperationAction(ISD::SETCC, MVT::f16, Custom);
setOperationAction(ISD::SETCC, MVT::f32, Custom);
setOperationAction(ISD::SETCC, MVT::f64, Custom);
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
setOperationAction(ISD::BRCOND, MVT::Other, Expand);
setOperationAction(ISD::BR_CC, MVT::i32, Custom);
setOperationAction(ISD::BR_CC, MVT::i64, Custom);
setOperationAction(ISD::BR_CC, MVT::f16, Custom);
setOperationAction(ISD::BR_CC, MVT::f32, Custom);
setOperationAction(ISD::BR_CC, MVT::f64, Custom);
setOperationAction(ISD::SELECT, MVT::i32, Custom);
setOperationAction(ISD::SELECT, MVT::i64, Custom);
setOperationAction(ISD::SELECT, MVT::f16, Custom);
setOperationAction(ISD::SELECT, MVT::f32, Custom);
setOperationAction(ISD::SELECT, MVT::f64, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
setOperationAction(ISD::BR_JT, MVT::Other, Custom);
setOperationAction(ISD::JumpTable, MVT::i64, Custom);
setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
setOperationAction(ISD::FREM, MVT::f32, Expand);
setOperationAction(ISD::FREM, MVT::f64, Expand);
setOperationAction(ISD::FREM, MVT::f80, Expand);
setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
// Custom lowering hooks are needed for XOR
// to fold it into CSINC/CSINV.
setOperationAction(ISD::XOR, MVT::i32, Custom);
setOperationAction(ISD::XOR, MVT::i64, Custom);
// Virtually no operation on f128 is legal, but LLVM can't expand them when
// there's a valid register class, so we need custom operations in most cases.
setOperationAction(ISD::FABS, MVT::f128, Expand);
setOperationAction(ISD::FADD, MVT::f128, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
setOperationAction(ISD::FCOS, MVT::f128, Expand);
setOperationAction(ISD::FDIV, MVT::f128, Custom);
setOperationAction(ISD::FMA, MVT::f128, Expand);
setOperationAction(ISD::FMUL, MVT::f128, Custom);
setOperationAction(ISD::FNEG, MVT::f128, Expand);
setOperationAction(ISD::FPOW, MVT::f128, Expand);
setOperationAction(ISD::FREM, MVT::f128, Expand);
setOperationAction(ISD::FRINT, MVT::f128, Expand);
setOperationAction(ISD::FSIN, MVT::f128, Expand);
setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
setOperationAction(ISD::FSQRT, MVT::f128, Expand);
setOperationAction(ISD::FSUB, MVT::f128, Custom);
setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
setOperationAction(ISD::SETCC, MVT::f128, Custom);
setOperationAction(ISD::BR_CC, MVT::f128, Custom);
setOperationAction(ISD::SELECT, MVT::f128, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
// Lowering for many of the conversions is actually specified by the non-f128
// type. The LowerXXX function will be trivial when f128 isn't involved.
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
// Variable arguments.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
setOperationAction(ISD::VAARG, MVT::Other, Custom);
setOperationAction(ISD::VACOPY, MVT::Other, Custom);
setOperationAction(ISD::VAEND, MVT::Other, Expand);
// Variable-sized objects.
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
if (Subtarget->isTargetWindows())
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
else
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
// Constant pool entries
setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
// BlockAddress
setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
// Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
setOperationAction(ISD::ADDC, MVT::i32, Custom);
setOperationAction(ISD::ADDE, MVT::i32, Custom);
setOperationAction(ISD::SUBC, MVT::i32, Custom);
setOperationAction(ISD::SUBE, MVT::i32, Custom);
setOperationAction(ISD::ADDC, MVT::i64, Custom);
setOperationAction(ISD::ADDE, MVT::i64, Custom);
setOperationAction(ISD::SUBC, MVT::i64, Custom);
setOperationAction(ISD::SUBE, MVT::i64, Custom);
// AArch64 lacks both left-rotate and popcount instructions.
setOperationAction(ISD::ROTL, MVT::i32, Expand);
setOperationAction(ISD::ROTL, MVT::i64, Expand);
for (MVT VT : MVT::vector_valuetypes()) {
setOperationAction(ISD::ROTL, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
}
// AArch64 doesn't have {U|S}MUL_LOHI.
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
setOperationAction(ISD::CTPOP, MVT::i32, Custom);
setOperationAction(ISD::CTPOP, MVT::i64, Custom);
setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
for (MVT VT : MVT::vector_valuetypes()) {
setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::UDIVREM, VT, Expand);
}
setOperationAction(ISD::SREM, MVT::i32, Expand);
setOperationAction(ISD::SREM, MVT::i64, Expand);
setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
setOperationAction(ISD::UREM, MVT::i32, Expand);
setOperationAction(ISD::UREM, MVT::i64, Expand);
// Custom lower Add/Sub/Mul with overflow.
setOperationAction(ISD::SADDO, MVT::i32, Custom);
setOperationAction(ISD::SADDO, MVT::i64, Custom);
setOperationAction(ISD::UADDO, MVT::i32, Custom);
setOperationAction(ISD::UADDO, MVT::i64, Custom);
setOperationAction(ISD::SSUBO, MVT::i32, Custom);
setOperationAction(ISD::SSUBO, MVT::i64, Custom);
setOperationAction(ISD::USUBO, MVT::i32, Custom);
setOperationAction(ISD::USUBO, MVT::i64, Custom);
setOperationAction(ISD::SMULO, MVT::i32, Custom);
setOperationAction(ISD::SMULO, MVT::i64, Custom);
setOperationAction(ISD::UMULO, MVT::i32, Custom);
setOperationAction(ISD::UMULO, MVT::i64, Custom);
setOperationAction(ISD::FSIN, MVT::f32, Expand);
setOperationAction(ISD::FSIN, MVT::f64, Expand);
setOperationAction(ISD::FCOS, MVT::f32, Expand);
setOperationAction(ISD::FCOS, MVT::f64, Expand);
setOperationAction(ISD::FPOW, MVT::f32, Expand);
setOperationAction(ISD::FPOW, MVT::f64, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
if (Subtarget->hasFullFP16())
setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);
else
setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
setOperationAction(ISD::FREM, MVT::f16, Promote);
setOperationAction(ISD::FREM, MVT::v4f16, Expand);
setOperationAction(ISD::FREM, MVT::v8f16, Expand);
setOperationAction(ISD::FPOW, MVT::f16, Promote);
setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
setOperationAction(ISD::FPOWI, MVT::f16, Promote);
setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
setOperationAction(ISD::FCOS, MVT::f16, Promote);
setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
setOperationAction(ISD::FSIN, MVT::f16, Promote);
setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
setOperationAction(ISD::FEXP, MVT::f16, Promote);
setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
setOperationAction(ISD::FEXP2, MVT::f16, Promote);
setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
setOperationAction(ISD::FLOG, MVT::f16, Promote);
setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
setOperationAction(ISD::FLOG2, MVT::f16, Promote);
setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
setOperationAction(ISD::FLOG10, MVT::f16, Promote);
setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);
setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);
if (!Subtarget->hasFullFP16()) {
setOperationAction(ISD::SELECT, MVT::f16, Promote);
setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
setOperationAction(ISD::SETCC, MVT::f16, Promote);
setOperationAction(ISD::BR_CC, MVT::f16, Promote);
setOperationAction(ISD::FADD, MVT::f16, Promote);
setOperationAction(ISD::FSUB, MVT::f16, Promote);
setOperationAction(ISD::FMUL, MVT::f16, Promote);
setOperationAction(ISD::FDIV, MVT::f16, Promote);
setOperationAction(ISD::FMA, MVT::f16, Promote);
setOperationAction(ISD::FNEG, MVT::f16, Promote);
setOperationAction(ISD::FABS, MVT::f16, Promote);
setOperationAction(ISD::FCEIL, MVT::f16, Promote);
setOperationAction(ISD::FSQRT, MVT::f16, Promote);
setOperationAction(ISD::FFLOOR, MVT::f16, Promote);
setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
setOperationAction(ISD::FRINT, MVT::f16, Promote);
setOperationAction(ISD::FROUND, MVT::f16, Promote);
setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);
// promote v4f16 to v4f32 when that is known to be safe.
setOperationAction(ISD::FADD, MVT::v4f16, Promote);
setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote);
setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote);
AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32);
AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32);
setOperationAction(ISD::FABS, MVT::v4f16, Expand);
setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
setOperationAction(ISD::FMA, MVT::v4f16, Expand);
setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);
setOperationAction(ISD::FABS, MVT::v8f16, Expand);
setOperationAction(ISD::FADD, MVT::v8f16, Expand);
setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
setOperationAction(ISD::FMA, MVT::v8f16, Expand);
setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
}
// AArch64 has implementations of a lot of rounding-like FP operations.
for (MVT Ty : {MVT::f32, MVT::f64}) {
setOperationAction(ISD::FFLOOR, Ty, Legal);
setOperationAction(ISD::FNEARBYINT, Ty, Legal);
setOperationAction(ISD::FCEIL, Ty, Legal);
setOperationAction(ISD::FRINT, Ty, Legal);
setOperationAction(ISD::FTRUNC, Ty, Legal);
setOperationAction(ISD::FROUND, Ty, Legal);
setOperationAction(ISD::FMINNUM, Ty, Legal);
setOperationAction(ISD::FMAXNUM, Ty, Legal);
setOperationAction(ISD::FMINIMUM, Ty, Legal);
setOperationAction(ISD::FMAXIMUM, Ty, Legal);
setOperationAction(ISD::LROUND, Ty, Legal);
setOperationAction(ISD::LLROUND, Ty, Legal);
setOperationAction(ISD::LRINT, Ty, Legal);
setOperationAction(ISD::LLRINT, Ty, Legal);
}
if (Subtarget->hasFullFP16()) {
setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
setOperationAction(ISD::FFLOOR, MVT::f16, Legal);
setOperationAction(ISD::FCEIL, MVT::f16, Legal);
setOperationAction(ISD::FRINT, MVT::f16, Legal);
setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
setOperationAction(ISD::FROUND, MVT::f16, Legal);
setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
}
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
// This requires the Performance Monitors extension.
if (Subtarget->hasPerfMon())
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
// Issue __sincos_stret if available.
setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
} else {
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
}
// Make floating-point constants legal for the large code model, so they don't
// become loads from the constant pool.
if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
}
// AArch64 does not have floating-point extending loads, i1 sign-extending
// load, floating-point truncating stores, or v2i32->v2i16 truncating store.
for (MVT VT : MVT::fp_valuetypes()) {
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
}
for (MVT VT : MVT::integer_valuetypes())
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f128, MVT::f80, Expand);
setTruncStoreAction(MVT::f128, MVT::f64, Expand);
setTruncStoreAction(MVT::f128, MVT::f32, Expand);
setTruncStoreAction(MVT::f128, MVT::f16, Expand);
setOperationAction(ISD::BITCAST, MVT::i16, Custom);
setOperationAction(ISD::BITCAST, MVT::f16, Custom);
// Indexed loads and stores are supported.
for (unsigned im = (unsigned)ISD::PRE_INC;
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
setIndexedLoadAction(im, MVT::i8, Legal);
setIndexedLoadAction(im, MVT::i16, Legal);
setIndexedLoadAction(im, MVT::i32, Legal);
setIndexedLoadAction(im, MVT::i64, Legal);
setIndexedLoadAction(im, MVT::f64, Legal);
setIndexedLoadAction(im, MVT::f32, Legal);
setIndexedLoadAction(im, MVT::f16, Legal);
setIndexedStoreAction(im, MVT::i8, Legal);
setIndexedStoreAction(im, MVT::i16, Legal);
setIndexedStoreAction(im, MVT::i32, Legal);
setIndexedStoreAction(im, MVT::i64, Legal);
setIndexedStoreAction(im, MVT::f64, Legal);
setIndexedStoreAction(im, MVT::f32, Legal);
setIndexedStoreAction(im, MVT::f16, Legal);
}
// Trap.
setOperationAction(ISD::TRAP, MVT::Other, Legal);
if (Subtarget->isTargetWindows())
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
// We combine OR nodes for bitfield operations.
setTargetDAGCombine(ISD::OR);
// Try to create BICs for vector ANDs.
setTargetDAGCombine(ISD::AND);
// Vector add and sub nodes may conceal a high-half opportunity.
// Also, try to fold ADD into CSINC/CSINV..
setTargetDAGCombine(ISD::ADD);
setTargetDAGCombine(ISD::SUB);
setTargetDAGCombine(ISD::SRL);
setTargetDAGCombine(ISD::XOR);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::FP_TO_SINT);
setTargetDAGCombine(ISD::FP_TO_UINT);
setTargetDAGCombine(ISD::FDIV);
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::BITCAST);
setTargetDAGCombine(ISD::CONCAT_VECTORS);
setTargetDAGCombine(ISD::STORE);
if (Subtarget->supportsAddressTopByteIgnored())
setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::VSELECT);
setTargetDAGCombine(ISD::INTRINSIC_VOID);
setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
setTargetDAGCombine(ISD::GlobalAddress);
// In case of strict alignment, avoid an excessive number of byte wide stores.
MaxStoresPerMemsetOptSize = 8;
MaxStoresPerMemset = Subtarget->requiresStrictAlign()
? MaxStoresPerMemsetOptSize : 32;
MaxGluedStoresPerMemcpy = 4;
MaxStoresPerMemcpyOptSize = 4;
MaxStoresPerMemcpy = Subtarget->requiresStrictAlign()
? MaxStoresPerMemcpyOptSize : 16;
MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;
+ MaxLoadsPerMemcmpOptSize = 4;
+ MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign()
+ ? MaxLoadsPerMemcmpOptSize : 8;
+
setStackPointerRegisterToSaveRestore(AArch64::SP);
setSchedulingPreference(Sched::Hybrid);
EnableExtLdPromotion = true;
// Set required alignment.
setMinFunctionAlignment(2);
// Set preferred alignments.
setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
setPrefLoopAlignment(STI.getPrefLoopAlignment());
// Only change the limit for entries in a jump table if specified by
// the sub target, but not at the command line.
unsigned MaxJT = STI.getMaximumJumpTableSize();
if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
setMaximumJumpTableSize(MaxJT);
setHasExtractBitsInsn(true);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
if (Subtarget->hasNEON()) {
// FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
// silliness like this:
setOperationAction(ISD::FABS, MVT::v1f64, Expand);
setOperationAction(ISD::FADD, MVT::v1f64, Expand);
setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
setOperationAction(ISD::FMA, MVT::v1f64, Expand);
setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
setOperationAction(ISD::FREM, MVT::v1f64, Expand);
setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
setOperationAction(ISD::MUL, MVT::v1i64, Expand);
// AArch64 doesn't have a direct vector ->f32 conversion instructions for
// elements smaller than i32, so promote the input to i32 first.
setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
// i8 vector elements also need promotion to i32 for v8i8
setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
// Similarly, there is no direct i32 -> f64 vector conversion instruction.
setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
// Or, direct i32 -> f16 vector conversion. Set it so custom, so the
// conversion happens in two steps: v4i32 -> v4f32 -> v4f16
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
if (Subtarget->hasFullFP16()) {
setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
} else {
// when AArch64 doesn't have fullfp16 support, promote the input
// to i32 first.
setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
}
setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
// AArch64 doesn't have MUL.2d:
setOperationAction(ISD::MUL, MVT::v2i64, Expand);
// Custom handling for some quad-vector types to detect MULL.
setOperationAction(ISD::MUL, MVT::v8i16, Custom);
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
// Vector reductions
for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
}
for (MVT VT : { MVT::v4f16, MVT::v2f32,
MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
}
setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
// Likewise, narrowing and extending vector loads/stores aren't handled
// directly.
for (MVT VT : MVT::vector_valuetypes()) {
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
setOperationAction(ISD::MULHS, VT, Legal);
setOperationAction(ISD::MULHU, VT, Legal);
} else {
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::MULHU, VT, Expand);
}
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
setOperationAction(ISD::BSWAP, VT, Expand);
setOperationAction(ISD::CTTZ, VT, Expand);
for (MVT InnerVT : MVT::vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
}
}
// AArch64 has implementations of a lot of rounding-like FP operations.
for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
setOperationAction(ISD::FFLOOR, Ty, Legal);
setOperationAction(ISD::FNEARBYINT, Ty, Legal);
setOperationAction(ISD::FCEIL, Ty, Legal);
setOperationAction(ISD::FRINT, Ty, Legal);
setOperationAction(ISD::FTRUNC, Ty, Legal);
setOperationAction(ISD::FROUND, Ty, Legal);
}
if (Subtarget->hasFullFP16()) {
for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
setOperationAction(ISD::FFLOOR, Ty, Legal);
setOperationAction(ISD::FNEARBYINT, Ty, Legal);
setOperationAction(ISD::FCEIL, Ty, Legal);
setOperationAction(ISD::FRINT, Ty, Legal);
setOperationAction(ISD::FTRUNC, Ty, Legal);
setOperationAction(ISD::FROUND, Ty, Legal);
}
}
setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
}
PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
}
void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
assert(VT.isVector() && "VT should be a vector type");
if (VT.isFloatingPoint()) {
MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
}
// Mark vector float intrinsics as expand.
if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
setOperationAction(ISD::FSIN, VT, Expand);
setOperationAction(ISD::FCOS, VT, Expand);
setOperationAction(ISD::FPOW, VT, Expand);
setOperationAction(ISD::FLOG, VT, Expand);
setOperationAction(ISD::FLOG2, VT, Expand);
setOperationAction(ISD::FLOG10, VT, Expand);
setOperationAction(ISD::FEXP, VT, Expand);
setOperationAction(ISD::FEXP2, VT, Expand);
// But we do support custom-lowering for FCOPYSIGN.
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
}
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::OR, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
setOperationAction(ISD::SELECT, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
setOperationAction(ISD::VSELECT, VT, Expand);
for (MVT InnerVT : MVT::all_valuetypes())
setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
// CNT supports only B element sizes, then use UADDLP to widen.
if (VT != MVT::v8i8 && VT != MVT::v16i8)
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::UDIV, VT, Expand);
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::FREM, VT, Expand);
setOperationAction(ISD::FP_TO_SINT, VT, Custom);
setOperationAction(ISD::FP_TO_UINT, VT, Custom);
if (!VT.isFloatingPoint())
setOperationAction(ISD::ABS, VT, Legal);
// [SU][MIN|MAX] are available for all NEON types apart from i64.
if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
setOperationAction(Opcode, VT, Legal);
// F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
if (VT.isFloatingPoint() &&
(VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
for (unsigned Opcode :
{ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM})
setOperationAction(Opcode, VT, Legal);
if (Subtarget->isLittleEndian()) {
for (unsigned im = (unsigned)ISD::PRE_INC;
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
setIndexedLoadAction(im, VT, Legal);
setIndexedStoreAction(im, VT, Legal);
}
}
}
void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
addRegisterClass(VT, &AArch64::FPR64RegClass);
addTypeForNEON(VT, MVT::v2i32);
}
void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
addRegisterClass(VT, &AArch64::FPR128RegClass);
addTypeForNEON(VT, MVT::v4i32);
}
EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
EVT VT) const {
if (!VT.isVector())
return MVT::i32;
return VT.changeVectorElementTypeToInteger();
}
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
const APInt &Demanded,
TargetLowering::TargetLoweringOpt &TLO,
unsigned NewOpc) {
uint64_t OldImm = Imm, NewImm, Enc;
uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
// Return if the immediate is already all zeros, all ones, a bimm32 or a
// bimm64.
if (Imm == 0 || Imm == Mask ||
AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
return false;
unsigned EltSize = Size;
uint64_t DemandedBits = Demanded.getZExtValue();
// Clear bits that are not demanded.
Imm &= DemandedBits;
while (true) {
// The goal here is to set the non-demanded bits in a way that minimizes
// the number of switching between 0 and 1. In order to achieve this goal,
// we set the non-demanded bits to the value of the preceding demanded bits.
// For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
// non-demanded bit), we copy bit0 (1) to the least significant 'x',
// bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
// The final result is 0b11000011.
uint64_t NonDemandedBits = ~DemandedBits;
uint64_t InvertedImm = ~Imm & DemandedBits;
uint64_t RotatedImm =
((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
NonDemandedBits;
uint64_t Sum = RotatedImm + NonDemandedBits;
bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
uint64_t Ones = (Sum + Carry) & NonDemandedBits;
NewImm = (Imm | Ones) & Mask;
// If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
// or all-ones or all-zeros, in which case we can stop searching. Otherwise,
// we halve the element size and continue the search.
if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
break;
// We cannot shrink the element size any further if it is 2-bits.
if (EltSize == 2)
return false;
EltSize /= 2;
Mask >>= EltSize;
uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
// Return if there is mismatch in any of the demanded bits of Imm and Hi.
if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
return false;
// Merge the upper and lower halves of Imm and DemandedBits.
Imm |= Hi;
DemandedBits |= DemandedBitsHi;
}
++NumOptimizedImms;
// Replicate the element across the register width.
while (EltSize < Size) {
NewImm |= NewImm << EltSize;
EltSize *= 2;
}
(void)OldImm;
assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
"demanded bits should never be altered");
assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
// Create the new constant immediate node.
EVT VT = Op.getValueType();
SDLoc DL(Op);
SDValue New;
// If the new constant immediate is all-zeros or all-ones, let the target
// independent DAG combine optimize this node.
if (NewImm == 0 || NewImm == OrigMask) {
New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
TLO.DAG.getConstant(NewImm, DL, VT));
// Otherwise, create a machine node so that target independent DAG combine
// doesn't undo this optimization.
} else {
Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
New = SDValue(
TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
}
return TLO.CombineTo(Op, New);
}
bool AArch64TargetLowering::targetShrinkDemandedConstant(
SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
// Delay this optimization to as late as possible.
if (!TLO.LegalOps)
return false;
if (!EnableOptimizeLogicalImm)
return false;
EVT VT = Op.getValueType();
if (VT.isVector())
return false;
unsigned Size = VT.getSizeInBits();
assert((Size == 32 || Size == 64) &&
"i32 or i64 is expected after legalization.");
// Exit early if we demand all bits.
if (Demanded.countPopulation() == Size)
return false;
unsigned NewOpc;
switch (Op.getOpcode()) {
default:
return false;
case ISD::AND:
NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
break;
case ISD::OR:
NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
break;
case ISD::XOR:
NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
break;
}
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
if (!C)
return false;
uint64_t Imm = C->getZExtValue();
return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
}
/// computeKnownBitsForTargetNode - Determine which of the bits specified in
/// Mask are known to be either zero or one and return them Known.
void AArch64TargetLowering::computeKnownBitsForTargetNode(
const SDValue Op, KnownBits &Known,
const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
switch (Op.getOpcode()) {
default:
break;
case AArch64ISD::CSEL: {
KnownBits Known2;
Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
Known.Zero &= Known2.Zero;
Known.One &= Known2.One;
break;
}
case ISD::INTRINSIC_W_CHAIN: {
ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
switch (IntID) {
default: return;
case Intrinsic::aarch64_ldaxr:
case Intrinsic::aarch64_ldxr: {
unsigned BitWidth = Known.getBitWidth();
EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
unsigned MemBits = VT.getScalarSizeInBits();
Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
return;
}
}
break;
}
case ISD::INTRINSIC_WO_CHAIN:
case ISD::INTRINSIC_VOID: {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
switch (IntNo) {
default:
break;
case Intrinsic::aarch64_neon_umaxv:
case Intrinsic::aarch64_neon_uminv: {
// Figure out the datatype of the vector operand. The UMINV instruction
// will zero extend the result, so we can mark as known zero all the
// bits larger than the element datatype. 32-bit or larget doesn't need
// this as those are legal types and will be handled by isel directly.
MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
unsigned BitWidth = Known.getBitWidth();
if (VT == MVT::v8i8 || VT == MVT::v16i8) {
assert(BitWidth >= 8 && "Unexpected width!");
APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
Known.Zero |= Mask;
} else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
assert(BitWidth >= 16 && "Unexpected width!");
APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
Known.Zero |= Mask;
}
break;
} break;
}
}
}
}
MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
EVT) const {
return MVT::i64;
}
bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
bool *Fast) const {
if (Subtarget->requiresStrictAlign())
return false;
if (Fast) {
// Some CPUs are fine with unaligned stores except for 128-bit ones.
*Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
// See comments in performSTORECombine() for more details about
// these conditions.
// Code that uses clang vector extensions can mark that it
// wants unaligned accesses to be treated as fast by
// underspecifying alignment to be 1 or 2.
Align <= 2 ||
// Disregard v2i64. Memcpy lowering produces those and splitting
// them regresses performance on micro-benchmarks and olden/bh.
VT == MVT::v2i64;
}
return true;
}
FastISel *
AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo) const {
return AArch64::createFastISel(funcInfo, libInfo);
}
const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((AArch64ISD::NodeType)Opcode) {
case AArch64ISD::FIRST_NUMBER: break;
case AArch64ISD::CALL: return "AArch64ISD::CALL";
case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
case AArch64ISD::ADR: return "AArch64ISD::ADR";
case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
case AArch64ISD::ADC: return "AArch64ISD::ADC";
case AArch64ISD::SBC: return "AArch64ISD::SBC";
case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
case AArch64ISD::DUP: return "AArch64ISD::DUP";
case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
case AArch64ISD::BICi: return "AArch64ISD::BICi";
case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
case AArch64ISD::BSL: return "AArch64ISD::BSL";
case AArch64ISD::NEG: return "AArch64ISD::NEG";
case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
case AArch64ISD::REV16: return "AArch64ISD::REV16";
case AArch64ISD::REV32: return "AArch64ISD::REV32";
case AArch64ISD::REV64: return "AArch64ISD::REV64";
case AArch64ISD::EXT: return "AArch64ISD::EXT";
case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
case AArch64ISD::NOT: return "AArch64ISD::NOT";
case AArch64ISD::BIT: return "AArch64ISD::BIT";
case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH";
case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
case AArch64ISD::STG: return "AArch64ISD::STG";
case AArch64ISD::STZG: return "AArch64ISD::STZG";
case AArch64ISD::ST2G: return "AArch64ISD::ST2G";
case AArch64ISD::STZ2G: return "AArch64ISD::STZ2G";
}
return nullptr;
}
MachineBasicBlock *
AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
MachineBasicBlock *MBB) const {
// We materialise the F128CSEL pseudo-instruction as some control flow and a
// phi node:
// OrigBB:
// [... previous instrs leading to comparison ...]
// b.ne TrueBB
// b EndBB
// TrueBB:
// ; Fallthrough
// EndBB:
// Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
DebugLoc DL = MI.getDebugLoc();
MachineFunction::iterator It = ++MBB->getIterator();
unsigned DestReg = MI.getOperand(0).getReg();
unsigned IfTrueReg = MI.getOperand(1).getReg();
unsigned IfFalseReg = MI.getOperand(2).getReg();
unsigned CondCode = MI.getOperand(3).getImm();
bool NZCVKilled = MI.getOperand(4).isKill();
MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
MF->insert(It, TrueBB);
MF->insert(It, EndBB);
// Transfer rest of current basic-block to EndBB
EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
MBB->end());
EndBB->transferSuccessorsAndUpdatePHIs(MBB);
BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
MBB->addSuccessor(TrueBB);
MBB->addSuccessor(EndBB);
// TrueBB falls through to the end.
TrueBB->addSuccessor(EndBB);
if (!NZCVKilled) {
TrueBB->addLiveIn(AArch64::NZCV);
EndBB->addLiveIn(AArch64::NZCV);
}
BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
.addReg(IfTrueReg)
.addMBB(TrueBB)
.addReg(IfFalseReg)
.addMBB(MBB);
MI.eraseFromParent();
return EndBB;
}
MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
MachineInstr &MI, MachineBasicBlock *BB) const {
assert(!isAsynchronousEHPersonality(classifyEHPersonality(
BB->getParent()->getFunction().getPersonalityFn())) &&
"SEH does not use catchret!");
return BB;
}
MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchPad(
MachineInstr &MI, MachineBasicBlock *BB) const {
MI.eraseFromParent();
return BB;
}
MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *BB) const {
switch (MI.getOpcode()) {
default:
#ifndef NDEBUG
MI.dump();
#endif
llvm_unreachable("Unexpected instruction for custom inserter!");
case AArch64::F128CSEL:
return EmitF128CSEL(MI, BB);
case TargetOpcode::STACKMAP:
case TargetOpcode::PATCHPOINT:
return emitPatchPoint(MI, BB);
case AArch64::CATCHRET:
return EmitLoweredCatchRet(MI, BB);
case AArch64::CATCHPAD:
return EmitLoweredCatchPad(MI, BB);
}
}
//===----------------------------------------------------------------------===//
// AArch64 Lowering private implementation.
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
// Lowering Code
//===----------------------------------------------------------------------===//
/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
/// CC
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
switch (CC) {
default:
llvm_unreachable("Unknown condition code!");
case ISD::SETNE:
return AArch64CC::NE;
case ISD::SETEQ:
return AArch64CC::EQ;
case ISD::SETGT:
return AArch64CC::GT;
case ISD::SETGE:
return AArch64CC::GE;
case ISD::SETLT:
return AArch64CC::LT;
case ISD::SETLE:
return AArch64CC::LE;
case ISD::SETUGT:
return AArch64CC::HI;
case ISD::SETUGE:
return AArch64CC::HS;
case ISD::SETULT:
return AArch64CC::LO;
case ISD::SETULE:
return AArch64CC::LS;
}
}
/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static void changeFPCCToAArch64CC(ISD::CondCode CC,
AArch64CC::CondCode &CondCode,
AArch64CC::CondCode &CondCode2) {
CondCode2 = AArch64CC::AL;
switch (CC) {
default:
llvm_unreachable("Unknown FP condition!");
case ISD::SETEQ:
case ISD::SETOEQ:
CondCode = AArch64CC::EQ;
break;
case ISD::SETGT:
case ISD::SETOGT:
CondCode = AArch64CC::GT;
break;
case ISD::SETGE:
case ISD::SETOGE:
CondCode = AArch64CC::GE;
break;
case ISD::SETOLT:
CondCode = AArch64CC::MI;
break;
case ISD::SETOLE:
CondCode = AArch64CC::LS;
break;
case ISD::SETONE:
CondCode = AArch64CC::MI;
CondCode2 = AArch64CC::GT;
break;
case ISD::SETO:
CondCode = AArch64CC::VC;
break;
case ISD::SETUO:
CondCode = AArch64CC::VS;
break;
case ISD::SETUEQ:
CondCode = AArch64CC::EQ;
CondCode2 = AArch64CC::VS;
break;
case ISD::SETUGT:
CondCode = AArch64CC::HI;
break;
case ISD::SETUGE:
CondCode = AArch64CC::PL;
break;
case ISD::SETLT:
case ISD::SETULT:
CondCode = AArch64CC::LT;
break;
case ISD::SETLE:
case ISD::SETULE:
CondCode = AArch64CC::LE;
break;
case ISD::SETNE:
case ISD::SETUNE:
CondCode = AArch64CC::NE;
break;
}
}
/// Convert a DAG fp condition code to an AArch64 CC.
/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
/// should be AND'ed instead of OR'ed.
static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
AArch64CC::CondCode &CondCode,
AArch64CC::CondCode &CondCode2) {
CondCode2 = AArch64CC::AL;
switch (CC) {
default:
changeFPCCToAArch64CC(CC, CondCode, CondCode2);
assert(CondCode2 == AArch64CC::AL);
break;
case ISD::SETONE:
// (a one b)
// == ((a olt b) || (a ogt b))
// == ((a ord b) && (a une b))
CondCode = AArch64CC::VC;
CondCode2 = AArch64CC::NE;
break;
case ISD::SETUEQ:
// (a ueq b)
// == ((a uno b) || (a oeq b))
// == ((a ule b) && (a uge b))
CondCode = AArch64CC::PL;
CondCode2 = AArch64CC::LE;
break;
}
}
/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
/// CC usable with the vector instructions. Fewer operations are available
/// without a real NZCV register, so we have to use less efficient combinations
/// to get the same effect.
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
AArch64CC::CondCode &CondCode,
AArch64CC::CondCode &CondCode2,
bool &Invert) {
Invert = false;
switch (CC) {
default:
// Mostly the scalar mappings work fine.
changeFPCCToAArch64CC(CC, CondCode, CondCode2);
break;
case ISD::SETUO:
Invert = true;
LLVM_FALLTHROUGH;
case ISD::SETO:
CondCode = AArch64CC::MI;
CondCode2 = AArch64CC::GE;
break;
case ISD::SETUEQ:
case ISD::SETULT:
case ISD::SETULE:
case ISD::SETUGT:
case ISD::SETUGE:
// All of the compare-mask comparisons are ordered, but we can switch
// between the two by a double inversion. E.g. ULE == !OGT.
Invert = true;
changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
break;
}
}
static bool isLegalArithImmed(uint64_t C) {
// Matches AArch64DAGToDAGISel::SelectArithImmed().
bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
LLVM_DEBUG(dbgs() << "Is imm " << C
<< " legal: " << (IsLegal ? "yes\n" : "no\n"));
return IsLegal;
}
// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
// can be set differently by this operation. It comes down to whether
// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
// everything is fine. If not then the optimization is wrong. Thus general
// comparisons are only valid if op2 != 0.
//
// So, finally, the only LLVM-native comparisons that don't mention C and V
// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
// the absence of information about op2.
static bool isCMN(SDValue Op, ISD::CondCode CC) {
return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
(CC == ISD::SETEQ || CC == ISD::SETNE);
}
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
const SDLoc &dl, SelectionDAG &DAG) {
EVT VT = LHS.getValueType();
const bool FullFP16 =
static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
if (VT.isFloatingPoint()) {
assert(VT != MVT::f128);
if (VT == MVT::f16 && !FullFP16) {
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
VT = MVT::f32;
}
return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
}
// The CMP instruction is just an alias for SUBS, and representing it as
// SUBS means that it's possible to get CSE with subtract operations.
// A later phase can perform the optimization of setting the destination
// register to WZR/XZR if it ends up being unused.
unsigned Opcode = AArch64ISD::SUBS;
if (isCMN(RHS, CC)) {
// Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
Opcode = AArch64ISD::ADDS;
RHS = RHS.getOperand(1);
} else if (isCMN(LHS, CC)) {
// As we are looking for EQ/NE compares, the operands can be commuted ; can
// we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
Opcode = AArch64ISD::ADDS;
LHS = LHS.getOperand(1);
} else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
!isUnsignedIntSetCC(CC)) {
// Similarly, (CMP (and X, Y), 0) can be implemented with a TST
// (a.k.a. ANDS) except that the flags are only guaranteed to work for one
// of the signed comparisons.
Opcode = AArch64ISD::ANDS;
RHS = LHS.getOperand(1);
LHS = LHS.getOperand(0);
}
return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
.getValue(1);
}
/// \defgroup AArch64CCMP CMP;CCMP matching
///
/// These functions deal with the formation of CMP;CCMP;... sequences.
/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
/// a comparison. They set the NZCV flags to a predefined value if their
/// predicate is false. This allows to express arbitrary conjunctions, for
/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
/// expressed as:
/// cmp A
/// ccmp B, inv(CB), CA
/// check for CB flags
///
/// This naturally lets us implement chains of AND operations with SETCC
/// operands. And we can even implement some other situations by transforming
/// them:
/// - We can implement (NEG SETCC) i.e. negating a single comparison by
/// negating the flags used in a CCMP/FCCMP operations.
/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
/// by negating the flags we test for afterwards. i.e.
/// NEG (CMP CCMP CCCMP ...) can be implemented.
/// - Note that we can only ever negate all previously processed results.
/// What we can not implement by flipping the flags to test is a negation
/// of two sub-trees (because the negation affects all sub-trees emitted so
/// far, so the 2nd sub-tree we emit would also affect the first).
/// With those tools we can implement some OR operations:
/// - (OR (SETCC A) (SETCC B)) can be implemented via:
/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
/// - After transforming OR to NEG/AND combinations we may be able to use NEG
/// elimination rules from earlier to implement the whole thing as a
/// CCMP/FCCMP chain.
///
/// As complete example:
/// or (or (setCA (cmp A)) (setCB (cmp B)))
/// (and (setCC (cmp C)) (setCD (cmp D)))"
/// can be reassociated to:
/// or (and (setCC (cmp C)) setCD (cmp D))
// (or (setCA (cmp A)) (setCB (cmp B)))
/// can be transformed to:
/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
/// which can be implemented as:
/// cmp C
/// ccmp D, inv(CD), CC
/// ccmp A, CA, inv(CD)
/// ccmp B, CB, inv(CA)
/// check for CB flags
///
/// A counterexample is "or (and A B) (and C D)" which translates to
/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
/// can only implement 1 of the inner (not) operations, but not both!
/// @{
/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
ISD::CondCode CC, SDValue CCOp,
AArch64CC::CondCode Predicate,
AArch64CC::CondCode OutCC,
const SDLoc &DL, SelectionDAG &DAG) {
unsigned Opcode = 0;
const bool FullFP16 =
static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
if (LHS.getValueType().isFloatingPoint()) {
assert(LHS.getValueType() != MVT::f128);
if (LHS.getValueType() == MVT::f16 && !FullFP16) {
LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
}
Opcode = AArch64ISD::FCCMP;
} else if (RHS.getOpcode() == ISD::SUB) {
SDValue SubOp0 = RHS.getOperand(0);
if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
// See emitComparison() on why we can only do this for SETEQ and SETNE.
Opcode = AArch64ISD::CCMN;
RHS = RHS.getOperand(1);
}
}
if (Opcode == 0)
Opcode = AArch64ISD::CCMP;
SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
}
/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
/// expressed as a conjunction. See \ref AArch64CCMP.
/// \param CanNegate Set to true if we can negate the whole sub-tree just by
/// changing the conditions on the SETCC tests.
/// (this means we can call emitConjunctionRec() with
/// Negate==true on this sub-tree)
/// \param MustBeFirst Set to true if this subtree needs to be negated and we
/// cannot do the negation naturally. We are required to
/// emit the subtree first in this case.
/// \param WillNegate Is true if are called when the result of this
/// subexpression must be negated. This happens when the
/// outer expression is an OR. We can use this fact to know
/// that we have a double negation (or (or ...) ...) that
/// can be implemented for free.
static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
bool &MustBeFirst, bool WillNegate,
unsigned Depth = 0) {
if (!Val.hasOneUse())
return false;
unsigned Opcode = Val->getOpcode();
if (Opcode == ISD::SETCC) {
if (Val->getOperand(0).getValueType() == MVT::f128)
return false;
CanNegate = true;
MustBeFirst = false;
return true;
}
// Protect against exponential runtime and stack overflow.
if (Depth > 6)
return false;
if (Opcode == ISD::AND || Opcode == ISD::OR) {
bool IsOR = Opcode == ISD::OR;
SDValue O0 = Val->getOperand(0);
SDValue O1 = Val->getOperand(1);
bool CanNegateL;
bool MustBeFirstL;
if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
return false;
bool CanNegateR;
bool MustBeFirstR;
if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
return false;
if (MustBeFirstL && MustBeFirstR)
return false;
if (IsOR) {
// For an OR expression we need to be able to naturally negate at least
// one side or we cannot do the transformation at all.
if (!CanNegateL && !CanNegateR)
return false;
// If we the result of the OR will be negated and we can naturally negate
// the leafs, then this sub-tree as a whole negates naturally.
CanNegate = WillNegate && CanNegateL && CanNegateR;
// If we cannot naturally negate the whole sub-tree, then this must be
// emitted first.
MustBeFirst = !CanNegate;
} else {
assert(Opcode == ISD::AND && "Must be OR or AND");
// We cannot naturally negate an AND operation.
CanNegate = false;
MustBeFirst = MustBeFirstL || MustBeFirstR;
}
return true;
}
return false;
}
/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
/// Tries to transform the given i1 producing node @p Val to a series compare
/// and conditional compare operations. @returns an NZCV flags producing node
/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
/// transformation was not possible.
/// \p Negate is true if we want this sub-tree being negated just by changing
/// SETCC conditions.
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
AArch64CC::CondCode Predicate) {
// We're at a tree leaf, produce a conditional comparison operation.
unsigned Opcode = Val->getOpcode();
if (Opcode == ISD::SETCC) {
SDValue LHS = Val->getOperand(0);
SDValue RHS = Val->getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
bool isInteger = LHS.getValueType().isInteger();
if (Negate)
CC = getSetCCInverse(CC, isInteger);
SDLoc DL(Val);
// Determine OutCC and handle FP special case.
if (isInteger) {
OutCC = changeIntCCToAArch64CC(CC);
} else {
assert(LHS.getValueType().isFloatingPoint());
AArch64CC::CondCode ExtraCC;
changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
// Some floating point conditions can't be tested with a single condition
// code. Construct an additional comparison in this case.
if (ExtraCC != AArch64CC::AL) {
SDValue ExtraCmp;
if (!CCOp.getNode())
ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
else
ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
ExtraCC, DL, DAG);
CCOp = ExtraCmp;
Predicate = ExtraCC;
}
}
// Produce a normal comparison if we are first in the chain
if (!CCOp)
return emitComparison(LHS, RHS, CC, DL, DAG);
// Otherwise produce a ccmp.
return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
DAG);
}
assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
bool IsOR = Opcode == ISD::OR;
SDValue LHS = Val->getOperand(0);
bool CanNegateL;
bool MustBeFirstL;
bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
assert(ValidL && "Valid conjunction/disjunction tree");
(void)ValidL;
SDValue RHS = Val->getOperand(1);
bool CanNegateR;
bool MustBeFirstR;
bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
assert(ValidR && "Valid conjunction/disjunction tree");
(void)ValidR;
// Swap sub-tree that must come first to the right side.
if (MustBeFirstL) {
assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
std::swap(LHS, RHS);
std::swap(CanNegateL, CanNegateR);
std::swap(MustBeFirstL, MustBeFirstR);
}
bool NegateR;
bool NegateAfterR;
bool NegateL;
bool NegateAfterAll;
if (Opcode == ISD::OR) {
// Swap the sub-tree that we can negate naturally to the left.
if (!CanNegateL) {
assert(CanNegateR && "at least one side must be negatable");
assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
assert(!Negate);
std::swap(LHS, RHS);
NegateR = false;
NegateAfterR = true;
} else {
// Negate the left sub-tree if possible, otherwise negate the result.
NegateR = CanNegateR;
NegateAfterR = !CanNegateR;
}
NegateL = true;
NegateAfterAll = !Negate;
} else {
assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
assert(!Negate && "Valid conjunction/disjunction tree");
NegateL = false;
NegateR = false;
NegateAfterR = false;
NegateAfterAll = false;
}
// Emit sub-trees.
AArch64CC::CondCode RHSCC;
SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
if (NegateAfterR)
RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
if (NegateAfterAll)
OutCC = AArch64CC::getInvertedCondCode(OutCC);
return CmpL;
}
/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
/// In some cases this is even possible with OR operations in the expression.
/// See \ref AArch64CCMP.
/// \see emitConjunctionRec().
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
AArch64CC::CondCode &OutCC) {
bool DummyCanNegate;
bool DummyMustBeFirst;
if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
return SDValue();
return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
}
/// @}
/// Returns how profitable it is to fold a comparison's operand's shift and/or
/// extension operations.
static unsigned getCmpOperandFoldingProfit(SDValue Op) {
auto isSupportedExtend = [&](SDValue V) {
if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
return true;
if (V.getOpcode() == ISD::AND)
if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
uint64_t Mask = MaskCst->getZExtValue();
return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
}
return false;
};
if (!Op.hasOneUse())
return 0;
if (isSupportedExtend(Op))
return 1;
unsigned Opc = Op.getOpcode();
if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
uint64_t Shift = ShiftCst->getZExtValue();
if (isSupportedExtend(Op.getOperand(0)))
return (Shift <= 4) ? 2 : 1;
EVT VT = Op.getValueType();
if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
return 1;
}
return 0;
}
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
SDValue &AArch64cc, SelectionDAG &DAG,
const SDLoc &dl) {
if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
EVT VT = RHS.getValueType();
uint64_t C = RHSC->getZExtValue();
if (!isLegalArithImmed(C)) {
// Constant does not fit, try adjusting it by one?
switch (CC) {
default:
break;
case ISD::SETLT:
case ISD::SETGE:
if ((VT == MVT::i32 && C != 0x80000000 &&
isLegalArithImmed((uint32_t)(C - 1))) ||
(VT == MVT::i64 && C != 0x80000000ULL &&
isLegalArithImmed(C - 1ULL))) {
CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
RHS = DAG.getConstant(C, dl, VT);
}
break;
case ISD::SETULT:
case ISD::SETUGE:
if ((VT == MVT::i32 && C != 0 &&
isLegalArithImmed((uint32_t)(C - 1))) ||
(VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
RHS = DAG.getConstant(C, dl, VT);
}
break;
case ISD::SETLE:
case ISD::SETGT:
if ((VT == MVT::i32 && C != INT32_MAX &&
isLegalArithImmed((uint32_t)(C + 1))) ||
(VT == MVT::i64 && C != INT64_MAX &&
isLegalArithImmed(C + 1ULL))) {
CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
RHS = DAG.getConstant(C, dl, VT);
}
break;
case ISD::SETULE:
case ISD::SETUGT:
if ((VT == MVT::i32 && C != UINT32_MAX &&
isLegalArithImmed((uint32_t)(C + 1))) ||
(VT == MVT::i64 && C != UINT64_MAX &&
isLegalArithImmed(C + 1ULL))) {
CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
RHS = DAG.getConstant(C, dl, VT);
}
break;
}
}
}
// Comparisons are canonicalized so that the RHS operand is simpler than the
// LHS one, the extreme case being when RHS is an immediate. However, AArch64
// can fold some shift+extend operations on the RHS operand, so swap the
// operands if that can be done.
//
// For example:
// lsl w13, w11, #1
// cmp w13, w12
// can be turned into:
// cmp w12, w11, lsl #1
if (!isa<ConstantSDNode>(RHS) ||
!isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) {
std::swap(LHS, RHS);
CC = ISD::getSetCCSwappedOperands(CC);
}
}
SDValue Cmp;
AArch64CC::CondCode AArch64CC;
if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
// The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
// For the i8 operand, the largest immediate is 255, so this can be easily
// encoded in the compare instruction. For the i16 operand, however, the
// largest immediate cannot be encoded in the compare.
// Therefore, use a sign extending load and cmn to avoid materializing the
// -1 constant. For example,
// movz w1, #65535
// ldrh w0, [x0, #0]
// cmp w0, w1
// >
// ldrsh w0, [x0, #0]
// cmn w0, #1
// Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
// if and only if (sext LHS) == (sext RHS). The checks are in place to
// ensure both the LHS and RHS are truly zero extended and to make sure the
// transformation is profitable.
if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
LHS.getNode()->hasNUsesOfValue(1, 0)) {
int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
SDValue SExt =
DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
DAG.getValueType(MVT::i16));
Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
RHS.getValueType()),
CC, dl, DAG);
AArch64CC = changeIntCCToAArch64CC(CC);
}
}
if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
}
}
}
if (!Cmp) {
Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
AArch64CC = changeIntCCToAArch64CC(CC);
}
AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
return Cmp;
}
static std::pair<SDValue, SDValue>
getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
"Unsupported value type");
SDValue Value, Overflow;
SDLoc DL(Op);
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
unsigned Opc = 0;
switch (Op.getOpcode()) {
default:
llvm_unreachable("Unknown overflow instruction!");
case ISD::SADDO:
Opc = AArch64ISD::ADDS;
CC = AArch64CC::VS;
break;
case ISD::UADDO:
Opc = AArch64ISD::ADDS;
CC = AArch64CC::HS;
break;
case ISD::SSUBO:
Opc = AArch64ISD::SUBS;
CC = AArch64CC::VS;
break;
case ISD::USUBO:
Opc = AArch64ISD::SUBS;
CC = AArch64CC::LO;
break;
// Multiply needs a little bit extra work.
case ISD::SMULO:
case ISD::UMULO: {
CC = AArch64CC::NE;
bool IsSigned = Op.getOpcode() == ISD::SMULO;
if (Op.getValueType() == MVT::i32) {
unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
// For a 32 bit multiply with overflow check we want the instruction
// selector to generate a widening multiply (SMADDL/UMADDL). For that we
// need to generate the following pattern:
// (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
DAG.getConstant(0, DL, MVT::i64));
// On AArch64 the upper 32 bits are always zero extended for a 32 bit
// operation. We need to clear out the upper 32 bits, because we used a
// widening multiply that wrote all 64 bits. In the end this should be a
// noop.
Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
if (IsSigned) {
// The signed overflow check requires more than just a simple check for
// any bit set in the upper 32 bits of the result. These bits could be
// just the sign bits of a negative number. To perform the overflow
// check we have to arithmetic shift right the 32nd bit of the result by
// 31 bits. Then we compare the result to the upper 32 bits.
SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
DAG.getConstant(32, DL, MVT::i64));
UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
DAG.getConstant(31, DL, MVT::i64));
// It is important that LowerBits is last, otherwise the arithmetic
// shift will not be folded into the compare (SUBS).
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
.getValue(1);
} else {
// The overflow check for unsigned multiply is easy. We only need to
// check if any of the upper 32 bits are set. This can be done with a
// CMP (shifted register). For that we need to generate the following
// pattern:
// (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
DAG.getConstant(32, DL, MVT::i64));
SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
Overflow =
DAG.getNode(AArch64ISD::SUBS, DL, VTs,
DAG.getConstant(0, DL, MVT::i64),
UpperBits).getValue(1);
}
break;
}
assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
// For the 64 bit multiply
Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
if (IsSigned) {
SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
DAG.getConstant(63, DL, MVT::i64));
// It is important that LowerBits is last, otherwise the arithmetic
// shift will not be folded into the compare (SUBS).
SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
.getValue(1);
} else {
SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
Overflow =
DAG.getNode(AArch64ISD::SUBS, DL, VTs,
DAG.getConstant(0, DL, MVT::i64),
UpperBits).getValue(1);
}
break;
}
} // switch (...)
if (Opc) {
SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
// Emit the AArch64 operation with overflow check.
Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
Overflow = Value.getValue(1);
}
return std::make_pair(Value, Overflow);
}
SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
RTLIB::Libcall Call) const {
SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
}
// Returns true if the given Op is the overflow flag result of an overflow
// intrinsic operation.
static bool isOverflowIntrOpRes(SDValue Op) {
unsigned Opc = Op.getOpcode();
return (Op.getResNo() == 1 &&
(Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO));
}
static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
SDValue Sel = Op.getOperand(0);
SDValue Other = Op.getOperand(1);
SDLoc dl(Sel);
// If the operand is an overflow checking operation, invert the condition
// code and kill the Not operation. I.e., transform:
// (xor (overflow_op_bool, 1))
// -->
// (csel 1, 0, invert(cc), overflow_op_bool)
// ... which later gets transformed to just a cset instruction with an
// inverted condition code, rather than a cset + eor sequence.
if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) {
// Only lower legal XALUO ops.
if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
return SDValue();
SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
AArch64CC::CondCode CC;
SDValue Value, Overflow;
std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
CCVal, Overflow);
}
// If neither operand is a SELECT_CC, give up.
if (Sel.getOpcode() != ISD::SELECT_CC)
std::swap(Sel, Other);
if (Sel.getOpcode() != ISD::SELECT_CC)
return Op;
// The folding we want to perform is:
// (xor x, (select_cc a, b, cc, 0, -1) )
// -->
// (csel x, (xor x, -1), cc ...)
//
// The latter will get matched to a CSINV instruction.
ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
SDValue LHS = Sel.getOperand(0);
SDValue RHS = Sel.getOperand(1);
SDValue TVal = Sel.getOperand(2);
SDValue FVal = Sel.getOperand(3);
// FIXME: This could be generalized to non-integer comparisons.
if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
return Op;
ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
// The values aren't constants, this isn't the pattern we're looking for.
if (!CFVal || !CTVal)
return Op;
// We can commute the SELECT_CC by inverting the condition. This
// might be needed to make this fit into a CSINV pattern.
if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, true);
}
// If the constants line up, perform the transform!
if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
SDValue CCVal;
SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
FVal = Other;
TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
DAG.getConstant(-1ULL, dl, Other.getValueType()));
return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
CCVal, Cmp);
}
return Op;
}
static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
// Let legalize expand this if it isn't a legal type yet.
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
unsigned Opc;
bool ExtraOp = false;
switch (Op.getOpcode()) {
default:
llvm_unreachable("Invalid code");
case ISD::ADDC:
Opc = AArch64ISD::ADDS;
break;
case ISD::SUBC:
Opc = AArch64ISD::SUBS;
break;
case ISD::ADDE:
Opc = AArch64ISD::ADCS;
ExtraOp = true;
break;
case ISD::SUBE:
Opc = AArch64ISD::SBCS;
ExtraOp = true;
break;
}
if (!ExtraOp)
return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
Op.getOperand(2));
}
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
// Let legalize expand this if it isn't a legal type yet.
if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
return SDValue();
SDLoc dl(Op);
AArch64CC::CondCode CC;
// The actual operation that sets the overflow or carry flag.
SDValue Value, Overflow;
std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
// We use 0 and 1 as false and true values.
SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
// We use an inverted condition, because the conditional select is inverted
// too. This will allow it to be selected to a single instruction:
// CSINC Wd, WZR, WZR, invert(cond).
SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
CCVal, Overflow);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
}
// Prefetch operands are:
// 1: Address to prefetch
// 2: bool isWrite
// 3: int locality (0 = no locality ... 3 = extreme locality)
// 4: bool isDataCache
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
SDLoc DL(Op);
unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
bool IsStream = !Locality;
// When the locality number is set
if (Locality) {
// The front-end should have filtered out the out-of-range values
assert(Locality <= 3 && "Prefetch locality out-of-range");
// The locality degree is the opposite of the cache speed.
// Put the number the other way around.
// The encoding starts at 0 for level 1
Locality = 3 - Locality;
}
// built the mask value encoding the expected behavior.
unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
(!IsData << 3) | // IsDataCache bit
(Locality << 1) | // Cache level bits
(unsigned)IsStream; // Stream bit
return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
}
SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
RTLIB::Libcall LC;
LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
return LowerF128Call(Op, DAG, LC);
}
SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
SelectionDAG &DAG) const {
if (Op.getOperand(0).getValueType() != MVT::f128) {
// It's legal except when f128 is involved
return Op;
}
RTLIB::Libcall LC;
LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
// FP_ROUND node has a second operand indicating whether it is known to be
// precise. That doesn't take part in the LibCall so we can't directly use
// LowerF128Call.
SDValue SrcVal = Op.getOperand(0);
return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
SDLoc(Op)).first;
}
SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
SelectionDAG &DAG) const {
// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
// Any additional optimization in this function should be recorded
// in the cost tables.
EVT InVT = Op.getOperand(0).getValueType();
EVT VT = Op.getValueType();
unsigned NumElts = InVT.getVectorNumElements();
// f16 conversions are promoted to f32 when full fp16 is not supported.
if (InVT.getVectorElementType() == MVT::f16 &&
!Subtarget->hasFullFP16()) {
MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
SDLoc dl(Op);
return DAG.getNode(
Op.getOpcode(), dl, Op.getValueType(),
DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
}
if (VT.getSizeInBits() < InVT.getSizeInBits()) {
SDLoc dl(Op);
SDValue Cv =
DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
Op.getOperand(0));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
}
if (VT.getSizeInBits() > InVT.getSizeInBits()) {
SDLoc dl(Op);
MVT ExtVT =
MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
VT.getVectorNumElements());
SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
}
// Type changing conversions are illegal.
return Op;
}
SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
SelectionDAG &DAG) const {
if (Op.getOperand(0).getValueType().isVector())
return LowerVectorFP_TO_INT(Op, DAG);
// f16 conversions are promoted to f32 when full fp16 is not supported.
if (Op.getOperand(0).getValueType() == MVT::f16 &&
!Subtarget->hasFullFP16()) {
SDLoc dl(Op);
return DAG.getNode(
Op.getOpcode(), dl, Op.getValueType(),
DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
}
if (Op.getOperand(0).getValueType() != MVT::f128) {
// It's legal except when f128 is involved
return Op;
}
RTLIB::Libcall LC;
if (Op.getOpcode() == ISD::FP_TO_SINT)
LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
else
LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
}
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
// Any additional optimization in this function should be recorded
// in the cost tables.
EVT VT = Op.getValueType();
SDLoc dl(Op);
SDValue In = Op.getOperand(0);
EVT InVT = In.getValueType();
if (VT.getSizeInBits() < InVT.getSizeInBits()) {
MVT CastVT =
MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
InVT.getVectorNumElements());
In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
}
if (VT.getSizeInBits() > InVT.getSizeInBits()) {
unsigned CastOpc =
Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
EVT CastVT = VT.changeVectorElementTypeToInteger();
In = DAG.getNode(CastOpc, dl, CastVT, In);
return DAG.getNode(Op.getOpcode(), dl, VT, In);
}
return Op;
}
SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
if (Op.getValueType().isVector())
return LowerVectorINT_TO_FP(Op, DAG);
// f16 conversions are promoted to f32 when full fp16 is not supported.
if (Op.getValueType() == MVT::f16 &&
!Subtarget->hasFullFP16()) {
SDLoc dl(Op);
return DAG.getNode(
ISD::FP_ROUND, dl, MVT::f16,
DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
DAG.getIntPtrConstant(0, dl));
}
// i128 conversions are libcalls.
if (Op.getOperand(0).getValueType() == MVT::i128)
return SDValue();
// Other conversions are legal, unless it's to the completely software-based
// fp128.
if (Op.getValueType() != MVT::f128)
return Op;
RTLIB::Libcall LC;
if (Op.getOpcode() == ISD::SINT_TO_FP)
LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
else
LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
return LowerF128Call(Op, DAG, LC);
}
SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
SelectionDAG &DAG) const {
// For iOS, we want to call an alternative entry point: __sincos_stret,
// which returns the values in two S / D registers.
SDLoc dl(Op);
SDValue Arg = Op.getOperand(0);
EVT ArgVT = Arg.getValueType();
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
ArgListTy Args;
ArgListEntry Entry;
Entry.Node = Arg;
Entry.Ty = ArgTy;
Entry.IsSExt = false;
Entry.IsZExt = false;
Args.push_back(Entry);
RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
: RTLIB::SINCOS_STRET_F32;
const char *LibcallName = getLibcallName(LC);
SDValue Callee =
DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
StructType *RetTy = StructType::get(ArgTy, ArgTy);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl)
.setChain(DAG.getEntryNode())
.setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
return CallResult.first;
}
static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
if (Op.getValueType() != MVT::f16)
return SDValue();
assert(Op.getOperand(0).getValueType() == MVT::i16);
SDLoc DL(Op);
Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
return SDValue(
DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
0);
}
static EVT getExtensionTo64Bits(const EVT &OrigVT) {
if (OrigVT.getSizeInBits() >= 64)
return OrigVT;
assert(OrigVT.isSimple() && "Expecting a simple value type");
MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
switch (OrigSimpleTy) {
default: llvm_unreachable("Unexpected Vector Type");
case MVT::v2i8:
case MVT::v2i16:
return MVT::v2i32;
case MVT::v4i8:
return MVT::v4i16;
}
}
static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
const EVT &OrigTy,
const EVT &ExtTy,
unsigned ExtOpcode) {
// The vector originally had a size of OrigTy. It was then extended to ExtTy.
// We expect the ExtTy to be 128-bits total. If the OrigTy is less than
// 64-bits we need to insert a new extension so that it will be 64-bits.
assert(ExtTy.is128BitVector() && "Unexpected extension size");
if (OrigTy.getSizeInBits() >= 64)
return N;
// Must extend size to at least 64 bits to be used as an operand for VMULL.
EVT NewVT = getExtensionTo64Bits(OrigTy);
return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
}
static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
bool isSigned) {
EVT VT = N->getValueType(0);
if (N->getOpcode() != ISD::BUILD_VECTOR)
return false;
for (const SDValue &Elt : N->op_values()) {
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
unsigned EltSize = VT.getScalarSizeInBits();
unsigned HalfSize = EltSize / 2;
if (isSigned) {
if (!isIntN(HalfSize, C->getSExtValue()))
return false;
} else {
if (!isUIntN(HalfSize, C->getZExtValue()))
return false;
}
continue;
}
return false;
}
return true;
}
static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
N->getOperand(0)->getValueType(0),
N->getValueType(0),
N->getOpcode());
assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
EVT VT = N->getValueType(0);
SDLoc dl(N);
unsigned EltSize = VT.getScalarSizeInBits() / 2;
unsigned NumElts = VT.getVectorNumElements();
MVT TruncVT = MVT::getIntegerVT(EltSize);
SmallVector<SDValue, 8> Ops;
for (unsigned i = 0; i != NumElts; ++i) {
ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
const APInt &CInt = C->getAPIntValue();
// Element types smaller than 32 bits are not legal, so use i32 elements.
// The values are implicitly truncated so sext vs. zext doesn't matter.
Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
}
return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
}
static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
return N->getOpcode() == ISD::SIGN_EXTEND ||
isExtendedBUILD_VECTOR(N, DAG, true);
}
static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
return N->getOpcode() == ISD::ZERO_EXTEND ||
isExtendedBUILD_VECTOR(N, DAG, false);
}
static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
unsigned Opcode = N->getOpcode();
if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
SDNode *N0 = N->getOperand(0).getNode();
SDNode *N1 = N->getOperand(1).getNode();
return N0->hasOneUse() && N1->hasOneUse() &&
isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
}
return false;
}
static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
unsigned Opcode = N->getOpcode();
if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
SDNode *N0 = N->getOperand(0).getNode();
SDNode *N1 = N->getOperand(1).getNode();
return N0->hasOneUse() && N1->hasOneUse() &&
isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
}
return false;
}
SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
SelectionDAG &DAG) const {
// The rounding mode is in bits 23:22 of the FPSCR.
// The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
// The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
// so that the shift + and get folded into a bitfield extract.
SDLoc dl(Op);
SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64,
DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl,
MVT::i64));
SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
DAG.getConstant(1U << 22, dl, MVT::i32));
SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
DAG.getConstant(22, dl, MVT::i32));
return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
DAG.getConstant(3, dl, MVT::i32));
}
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
// Multiplications are only custom-lowered for 128-bit vectors so that
// VMULL can be detected. Otherwise v2i64 multiplications are not legal.
EVT VT = Op.getValueType();
assert(VT.is128BitVector() && VT.isInteger() &&
"unexpected type for custom-lowering ISD::MUL");
SDNode *N0 = Op.getOperand(0).getNode();
SDNode *N1 = Op.getOperand(1).getNode();
unsigned NewOpc = 0;
bool isMLA = false;
bool isN0SExt = isSignExtended(N0, DAG);
bool isN1SExt = isSignExtended(N1, DAG);
if (isN0SExt && isN1SExt)
NewOpc = AArch64ISD::SMULL;
else {
bool isN0ZExt = isZeroExtended(N0, DAG);
bool isN1ZExt = isZeroExtended(N1, DAG);
if (isN0ZExt && isN1ZExt)
NewOpc = AArch64ISD::UMULL;
else if (isN1SExt || isN1ZExt) {
// Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
// into (s/zext A * s/zext C) + (s/zext B * s/zext C)
if (isN1SExt && isAddSubSExt(N0, DAG)) {
NewOpc = AArch64ISD::SMULL;
isMLA = true;
} else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
NewOpc = AArch64ISD::UMULL;
isMLA = true;
} else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
std::swap(N0, N1);
NewOpc = AArch64ISD::UMULL;
isMLA = true;
}
}
if (!NewOpc) {
if (VT == MVT::v2i64)
// Fall through to expand this. It is not legal.
return SDValue();
else
// Other vector multiplications are legal.
return Op;
}
}
// Legalize to a S/UMULL instruction
SDLoc DL(Op);
SDValue Op0;
SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
if (!isMLA) {
Op0 = skipExtensionForVectorMULL(N0, DAG);
assert(Op0.getValueType().is64BitVector() &&
Op1.getValueType().is64BitVector() &&
"unexpected types for extended operands to VMULL");
return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
}
// Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
// isel lowering to take advantage of no-stall back to back s/umul + s/umla.
// This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
EVT Op1VT = Op1.getValueType();
return DAG.getNode(N0->getOpcode(), DL, VT,
DAG.getNode(NewOpc, DL, VT,
DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
DAG.getNode(NewOpc, DL, VT,
DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
}
SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
SDLoc dl(Op);
switch (IntNo) {
default: return SDValue(); // Don't custom lower most intrinsics.
case Intrinsic::thread_pointer: {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
}
case Intrinsic::aarch64_neon_abs: {
EVT Ty = Op.getValueType();
if (Ty == MVT::i64) {
SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
Op.getOperand(1));
Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
} else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
} else {
report_fatal_error("Unexpected type for AArch64 NEON intrinic");
}
}
case Intrinsic::aarch64_neon_smax:
return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_neon_umax:
return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_neon_smin:
return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_neon_umin:
return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::localaddress: {
const auto &MF = DAG.getMachineFunction();
const auto *RegInfo = Subtarget->getRegisterInfo();
unsigned Reg = RegInfo->getLocalAddressRegister(MF);
return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
Op.getSimpleValueType());
}
case Intrinsic::eh_recoverfp: {
// FIXME: This needs to be implemented to correctly handle highly aligned
// stack objects. For now we simply return the incoming FP. Refer D53541
// for more details.
SDValue FnOp = Op.getOperand(1);
SDValue IncomingFPOp = Op.getOperand(2);
GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
if (!Fn)
report_fatal_error(
"llvm.eh.recoverfp must take a function as the first argument");
return IncomingFPOp;
}
}
}
// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
EVT VT, EVT MemVT,
SelectionDAG &DAG) {
assert(VT.isVector() && "VT should be a vector type");
assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
SDValue Value = ST->getValue();
// It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
// the word lane which represent the v4i8 subvector. It optimizes the store
// to:
//
// xtn v0.8b, v0.8h
// str s0, [x0]
SDValue Undef = DAG.getUNDEF(MVT::i16);
SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
{Undef, Undef, Undef, Undef});
SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
Value, UndefVec);
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
Trunc, DAG.getConstant(0, DL, MVT::i64));
return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
ST->getBasePtr(), ST->getMemOperand());
}
// Custom lowering for any store, vector or scalar and/or default or with
// a truncate operations. Currently only custom lower truncate operation
// from vector v4i16 to v4i8.
SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
SelectionDAG &DAG) const {
SDLoc Dl(Op);
StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
assert (StoreNode && "Can only custom lower store nodes");
SDValue Value = StoreNode->getValue();
EVT VT = Value.getValueType();
EVT MemVT = StoreNode->getMemoryVT();
assert (VT.isVector() && "Can only custom lower vector store types");
unsigned AS = StoreNode->getAddressSpace();
unsigned Align = StoreNode->getAlignment();
if (Align < MemVT.getStoreSize() &&
!allowsMisalignedMemoryAccesses(
MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) {
return scalarizeVectorStore(StoreNode, DAG);
}
if (StoreNode->isTruncatingStore()) {
return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
}
return SDValue();
}
SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
LLVM_DEBUG(dbgs() << "Custom lowering: ");
LLVM_DEBUG(Op.dump());
switch (Op.getOpcode()) {
default:
llvm_unreachable("unimplemented operand");
return SDValue();
case ISD::BITCAST:
return LowerBITCAST(Op, DAG);
case ISD::GlobalAddress:
return LowerGlobalAddress(Op, DAG);
case ISD::GlobalTLSAddress:
return LowerGlobalTLSAddress(Op, DAG);
case ISD::SETCC:
return LowerSETCC(Op, DAG);
case ISD::BR_CC:
return LowerBR_CC(Op, DAG);
case ISD::SELECT:
return LowerSELECT(Op, DAG);
case ISD::SELECT_CC:
return LowerSELECT_CC(Op, DAG);
case ISD::JumpTable:
return LowerJumpTable(Op, DAG);
case ISD::BR_JT:
return LowerBR_JT(Op, DAG);
case ISD::ConstantPool:
return LowerConstantPool(Op, DAG);
case ISD::BlockAddress:
return LowerBlockAddress(Op, DAG);
case ISD::VASTART:
return LowerVASTART(Op, DAG);
case ISD::VACOPY:
return LowerVACOPY(Op, DAG);
case ISD::VAARG:
return LowerVAARG(Op, DAG);
case ISD::ADDC:
case ISD::ADDE:
case ISD::SUBC:
case ISD::SUBE:
return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
case ISD::SADDO:
case ISD::UADDO:
case ISD::SSUBO:
case ISD::USUBO:
case ISD::SMULO:
case ISD::UMULO:
return LowerXALUO(Op, DAG);
case ISD::FADD:
return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
case ISD::FSUB:
return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
case ISD::FMUL:
return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
case ISD::FDIV:
return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
case ISD::FP_ROUND:
return LowerFP_ROUND(Op, DAG);
case ISD::FP_EXTEND:
return LowerFP_EXTEND(Op, DAG);
case ISD::FRAMEADDR:
return LowerFRAMEADDR(Op, DAG);
case ISD::SPONENTRY:
return LowerSPONENTRY(Op, DAG);
case ISD::RETURNADDR:
return LowerRETURNADDR(Op, DAG);
case ISD::ADDROFRETURNADDR:
return LowerADDROFRETURNADDR(Op, DAG);
case ISD::INSERT_VECTOR_ELT:
return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT:
return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::BUILD_VECTOR:
return LowerBUILD_VECTOR(Op, DAG);
case ISD::VECTOR_SHUFFLE:
return LowerVECTOR_SHUFFLE(Op, DAG);
case ISD::EXTRACT_SUBVECTOR:
return LowerEXTRACT_SUBVECTOR(Op, DAG);
case ISD::SRA:
case ISD::SRL:
case ISD::SHL:
return LowerVectorSRA_SRL_SHL(Op, DAG);
case ISD::SHL_PARTS:
return LowerShiftLeftParts(Op, DAG);
case ISD::SRL_PARTS:
case ISD::SRA_PARTS:
return LowerShiftRightParts(Op, DAG);
case ISD::CTPOP:
return LowerCTPOP(Op, DAG);
case ISD::FCOPYSIGN:
return LowerFCOPYSIGN(Op, DAG);
case ISD::OR:
return LowerVectorOR(Op, DAG);
case ISD::XOR:
return LowerXOR(Op, DAG);
case ISD::PREFETCH:
return LowerPREFETCH(Op, DAG);
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
return LowerINT_TO_FP(Op, DAG);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
return LowerFP_TO_INT(Op, DAG);
case ISD::FSINCOS:
return LowerFSINCOS(Op, DAG);
case ISD::FLT_ROUNDS_:
return LowerFLT_ROUNDS_(Op, DAG);
case ISD::MUL:
return LowerMUL(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN:
return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::STORE:
return LowerSTORE(Op, DAG);
case ISD::VECREDUCE_ADD:
case ISD::VECREDUCE_SMAX:
case ISD::VECREDUCE_SMIN:
case ISD::VECREDUCE_UMAX:
case ISD::VECREDUCE_UMIN:
case ISD::VECREDUCE_FMAX:
case ISD::VECREDUCE_FMIN:
return LowerVECREDUCE(Op, DAG);
case ISD::ATOMIC_LOAD_SUB:
return LowerATOMIC_LOAD_SUB(Op, DAG);
case ISD::ATOMIC_LOAD_AND:
return LowerATOMIC_LOAD_AND(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:
return LowerDYNAMIC_STACKALLOC(Op, DAG);
}
}
//===----------------------------------------------------------------------===//
// Calling Convention Implementation
//===----------------------------------------------------------------------===//
/// Selects the correct CCAssignFn for a given CallingConvention value.
CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
bool IsVarArg) const {
switch (CC) {
default:
report_fatal_error("Unsupported calling convention.");
case CallingConv::WebKit_JS:
return CC_AArch64_WebKit_JS;
case CallingConv::GHC:
return CC_AArch64_GHC;
case CallingConv::C:
case CallingConv::Fast:
case CallingConv::PreserveMost:
case CallingConv::CXX_FAST_TLS:
case CallingConv::Swift:
if (Subtarget->isTargetWindows() && IsVarArg)
return CC_AArch64_Win64_VarArg;
if (!Subtarget->isTargetDarwin())
return CC_AArch64_AAPCS;
return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
case CallingConv::Win64:
return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
case CallingConv::AArch64_VectorCall:
return CC_AArch64_AAPCS;
}
}
CCAssignFn *
AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
: RetCC_AArch64_AAPCS;
}
SDValue AArch64TargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
// At this point, Ins[].VT may already be promoted to i32. To correctly
// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
// i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
// Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
// we use a special version of AnalyzeFormalArguments to pass in ValVT and
// LocVT.
unsigned NumArgs = Ins.size();
Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
unsigned CurArgIdx = 0;
for (unsigned i = 0; i != NumArgs; ++i) {
MVT ValVT = Ins[i].VT;
if (Ins[i].isOrigArg()) {
std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
CurArgIdx = Ins[i].getOrigArgIndex();
// Get type of the original argument.
EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
/*AllowUnknown*/ true);
MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
ValVT = MVT::i8;
else if (ActualMVT == MVT::i16)
ValVT = MVT::i16;
}
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
bool Res =
AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
assert(!Res && "Call operand has unhandled type");
(void)Res;
}
assert(ArgLocs.size() == Ins.size());
SmallVector<SDValue, 16> ArgValues;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
if (Ins[i].Flags.isByVal()) {
// Byval is used for HFAs in the PCS, but the system should work in a
// non-compliant manner for larger structs.
EVT PtrVT = getPointerTy(DAG.getDataLayout());
int Size = Ins[i].Flags.getByValSize();
unsigned NumRegs = (Size + 7) / 8;
// FIXME: This works on big-endian for composite byvals, which are the common
// case. It should also work for fundamental types too.
unsigned FrameIdx =
MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
InVals.push_back(FrameIdxN);
continue;
}
if (VA.isRegLoc()) {
// Arguments stored in registers.
EVT RegVT = VA.getLocVT();
SDValue ArgValue;
const TargetRegisterClass *RC;
if (RegVT == MVT::i32)
RC = &AArch64::GPR32RegClass;
else if (RegVT == MVT::i64)
RC = &AArch64::GPR64RegClass;
else if (RegVT == MVT::f16)
RC = &AArch64::FPR16RegClass;
else if (RegVT == MVT::f32)
RC = &AArch64::FPR32RegClass;
else if (RegVT == MVT::f64 || RegVT.is64BitVector())
RC = &AArch64::FPR64RegClass;
else if (RegVT == MVT::f128 || RegVT.is128BitVector())
RC = &AArch64::FPR128RegClass;
else
llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
// Transform the arguments in physical registers into virtual ones.
unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
// If this is an 8, 16 or 32-bit value, it is really passed promoted
// to 64 bits. Insert an assert[sz]ext to capture this, then
// truncate to the right size.
switch (VA.getLocInfo()) {
default:
llvm_unreachable("Unknown loc info!");
case CCValAssign::Full:
break;
case CCValAssign::BCvt:
ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
break;
case CCValAssign::AExt:
case CCValAssign::SExt:
case CCValAssign::ZExt:
// SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
// nodes after our lowering.
assert(RegVT == Ins[i].VT && "incorrect register location selected");
break;
}
InVals.push_back(ArgValue);
} else { // VA.isRegLoc()
assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
unsigned ArgOffset = VA.getLocMemOffset();
unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
uint32_t BEAlign = 0;
if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
!Ins[i].Flags.isInConsecutiveRegs())
BEAlign = 8 - ArgSize;
int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
// Create load nodes to retrieve arguments from the stack.
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
SDValue ArgValue;
// For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
MVT MemVT = VA.getValVT();
switch (VA.getLocInfo()) {
default:
break;
case CCValAssign::BCvt:
MemVT = VA.getLocVT();
break;
case CCValAssign::SExt:
ExtType = ISD::SEXTLOAD;
break;
case CCValAssign::ZExt:
ExtType = ISD::ZEXTLOAD;
break;
case CCValAssign::AExt:
ExtType = ISD::EXTLOAD;
break;
}
ArgValue = DAG.getExtLoad(
ExtType, DL, VA.getLocVT(), Chain, FIN,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
MemVT);
InVals.push_back(ArgValue);
}
}
// varargs
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
if (isVarArg) {
if (!Subtarget->isTargetDarwin() || IsWin64) {
// The AAPCS variadic function ABI is identical to the non-variadic
// one. As a result there may be more arguments in registers and we should
// save them for future reference.
// Win64 variadic functions also pass arguments in registers, but all float
// arguments are passed in integer registers.
saveVarArgRegisters(CCInfo, DAG, DL, Chain);
}
// This will point to the next argument passed via stack.
unsigned StackOffset = CCInfo.getNextStackOffset();
// We currently pass all varargs at 8-byte alignment.
StackOffset = ((StackOffset + 7) & ~7);
FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
if (MFI.hasMustTailInVarArgFunc()) {
SmallVector<MVT, 2> RegParmTypes;
RegParmTypes.push_back(MVT::i64);
RegParmTypes.push_back(MVT::f128);
// Compute the set of forwarded registers. The rest are scratch.
SmallVectorImpl<ForwardedRegister> &Forwards =
FuncInfo->getForwardedMustTailRegParms();
CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
CC_AArch64_AAPCS);
// Conservatively forward X8, since it might be used for aggregate return.
if (!CCInfo.isAllocated(AArch64::X8)) {
unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
}
}
}
// On Windows, InReg pointers must be returned, so record the pointer in a
// virtual register at the start of the function so it can be returned in the
// epilogue.
if (IsWin64) {
for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
if (Ins[I].Flags.isInReg()) {
assert(!FuncInfo->getSRetReturnReg());
MVT PtrTy = getPointerTy(DAG.getDataLayout());
unsigned Reg =
MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
FuncInfo->setSRetReturnReg(Reg);
SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
break;
}
}
}
unsigned StackArgSize = CCInfo.getNextStackOffset();
bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
// This is a non-standard ABI so by fiat I say we're allowed to make full
// use of the stack area to be popped, which must be aligned to 16 bytes in
// any case:
StackArgSize = alignTo(StackArgSize, 16);
// If we're expected to restore the stack (e.g. fastcc) then we'll be adding
// a multiple of 16.
FuncInfo->setArgumentStackToRestore(StackArgSize);
// This realignment carries over to the available bytes below. Our own
// callers will guarantee the space is free by giving an aligned value to
// CALLSEQ_START.
}
// Even if we're not expected to free up the space, it's useful to know how
// much is there while considering tail calls (because we can reuse it).
FuncInfo->setBytesInStackArgArea(StackArgSize);
if (Subtarget->hasCustomCallingConv())
Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
return Chain;
}
void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
SelectionDAG &DAG,
const SDLoc &DL,
SDValue &Chain) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
auto PtrVT = getPointerTy(DAG.getDataLayout());
bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
SmallVector<SDValue, 8> MemOps;
static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
AArch64::X3, AArch64::X4, AArch64::X5,
AArch64::X6, AArch64::X7 };
static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
int GPRIdx = 0;
if (GPRSaveSize != 0) {
if (IsWin64) {
GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
if (GPRSaveSize & 15)
// The extra size here, if triggered, will always be 8.
MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
} else
GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
SDValue Store = DAG.getStore(
Val.getValue(1), DL, Val, FIN,
IsWin64
? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
GPRIdx,
(i - FirstVariadicGPR) * 8)
: MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
MemOps.push_back(Store);
FIN =
DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
}
}
FuncInfo->setVarArgsGPRIndex(GPRIdx);
FuncInfo->setVarArgsGPRSize(GPRSaveSize);
if (Subtarget->hasFPARMv8() && !IsWin64) {
static const MCPhysReg FPRArgRegs[] = {
AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
int FPRIdx = 0;
if (FPRSaveSize != 0) {
FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);
SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
SDValue Store = DAG.getStore(
Val.getValue(1), DL, Val, FIN,
MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));
MemOps.push_back(Store);
FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
DAG.getConstant(16, DL, PtrVT));
}
}
FuncInfo->setVarArgsFPRIndex(FPRIdx);
FuncInfo->setVarArgsFPRSize(FPRSaveSize);
}
if (!MemOps.empty()) {
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
}
}
/// LowerCallResult - Lower the result values of a call into the
/// appropriate copies out of appropriate physical registers.
SDValue AArch64TargetLowering::LowerCallResult(
SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
SDValue ThisVal) const {
CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
? RetCC_AArch64_WebKit_JS
: RetCC_AArch64_AAPCS;
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
CCInfo.AnalyzeCallResult(Ins, RetCC);
// Copy all of the result registers out of their specified physreg.
for (unsigned i = 0; i != RVLocs.size(); ++i) {
CCValAssign VA = RVLocs[i];
// Pass 'this' value directly from the argument to return value, to avoid
// reg unit interference
if (i == 0 && isThisReturn) {
assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
"unexpected return calling convention register assignment");
InVals.push_back(ThisVal);
continue;
}
SDValue Val =
DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
Chain = Val.getValue(1);
InFlag = Val.getValue(2);
switch (VA.getLocInfo()) {
default:
llvm_unreachable("Unknown loc info!");
case CCValAssign::Full:
break;
case CCValAssign::BCvt:
Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
break;
}
InVals.push_back(Val);
}
return Chain;
}
/// Return true if the calling convention is one that we can guarantee TCO for.
static bool canGuaranteeTCO(CallingConv::ID CC) {
return CC == CallingConv::Fast;
}
/// Return true if we might ever do TCO for calls with this calling convention.
static bool mayTailCallThisCC(CallingConv::ID CC) {
switch (CC) {
case CallingConv::C:
case CallingConv::PreserveMost:
case CallingConv::Swift:
return true;
default:
return canGuaranteeTCO(CC);
}
}
bool AArch64TargetLowering::isEligibleForTailCallOptimization(
SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
if (!mayTailCallThisCC(CalleeCC))
return false;
MachineFunction &MF = DAG.getMachineFunction();
const Function &CallerF = MF.getFunction();
CallingConv::ID CallerCC = CallerF.getCallingConv();
bool CCMatch = CallerCC == CalleeCC;
// Byval parameters hand the function a pointer directly into the stack area
// we want to reuse during a tail call. Working around this *is* possible (see
// X86) but less efficient and uglier in LowerCall.
for (Function::const_arg_iterator i = CallerF.arg_begin(),
e = CallerF.arg_end();
i != e; ++i) {
if (i->hasByValAttr())
return false;
// On Windows, "inreg" attributes signify non-aggregate indirect returns.
// In this case, it is necessary to save/restore X0 in the callee. Tail
// call opt interferes with this. So we disable tail call opt when the
// caller has an argument with "inreg" attribute.
// FIXME: Check whether the callee also has an "inreg" argument.
if (i->hasInRegAttr())
return false;
}
if (getTargetMachine().Options.GuaranteedTailCallOpt)
return canGuaranteeTCO(CalleeCC) && CCMatch;
// Externally-defined functions with weak linkage should not be
// tail-called on AArch64 when the OS does not support dynamic
// pre-emption of symbols, as the AAELF spec requires normal calls
// to undefined weak functions to be replaced with a NOP or jump to the
// next instruction. The behaviour of branch instructions in this
// situation (as used for tail calls) is implementation-defined, so we
// cannot rely on the linker replacing the tail call with a return.
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
const GlobalValue *GV = G->getGlobal();
const Triple &TT = getTargetMachine().getTargetTriple();
if (GV->hasExternalWeakLinkage() &&
(!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
return false;
}
// Now we search for cases where we can use a tail call without changing the
// ABI. Sibcall is used in some places (particularly gcc) to refer to this
// concept.
// I want anyone implementing a new calling convention to think long and hard
// about this assert.
assert((!isVarArg || CalleeCC == CallingConv::C) &&
"Unexpected variadic calling convention");
LLVMContext &C = *DAG.getContext();
if (isVarArg && !Outs.empty()) {
// At least two cases here: if caller is fastcc then we can't have any
// memory arguments (we'd be expected to clean up the stack afterwards). If
// caller is C then we could potentially use its argument area.
// FIXME: for now we take the most conservative of these in both cases:
// disallow all variadic memory operands.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
for (const CCValAssign &ArgLoc : ArgLocs)
if (!ArgLoc.isRegLoc())
return false;
}
// Check that the call results are passed in the same way.
if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
CCAssignFnForCall(CalleeCC, isVarArg),
CCAssignFnForCall(CallerCC, isVarArg)))
return false;
// The callee has to preserve all registers the caller needs to preserve.
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
if (!CCMatch) {
const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
if (Subtarget->hasCustomCallingConv()) {
TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
}
if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
return false;
}
// Nothing more to check if the callee is taking no arguments
if (Outs.empty())
return true;
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
// If the stack arguments for this call do not fit into our own save area then
// the call cannot be made tail.
if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
return false;
const MachineRegisterInfo &MRI = MF.getRegInfo();
if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
return false;
return true;
}
SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
SelectionDAG &DAG,
MachineFrameInfo &MFI,
int ClobberedFI) const {
SmallVector<SDValue, 8> ArgChains;
int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
// Include the original chain at the beginning of the list. When this is
// used by target LowerCall hooks, this helps legalize find the
// CALLSEQ_BEGIN node.
ArgChains.push_back(Chain);
// Add a chain value for each stack argument corresponding
for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
UE = DAG.getEntryNode().getNode()->use_end();
U != UE; ++U)
if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
if (FI->getIndex() < 0) {
int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
int64_t InLastByte = InFirstByte;
InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
(FirstByte <= InFirstByte && InFirstByte <= LastByte))
ArgChains.push_back(SDValue(L, 1));
}
// Build a tokenfactor for all the chains.
return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
}
bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
bool TailCallOpt) const {
return CallCC == CallingConv::Fast && TailCallOpt;
}
/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
/// and add input and output parameter nodes.
SDValue
AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
SelectionDAG &DAG = CLI.DAG;
SDLoc &DL = CLI.DL;
SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
bool &IsTailCall = CLI.IsTailCall;
CallingConv::ID CallConv = CLI.CallConv;
bool IsVarArg = CLI.IsVarArg;
MachineFunction &MF = DAG.getMachineFunction();
bool IsThisReturn = false;
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
bool IsSibCall = false;
if (IsTailCall) {
// Check if it's really possible to do a tail call.
IsTailCall = isEligibleForTailCallOptimization(
Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
// A sibling call is one where we're under the usual C ABI and not planning
// to change that but can still do a tail call:
if (!TailCallOpt && IsTailCall)
IsSibCall = true;
if (IsTailCall)
++NumTailCalls;
}
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
if (IsVarArg) {
// Handle fixed and variable vector arguments differently.
// Variable vector arguments always go into memory.
unsigned NumArgs = Outs.size();
for (unsigned i = 0; i != NumArgs; ++i) {
MVT ArgVT = Outs[i].VT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
/*IsVarArg=*/ !Outs[i].IsFixed);
bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
assert(!Res && "Call operand has unhandled type");
(void)Res;
}
} else {
// At this point, Outs[].VT may already be promoted to i32. To correctly
// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
// i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
// Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
// we use a special version of AnalyzeCallOperands to pass in ValVT and
// LocVT.
unsigned NumArgs = Outs.size();
for (unsigned i = 0; i != NumArgs; ++i) {
MVT ValVT = Outs[i].VT;
// Get type of the original argument.
EVT ActualVT = getValueType(DAG.getDataLayout(),
CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
/*AllowUnknown*/ true);
MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
ValVT = MVT::i8;
else if (ActualMVT == MVT::i16)
ValVT = MVT::i16;
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
assert(!Res && "Call operand has unhandled type");
(void)Res;
}
}
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
if (IsSibCall) {
// Since we're not changing the ABI to make this a tail call, the memory
// operands are already available in the caller's incoming argument space.
NumBytes = 0;
}
// FPDiff is the byte offset of the call's argument area from the callee's.
// Stores to callee stack arguments will be placed in FixedStackSlots offset
// by this amount for a tail call. In a sibling call it must be 0 because the
// caller will deallocate the entire stack and the callee still expects its
// arguments to begin at SP+0. Completely unused for non-tail calls.
int FPDiff = 0;
if (IsTailCall && !IsSibCall) {
unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
// Since callee will pop argument stack as a tail call, we must keep the
// popped size 16-byte aligned.
NumBytes = alignTo(NumBytes, 16);
// FPDiff will be negative if this tail call requires more space than we
// would automatically have in our incoming argument space. Positive if we
// can actually shrink the stack.
FPDiff = NumReusableBytes - NumBytes;
// The stack pointer must be 16-byte aligned at all times it's used for a
// memory operation, which in practice means at *all* times and in
// particular across call boundaries. Therefore our own arguments started at
// a 16-byte aligned SP and the delta applied for the tail call should
// satisfy the same constraint.
assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
}
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
if (!IsSibCall)
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
getPointerTy(DAG.getDataLayout()));
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
SmallVector<SDValue, 8> MemOpChains;
auto PtrVT = getPointerTy(DAG.getDataLayout());
if (IsVarArg && CLI.CS && CLI.CS.isMustTailCall()) {
const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
for (const auto &F : Forwards) {
SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
}
}
// Walk the register/memloc assignments, inserting copies/loads.
for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
++i, ++realArgIdx) {
CCValAssign &VA = ArgLocs[i];
SDValue Arg = OutVals[realArgIdx];
ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
// Promote the value if needed.
switch (VA.getLocInfo()) {
default:
llvm_unreachable("Unknown loc info!");
case CCValAssign::Full:
break;
case CCValAssign::SExt:
Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::ZExt:
Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::AExt:
if (Outs[realArgIdx].ArgVT == MVT::i1) {
// AAPCS requires i1 to be zero-extended to 8-bits by the caller.
Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
}
Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::BCvt:
Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::FPExt:
Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
break;
}
if (VA.isRegLoc()) {
if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
Outs[0].VT == MVT::i64) {
assert(VA.getLocVT() == MVT::i64 &&
"unexpected calling convention register assignment");
assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
"unexpected use of 'returned'");
IsThisReturn = true;
}
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
} else {
assert(VA.isMemLoc());
SDValue DstAddr;
MachinePointerInfo DstInfo;
// FIXME: This works on big-endian for composite byvals, which are the
// common case. It should also work for fundamental types too.
uint32_t BEAlign = 0;
unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
: VA.getValVT().getSizeInBits();
OpSize = (OpSize + 7) / 8;
if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
!Flags.isInConsecutiveRegs()) {
if (OpSize < 8)
BEAlign = 8 - OpSize;
}
unsigned LocMemOffset = VA.getLocMemOffset();
int32_t Offset = LocMemOffset + BEAlign;
SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
if (IsTailCall) {
Offset = Offset + FPDiff;
int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
DstAddr = DAG.getFrameIndex(FI, PtrVT);
DstInfo =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
// Make sure any stack arguments overlapping with where we're storing
// are loaded before this eventual operation. Otherwise they'll be
// clobbered.
Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
} else {
SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(),
LocMemOffset);
}
if (Outs[i].Flags.isByVal()) {
SDValue SizeNode =
DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
SDValue Cpy = DAG.getMemcpy(
Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
/*isVol = */ false, /*AlwaysInline = */ false,
/*isTailCall = */ false,
DstInfo, MachinePointerInfo());
MemOpChains.push_back(Cpy);
} else {
// Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
// promoted to a legal register type i32, we should truncate Arg back to
// i1/i8/i16.
if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
VA.getValVT() == MVT::i16)
Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
MemOpChains.push_back(Store);
}
}
}
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
// Build a sequence of copy-to-reg nodes chained together with token chain
// and flag operands which copy the outgoing args into the appropriate regs.
SDValue InFlag;
for (auto &RegToPass : RegsToPass) {
Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
RegToPass.second, InFlag);
InFlag = Chain.getValue(1);
}
// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
// direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
// node so that legalize doesn't hack it.
if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
auto GV = G->getGlobal();
if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) ==
AArch64II::MO_GOT) {
Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
} else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) {
assert(Subtarget->isTargetWindows() &&
"Windows is the only supported COFF target");
Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT);
} else {
const GlobalValue *GV = G->getGlobal();
Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
}
} else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
if (getTargetMachine().getCodeModel() == CodeModel::Large &&
Subtarget->isTargetMachO()) {
const char *Sym = S->getSymbol();
Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
} else {
const char *Sym = S->getSymbol();
Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
}
}
// We don't usually want to end the call-sequence here because we would tidy
// the frame up *after* the call, however in the ABI-changing tail-call case
// we've carefully laid out the parameters so that when sp is reset they'll be
// in the correct location.
if (IsTailCall && !IsSibCall) {
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
InFlag = Chain.getValue(1);
}
std::vector<SDValue> Ops;
Ops.push_back(Chain);
Ops.push_back(Callee);
if (IsTailCall) {
// Each tail call may have to adjust the stack by a different amount, so
// this information must travel along with the operation for eventual
// consumption by emitEpilogue.
Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
}
// Add argument registers to the end of the list so that they are known live
// into the call.
for (auto &RegToPass : RegsToPass)
Ops.push_back(DAG.getRegister(RegToPass.first,
RegToPass.second.getValueType()));
// Add a register mask operand representing the call-preserved registers.
const uint32_t *Mask;
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
if (IsThisReturn) {
// For 'this' returns, use the X0-preserving mask if applicable
Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
if (!Mask) {
IsThisReturn = false;
Mask = TRI->getCallPreservedMask(MF, CallConv);
}
} else
Mask = TRI->getCallPreservedMask(MF, CallConv);
if (Subtarget->hasCustomCallingConv())
TRI->UpdateCustomCallPreservedMask(MF, &Mask);
if (TRI->isAnyArgRegReserved(MF))
TRI->emitReservedArgRegCallError(MF);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
if (InFlag.getNode())
Ops.push_back(InFlag);
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
// If we're doing a tall call, use a TC_RETURN here rather than an
// actual call instruction.
if (IsTailCall) {
MF.getFrameInfo().setHasTailCall();
return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
}
// Returns a chain and a flag for retval copy to use.
Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
InFlag = Chain.getValue(1);
uint64_t CalleePopBytes =
DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
DAG.getIntPtrConstant(CalleePopBytes, DL, true),
InFlag, DL);
if (!Ins.empty())
InFlag = Chain.getValue(1);
// Handle result values, copying them out of physregs into vregs that we
// return.
return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
InVals, IsThisReturn,
IsThisReturn ? OutVals[0] : SDValue());
}
bool AArch64TargetLowering::CanLowerReturn(
CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
? RetCC_AArch64_WebKit_JS
: RetCC_AArch64_AAPCS;
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
return CCInfo.CheckReturn(Outs, RetCC);
}
SDValue
AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &DL, SelectionDAG &DAG) const {
auto &MF = DAG.getMachineFunction();
auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
? RetCC_AArch64_WebKit_JS
: RetCC_AArch64_AAPCS;
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
CCInfo.AnalyzeReturn(Outs, RetCC);
// Copy the result values into the output registers.
SDValue Flag;
SmallVector<SDValue, 4> RetOps(1, Chain);
for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
++i, ++realRVLocIdx) {
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
SDValue Arg = OutVals[realRVLocIdx];
switch (VA.getLocInfo()) {
default:
llvm_unreachable("Unknown loc info!");
case CCValAssign::Full:
if (Outs[i].ArgVT == MVT::i1) {
// AAPCS requires i1 to be zero-extended to i8 by the producer of the
// value. This is strictly redundant on Darwin (which uses "zeroext
// i1"), but will be optimised out before ISel.
Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
}
break;
case CCValAssign::BCvt:
Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
break;
}
Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
}
// Windows AArch64 ABIs require that for returning structs by value we copy
// the sret argument into X0 for the return.
// We saved the argument into a virtual register in the entry block,
// so now we copy the value out and into X0.
if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
getPointerTy(MF.getDataLayout()));
unsigned RetValReg = AArch64::X0;
Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
Flag = Chain.getValue(1);
RetOps.push_back(
DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
}
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
const MCPhysReg *I =
TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
if (I) {
for (; *I; ++I) {
if (AArch64::GPR64RegClass.contains(*I))
RetOps.push_back(DAG.getRegister(*I, MVT::i64));
else if (AArch64::FPR64RegClass.contains(*I))
RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
}
}
RetOps[0] = Chain; // Update chain.
// Add the flag if we have it.
if (Flag.getNode())
RetOps.push_back(Flag);
return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
}
//===----------------------------------------------------------------------===//
// Other Lowering Code
//===----------------------------------------------------------------------===//
SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
SelectionDAG &DAG,
unsigned Flag) const {
return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
N->getOffset(), Flag);
}
SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
SelectionDAG &DAG,
unsigned Flag) const {
return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
}
SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
SelectionDAG &DAG,
unsigned Flag) const {
return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
N->getOffset(), Flag);
}
SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
SelectionDAG &DAG,
unsigned Flag) const {
return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
}
// (loadGOT sym)
template <class NodeTy>
SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
unsigned Flags) const {
LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
SDLoc DL(N);
EVT Ty = getPointerTy(DAG.getDataLayout());
SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
// FIXME: Once remat is capable of dealing with instructions with register
// operands, expand this into two nodes instead of using a wrapper node.
return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
}
// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
template <class NodeTy>
SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
unsigned Flags) const {
LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
SDLoc DL(N);
EVT Ty = getPointerTy(DAG.getDataLayout());
const unsigned char MO_NC = AArch64II::MO_NC;
return DAG.getNode(
AArch64ISD::WrapperLarge, DL, Ty,
getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
}
// (addlow (adrp %hi(sym)) %lo(sym))
template <class NodeTy>
SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
unsigned Flags) const {
LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
SDLoc DL(N);
EVT Ty = getPointerTy(DAG.getDataLayout());
SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
SDValue Lo = getTargetNode(N, Ty, DAG,
AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags);
SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
}
// (adr sym)
template <class NodeTy>
SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
unsigned Flags) const {
LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
SDLoc DL(N);
EVT Ty = getPointerTy(DAG.getDataLayout());
SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
}
SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
SelectionDAG &DAG) const {
GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = GN->getGlobal();
unsigned char OpFlags =
Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
if (OpFlags != AArch64II::MO_NO_FLAG)
assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
"unexpected offset in global node");
// This also catches the large code model case for Darwin, and tiny code
// model with got relocations.
if ((OpFlags & AArch64II::MO_GOT) != 0) {
return getGOT(GN, DAG, OpFlags);
}
SDValue Result;
if (getTargetMachine().getCodeModel() == CodeModel::Large) {
Result = getAddrLarge(GN, DAG, OpFlags);
} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
Result = getAddrTiny(GN, DAG, OpFlags);
} else {
Result = getAddr(GN, DAG, OpFlags);
}
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDLoc DL(GN);
if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_COFFSTUB))
Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
MachinePointerInfo::getGOT(DAG.getMachineFunction()));
return Result;
}
/// Convert a TLS address reference into the correct sequence of loads
/// and calls to compute the variable's address (for Darwin, currently) and
/// return an SDValue containing the final node.
/// Darwin only has one TLS scheme which must be capable of dealing with the
/// fully general situation, in the worst case. This means:
/// + "extern __thread" declaration.
/// + Defined in a possibly unknown dynamic library.
///
/// The general system is that each __thread variable has a [3 x i64] descriptor
/// which contains information used by the runtime to calculate the address. The
/// only part of this the compiler needs to know about is the first xword, which
/// contains a function pointer that must be called with the address of the
/// entire descriptor in "x0".
///
/// Since this descriptor may be in a different unit, in general even the
/// descriptor must be accessed via an indirect load. The "ideal" code sequence
/// is:
/// adrp x0, _var@TLVPPAGE
/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
/// ; the function pointer
/// blr x1 ; Uses descriptor address in x0
/// ; Address of _var is now in x0.
///
/// If the address of _var's descriptor *is* known to the linker, then it can
/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
/// a slight efficiency gain.
SDValue
AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
assert(Subtarget->isTargetDarwin() &&
"This function expects a Darwin target");
SDLoc DL(Op);
MVT PtrVT = getPointerTy(DAG.getDataLayout());
const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
SDValue TLVPAddr =
DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
// The first entry in the descriptor is a function pointer that we must call
// to obtain the address of the variable.
SDValue Chain = DAG.getEntryNode();
SDValue FuncTLVGet = DAG.getLoad(
MVT::i64, DL, Chain, DescAddr,
MachinePointerInfo::getGOT(DAG.getMachineFunction()),
/* Alignment = */ 8,
MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant |
MachineMemOperand::MODereferenceable);
Chain = FuncTLVGet.getValue(1);
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
MFI.setAdjustsStack(true);
// TLS calls preserve all registers except those that absolutely must be
// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
// silly).
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
const uint32_t *Mask = TRI->getTLSCallPreservedMask();
if (Subtarget->hasCustomCallingConv())
TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
// Finally, we can make the call. This is just a degenerate version of a
// normal AArch64 call node: x0 takes the address of the descriptor, and
// returns the address of the variable in this thread.
Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
Chain =
DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
DAG.getRegisterMask(Mask), Chain.getValue(1));
return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
}
/// When accessing thread-local variables under either the general-dynamic or
/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
/// is a function pointer to carry out the resolution.
///
/// The sequence is:
/// adrp x0, :tlsdesc:var
/// ldr x1, [x0, #:tlsdesc_lo12:var]
/// add x0, x0, #:tlsdesc_lo12:var
/// .tlsdesccall var
/// blr x1
/// (TPIDR_EL0 offset now in x0)
///
/// The above sequence must be produced unscheduled, to enable the linker to
/// optimize/relax this sequence.
/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
/// above sequence, and expanded really late in the compilation flow, to ensure
/// the sequence is produced as per above.
SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
const SDLoc &DL,
SelectionDAG &DAG) const {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Chain = DAG.getEntryNode();
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
Chain =
DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
SDValue Glue = Chain.getValue(1);
return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
}
SDValue
AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
assert(Subtarget->isTargetELF() && "This function expects an ELF target");
if (getTargetMachine().getCodeModel() == CodeModel::Large)
report_fatal_error("ELF TLS only supported in small memory model");
// Different choices can be made for the maximum size of the TLS area for a
// module. For the small address model, the default TLS size is 16MiB and the
// maximum TLS size is 4GiB.
// FIXME: add -mtls-size command line option and make it control the 16MiB
// vs. 4GiB code sequence generation.
// FIXME: add tiny codemodel support. We currently generate the same code as
// small, which may be larger than needed.
const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
if (Model == TLSModel::LocalDynamic)
Model = TLSModel::GeneralDynamic;
}
SDValue TPOff;
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDLoc DL(Op);
const GlobalValue *GV = GA->getGlobal();
SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
if (Model == TLSModel::LocalExec) {
SDValue HiVar = DAG.getTargetGlobalAddress(
GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
SDValue LoVar = DAG.getTargetGlobalAddress(
GV, DL, PtrVT, 0,
AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
SDValue TPWithOff_lo =
SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
HiVar,
DAG.getTargetConstant(0, DL, MVT::i32)),
0);
SDValue TPWithOff =
SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo,
LoVar,
DAG.getTargetConstant(0, DL, MVT::i32)),
0);
return TPWithOff;
} else if (Model == TLSModel::InitialExec) {
TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
} else if (Model == TLSModel::LocalDynamic) {
// Local-dynamic accesses proceed in two phases. A general-dynamic TLS
// descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
// the beginning of the module's TLS region, followed by a DTPREL offset
// calculation.
// These accesses will need deduplicating if there's more than one.
AArch64FunctionInfo *MFI =
DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
MFI->incNumLocalDynamicTLSAccesses();
// The call needs a relocation too for linker relaxation. It doesn't make
// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
// the address.
SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
AArch64II::MO_TLS);
// Now we can calculate the offset from TPIDR_EL0 to this module's
// thread-local area.
TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
// Now use :dtprel_whatever: operations to calculate this variable's offset
// in its thread-storage area.
SDValue HiVar = DAG.getTargetGlobalAddress(
GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
SDValue LoVar = DAG.getTargetGlobalAddress(
GV, DL, MVT::i64, 0,
AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
DAG.getTargetConstant(0, DL, MVT::i32)),
0);
TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
DAG.getTargetConstant(0, DL, MVT::i32)),
0);
} else if (Model == TLSModel::GeneralDynamic) {
// The call needs a relocation too for linker relaxation. It doesn't make
// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
// the address.
SDValue SymAddr =
DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
// Finally we can make a call to calculate the offset from tpidr_el0.
TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
} else
llvm_unreachable("Unsupported ELF TLS access model");
return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
}
SDValue
AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
SDValue Chain = DAG.getEntryNode();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDLoc DL(Op);
SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
// Load the ThreadLocalStoragePointer from the TEB
// A pointer to the TLS array is located at offset 0x58 from the TEB.
SDValue TLSArray =
DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
Chain = TLSArray.getValue(1);
// Load the TLS index from the C runtime;
// This does the same as getAddr(), but without having a GlobalAddressSDNode.
// This also does the same as LOADgot, but using a generic i32 load,
// while LOADgot only loads i64.
SDValue TLSIndexHi =
DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
"_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
SDValue TLSIndex =
DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
Chain = TLSIndex.getValue(1);
// The pointer to the thread's TLS data area is at the TLS Index scaled by 8
// offset into the TLSArray.
TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
DAG.getConstant(3, DL, PtrVT));
SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
MachinePointerInfo());
Chain = TLS.getValue(1);
const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = GA->getGlobal();
SDValue TGAHi = DAG.getTargetGlobalAddress(
GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
SDValue TGALo = DAG.getTargetGlobalAddress(
GV, DL, PtrVT, 0,
AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
// Add the offset from the start of the .tls section (section base).
SDValue Addr =
SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
DAG.getTargetConstant(0, DL, MVT::i32)),
0);
Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
return Addr;
}
SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
if (DAG.getTarget().useEmulatedTLS())
return LowerToTLSEmulatedModel(GA, DAG);
if (Subtarget->isTargetDarwin())
return LowerDarwinGlobalTLSAddress(Op, DAG);
if (Subtarget->isTargetELF())
return LowerELFGlobalTLSAddress(Op, DAG);
if (Subtarget->isTargetWindows())
return LowerWindowsGlobalTLSAddress(Op, DAG);
llvm_unreachable("Unexpected platform trying to use TLS");
}
SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
SDValue LHS = Op.getOperand(2);
SDValue RHS = Op.getOperand(3);
SDValue Dest = Op.getOperand(4);
SDLoc dl(Op);
MachineFunction &MF = DAG.getMachineFunction();
// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
// will not be produced, as they are conditional branch instructions that do
// not set flags.
bool ProduceNonFlagSettingCondBr =
!MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
// Handle f128 first, since lowering it will result in comparing the return
// value of a libcall against zero, which is just what the rest of LowerBR_CC
// is expecting to deal with.
if (LHS.getValueType() == MVT::f128) {
softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
// If softenSetCCOperands returned a scalar, we need to compare the result
// against zero to select between true and false values.
if (!RHS.getNode()) {
RHS = DAG.getConstant(0, dl, LHS.getValueType());
CC = ISD::SETNE;
}
}
// Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
// instruction.
if (isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
// Only lower legal XALUO ops.
if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
return SDValue();
// The actual operation with overflow check.
AArch64CC::CondCode OFCC;
SDValue Value, Overflow;
std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
if (CC == ISD::SETNE)
OFCC = getInvertedCondCode(OFCC);
SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
Overflow);
}
if (LHS.getValueType().isInteger()) {
assert((LHS.getValueType() == RHS.getValueType()) &&
(LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
// If the RHS of the comparison is zero, we can potentially fold this
// to a specialized branch.
const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
if (CC == ISD::SETEQ) {
// See if we can use a TBZ to fold in an AND as well.
// TBZ has a smaller branch displacement than CBZ. If the offset is
// out of bounds, a late MI-layer pass rewrites branches.
// 403.gcc is an example that hits this case.
if (LHS.getOpcode() == ISD::AND &&
isa<ConstantSDNode>(LHS.getOperand(1)) &&
isPowerOf2_64(LHS.getConstantOperandVal(1))) {
SDValue Test = LHS.getOperand(0);
uint64_t Mask = LHS.getConstantOperandVal(1);
return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
Dest);
}
return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
} else if (CC == ISD::SETNE) {
// See if we can use a TBZ to fold in an AND as well.
// TBZ has a smaller branch displacement than CBZ. If the offset is
// out of bounds, a late MI-layer pass rewrites branches.
// 403.gcc is an example that hits this case.
if (LHS.getOpcode() == ISD::AND &&
isa<ConstantSDNode>(LHS.getOperand(1)) &&
isPowerOf2_64(LHS.getConstantOperandVal(1))) {
SDValue Test = LHS.getOperand(0);
uint64_t Mask = LHS.getConstantOperandVal(1);
return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
Dest);
}
return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
} else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
// Don't combine AND since emitComparison converts the AND to an ANDS
// (a.k.a. TST) and the test in the test bit and branch instruction
// becomes redundant. This would also increase register pressure.
uint64_t Mask = LHS.getValueSizeInBits() - 1;
return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
DAG.getConstant(Mask, dl, MVT::i64), Dest);
}
}
if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
// Don't combine AND since emitComparison converts the AND to an ANDS
// (a.k.a. TST) and the test in the test bit and branch instruction
// becomes redundant. This would also increase register pressure.
uint64_t Mask = LHS.getValueSizeInBits() - 1;
return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
DAG.getConstant(Mask, dl, MVT::i64), Dest);
}
SDValue CCVal;
SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
Cmp);
}
assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
LHS.getValueType() == MVT::f64);
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
// clean. Some of them require two branches to implement.
SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
AArch64CC::CondCode CC1, CC2;
changeFPCCToAArch64CC(CC, CC1, CC2);
SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
SDValue BR1 =
DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
if (CC2 != AArch64CC::AL) {
SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
Cmp);
}
return BR1;
}
SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc DL(Op);
SDValue In1 = Op.getOperand(0);
SDValue In2 = Op.getOperand(1);
EVT SrcVT = In2.getValueType();
if (SrcVT.bitsLT(VT))
In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
else if (SrcVT.bitsGT(VT))
In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));
EVT VecVT;
uint64_t EltMask;
SDValue VecVal1, VecVal2;
auto setVecVal = [&] (int Idx) {
if (!VT.isVector()) {
VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
DAG.getUNDEF(VecVT), In1);
VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
DAG.getUNDEF(VecVT), In2);
} else {
VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
}
};
if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
EltMask = 0x80000000ULL;
setVecVal(AArch64::ssub);
} else if (VT == MVT::f64 || VT == MVT::v2f64) {
VecVT = MVT::v2i64;
// We want to materialize a mask with the high bit set, but the AdvSIMD
// immediate moves cannot materialize that in a single instruction for
// 64-bit elements. Instead, materialize zero and then negate it.
EltMask = 0;
setVecVal(AArch64::dsub);
} else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) {
VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16);
EltMask = 0x8000ULL;
setVecVal(AArch64::hsub);
} else {
llvm_unreachable("Invalid type for copysign!");
}
SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT);
// If we couldn't materialize the mask above, then the mask vector will be
// the zero vector, and we need to negate it here.
if (VT == MVT::f64 || VT == MVT::v2f64) {
BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
}
SDValue Sel =
DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
if (VT == MVT::f16)
return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel);
if (VT == MVT::f32)
return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
else if (VT == MVT::f64)
return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
else
return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
}
SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
if (DAG.getMachineFunction().getFunction().hasFnAttribute(
Attribute::NoImplicitFloat))
return SDValue();
if (!Subtarget->hasNEON())
return SDValue();
// While there is no integer popcount instruction, it can
// be more efficiently lowered to the following sequence that uses
// AdvSIMD registers/instructions as long as the copies to/from
// the AdvSIMD registers are cheap.
// FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
// CNT V0.8B, V0.8B // 8xbyte pop-counts
// ADDV B0, V0.8B // sum 8xbyte pop-counts
// UMOV X0, V0.B[0] // copy byte result back to integer reg
SDValue Val = Op.getOperand(0);
SDLoc DL(Op);
EVT VT = Op.getValueType();
if (VT == MVT::i32 || VT == MVT::i64) {
if (VT == MVT::i32)
Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
SDValue UaddLV = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
if (VT == MVT::i64)
UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
return UaddLV;
}
assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
"Unexpected type for custom ctpop lowering");
EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
Val = DAG.getBitcast(VT8Bit, Val);
Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
// Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
unsigned EltSize = 8;
unsigned NumElts = VT.is64BitVector() ? 8 : 16;
while (EltSize != VT.getScalarSizeInBits()) {
EltSize *= 2;
NumElts /= 2;
MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
Val = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
}
return Val;
}
SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
if (Op.getValueType().isVector())
return LowerVSETCC(Op, DAG);
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
SDLoc dl(Op);
// We chose ZeroOrOneBooleanContents, so use zero and one.
EVT VT = Op.getValueType();
SDValue TVal = DAG.getConstant(1, dl, VT);
SDValue FVal = DAG.getConstant(0, dl, VT);
// Handle f128 first, since one possible outcome is a normal integer
// comparison which gets picked up by the next if statement.
if (LHS.getValueType() == MVT::f128) {
softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
// If softenSetCCOperands returned a scalar, use it.
if (!RHS.getNode()) {
assert(LHS.getValueType() == Op.getValueType() &&
"Unexpected setcc expansion!");
return LHS;
}
}
if (LHS.getValueType().isInteger()) {
SDValue CCVal;
SDValue Cmp =
getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
// Note that we inverted the condition above, so we reverse the order of
// the true and false operands here. This will allow the setcc to be
// matched to a single CSINC instruction.
return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
}
// Now we know we're dealing with FP values.
assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
LHS.getValueType() == MVT::f64);
// If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
// and do the comparison.
SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
AArch64CC::CondCode CC1, CC2;
changeFPCCToAArch64CC(CC, CC1, CC2);
if (CC2 == AArch64CC::AL) {
changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
// Note that we inverted the condition above, so we reverse the order of
// the true and false operands here. This will allow the setcc to be
// matched to a single CSINC instruction.
return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
} else {
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
// totally clean. Some of them require two CSELs to implement. As is in
// this case, we emit the first CSEL and then emit a second using the output
// of the first as the RHS. We're effectively OR'ing the two CC's together.
// FIXME: It would be nice if we could match the two CSELs to two CSINCs.
SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
SDValue CS1 =
DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
}
}
SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
SDValue RHS, SDValue TVal,
SDValue FVal, const SDLoc &dl,
SelectionDAG &DAG) const {
// Handle f128 first, because it will result in a comparison of some RTLIB
// call result against zero.
if (LHS.getValueType() == MVT::f128) {
softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
// If softenSetCCOperands returned a scalar, we need to compare the result
// against zero to select between true and false values.
if (!RHS.getNode()) {
RHS = DAG.getConstant(0, dl, LHS.getValueType());
CC = ISD::SETNE;
}
}
// Also handle f16, for which we need to do a f32 comparison.
if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
}
// Next, handle integers.
if (LHS.getValueType().isInteger()) {
assert((LHS.getValueType() == RHS.getValueType()) &&
(LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
unsigned Opcode = AArch64ISD::CSEL;
// If both the TVal and the FVal are constants, see if we can swap them in
// order to for a CSINV or CSINC out of them.
ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, true);
} else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, true);
} else if (TVal.getOpcode() == ISD::XOR) {
// If TVal is a NOT we want to swap TVal and FVal so that we can match
// with a CSINV rather than a CSEL.
if (isAllOnesConstant(TVal.getOperand(1))) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, true);
}
} else if (TVal.getOpcode() == ISD::SUB) {
// If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
// that we can match with a CSNEG rather than a CSEL.
if (isNullConstant(TVal.getOperand(0))) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, true);
}
} else if (CTVal && CFVal) {
const int64_t TrueVal = CTVal->getSExtValue();
const int64_t FalseVal = CFVal->getSExtValue();
bool Swap = false;
// If both TVal and FVal are constants, see if FVal is the
// inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
// instead of a CSEL in that case.
if (TrueVal == ~FalseVal) {
Opcode = AArch64ISD::CSINV;
} else if (TrueVal == -FalseVal) {
Opcode = AArch64ISD::CSNEG;
} else if (TVal.getValueType() == MVT::i32) {
// If our operands are only 32-bit wide, make sure we use 32-bit
// arithmetic for the check whether we can use CSINC. This ensures that
// the addition in the check will wrap around properly in case there is
// an overflow (which would not be the case if we do the check with
// 64-bit arithmetic).
const uint32_t TrueVal32 = CTVal->getZExtValue();
const uint32_t FalseVal32 = CFVal->getZExtValue();
if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
Opcode = AArch64ISD::CSINC;
if (TrueVal32 > FalseVal32) {
Swap = true;
}
}
// 64-bit check whether we can use CSINC.
} else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
Opcode = AArch64ISD::CSINC;
if (TrueVal > FalseVal) {
Swap = true;
}
}
// Swap TVal and FVal if necessary.
if (Swap) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, true);
}
if (Opcode != AArch64ISD::CSEL) {
// Drop FVal since we can get its value by simply inverting/negating
// TVal.
FVal = TVal;
}
}
// Avoid materializing a constant when possible by reusing a known value in
// a register. However, don't perform this optimization if the known value
// is one, zero or negative one in the case of a CSEL. We can always
// materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
// FVal, respectively.
ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
!RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) {
AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
// Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
// "a != C ? x : a" to avoid materializing C.
if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
TVal = LHS;
else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
FVal = LHS;
} else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
// Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
// avoid materializing C.
AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
Opcode = AArch64ISD::CSINV;
TVal = LHS;
FVal = DAG.getConstant(0, dl, FVal.getValueType());
}
}
SDValue CCVal;
SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
EVT VT = TVal.getValueType();
return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
}
// Now we know we're dealing with FP values.
assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
LHS.getValueType() == MVT::f64);
assert(LHS.getValueType() == RHS.getValueType());
EVT VT = TVal.getValueType();
SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
// clean. Some of them require two CSELs to implement.
AArch64CC::CondCode CC1, CC2;
changeFPCCToAArch64CC(CC, CC1, CC2);
if (DAG.getTarget().Options.UnsafeFPMath) {
// Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
// "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
if (RHSVal && RHSVal->isZero()) {
ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
TVal = LHS;
else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
CFVal && CFVal->isZero() &&
FVal.getValueType() == LHS.getValueType())
FVal = LHS;
}
}
// Emit first, and possibly only, CSEL.
SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
// If we need a second CSEL, emit it, using the output of the first as the
// RHS. We're effectively OR'ing the two CC's together.
if (CC2 != AArch64CC::AL) {
SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
}
// Otherwise, return the output of the first CSEL.
return CS1;
}
SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
SelectionDAG &DAG) const {
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
SDValue TVal = Op.getOperand(2);
SDValue FVal = Op.getOperand(3);
SDLoc DL(Op);
return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
}
SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
SelectionDAG &DAG) const {
SDValue CCVal = Op->getOperand(0);
SDValue TVal = Op->getOperand(1);
SDValue FVal = Op->getOperand(2);
SDLoc DL(Op);
// Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
// instruction.
if (isOverflowIntrOpRes(CCVal)) {
// Only lower legal XALUO ops.
if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
return SDValue();
AArch64CC::CondCode OFCC;
SDValue Value, Overflow;
std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
CCVal, Overflow);
}
// Lower it the same way as we would lower a SELECT_CC node.
ISD::CondCode CC;
SDValue LHS, RHS;
if (CCVal.getOpcode() == ISD::SETCC) {
LHS = CCVal.getOperand(0);
RHS = CCVal.getOperand(1);
CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get();
} else {
LHS = CCVal;
RHS = DAG.getConstant(0, DL, CCVal.getValueType());
CC = ISD::SETNE;
}
return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
}
SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
SelectionDAG &DAG) const {
// Jump table entries as PC relative offsets. No additional tweaking
// is necessary here. Just get the address of the jump table.
JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
if (getTargetMachine().getCodeModel() == CodeModel::Large &&
!Subtarget->isTargetMachO()) {
return getAddrLarge(JT, DAG);
} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
return getAddrTiny(JT, DAG);
}
return getAddr(JT, DAG);
}
SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
SelectionDAG &DAG) const {
// Jump table entries as PC relative offsets. No additional tweaking
// is necessary here. Just get the address of the jump table.
SDLoc DL(Op);
SDValue JT = Op.getOperand(1);
SDValue Entry = Op.getOperand(2);
int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
SDNode *Dest =
DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0),
SDValue(Dest, 0));
}
SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
SelectionDAG &DAG) const {
ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
if (getTargetMachine().getCodeModel() == CodeModel::Large) {
// Use the GOT for the large code model on iOS.
if (Subtarget->isTargetMachO()) {
return getGOT(CP, DAG);
}
return getAddrLarge(CP, DAG);
} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
return getAddrTiny(CP, DAG);
} else {
return getAddr(CP, DAG);
}
}
SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
SelectionDAG &DAG) const {
BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
if (getTargetMachine().getCodeModel() == CodeModel::Large &&
!Subtarget->isTargetMachO()) {
return getAddrLarge(BA, DAG);
} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
return getAddrTiny(BA, DAG);
}
return getAddr(BA, DAG);
}
SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
SelectionDAG &DAG) const {
AArch64FunctionInfo *FuncInfo =
DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
SDLoc DL(Op);
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
getPointerTy(DAG.getDataLayout()));
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
MachinePointerInfo(SV));
}
SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
SelectionDAG &DAG) const {
AArch64FunctionInfo *FuncInfo =
DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
SDLoc DL(Op);
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
? FuncInfo->getVarArgsGPRIndex()
: FuncInfo->getVarArgsStackIndex(),
getPointerTy(DAG.getDataLayout()));
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
MachinePointerInfo(SV));
}
SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
SelectionDAG &DAG) const {
// The layout of the va_list struct is specified in the AArch64 Procedure Call
// Standard, section B.3.
MachineFunction &MF = DAG.getMachineFunction();
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
SDValue VAList = Op.getOperand(1);
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
SmallVector<SDValue, 4> MemOps;
// void *__stack at offset 0
SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
MachinePointerInfo(SV), /* Alignment = */ 8));
// void *__gr_top at offset 8
int GPRSize = FuncInfo->getVarArgsGPRSize();
if (GPRSize > 0) {
SDValue GRTop, GRTopAddr;
GRTopAddr =
DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT));
GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
DAG.getConstant(GPRSize, DL, PtrVT));
MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
MachinePointerInfo(SV, 8),
/* Alignment = */ 8));
}
// void *__vr_top at offset 16
int FPRSize = FuncInfo->getVarArgsFPRSize();
if (FPRSize > 0) {
SDValue VRTop, VRTopAddr;
VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
DAG.getConstant(16, DL, PtrVT));
VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
DAG.getConstant(FPRSize, DL, PtrVT));
MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
MachinePointerInfo(SV, 16),
/* Alignment = */ 8));
}
// int __gr_offs at offset 24
SDValue GROffsAddr =
DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
MemOps.push_back(DAG.getStore(
Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr,
MachinePointerInfo(SV, 24), /* Alignment = */ 4));
// int __vr_offs at offset 28
SDValue VROffsAddr =
DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
MemOps.push_back(DAG.getStore(
Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr,
MachinePointerInfo(SV, 28), /* Alignment = */ 4));
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
}
SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
return LowerWin64_VASTART(Op, DAG);
else if (Subtarget->isTargetDarwin())
return LowerDarwin_VASTART(Op, DAG);
else
return LowerAAPCS_VASTART(Op, DAG);
}
SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
SelectionDAG &DAG) const {
// AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
// pointer.
SDLoc DL(Op);
unsigned VaListSize =
Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32;
const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1),
Op.getOperand(2),
DAG.getConstant(VaListSize, DL, MVT::i32),
8, false, false, false, MachinePointerInfo(DestSV),
MachinePointerInfo(SrcSV));
}
SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
assert(Subtarget->isTargetDarwin() &&
"automatic va_arg instruction only works on Darwin");
const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
EVT VT = Op.getValueType();
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
SDValue Addr = Op.getOperand(1);
unsigned Align = Op.getConstantOperandVal(3);
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V));
Chain = VAList.getValue(1);
if (Align > 8) {
assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
DAG.getConstant(Align - 1, DL, PtrVT));
VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
DAG.getConstant(-(int64_t)Align, DL, PtrVT));
}
Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
// Scalar integer and FP values smaller than 64 bits are implicitly extended
// up to 64 bits. At the very least, we have to increase the striding of the
// vaargs list to match this, and for FP values we need to introduce
// FP_ROUND nodes as well.
if (VT.isInteger() && !VT.isVector())
ArgSize = 8;
bool NeedFPTrunc = false;
if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
ArgSize = 8;
NeedFPTrunc = true;
}
// Increment the pointer, VAList, to the next vaarg
SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
DAG.getConstant(ArgSize, DL, PtrVT));
// Store the incremented VAList to the legalized pointer
SDValue APStore =
DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
// Load the actual argument out of the pointer VAList
if (NeedFPTrunc) {
// Load the value as an f64.
SDValue WideFP =
DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
// Round the value down to an f32.
SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
DAG.getIntPtrConstant(1, DL));
SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
// Merge the rounded value with the chain output of the load.
return DAG.getMergeValues(Ops, DL);
}
return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
}
SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
SelectionDAG &DAG) const {
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
MFI.setFrameAddressIsTaken(true);
EVT VT = Op.getValueType();
SDLoc DL(Op);
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
SDValue FrameAddr =
DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
while (Depth--)
FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
MachinePointerInfo());
return FrameAddr;
}
SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
SelectionDAG &DAG) const {
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
EVT VT = getPointerTy(DAG.getDataLayout());
SDLoc DL(Op);
int FI = MFI.CreateFixedObject(4, 0, false);
return DAG.getFrameIndex(FI, VT);
}
#define GET_REGISTER_MATCHER
#include "AArch64GenAsmMatcher.inc"
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const {
unsigned Reg = MatchRegisterName(RegName);
if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
if (!Subtarget->isXRegisterReserved(DwarfRegNum))
Reg = 0;
}
if (Reg)
return Reg;
report_fatal_error(Twine("Invalid register name \""
+ StringRef(RegName) + "\"."));
}
SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
SelectionDAG &DAG) const {
DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
EVT VT = Op.getValueType();
SDLoc DL(Op);
SDValue FrameAddr =
DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
}
SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
MFI.setReturnAddressIsTaken(true);
EVT VT = Op.getValueType();
SDLoc DL(Op);
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
if (Depth) {
SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
return DAG.getLoad(VT, DL, DAG.getEntryNode(),
DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
MachinePointerInfo());
}
// Return LR, which contains the return address. Mark it an implicit live-in.
unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
}
/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
/// i64 values and take a 2 x i64 value to shift plus a shift amount.
SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
EVT VT = Op.getValueType();
unsigned VTBits = VT.getSizeInBits();
SDLoc dl(Op);
SDValue ShOpLo = Op.getOperand(0);
SDValue ShOpHi = Op.getOperand(1);
SDValue ShAmt = Op.getOperand(2);
unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
// Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
// is "undef". We wanted 0, so CSEL it directly.
SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
ISD::SETEQ, dl, DAG);
SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
HiBitsForLo =
DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
HiBitsForLo, CCVal, Cmp);
SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
DAG.getConstant(VTBits, dl, MVT::i64));
SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
SDValue LoForNormalShift =
DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);
Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
dl, DAG);
CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
LoForNormalShift, CCVal, Cmp);
// AArch64 shifts larger than the register width are wrapped rather than
// clamped, so we can't just emit "hi >> x".
SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
SDValue HiForBigShift =
Opc == ISD::SRA
? DAG.getNode(Opc, dl, VT, ShOpHi,
DAG.getConstant(VTBits - 1, dl, MVT::i64))
: DAG.getConstant(0, dl, VT);
SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
HiForNormalShift, CCVal, Cmp);
SDValue Ops[2] = { Lo, Hi };
return DAG.getMergeValues(Ops, dl);
}
/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
/// i64 values and take a 2 x i64 value to shift plus a shift amount.
SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
EVT VT = Op.getValueType();
unsigned VTBits = VT.getSizeInBits();
SDLoc dl(Op);
SDValue ShOpLo = Op.getOperand(0);
SDValue ShOpHi = Op.getOperand(1);
SDValue ShAmt = Op.getOperand(2);
assert(Op.getOpcode() == ISD::SHL_PARTS);
SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
// Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
// is "undef". We wanted 0, so CSEL it directly.
SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
ISD::SETEQ, dl, DAG);
SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
LoBitsForHi =
DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
LoBitsForHi, CCVal, Cmp);
SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
DAG.getConstant(VTBits, dl, MVT::i64));
SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
SDValue HiForNormalShift =
DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);
SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
dl, DAG);
CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
HiForNormalShift, CCVal, Cmp);
// AArch64 shifts of larger than register sizes are wrapped rather than
// clamped, so we can't just emit "lo << a" if a is too big.
SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
LoForNormalShift, CCVal, Cmp);
SDValue Ops[2] = { Lo, Hi };
return DAG.getMergeValues(Ops, dl);
}
bool AArch64TargetLowering::isOffsetFoldingLegal(
const GlobalAddressSDNode *GA) const {
// Offsets are folded in the DAG combine rather than here so that we can
// intelligently choose an offset based on the uses.
return false;
}
bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
bool OptForSize) const {
bool IsLegal = false;
// We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
// 16-bit case when target has full fp16 support.
// FIXME: We should be able to handle f128 as well with a clever lowering.
const APInt ImmInt = Imm.bitcastToAPInt();
if (VT == MVT::f64)
IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
else if (VT == MVT::f32)
IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
else if (VT == MVT::f16 && Subtarget->hasFullFP16())
IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 || Imm.isPosZero();
// TODO: fmov h0, w0 is also legal, however on't have an isel pattern to
// generate that fmov.
// If we can not materialize in immediate field for fmov, check if the
// value can be encoded as the immediate operand of a logical instruction.
// The immediate value will be created with either MOVZ, MOVN, or ORR.
if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
// The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
// however the mov+fmov sequence is always better because of the reduced
// cache pressure. The timings are still the same if you consider
// movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
// movw+movk is fused). So we limit up to 2 instrdduction at most.
SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(),
Insn);
unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
IsLegal = Insn.size() <= Limit;
}
LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString()
<< " imm value: "; Imm.dump(););
return IsLegal;
}
//===----------------------------------------------------------------------===//
// AArch64 Optimization Hooks
//===----------------------------------------------------------------------===//
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
SDValue Operand, SelectionDAG &DAG,
int &ExtraSteps) {
EVT VT = Operand.getValueType();
if (ST->hasNEON() &&
(VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
VT == MVT::f32 || VT == MVT::v1f32 ||
VT == MVT::v2f32 || VT == MVT::v4f32)) {
if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
// For the reciprocal estimates, convergence is quadratic, so the number
// of digits is doubled after each iteration. In ARMv8, the accuracy of
// the initial estimate is 2^-8. Thus the number of extra steps to refine
// the result for float (23 mantissa bits) is 2 and for double (52
// mantissa bits) is 3.
ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;
return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
}
return SDValue();
}
SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
SelectionDAG &DAG, int Enabled,
int &ExtraSteps,
bool &UseOneConst,
bool Reciprocal) const {
if (Enabled == ReciprocalEstimate::Enabled ||
(Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
DAG, ExtraSteps)) {
SDLoc DL(Operand);
EVT VT = Operand.getValueType();
SDNodeFlags Flags;
Flags.setAllowReassociation(true);
// Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
// AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
for (int i = ExtraSteps; i > 0; --i) {
SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
Flags);
Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
}
if (!Reciprocal) {
EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
VT);
SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ);
Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
// Correct the result if the operand is 0.0.
Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL,
VT, Eq, Operand, Estimate);
}
ExtraSteps = 0;
return Estimate;
}
return SDValue();
}
SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
SelectionDAG &DAG, int Enabled,
int &ExtraSteps) const {
if (Enabled == ReciprocalEstimate::Enabled)
if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
DAG, ExtraSteps)) {
SDLoc DL(Operand);
EVT VT = Operand.getValueType();
SDNodeFlags Flags;
Flags.setAllowReassociation(true);
// Newton reciprocal iteration: E * (2 - X * E)
// AArch64 reciprocal iteration instruction: (2 - M * N)
for (int i = ExtraSteps; i > 0; --i) {
SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
Estimate, Flags);
Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
}
ExtraSteps = 0;
return Estimate;
}
return SDValue();
}
//===----------------------------------------------------------------------===//
// AArch64 Inline Assembly Support
//===----------------------------------------------------------------------===//
// Table of Constraints
// TODO: This is the current set of constraints supported by ARM for the
// compiler, not all of them may make sense.
//
// r - A general register
// w - An FP/SIMD register of some size in the range v0-v31
// x - An FP/SIMD register of some size in the range v0-v15
// I - Constant that can be used with an ADD instruction
// J - Constant that can be used with a SUB instruction
// K - Constant that can be used with a 32-bit logical instruction
// L - Constant that can be used with a 64-bit logical instruction
// M - Constant that can be used as a 32-bit MOV immediate
// N - Constant that can be used as a 64-bit MOV immediate
// Q - A memory reference with base register and no offset
// S - A symbolic address
// Y - Floating point constant zero
// Z - Integer constant zero
//
// Note that general register operands will be output using their 64-bit x
// register name, whatever the size of the variable, unless the asm operand
// is prefixed by the %w modifier. Floating-point and SIMD register operands
// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
// %q modifier.
const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
// At this point, we have to lower this constraint to something else, so we
// lower it to an "r" or "w". However, by doing this we will force the result
// to be in register, while the X constraint is much more permissive.
//
// Although we are correct (we are free to emit anything, without
// constraints), we might break use cases that would expect us to be more
// efficient and emit something else.
if (!Subtarget->hasFPARMv8())
return "r";
if (ConstraintVT.isFloatingPoint())
return "w";
if (ConstraintVT.isVector() &&
(ConstraintVT.getSizeInBits() == 64 ||
ConstraintVT.getSizeInBits() == 128))
return "w";
return "r";
}
/// getConstraintType - Given a constraint letter, return the type of
/// constraint it is for this target.
AArch64TargetLowering::ConstraintType
AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
default:
break;
- case 'z':
- return C_Other;
case 'x':
case 'w':
return C_RegisterClass;
// An address with a single base register. Due to the way we
// currently handle addresses it is the same as 'r'.
case 'Q':
return C_Memory;
+ case 'I':
+ case 'J':
+ case 'K':
+ case 'L':
+ case 'M':
+ case 'N':
+ case 'Y':
+ case 'Z':
+ return C_Immediate;
+ case 'z':
case 'S': // A symbolic address
return C_Other;
}
}
return TargetLowering::getConstraintType(Constraint);
}
/// Examine constraint type and operand type and determine a weight value.
/// This object must already have been set up with the operand type
/// and the current alternative constraint selected.
TargetLowering::ConstraintWeight
AArch64TargetLowering::getSingleConstraintMatchWeight(
AsmOperandInfo &info, const char *constraint) const {
ConstraintWeight weight = CW_Invalid;
Value *CallOperandVal = info.CallOperandVal;
// If we don't have a value, we can't do a match,
// but allow it at the lowest weight.
if (!CallOperandVal)
return CW_Default;
Type *type = CallOperandVal->getType();
// Look at the constraint type.
switch (*constraint) {
default:
weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
break;
case 'x':
case 'w':
if (type->isFloatingPointTy() || type->isVectorTy())
weight = CW_Register;
break;
case 'z':
weight = CW_Constant;
break;
}
return weight;
}
std::pair<unsigned, const TargetRegisterClass *>
AArch64TargetLowering::getRegForInlineAsmConstraint(
const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
case 'r':
if (VT.getSizeInBits() == 64)
return std::make_pair(0U, &AArch64::GPR64commonRegClass);
return std::make_pair(0U, &AArch64::GPR32commonRegClass);
case 'w':
if (!Subtarget->hasFPARMv8())
break;
if (VT.getSizeInBits() == 16)
return std::make_pair(0U, &AArch64::FPR16RegClass);
if (VT.getSizeInBits() == 32)
return std::make_pair(0U, &AArch64::FPR32RegClass);
if (VT.getSizeInBits() == 64)
return std::make_pair(0U, &AArch64::FPR64RegClass);
if (VT.getSizeInBits() == 128)
return std::make_pair(0U, &AArch64::FPR128RegClass);
break;
// The instructions that this constraint is designed for can
// only take 128-bit registers so just use that regclass.
case 'x':
if (!Subtarget->hasFPARMv8())
break;
if (VT.getSizeInBits() == 128)
return std::make_pair(0U, &AArch64::FPR128_loRegClass);
break;
}
}
if (StringRef("{cc}").equals_lower(Constraint))
return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
// Use the default implementation in TargetLowering to convert the register
// constraint into a member of a register class.
std::pair<unsigned, const TargetRegisterClass *> Res;
Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
// Not found as a standard register?
if (!Res.second) {
unsigned Size = Constraint.size();
if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
int RegNo;
bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
if (!Failed && RegNo >= 0 && RegNo <= 31) {
// v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
// By default we'll emit v0-v31 for this unless there's a modifier where
// we'll emit the correct register as well.
if (VT != MVT::Other && VT.getSizeInBits() == 64) {
Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
Res.second = &AArch64::FPR64RegClass;
} else {
Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
Res.second = &AArch64::FPR128RegClass;
}
}
}
}
if (Res.second && !Subtarget->hasFPARMv8() &&
!AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
!AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
return std::make_pair(0U, nullptr);
return Res;
}
/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
/// vector. If it is invalid, don't add anything to Ops.
void AArch64TargetLowering::LowerAsmOperandForConstraint(
SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
SelectionDAG &DAG) const {
SDValue Result;
// Currently only support length 1 constraints.
if (Constraint.length() != 1)
return;
char ConstraintLetter = Constraint[0];
switch (ConstraintLetter) {
default:
break;
// This set of constraints deal with valid constants for various instructions.
// Validate and return a target constant for them if we can.
case 'z': {
// 'z' maps to xzr or wzr so it needs an input of 0.
if (!isNullConstant(Op))
return;
if (Op.getValueType() == MVT::i64)
Result = DAG.getRegister(AArch64::XZR, MVT::i64);
else
Result = DAG.getRegister(AArch64::WZR, MVT::i32);
break;
}
case 'S': {
// An absolute symbolic address or label reference.
if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
GA->getValueType(0));
} else if (const BlockAddressSDNode *BA =
dyn_cast<BlockAddressSDNode>(Op)) {
Result =
DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0));
} else if (const ExternalSymbolSDNode *ES =
dyn_cast<ExternalSymbolSDNode>(Op)) {
Result =
DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0));
} else
return;
break;
}
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
if (!C)
return;
// Grab the value and do some validation.
uint64_t CVal = C->getZExtValue();
switch (ConstraintLetter) {
// The I constraint applies only to simple ADD or SUB immediate operands:
// i.e. 0 to 4095 with optional shift by 12
// The J constraint applies only to ADD or SUB immediates that would be
// valid when negated, i.e. if [an add pattern] were to be output as a SUB
// instruction [or vice versa], in other words -1 to -4095 with optional
// left shift by 12.
case 'I':
if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
break;
return;
case 'J': {
uint64_t NVal = -C->getSExtValue();
if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
CVal = C->getSExtValue();
break;
}
return;
}
// The K and L constraints apply *only* to logical immediates, including
// what used to be the MOVI alias for ORR (though the MOVI alias has now
// been removed and MOV should be used). So these constraints have to
// distinguish between bit patterns that are valid 32-bit or 64-bit
// "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
// not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
// versa.
case 'K':
if (AArch64_AM::isLogicalImmediate(CVal, 32))
break;
return;
case 'L':
if (AArch64_AM::isLogicalImmediate(CVal, 64))
break;
return;
// The M and N constraints are a superset of K and L respectively, for use
// with the MOV (immediate) alias. As well as the logical immediates they
// also match 32 or 64-bit immediates that can be loaded either using a
// *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
// (M) or 64-bit 0x1234000000000000 (N) etc.
// As a note some of this code is liberally stolen from the asm parser.
case 'M': {
if (!isUInt<32>(CVal))
return;
if (AArch64_AM::isLogicalImmediate(CVal, 32))
break;
if ((CVal & 0xFFFF) == CVal)
break;
if ((CVal & 0xFFFF0000ULL) == CVal)
break;
uint64_t NCVal = ~(uint32_t)CVal;
if ((NCVal & 0xFFFFULL) == NCVal)
break;
if ((NCVal & 0xFFFF0000ULL) == NCVal)
break;
return;
}
case 'N': {
if (AArch64_AM::isLogicalImmediate(CVal, 64))
break;
if ((CVal & 0xFFFFULL) == CVal)
break;
if ((CVal & 0xFFFF0000ULL) == CVal)
break;
if ((CVal & 0xFFFF00000000ULL) == CVal)
break;
if ((CVal & 0xFFFF000000000000ULL) == CVal)
break;
uint64_t NCVal = ~CVal;
if ((NCVal & 0xFFFFULL) == NCVal)
break;
if ((NCVal & 0xFFFF0000ULL) == NCVal)
break;
if ((NCVal & 0xFFFF00000000ULL) == NCVal)
break;
if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
break;
return;
}
default:
return;
}
// All assembler immediates are 64-bit integers.
Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
break;
}
if (Result.getNode()) {
Ops.push_back(Result);
return;
}
return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
}
//===----------------------------------------------------------------------===//
// AArch64 Advanced SIMD Support
//===----------------------------------------------------------------------===//
/// WidenVector - Given a value in the V64 register class, produce the
/// equivalent value in the V128 register class.
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
EVT VT = V64Reg.getValueType();
unsigned NarrowSize = VT.getVectorNumElements();
MVT EltTy = VT.getVectorElementType().getSimpleVT();
MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
SDLoc DL(V64Reg);
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
V64Reg, DAG.getConstant(0, DL, MVT::i32));
}
/// getExtFactor - Determine the adjustment factor for the position when
/// generating an "extract from vector registers" instruction.
static unsigned getExtFactor(SDValue &V) {
EVT EltType = V.getValueType().getVectorElementType();
return EltType.getSizeInBits() / 8;
}
/// NarrowVector - Given a value in the V128 register class, produce the
/// equivalent value in the V64 register class.
static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
EVT VT = V128Reg.getValueType();
unsigned WideSize = VT.getVectorNumElements();
MVT EltTy = VT.getVectorElementType().getSimpleVT();
MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
SDLoc DL(V128Reg);
return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
}
// Gather data to see if the operation can be modelled as a
// shuffle in combination with VEXTs.
SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
SDLoc dl(Op);
EVT VT = Op.getValueType();
unsigned NumElts = VT.getVectorNumElements();
struct ShuffleSourceInfo {
SDValue Vec;
unsigned MinElt;
unsigned MaxElt;
// We may insert some combination of BITCASTs and VEXT nodes to force Vec to
// be compatible with the shuffle we intend to construct. As a result
// ShuffleVec will be some sliding window into the original Vec.
SDValue ShuffleVec;
// Code should guarantee that element i in Vec starts at element "WindowBase
// + i * WindowScale in ShuffleVec".
int WindowBase;
int WindowScale;
ShuffleSourceInfo(SDValue Vec)
: Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
};
// First gather all vectors used as an immediate source for this BUILD_VECTOR
// node.
SmallVector<ShuffleSourceInfo, 2> Sources;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
if (V.isUndef())
continue;
else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isa<ConstantSDNode>(V.getOperand(1))) {
LLVM_DEBUG(
dbgs() << "Reshuffle failed: "
"a shuffle can only come from building a vector from "
"various elements of other vectors, provided their "
"indices are constant\n");
return SDValue();
}
// Add this element source to the list if it's not already there.
SDValue SourceVec = V.getOperand(0);
auto Source = find(Sources, SourceVec);
if (Source == Sources.end())
Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
// Update the minimum and maximum lane number seen.
unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
Source->MinElt = std::min(Source->MinElt, EltNo);
Source->MaxElt = std::max(Source->MaxElt, EltNo);
}
if (Sources.size() > 2) {
LLVM_DEBUG(
dbgs() << "Reshuffle failed: currently only do something sane when at "
"most two source vectors are involved\n");
return SDValue();
}
// Find out the smallest element size among result and two sources, and use
// it as element size to build the shuffle_vector.
EVT SmallestEltTy = VT.getVectorElementType();
for (auto &Source : Sources) {
EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
if (SrcEltTy.bitsLT(SmallestEltTy)) {
SmallestEltTy = SrcEltTy;
}
}
unsigned ResMultiplier =
VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
// If the source vector is too wide or too narrow, we may nevertheless be able
// to construct a compatible shuffle either by concatenating it with UNDEF or
// extracting a suitable range of elements.
for (auto &Src : Sources) {
EVT SrcVT = Src.ShuffleVec.getValueType();
if (SrcVT.getSizeInBits() == VT.getSizeInBits())
continue;
// This stage of the search produces a source with the same element type as
// the original, but with a total width matching the BUILD_VECTOR output.
EVT EltVT = SrcVT.getVectorElementType();
unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
// We can pad out the smaller vector for free, so if it's part of a
// shuffle...
Src.ShuffleVec =
DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
DAG.getUNDEF(Src.ShuffleVec.getValueType()));
continue;
}
assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());
if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
LLVM_DEBUG(
dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
return SDValue();
}
if (Src.MinElt >= NumSrcElts) {
// The extraction can just take the second half
Src.ShuffleVec =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
DAG.getConstant(NumSrcElts, dl, MVT::i64));
Src.WindowBase = -NumSrcElts;
} else if (Src.MaxElt < NumSrcElts) {
// The extraction can just take the first half
Src.ShuffleVec =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
DAG.getConstant(0, dl, MVT::i64));
} else {
// An actual VEXT is needed
SDValue VEXTSrc1 =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
DAG.getConstant(0, dl, MVT::i64));
SDValue VEXTSrc2 =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
DAG.getConstant(NumSrcElts, dl, MVT::i64));
unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
VEXTSrc2,
DAG.getConstant(Imm, dl, MVT::i32));
Src.WindowBase = -Src.MinElt;
}
}
// Another possible incompatibility occurs from the vector element types. We
// can fix this by bitcasting the source vectors to the same type we intend
// for the shuffle.
for (auto &Src : Sources) {
EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
if (SrcEltTy == SmallestEltTy)
continue;
assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
Src.WindowBase *= Src.WindowScale;
}
// Final sanity check before we try to actually produce a shuffle.
LLVM_DEBUG(for (auto Src
: Sources)
assert(Src.ShuffleVec.getValueType() == ShuffleVT););
// The stars all align, our next step is to produce the mask for the shuffle.
SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
SDValue Entry = Op.getOperand(i);
if (Entry.isUndef())
continue;
auto Src = find(Sources, Entry.getOperand(0));
int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
// EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
// trunc. So only std::min(SrcBits, DestBits) actually get defined in this
// segment.
EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
int BitsDefined =
std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits());
int LanesDefined = BitsDefined / BitsPerShuffleLane;
// This source is expected to fill ResMultiplier lanes of the final shuffle,
// starting at the appropriate offset.
int *LaneMask = &Mask[i * ResMultiplier];
int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
ExtractBase += NumElts * (Src - Sources.begin());
for (int j = 0; j < LanesDefined; ++j)
LaneMask[j] = ExtractBase + j;
}
// Final check before we try to produce nonsense...
if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
return SDValue();
}
SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
for (unsigned i = 0; i < Sources.size(); ++i)
ShuffleOps[i] = Sources[i].ShuffleVec;
SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
ShuffleOps[1], Mask);
SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
dbgs() << "Reshuffle, creating node: "; V.dump(););
return V;
}
// check if an EXT instruction can handle the shuffle mask when the
// vector sources of the shuffle are the same.
static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
unsigned NumElts = VT.getVectorNumElements();
// Assume that the first shuffle index is not UNDEF. Fail if it is.
if (M[0] < 0)
return false;
Imm = M[0];
// If this is a VEXT shuffle, the immediate value is the index of the first
// element. The other shuffle indices must be the successive elements after
// the first one.
unsigned ExpectedElt = Imm;
for (unsigned i = 1; i < NumElts; ++i) {
// Increment the expected index. If it wraps around, just follow it
// back to index zero and keep going.
++ExpectedElt;
if (ExpectedElt == NumElts)
ExpectedElt = 0;
if (M[i] < 0)
continue; // ignore UNDEF indices
if (ExpectedElt != static_cast<unsigned>(M[i]))
return false;
}
return true;
}
// check if an EXT instruction can handle the shuffle mask when the
// vector sources of the shuffle are different.
static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
unsigned &Imm) {
// Look for the first non-undef element.
const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
// Benefit form APInt to handle overflow when calculating expected element.
unsigned NumElts = VT.getVectorNumElements();
unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
// The following shuffle indices must be the successive elements after the
// first real element.
const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
[&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
if (FirstWrongElt != M.end())
return false;
// The index of an EXT is the first element if it is not UNDEF.
// Watch out for the beginning UNDEFs. The EXT index should be the expected
// value of the first element. E.g.
// <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
// <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
// ExpectedElt is the last mask index plus 1.
Imm = ExpectedElt.getZExtValue();
// There are two difference cases requiring to reverse input vectors.
// For example, for vector <4 x i32> we have the following cases,
// Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
// Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
// For both cases, we finally use mask <5, 6, 7, 0>, which requires
// to reverse two input vectors.
if (Imm < NumElts)
ReverseEXT = true;
else
Imm -= NumElts;
return true;
}
/// isREVMask - Check if a vector shuffle corresponds to a REV
/// instruction with the specified blocksize. (The order of the elements
/// within each block of the vector is reversed.)
static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
"Only possible block sizes for REV are: 16, 32, 64");
unsigned EltSz = VT.getScalarSizeInBits();
if (EltSz == 64)
return false;
unsigned NumElts = VT.getVectorNumElements();
unsigned BlockElts = M[0] + 1;
// If the first shuffle index is UNDEF, be optimistic.
if (M[0] < 0)
BlockElts = BlockSize / EltSz;
if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
return false;
for (unsigned i = 0; i < NumElts; ++i) {
if (M[i] < 0)
continue; // ignore UNDEF indices
if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
return false;
}
return true;
}
static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
WhichResult = (M[0] == 0 ? 0 : 1);
unsigned Idx = WhichResult * NumElts / 2;
for (unsigned i = 0; i != NumElts; i += 2) {
if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
(M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
return false;
Idx += 1;
}
return true;
}
static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
WhichResult = (M[0] == 0 ? 0 : 1);
for (unsigned i = 0; i != NumElts; ++i) {
if (M[i] < 0)
continue; // ignore UNDEF indices
if ((unsigned)M[i] != 2 * i + WhichResult)
return false;
}
return true;
}
static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
if (NumElts % 2 != 0)
return false;
WhichResult = (M[0] == 0 ? 0 : 1);
for (unsigned i = 0; i < NumElts; i += 2) {
if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
(M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
return false;
}
return true;
}
/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
if (NumElts % 2 != 0)
return false;
WhichResult = (M[0] == 0 ? 0 : 1);
unsigned Idx = WhichResult * NumElts / 2;
for (unsigned i = 0; i != NumElts; i += 2) {
if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
(M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
return false;
Idx += 1;
}
return true;
}
/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned Half = VT.getVectorNumElements() / 2;
WhichResult = (M[0] == 0 ? 0 : 1);
for (unsigned j = 0; j != 2; ++j) {
unsigned Idx = WhichResult;
for (unsigned i = 0; i != Half; ++i) {
int MIdx = M[i + j * Half];
if (MIdx >= 0 && (unsigned)MIdx != Idx)
return false;
Idx += 2;
}
}
return true;
}
/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
if (NumElts % 2 != 0)
return false;
WhichResult = (M[0] == 0 ? 0 : 1);
for (unsigned i = 0; i < NumElts; i += 2) {
if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
(M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
return false;
}
return true;
}
static bool isINSMask(ArrayRef<int> M, int NumInputElements,
bool &DstIsLeft, int &Anomaly) {
if (M.size() != static_cast<size_t>(NumInputElements))
return false;
int NumLHSMatch = 0, NumRHSMatch = 0;
int LastLHSMismatch = -1, LastRHSMismatch = -1;
for (int i = 0; i < NumInputElements; ++i) {
if (M[i] == -1) {
++NumLHSMatch;
++NumRHSMatch;
continue;
}
if (M[i] == i)
++NumLHSMatch;
else
LastLHSMismatch = i;
if (M[i] == i + NumInputElements)
++NumRHSMatch;
else
LastRHSMismatch = i;
}
if (NumLHSMatch == NumInputElements - 1) {
DstIsLeft = true;
Anomaly = LastLHSMismatch;
return true;
} else if (NumRHSMatch == NumInputElements - 1) {
DstIsLeft = false;
Anomaly = LastRHSMismatch;
return true;
}
return false;
}
static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
if (VT.getSizeInBits() != 128)
return false;
unsigned NumElts = VT.getVectorNumElements();
for (int I = 0, E = NumElts / 2; I != E; I++) {
if (Mask[I] != I)
return false;
}
int Offset = NumElts / 2;
for (int I = NumElts / 2, E = NumElts; I != E; I++) {
if (Mask[I] != I + SplitLHS * Offset)
return false;
}
return true;
}
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
SDLoc DL(Op);
EVT VT = Op.getValueType();
SDValue V0 = Op.getOperand(0);
SDValue V1 = Op.getOperand(1);
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
VT.getVectorElementType() != V1.getValueType().getVectorElementType())
return SDValue();
bool SplitV0 = V0.getValueSizeInBits() == 128;
if (!isConcatMask(Mask, VT, SplitV0))
return SDValue();
EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
VT.getVectorNumElements() / 2);
if (SplitV0) {
V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
DAG.getConstant(0, DL, MVT::i64));
}
if (V1.getValueSizeInBits() == 128) {
V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
DAG.getConstant(0, DL, MVT::i64));
}
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
}
/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
/// the specified operations to build the shuffle.
static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
SDValue RHS, SelectionDAG &DAG,
const SDLoc &dl) {
unsigned OpNum = (PFEntry >> 26) & 0x0F;
unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
enum {
OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
OP_VREV,
OP_VDUP0,
OP_VDUP1,
OP_VDUP2,
OP_VDUP3,
OP_VEXT1,
OP_VEXT2,
OP_VEXT3,
OP_VUZPL, // VUZP, left result
OP_VUZPR, // VUZP, right result
OP_VZIPL, // VZIP, left result
OP_VZIPR, // VZIP, right result
OP_VTRNL, // VTRN, left result
OP_VTRNR // VTRN, right result
};
if (OpNum == OP_COPY) {
if (LHSID == (1 * 9 + 2) * 9 + 3)
return LHS;
assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
return RHS;
}
SDValue OpLHS, OpRHS;
OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
EVT VT = OpLHS.getValueType();
switch (OpNum) {
default:
llvm_unreachable("Unknown shuffle opcode!");
case OP_VREV:
// VREV divides the vector in half and swaps within the half.
if (VT.getVectorElementType() == MVT::i32 ||
VT.getVectorElementType() == MVT::f32)
return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
// vrev <4 x i16> -> REV32
if (VT.getVectorElementType() == MVT::i16 ||
VT.getVectorElementType() == MVT::f16)
return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
// vrev <4 x i8> -> REV16
assert(VT.getVectorElementType() == MVT::i8);
return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
case OP_VDUP0:
case OP_VDUP1:
case OP_VDUP2:
case OP_VDUP3: {
EVT EltTy = VT.getVectorElementType();
unsigned Opcode;
if (EltTy == MVT::i8)
Opcode = AArch64ISD::DUPLANE8;
else if (EltTy == MVT::i16 || EltTy == MVT::f16)
Opcode = AArch64ISD::DUPLANE16;
else if (EltTy == MVT::i32 || EltTy == MVT::f32)
Opcode = AArch64ISD::DUPLANE32;
else if (EltTy == MVT::i64 || EltTy == MVT::f64)
Opcode = AArch64ISD::DUPLANE64;
else
llvm_unreachable("Invalid vector element type?");
if (VT.getSizeInBits() == 64)
OpLHS = WidenVector(OpLHS, DAG);
SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
}
case OP_VEXT1:
case OP_VEXT2:
case OP_VEXT3: {
unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
DAG.getConstant(Imm, dl, MVT::i32));
}
case OP_VUZPL:
return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
OpRHS);
case OP_VUZPR:
return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
OpRHS);
case OP_VZIPL:
return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
OpRHS);
case OP_VZIPR:
return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
OpRHS);
case OP_VTRNL:
return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
OpRHS);
case OP_VTRNR:
return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
OpRHS);
}
}
static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
SelectionDAG &DAG) {
// Check to see if we can use the TBL instruction.
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
SDLoc DL(Op);
EVT EltVT = Op.getValueType().getVectorElementType();
unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
SmallVector<SDValue, 8> TBLMask;
for (int Val : ShuffleMask) {
for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
unsigned Offset = Byte + Val * BytesPerElt;
TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
}
}
MVT IndexVT = MVT::v8i8;
unsigned IndexLen = 8;
if (Op.getValueSizeInBits() == 128) {
IndexVT = MVT::v16i8;
IndexLen = 16;
}
SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
SDValue Shuffle;
if (V2.getNode()->isUndef()) {
if (IndexLen == 8)
V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
Shuffle = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
DAG.getBuildVector(IndexVT, DL,
makeArrayRef(TBLMask.data(), IndexLen)));
} else {
if (IndexLen == 8) {
V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
Shuffle = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
DAG.getBuildVector(IndexVT, DL,
makeArrayRef(TBLMask.data(), IndexLen)));
} else {
// FIXME: We cannot, for the moment, emit a TBL2 instruction because we
// cannot currently represent the register constraints on the input
// table registers.
// Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
// DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
// IndexLen));
Shuffle = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
V2Cst, DAG.getBuildVector(IndexVT, DL,
makeArrayRef(TBLMask.data(), IndexLen)));
}
}
return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
}
static unsigned getDUPLANEOp(EVT EltType) {
if (EltType == MVT::i8)
return AArch64ISD::DUPLANE8;
if (EltType == MVT::i16 || EltType == MVT::f16)
return AArch64ISD::DUPLANE16;
if (EltType == MVT::i32 || EltType == MVT::f32)
return AArch64ISD::DUPLANE32;
if (EltType == MVT::i64 || EltType == MVT::f64)
return AArch64ISD::DUPLANE64;
llvm_unreachable("Invalid vector element type?");
}
SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
EVT VT = Op.getValueType();
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
// Convert shuffles that are directly supported on NEON to target-specific
// DAG nodes, instead of keeping them as shuffles and matching them again
// during code selection. This is more efficient and avoids the possibility
// of inconsistencies between legalization and selection.
ArrayRef<int> ShuffleMask = SVN->getMask();
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
if (SVN->isSplat()) {
int Lane = SVN->getSplatIndex();
// If this is undef splat, generate it via "just" vdup, if possible.
if (Lane == -1)
Lane = 0;
if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
V1.getOperand(0));
// Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
// constant. If so, we can just reference the lane's definition directly.
if (V1.getOpcode() == ISD::BUILD_VECTOR &&
!isa<ConstantSDNode>(V1.getOperand(Lane)))
return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
// Otherwise, duplicate from the lane of the input vector.
unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
// SelectionDAGBuilder may have "helpfully" already extracted or conatenated
// to make a vector of the same size as this SHUFFLE. We can ignore the
// extract entirely, and canonicalise the concat using WidenVector.
if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
V1 = V1.getOperand(0);
} else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
Lane -= Idx * VT.getVectorNumElements() / 2;
V1 = WidenVector(V1.getOperand(Idx), DAG);
} else if (VT.getSizeInBits() == 64)
V1 = WidenVector(V1, DAG);
return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64));
}
if (isREVMask(ShuffleMask, VT, 64))
return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
if (isREVMask(ShuffleMask, VT, 32))
return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
if (isREVMask(ShuffleMask, VT, 16))
return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
bool ReverseEXT = false;
unsigned Imm;
if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
if (ReverseEXT)
std::swap(V1, V2);
Imm *= getExtFactor(V1);
return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
DAG.getConstant(Imm, dl, MVT::i32));
} else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
Imm *= getExtFactor(V1);
return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
DAG.getConstant(Imm, dl, MVT::i32));
}
unsigned WhichResult;
if (isZIPMask(ShuffleMask, VT, WhichResult)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
}
if (isUZPMask(ShuffleMask, VT, WhichResult)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
}
if (isTRNMask(ShuffleMask, VT, WhichResult)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
}
if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
}
if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
}
if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
}
if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
return Concat;
bool DstIsLeft;
int Anomaly;
int NumInputElements = V1.getValueType().getVectorNumElements();
if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
SDValue DstVec = DstIsLeft ? V1 : V2;
SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
SDValue SrcVec = V1;
int SrcLane = ShuffleMask[Anomaly];
if (SrcLane >= NumInputElements) {
SrcVec = V2;
SrcLane -= VT.getVectorNumElements();
}
SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
EVT ScalarVT = VT.getVectorElementType();
if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
ScalarVT = MVT::i32;
return DAG.getNode(
ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
DstLaneV);
}
// If the shuffle is not directly supported and it has 4 elements, use
// the PerfectShuffle-generated table to synthesize it from other shuffles.
unsigned NumElts = VT.getVectorNumElements();
if (NumElts == 4) {
unsigned PFIndexes[4];
for (unsigned i = 0; i != 4; ++i) {
if (ShuffleMask[i] < 0)
PFIndexes[i] = 8;
else
PFIndexes[i] = ShuffleMask[i];
}
// Compute the index in the perfect shuffle table.
unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
PFIndexes[2] * 9 + PFIndexes[3];
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
unsigned Cost = (PFEntry >> 30);
if (Cost <= 4)
return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
}
return GenerateTBL(Op, ShuffleMask, DAG);
}
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
APInt &UndefBits) {
EVT VT = BVN->getValueType(0);
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
for (unsigned i = 0; i < NumSplats; ++i) {
CnstBits <<= SplatBitSize;
UndefBits <<= SplatBitSize;
CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
}
return true;
}
return false;
}
// Try 64-bit splatted SIMD immediate.
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
const APInt &Bits) {
if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
EVT VT = Op.getValueType();
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
if (AArch64_AM::isAdvSIMDModImmType10(Value)) {
Value = AArch64_AM::encodeAdvSIMDModImmType10(Value);
SDLoc dl(Op);
SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
DAG.getConstant(Value, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
}
return SDValue();
}
// Try 32-bit splatted SIMD immediate.
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
const APInt &Bits,
const SDValue *LHS = nullptr) {
if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
EVT VT = Op.getValueType();
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
bool isAdvSIMDModImm = false;
uint64_t Shift;
if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);
Shift = 0;
}
else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);
Shift = 8;
}
else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);
Shift = 16;
}
else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);
Shift = 24;
}
if (isAdvSIMDModImm) {
SDLoc dl(Op);
SDValue Mov;
if (LHS)
Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
DAG.getConstant(Value, dl, MVT::i32),
DAG.getConstant(Shift, dl, MVT::i32));
else
Mov = DAG.getNode(NewOp, dl, MovTy,
DAG.getConstant(Value, dl, MVT::i32),
DAG.getConstant(Shift, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
}
return SDValue();
}
// Try 16-bit splatted SIMD immediate.
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
const APInt &Bits,
const SDValue *LHS = nullptr) {
if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
EVT VT = Op.getValueType();
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
bool isAdvSIMDModImm = false;
uint64_t Shift;
if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);
Shift = 0;
}
else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);
Shift = 8;
}
if (isAdvSIMDModImm) {
SDLoc dl(Op);
SDValue Mov;
if (LHS)
Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
DAG.getConstant(Value, dl, MVT::i32),
DAG.getConstant(Shift, dl, MVT::i32));
else
Mov = DAG.getNode(NewOp, dl, MovTy,
DAG.getConstant(Value, dl, MVT::i32),
DAG.getConstant(Shift, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
}
return SDValue();
}
// Try 32-bit splatted SIMD immediate with shifted ones.
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
SelectionDAG &DAG, const APInt &Bits) {
if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
EVT VT = Op.getValueType();
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
bool isAdvSIMDModImm = false;
uint64_t Shift;
if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);
Shift = 264;
}
else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);
Shift = 272;
}
if (isAdvSIMDModImm) {
SDLoc dl(Op);
SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
DAG.getConstant(Value, dl, MVT::i32),
DAG.getConstant(Shift, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
}
return SDValue();
}
// Try 8-bit splatted SIMD immediate.
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
const APInt &Bits) {
if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
EVT VT = Op.getValueType();
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
if (AArch64_AM::isAdvSIMDModImmType9(Value)) {
Value = AArch64_AM::encodeAdvSIMDModImmType9(Value);
SDLoc dl(Op);
SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
DAG.getConstant(Value, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
}
return SDValue();
}
// Try FP splatted SIMD immediate.
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
const APInt &Bits) {
if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
EVT VT = Op.getValueType();
bool isWide = (VT.getSizeInBits() == 128);
MVT MovTy;
bool isAdvSIMDModImm = false;
if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);
MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
}
else if (isWide &&
(isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);
MovTy = MVT::v2f64;
}
if (isAdvSIMDModImm) {
SDLoc dl(Op);
SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
DAG.getConstant(Value, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
}
return SDValue();
}
// Specialized code to quickly find if PotentialBVec is a BuildVector that
// consists of only the same constant int value, returned in reference arg
// ConstVal
static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
uint64_t &ConstVal) {
BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
if (!Bvec)
return false;
ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
if (!FirstElt)
return false;
EVT VT = Bvec->getValueType(0);
unsigned NumElts = VT.getVectorNumElements();
for (unsigned i = 1; i < NumElts; ++i)
if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
return false;
ConstVal = FirstElt->getZExtValue();
return true;
}
static unsigned getIntrinsicID(const SDNode *N) {
unsigned Opcode = N->getOpcode();
switch (Opcode) {
default:
return Intrinsic::not_intrinsic;
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
if (IID < Intrinsic::num_intrinsics)
return IID;
return Intrinsic::not_intrinsic;
}
}
}
// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
// Also, logical shift right -> sri, with the same structure.
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
if (!VT.isVector())
return SDValue();
SDLoc DL(N);
// Is the first op an AND?
const SDValue And = N->getOperand(0);
if (And.getOpcode() != ISD::AND)
return SDValue();
// Is the second op an shl or lshr?
SDValue Shift = N->getOperand(1);
// This will have been turned into: AArch64ISD::VSHL vector, #shift
// or AArch64ISD::VLSHR vector, #shift
unsigned ShiftOpc = Shift.getOpcode();
if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
return SDValue();
bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;
// Is the shift amount constant?
ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
if (!C2node)
return SDValue();
// Is the and mask vector all constant?
uint64_t C1;
if (!isAllConstantBuildVector(And.getOperand(1), C1))
return SDValue();
// Is C1 == ~C2, taking into account how much one can shift elements of a
// particular size?
uint64_t C2 = C2node->getZExtValue();
unsigned ElemSizeInBits = VT.getScalarSizeInBits();
if (C2 > ElemSizeInBits)
return SDValue();
unsigned ElemMask = (1 << ElemSizeInBits) - 1;
if ((C1 & ElemMask) != (~C2 & ElemMask))
return SDValue();
SDValue X = And.getOperand(0);
SDValue Y = Shift.getOperand(0);
unsigned Intrin =
IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
SDValue ResultSLI =
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
DAG.getConstant(Intrin, DL, MVT::i32), X, Y,
Shift.getOperand(1));
LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
LLVM_DEBUG(N->dump(&DAG));
LLVM_DEBUG(dbgs() << "into: \n");
LLVM_DEBUG(ResultSLI->dump(&DAG));
++NumShiftInserts;
return ResultSLI;
}
SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
SelectionDAG &DAG) const {
// Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
if (EnableAArch64SlrGeneration) {
if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
return Res;
}
EVT VT = Op.getValueType();
SDValue LHS = Op.getOperand(0);
BuildVectorSDNode *BVN =
dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
if (!BVN) {
// OR commutes, so try swapping the operands.
LHS = Op.getOperand(1);
BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
}
if (!BVN)
return Op;
APInt DefBits(VT.getSizeInBits(), 0);
APInt UndefBits(VT.getSizeInBits(), 0);
if (resolveBuildVector(BVN, DefBits, UndefBits)) {
SDValue NewOp;
if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
DefBits, &LHS)) ||
(NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
DefBits, &LHS)))
return NewOp;
if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
UndefBits, &LHS)) ||
(NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
UndefBits, &LHS)))
return NewOp;
}
// We can always fall back to a non-immediate OR.
return Op;
}
// Normalize the operands of BUILD_VECTOR. The value of constant operands will
// be truncated to fit element width.
static SDValue NormalizeBuildVector(SDValue Op,
SelectionDAG &DAG) {
assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
SDLoc dl(Op);
EVT VT = Op.getValueType();
EVT EltTy= VT.getVectorElementType();
if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
return Op;
SmallVector<SDValue, 16> Ops;
for (SDValue Lane : Op->ops()) {
// For integer vectors, type legalization would have promoted the
// operands already. Otherwise, if Op is a floating-point splat
// (with operands cast to integers), then the only possibilities
// are constants and UNDEFs.
if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
APInt LowBits(EltTy.getSizeInBits(),
CstLane->getZExtValue());
Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
} else if (Lane.getNode()->isUndef()) {
Lane = DAG.getUNDEF(MVT::i32);
} else {
assert(Lane.getValueType() == MVT::i32 &&
"Unexpected BUILD_VECTOR operand type");
}
Ops.push_back(Lane);
}
return DAG.getBuildVector(VT, dl, Ops);
}
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
APInt DefBits(VT.getSizeInBits(), 0);
APInt UndefBits(VT.getSizeInBits(), 0);
BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
if (resolveBuildVector(BVN, DefBits, UndefBits)) {
SDValue NewOp;
if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
return NewOp;
DefBits = ~DefBits;
if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
return NewOp;
DefBits = UndefBits;
if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
return NewOp;
DefBits = ~UndefBits;
if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
return NewOp;
}
return SDValue();
}
SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
// Try to build a simple constant vector.
Op = NormalizeBuildVector(Op, DAG);
if (VT.isInteger()) {
// Certain vector constants, used to express things like logical NOT and
// arithmetic NEG, are passed through unmodified. This allows special
// patterns for these operations to match, which will lower these constants
// to whatever is proven necessary.
BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
if (BVN->isConstant())
if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
unsigned BitSize = VT.getVectorElementType().getSizeInBits();
APInt Val(BitSize,
Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
if (Val.isNullValue() || Val.isAllOnesValue())
return Op;
}
}
if (SDValue V = ConstantBuildVector(Op, DAG))
return V;
// Scan through the operands to find some interesting properties we can
// exploit:
// 1) If only one value is used, we can use a DUP, or
// 2) if only the low element is not undef, we can just insert that, or
// 3) if only one constant value is used (w/ some non-constant lanes),
// we can splat the constant value into the whole vector then fill
// in the non-constant lanes.
// 4) FIXME: If different constant values are used, but we can intelligently
// select the values we'll be overwriting for the non-constant
// lanes such that we can directly materialize the vector
// some other way (MOVI, e.g.), we can be sneaky.
// 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
SDLoc dl(Op);
unsigned NumElts = VT.getVectorNumElements();
bool isOnlyLowElement = true;
bool usesOnlyOneValue = true;
bool usesOnlyOneConstantValue = true;
bool isConstant = true;
bool AllLanesExtractElt = true;
unsigned NumConstantLanes = 0;
SDValue Value;
SDValue ConstantValue;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
AllLanesExtractElt = false;
if (V.isUndef())
continue;
if (i > 0)
isOnlyLowElement = false;
if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
isConstant = false;
if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
++NumConstantLanes;
if (!ConstantValue.getNode())
ConstantValue = V;
else if (ConstantValue != V)
usesOnlyOneConstantValue = false;
}
if (!Value.getNode())
Value = V;
else if (V != Value)
usesOnlyOneValue = false;
}
if (!Value.getNode()) {
LLVM_DEBUG(
dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
return DAG.getUNDEF(VT);
}
// Convert BUILD_VECTOR where all elements but the lowest are undef into
// SCALAR_TO_VECTOR, except for when we have a single-element constant vector
// as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
if (isOnlyLowElement && !(NumElts == 1 && isa<ConstantSDNode>(Value))) {
LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
"SCALAR_TO_VECTOR node\n");
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
}
if (AllLanesExtractElt) {
SDNode *Vector = nullptr;
bool Even = false;
bool Odd = false;
// Check whether the extract elements match the Even pattern <0,2,4,...> or
// the Odd pattern <1,3,5,...>.
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
const SDNode *N = V.getNode();
if (!isa<ConstantSDNode>(N->getOperand(1)))
break;
SDValue N0 = N->getOperand(0);
// All elements are extracted from the same vector.
if (!Vector) {
Vector = N0.getNode();
// Check that the type of EXTRACT_VECTOR_ELT matches the type of
// BUILD_VECTOR.
if (VT.getVectorElementType() !=
N0.getValueType().getVectorElementType())
break;
} else if (Vector != N0.getNode()) {
Odd = false;
Even = false;
break;
}
// Extracted values are either at Even indices <0,2,4,...> or at Odd
// indices <1,3,5,...>.
uint64_t Val = N->getConstantOperandVal(1);
if (Val == 2 * i) {
Even = true;
continue;
}
if (Val - 1 == 2 * i) {
Odd = true;
continue;
}
// Something does not match: abort.
Odd = false;
Even = false;
break;
}
if (Even || Odd) {
SDValue LHS =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
DAG.getConstant(0, dl, MVT::i64));
SDValue RHS =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
DAG.getConstant(NumElts, dl, MVT::i64));
if (Even && !Odd)
return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
RHS);
if (Odd && !Even)
return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
RHS);
}
}
// Use DUP for non-constant splats. For f32 constant splats, reduce to
// i32 and try again.
if (usesOnlyOneValue) {
if (!isConstant) {
if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Value.getValueType() != VT) {
LLVM_DEBUG(
dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
}
// This is actually a DUPLANExx operation, which keeps everything vectory.
SDValue Lane = Value.getOperand(1);
Value = Value.getOperand(0);
if (Value.getValueSizeInBits() == 64) {
LLVM_DEBUG(
dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
"widening it\n");
Value = WidenVector(Value, DAG);
}
unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
return DAG.getNode(Opcode, dl, VT, Value, Lane);
}
if (VT.getVectorElementType().isFloatingPoint()) {
SmallVector<SDValue, 8> Ops;
EVT EltTy = VT.getVectorElementType();
assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) &&
"Unsupported floating-point vector type");
LLVM_DEBUG(
dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
"BITCASTS, and try again\n");
MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
for (unsigned i = 0; i < NumElts; ++i)
Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
Val.dump(););
Val = LowerBUILD_VECTOR(Val, DAG);
if (Val.getNode())
return DAG.getNode(ISD::BITCAST, dl, VT, Val);
}
}
// If there was only one constant value used and for more than one lane,
// start by splatting that value, then replace the non-constant lanes. This
// is better than the default, which will perform a separate initialization
// for each lane.
if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
// Firstly, try to materialize the splat constant.
SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
Val = ConstantBuildVector(Vec, DAG);
if (!Val) {
// Otherwise, materialize the constant and splat it.
Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
DAG.ReplaceAllUsesWith(Vec.getNode(), &Val);
}
// Now insert the non-constant lanes.
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V))
// Note that type legalization likely mucked about with the VT of the
// source operand, so we may have to convert it here before inserting.
Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
}
return Val;
}
// This will generate a load from the constant pool.
if (isConstant) {
LLVM_DEBUG(
dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
"expansion\n");
return SDValue();
}
// Empirical tests suggest this is rarely worth it for vectors of length <= 2.
if (NumElts >= 4) {
if (SDValue shuffle = ReconstructShuffle(Op, DAG))
return shuffle;
}
// If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
// know the default expansion would otherwise fall back on something even
// worse. For a vector with one or two non-undef values, that's
// scalar_to_vector for the elements followed by a shuffle (provided the
// shuffle is valid for the target) and materialization element by element
// on the stack followed by a load for everything else.
if (!isConstant && !usesOnlyOneValue) {
LLVM_DEBUG(
dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
"of INSERT_VECTOR_ELT\n");
SDValue Vec = DAG.getUNDEF(VT);
SDValue Op0 = Op.getOperand(0);
unsigned i = 0;
// Use SCALAR_TO_VECTOR for lane zero to
// a) Avoid a RMW dependency on the full vector register, and
// b) Allow the register coalescer to fold away the copy if the
// value is already in an S or D register, and we're forced to emit an
// INSERT_SUBREG that we can't fold anywhere.
//
// We also allow types like i8 and i16 which are illegal scalar but legal
// vector element types. After type-legalization the inserted value is
// extended (i32) and it is safe to cast them to the vector type by ignoring
// the upper bits of the lowest lane (e.g. v8i8, v4i16).
if (!Op0.isUndef()) {
LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
++i;
}
LLVM_DEBUG(if (i < NumElts) dbgs()
<< "Creating nodes for the other vector elements:\n";);
for (; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
if (V.isUndef())
continue;
SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
}
return Vec;
}
LLVM_DEBUG(
dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
"better alternative\n");
return SDValue();
}
SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
// Check for non-constant or out of range lane.
EVT VT = Op.getOperand(0).getValueType();
ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
return SDValue();
// Insertion/extraction are legal for V128 types.
if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
VT == MVT::v8f16)
return Op;
if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
return SDValue();
// For V64 types, we perform insertion by expanding the value
// to a V128 type and perform the insertion on that.
SDLoc DL(Op);
SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
EVT WideTy = WideVec.getValueType();
SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
Op.getOperand(1), Op.getOperand(2));
// Re-narrow the resultant vector.
return NarrowVector(Node, DAG);
}
SDValue
AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
// Check for non-constant or out of range lane.
EVT VT = Op.getOperand(0).getValueType();
ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
return SDValue();
// Insertion/extraction are legal for V128 types.
if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
VT == MVT::v8f16)
return Op;
if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
return SDValue();
// For V64 types, we perform extraction by expanding the value
// to a V128 type and perform the extraction on that.
SDLoc DL(Op);
SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
EVT WideTy = WideVec.getValueType();
EVT ExtrTy = WideTy.getVectorElementType();
if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
ExtrTy = MVT::i32;
// For extractions, we just return the result directly.
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
Op.getOperand(1));
}
SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getOperand(0).getValueType();
SDLoc dl(Op);
// Just in case...
if (!VT.isVector())
return SDValue();
ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
if (!Cst)
return SDValue();
unsigned Val = Cst->getZExtValue();
unsigned Size = Op.getValueSizeInBits();
// This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
if (Val == 0)
return Op;
// If this is extracting the upper 64-bits of a 128-bit vector, we match
// that directly.
if (Size == 64 && Val * VT.getScalarSizeInBits() == 64)
return Op;
return SDValue();
}
bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
if (VT.getVectorNumElements() == 4 &&
(VT.is128BitVector() || VT.is64BitVector())) {
unsigned PFIndexes[4];
for (unsigned i = 0; i != 4; ++i) {
if (M[i] < 0)
PFIndexes[i] = 8;
else
PFIndexes[i] = M[i];
}
// Compute the index in the perfect shuffle table.
unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
PFIndexes[2] * 9 + PFIndexes[3];
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
unsigned Cost = (PFEntry >> 30);
if (Cost <= 4)
return true;
}
bool DummyBool;
int DummyInt;
unsigned DummyUnsigned;
return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
// isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
isZIPMask(M, VT, DummyUnsigned) ||
isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
isConcatMask(M, VT, VT.getSizeInBits() == 128));
}
/// getVShiftImm - Check if this is a valid build_vector for the immediate
/// operand of a vector shift operation, where all the elements of the
/// build_vector must have the same constant integer value.
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
// Ignore bit_converts.
while (Op.getOpcode() == ISD::BITCAST)
Op = Op.getOperand(0);
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
HasAnyUndefs, ElementBits) ||
SplatBitSize > ElementBits)
return false;
Cnt = SplatBits.getSExtValue();
return true;
}
/// isVShiftLImm - Check if this is a valid build_vector for the immediate
/// operand of a vector shift left operation. That value must be in the range:
/// 0 <= Value < ElementBits for a left shift; or
/// 0 <= Value <= ElementBits for a long left shift.
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
assert(VT.isVector() && "vector shift count is not a vector type");
int64_t ElementBits = VT.getScalarSizeInBits();
if (!getVShiftImm(Op, ElementBits, Cnt))
return false;
return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
}
/// isVShiftRImm - Check if this is a valid build_vector for the immediate
/// operand of a vector shift right operation. The value must be in the range:
/// 1 <= Value <= ElementBits for a right shift; or
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
assert(VT.isVector() && "vector shift count is not a vector type");
int64_t ElementBits = VT.getScalarSizeInBits();
if (!getVShiftImm(Op, ElementBits, Cnt))
return false;
return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
}
SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc DL(Op);
int64_t Cnt;
if (!Op.getOperand(1).getValueType().isVector())
return Op;
unsigned EltSize = VT.getScalarSizeInBits();
switch (Op.getOpcode()) {
default:
llvm_unreachable("unexpected shift opcode");
case ISD::SHL:
if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
DAG.getConstant(Cnt, DL, MVT::i32));
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
MVT::i32),
Op.getOperand(0), Op.getOperand(1));
case ISD::SRA:
case ISD::SRL:
// Right shift immediate
if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
unsigned Opc =
(Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
DAG.getConstant(Cnt, DL, MVT::i32));
}
// Right shift register. Note, there is not a shift right register
// instruction, but the shift left register instruction takes a signed
// value, where negative numbers specify a right shift.
unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
: Intrinsic::aarch64_neon_ushl;
// negate the shift amount
SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
SDValue NegShiftLeft =
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
NegShift);
return NegShiftLeft;
}
return SDValue();
}
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
AArch64CC::CondCode CC, bool NoNans, EVT VT,
const SDLoc &dl, SelectionDAG &DAG) {
EVT SrcVT = LHS.getValueType();
assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
"function only supposed to emit natural comparisons");
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
APInt CnstBits(VT.getSizeInBits(), 0);
APInt UndefBits(VT.getSizeInBits(), 0);
bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
bool IsZero = IsCnst && (CnstBits == 0);
if (SrcVT.getVectorElementType().isFloatingPoint()) {
switch (CC) {
default:
return SDValue();
case AArch64CC::NE: {
SDValue Fcmeq;
if (IsZero)
Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
else
Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
}
case AArch64CC::EQ:
if (IsZero)
return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
case AArch64CC::GE:
if (IsZero)
return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
case AArch64CC::GT:
if (IsZero)
return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
case AArch64CC::LS:
if (IsZero)
return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
case AArch64CC::LT:
if (!NoNans)
return SDValue();
// If we ignore NaNs then we can use to the MI implementation.
LLVM_FALLTHROUGH;
case AArch64CC::MI:
if (IsZero)
return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
}
}
switch (CC) {
default:
return SDValue();
case AArch64CC::NE: {
SDValue Cmeq;
if (IsZero)
Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
else
Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
}
case AArch64CC::EQ:
if (IsZero)
return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
case AArch64CC::GE:
if (IsZero)
return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
case AArch64CC::GT:
if (IsZero)
return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
case AArch64CC::LE:
if (IsZero)
return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
case AArch64CC::LS:
return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
case AArch64CC::LO:
return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
case AArch64CC::LT:
if (IsZero)
return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
case AArch64CC::HI:
return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
case AArch64CC::HS:
return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
}
}
SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
SelectionDAG &DAG) const {
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
SDLoc dl(Op);
if (LHS.getValueType().getVectorElementType().isInteger()) {
assert(LHS.getValueType() == RHS.getValueType());
AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
SDValue Cmp =
EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
}
const bool FullFP16 =
static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
// Make v4f16 (only) fcmp operations utilise vector instructions
// v8f16 support will be a litle more complicated
if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) {
if (LHS.getValueType().getVectorNumElements() == 4) {
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
DAG.ReplaceAllUsesWith(Op, NewSetcc);
CmpVT = MVT::v4i32;
} else
return SDValue();
}
assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
LHS.getValueType().getVectorElementType() != MVT::f128);
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
// clean. Some of them require two branches to implement.
AArch64CC::CondCode CC1, CC2;
bool ShouldInvert;
changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
SDValue Cmp =
EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
if (!Cmp.getNode())
return SDValue();
if (CC2 != AArch64CC::AL) {
SDValue Cmp2 =
EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
if (!Cmp2.getNode())
return SDValue();
Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
}
Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
if (ShouldInvert)
Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
return Cmp;
}
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
SelectionDAG &DAG) {
SDValue VecOp = ScalarOp.getOperand(0);
auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
DAG.getConstant(0, DL, MVT::i64));
}
SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
switch (Op.getOpcode()) {
case ISD::VECREDUCE_ADD:
return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
case ISD::VECREDUCE_SMAX:
return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
case ISD::VECREDUCE_SMIN:
return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
case ISD::VECREDUCE_UMAX:
return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
case ISD::VECREDUCE_UMIN:
return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
case ISD::VECREDUCE_FMAX: {
assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag");
return DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
Op.getOperand(0));
}
case ISD::VECREDUCE_FMIN: {
assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag");
return DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
Op.getOperand(0));
}
default:
llvm_unreachable("Unhandled reduction");
}
}
SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
SelectionDAG &DAG) const {
auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
if (!Subtarget.hasLSE())
return SDValue();
// LSE has an atomic load-add instruction, but not a load-sub.
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
SDValue RHS = Op.getOperand(2);
AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS);
return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(),
Op.getOperand(0), Op.getOperand(1), RHS,
AN->getMemOperand());
}
SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
SelectionDAG &DAG) const {
auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
if (!Subtarget.hasLSE())
return SDValue();
// LSE has an atomic load-clear instruction, but not a load-and.
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
SDValue RHS = Op.getOperand(2);
AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
Op.getOperand(0), Op.getOperand(1), RHS,
AN->getMemOperand());
}
SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
SDLoc dl(Op);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0);
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
if (Subtarget->hasCustomCallingConv())
TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
DAG.getConstant(4, dl, MVT::i64));
Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
Chain =
DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
DAG.getRegisterMask(Mask), Chain.getValue(1));
// To match the actual intent better, we should read the output from X15 here
// again (instead of potentially spilling it to the stack), but rereading Size
// from X15 here doesn't work at -O0, since it thinks that X15 is undefined
// here.
Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
DAG.getConstant(4, dl, MVT::i64));
return Chain;
}
SDValue
AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
assert(Subtarget->isTargetWindows() &&
"Only Windows alloca probing supported");
SDLoc dl(Op);
// Get the inputs.
SDNode *Node = Op.getNode();
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
EVT VT = Node->getValueType(0);
if (DAG.getMachineFunction().getFunction().hasFnAttribute(
"no-stack-arg-probe")) {
SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
Chain = SP.getValue(1);
SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
DAG.getConstant(-(uint64_t)Align, dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
SDValue Ops[2] = {SP, Chain};
return DAG.getMergeValues(Ops, dl);
}
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
Chain = SP.getValue(1);
SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
DAG.getConstant(-(uint64_t)Align, dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
SDValue Ops[2] = {SP, Chain};
return DAG.getMergeValues(Ops, dl);
}
/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
/// specified in the intrinsic calls.
bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,
MachineFunction &MF,
unsigned Intrinsic) const {
auto &DL = I.getModule()->getDataLayout();
switch (Intrinsic) {
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_ld4:
case Intrinsic::aarch64_neon_ld1x2:
case Intrinsic::aarch64_neon_ld1x3:
case Intrinsic::aarch64_neon_ld1x4:
case Intrinsic::aarch64_neon_ld2lane:
case Intrinsic::aarch64_neon_ld3lane:
case Intrinsic::aarch64_neon_ld4lane:
case Intrinsic::aarch64_neon_ld2r:
case Intrinsic::aarch64_neon_ld3r:
case Intrinsic::aarch64_neon_ld4r: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
// Conservatively set memVT to the entire set of vectors loaded.
uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
Info.offset = 0;
Info.align = 0;
// volatile loads with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOLoad;
return true;
}
case Intrinsic::aarch64_neon_st2:
case Intrinsic::aarch64_neon_st3:
case Intrinsic::aarch64_neon_st4:
case Intrinsic::aarch64_neon_st1x2:
case Intrinsic::aarch64_neon_st1x3:
case Intrinsic::aarch64_neon_st1x4:
case Intrinsic::aarch64_neon_st2lane:
case Intrinsic::aarch64_neon_st3lane:
case Intrinsic::aarch64_neon_st4lane: {
Info.opc = ISD::INTRINSIC_VOID;
// Conservatively set memVT to the entire set of vectors stored.
unsigned NumElts = 0;
for (unsigned ArgI = 0, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
Type *ArgTy = I.getArgOperand(ArgI)->getType();
if (!ArgTy->isVectorTy())
break;
NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
}
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
Info.offset = 0;
Info.align = 0;
// volatile stores with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOStore;
return true;
}
case Intrinsic::aarch64_ldaxr:
case Intrinsic::aarch64_ldxr: {
PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
return true;
}
case Intrinsic::aarch64_stlxr:
case Intrinsic::aarch64_stxr: {
PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(1);
Info.offset = 0;
Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
return true;
}
case Intrinsic::aarch64_ldaxp:
case Intrinsic::aarch64_ldxp:
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::i128;
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.align = 16;
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
return true;
case Intrinsic::aarch64_stlxp:
case Intrinsic::aarch64_stxp:
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::i128;
Info.ptrVal = I.getArgOperand(2);
Info.offset = 0;
Info.align = 16;
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
return true;
default:
break;
}
return false;
}
bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
ISD::LoadExtType ExtTy,
EVT NewVT) const {
// TODO: This may be worth removing. Check regression tests for diffs.
if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
return false;
// If we're reducing the load width in order to avoid having to use an extra
// instruction to do extension then it's probably a good idea.
if (ExtTy != ISD::NON_EXTLOAD)
return true;
// Don't reduce load width if it would prevent us from combining a shift into
// the offset.
MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
assert(Mem);
const SDValue &Base = Mem->getBasePtr();
if (Base.getOpcode() == ISD::ADD &&
Base.getOperand(1).getOpcode() == ISD::SHL &&
Base.getOperand(1).hasOneUse() &&
Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
// The shift can be combined if it matches the size of the value being
// loaded (and so reducing the width would make it not match).
uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
if (ShiftAmount == Log2_32(LoadBytes))
return false;
}
// We have no reason to disallow reducing the load width, so allow it.
return true;
}
// Truncations from 64-bit GPR to 32-bit GPR is free.
bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
return NumBits1 > NumBits2;
}
bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
return false;
unsigned NumBits1 = VT1.getSizeInBits();
unsigned NumBits2 = VT2.getSizeInBits();
return NumBits1 > NumBits2;
}
/// Check if it is profitable to hoist instruction in then/else to if.
/// Not profitable if I and it's user can form a FMA instruction
/// because we prefer FMSUB/FMADD.
bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
if (I->getOpcode() != Instruction::FMul)
return true;
if (!I->hasOneUse())
return true;
Instruction *User = I->user_back();
if (User &&
!(User->getOpcode() == Instruction::FSub ||
User->getOpcode() == Instruction::FAdd))
return true;
const TargetOptions &Options = getTargetMachine().Options;
const DataLayout &DL = I->getModule()->getDataLayout();
EVT VT = getValueType(DL, User->getOperand(0)->getType());
return !(isFMAFasterThanFMulAndFAdd(VT) &&
isOperationLegalOrCustom(ISD::FMA, VT) &&
(Options.AllowFPOpFusion == FPOpFusion::Fast ||
Options.UnsafeFPMath));
}
// All 32-bit GPR operations implicitly zero the high-half of the corresponding
// 64-bit GPR.
bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
return NumBits1 == 32 && NumBits2 == 64;
}
bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
return false;
unsigned NumBits1 = VT1.getSizeInBits();
unsigned NumBits2 = VT2.getSizeInBits();
return NumBits1 == 32 && NumBits2 == 64;
}
bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
EVT VT1 = Val.getValueType();
if (isZExtFree(VT1, VT2)) {
return true;
}
if (Val.getOpcode() != ISD::LOAD)
return false;
// 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
VT1.getSizeInBits() <= 32);
}
bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
if (isa<FPExtInst>(Ext))
return false;
// Vector types are not free.
if (Ext->getType()->isVectorTy())
return false;
for (const Use &U : Ext->uses()) {
// The extension is free if we can fold it with a left shift in an
// addressing mode or an arithmetic operation: add, sub, and cmp.
// Is there a shift?
const Instruction *Instr = cast<Instruction>(U.getUser());
// Is this a constant shift?
switch (Instr->getOpcode()) {
case Instruction::Shl:
if (!isa<ConstantInt>(Instr->getOperand(1)))
return false;
break;
case Instruction::GetElementPtr: {
gep_type_iterator GTI = gep_type_begin(Instr);
auto &DL = Ext->getModule()->getDataLayout();
std::advance(GTI, U.getOperandNo()-1);
Type *IdxTy = GTI.getIndexedType();
// This extension will end up with a shift because of the scaling factor.
// 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
// Get the shift amount based on the scaling factor:
// log2(sizeof(IdxTy)) - log2(8).
uint64_t ShiftAmt =
countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3;
// Is the constant foldable in the shift of the addressing mode?
// I.e., shift amount is between 1 and 4 inclusive.
if (ShiftAmt == 0 || ShiftAmt > 4)
return false;
break;
}
case Instruction::Trunc:
// Check if this is a noop.
// trunc(sext ty1 to ty2) to ty1.
if (Instr->getType() == Ext->getOperand(0)->getType())
continue;
LLVM_FALLTHROUGH;
default:
return false;
}
// At this point we can use the bfm family, so this extension is free
// for that use.
}
return true;
}
/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
/// or upper half of the vector elements.
static bool areExtractShuffleVectors(Value *Op1, Value *Op2) {
auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
auto *FullVT = cast<VectorType>(FullV->getType());
auto *HalfVT = cast<VectorType>(HalfV->getType());
return FullVT->getBitWidth() == 2 * HalfVT->getBitWidth();
};
auto extractHalf = [](Value *FullV, Value *HalfV) {
auto *FullVT = cast<VectorType>(FullV->getType());
auto *HalfVT = cast<VectorType>(HalfV->getType());
return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
};
Constant *M1, *M2;
Value *S1Op1, *S2Op1;
if (!match(Op1, m_ShuffleVector(m_Value(S1Op1), m_Undef(), m_Constant(M1))) ||
!match(Op2, m_ShuffleVector(m_Value(S2Op1), m_Undef(), m_Constant(M2))))
return false;
// Check that the operands are half as wide as the result and we extract
// half of the elements of the input vectors.
if (!areTypesHalfed(S1Op1, Op1) || !areTypesHalfed(S2Op1, Op2) ||
!extractHalf(S1Op1, Op1) || !extractHalf(S2Op1, Op2))
return false;
// Check the mask extracts either the lower or upper half of vector
// elements.
int M1Start = -1;
int M2Start = -1;
int NumElements = cast<VectorType>(Op1->getType())->getNumElements() * 2;
if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) ||
!ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) ||
M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2)))
return false;
return true;
}
/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
/// of the vector elements.
static bool areExtractExts(Value *Ext1, Value *Ext2) {
auto areExtDoubled = [](Instruction *Ext) {
return Ext->getType()->getScalarSizeInBits() ==
2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
};
if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
!match(Ext2, m_ZExtOrSExt(m_Value())) ||
!areExtDoubled(cast<Instruction>(Ext1)) ||
!areExtDoubled(cast<Instruction>(Ext2)))
return false;
return true;
}
/// Check if sinking \p I's operands to I's basic block is profitable, because
/// the operands can be folded into a target instruction, e.g.
/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
bool AArch64TargetLowering::shouldSinkOperands(
Instruction *I, SmallVectorImpl<Use *> &Ops) const {
if (!I->getType()->isVectorTy())
return false;
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
switch (II->getIntrinsicID()) {
case Intrinsic::aarch64_neon_umull:
if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
return false;
Ops.push_back(&II->getOperandUse(0));
Ops.push_back(&II->getOperandUse(1));
return true;
default:
return false;
}
}
switch (I->getOpcode()) {
case Instruction::Sub:
case Instruction::Add: {
if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
return false;
// If the exts' operands extract either the lower or upper elements, we
// can sink them too.
auto Ext1 = cast<Instruction>(I->getOperand(0));
auto Ext2 = cast<Instruction>(I->getOperand(1));
if (areExtractShuffleVectors(Ext1, Ext2)) {
Ops.push_back(&Ext1->getOperandUse(0));
Ops.push_back(&Ext2->getOperandUse(0));
}
Ops.push_back(&I->getOperandUse(0));
Ops.push_back(&I->getOperandUse(1));
return true;
}
default:
return false;
}
return false;
}
bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
unsigned &RequiredAligment) const {
if (!LoadedType.isSimple() ||
(!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
return false;
// Cyclone supports unaligned accesses.
RequiredAligment = 0;
unsigned NumBits = LoadedType.getSizeInBits();
return NumBits == 32 || NumBits == 64;
}
/// A helper function for determining the number of interleaved accesses we
/// will generate when lowering accesses of the given type.
unsigned
AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
const DataLayout &DL) const {
return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
}
MachineMemOperand::Flags
AArch64TargetLowering::getMMOFlags(const Instruction &I) const {
if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
return MOStridedAccess;
return MachineMemOperand::MONone;
}
bool AArch64TargetLowering::isLegalInterleavedAccessType(
VectorType *VecTy, const DataLayout &DL) const {
unsigned VecSize = DL.getTypeSizeInBits(VecTy);
unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
// Ensure the number of vector elements is greater than 1.
if (VecTy->getNumElements() < 2)
return false;
// Ensure the element type is legal.
if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
return false;
// Ensure the total vector size is 64 or a multiple of 128. Types larger than
// 128 will be split into multiple interleaved accesses.
return VecSize == 64 || VecSize % 128 == 0;
}
/// Lower an interleaved load into a ldN intrinsic.
///
/// E.g. Lower an interleaved load (Factor = 2):
/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
///
/// Into:
/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool AArch64TargetLowering::lowerInterleavedLoad(
LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
assert(!Shuffles.empty() && "Empty shufflevector input");
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
const DataLayout &DL = LI->getModule()->getDataLayout();
VectorType *VecTy = Shuffles[0]->getType();
// Skip if we do not have NEON and skip illegal vector types. We can
// "legalize" wide vector types into multiple interleaved accesses as long as
// the vector types are divisible by 128.
if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL))
return false;
unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
// A pointer vector can not be the return type of the ldN intrinsics. Need to
// load integer vectors first and then convert to pointer vectors.
Type *EltTy = VecTy->getVectorElementType();
if (EltTy->isPointerTy())
VecTy =
VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
IRBuilder<> Builder(LI);
// The base address of the load.
Value *BaseAddr = LI->getPointerOperand();
if (NumLoads > 1) {
// If we're going to generate more than one load, reset the sub-vector type
// to something legal.
VecTy = VectorType::get(VecTy->getVectorElementType(),
VecTy->getVectorNumElements() / NumLoads);
// We will compute the pointer operand of each load from the original base
// address using GEPs. Cast the base address to a pointer to the scalar
// element type.
BaseAddr = Builder.CreateBitCast(
BaseAddr, VecTy->getVectorElementType()->getPointerTo(
LI->getPointerAddressSpace()));
}
Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
Type *Tys[2] = {VecTy, PtrTy};
static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
Intrinsic::aarch64_neon_ld3,
Intrinsic::aarch64_neon_ld4};
Function *LdNFunc =
Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
// Holds sub-vectors extracted from the load intrinsic return values. The
// sub-vectors are associated with the shufflevector instructions they will
// replace.
DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
// If we're generating more than one load, compute the base address of
// subsequent loads as an offset from the previous.
if (LoadCount > 0)
BaseAddr =
Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr,
VecTy->getVectorNumElements() * Factor);
CallInst *LdN = Builder.CreateCall(
LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");
// Extract and store the sub-vectors returned by the load intrinsic.
for (unsigned i = 0; i < Shuffles.size(); i++) {
ShuffleVectorInst *SVI = Shuffles[i];
unsigned Index = Indices[i];
Value *SubVec = Builder.CreateExtractValue(LdN, Index);
// Convert the integer vector to pointer vector if the element is pointer.
if (EltTy->isPointerTy())
SubVec = Builder.CreateIntToPtr(
SubVec, VectorType::get(SVI->getType()->getVectorElementType(),
VecTy->getVectorNumElements()));
SubVecs[SVI].push_back(SubVec);
}
}
// Replace uses of the shufflevector instructions with the sub-vectors
// returned by the load intrinsic. If a shufflevector instruction is
// associated with more than one sub-vector, those sub-vectors will be
// concatenated into a single wide vector.
for (ShuffleVectorInst *SVI : Shuffles) {
auto &SubVec = SubVecs[SVI];
auto *WideVec =
SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
SVI->replaceAllUsesWith(WideVec);
}
return true;
}
/// Lower an interleaved store into a stN intrinsic.
///
/// E.g. Lower an interleaved store (Factor = 3):
/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
/// store <12 x i32> %i.vec, <12 x i32>* %ptr
///
/// Into:
/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
///
/// Note that the new shufflevectors will be removed and we'll only generate one
/// st3 instruction in CodeGen.
///
/// Example for a more general valid mask (Factor 3). Lower:
/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
/// store <12 x i32> %i.vec, <12 x i32>* %ptr
///
/// Into:
/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
VectorType *VecTy = SVI->getType();
assert(VecTy->getVectorNumElements() % Factor == 0 &&
"Invalid interleaved store");
unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
Type *EltTy = VecTy->getVectorElementType();
VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
const DataLayout &DL = SI->getModule()->getDataLayout();
// Skip if we do not have NEON and skip illegal vector types. We can
// "legalize" wide vector types into multiple interleaved accesses as long as
// the vector types are divisible by 128.
if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
return false;
unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
Value *Op0 = SVI->getOperand(0);
Value *Op1 = SVI->getOperand(1);
IRBuilder<> Builder(SI);
// StN intrinsics don't support pointer vectors as arguments. Convert pointer
// vectors to integer vectors.
if (EltTy->isPointerTy()) {
Type *IntTy = DL.getIntPtrType(EltTy);
unsigned NumOpElts = Op0->getType()->getVectorNumElements();
// Convert to the corresponding integer vector.
Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
SubVecTy = VectorType::get(IntTy, LaneLen);
}
// The base address of the store.
Value *BaseAddr = SI->getPointerOperand();
if (NumStores > 1) {
// If we're going to generate more than one store, reset the lane length
// and sub-vector type to something legal.
LaneLen /= NumStores;
SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
// We will compute the pointer operand of each store from the original base
// address using GEPs. Cast the base address to a pointer to the scalar
// element type.
BaseAddr = Builder.CreateBitCast(
BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
SI->getPointerAddressSpace()));
}
auto Mask = SVI->getShuffleMask();
Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
Type *Tys[2] = {SubVecTy, PtrTy};
static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
Intrinsic::aarch64_neon_st3,
Intrinsic::aarch64_neon_st4};
Function *StNFunc =
Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
SmallVector<Value *, 5> Ops;
// Split the shufflevector operands into sub vectors for the new stN call.
for (unsigned i = 0; i < Factor; i++) {
unsigned IdxI = StoreCount * LaneLen * Factor + i;
if (Mask[IdxI] >= 0) {
Ops.push_back(Builder.CreateShuffleVector(
Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
} else {
unsigned StartMask = 0;
for (unsigned j = 1; j < LaneLen; j++) {
unsigned IdxJ = StoreCount * LaneLen * Factor + j;
if (Mask[IdxJ * Factor + IdxI] >= 0) {
StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
break;
}
}
// Note: Filling undef gaps with random elements is ok, since
// those elements were being written anyway (with undefs).
// In the case of all undefs we're defaulting to using elems from 0
// Note: StartMask cannot be negative, it's checked in
// isReInterleaveMask
Ops.push_back(Builder.CreateShuffleVector(
Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
}
}
// If we generating more than one store, we compute the base address of
// subsequent stores as an offset from the previous.
if (StoreCount > 0)
BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(),
BaseAddr, LaneLen * Factor);
Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
Builder.CreateCall(StNFunc, Ops);
}
return true;
}
static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
unsigned AlignCheck) {
return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
(DstAlign == 0 || DstAlign % AlignCheck == 0));
}
EVT AArch64TargetLowering::getOptimalMemOpType(
uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
bool ZeroMemset, bool MemcpyStrSrc,
const AttributeList &FuncAttributes) const {
bool CanImplicitFloat =
!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
// Only use AdvSIMD to implement memset of 32-byte and above. It would have
// taken one instruction to materialize the v2i64 zero and one store (with
// restrictive addressing mode). Just do i64 stores.
bool IsSmallMemset = IsMemset && Size < 32;
auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) {
if (memOpAlign(SrcAlign, DstAlign, AlignCheck))
return true;
bool Fast;
return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
&Fast) &&
Fast;
};
if (CanUseNEON && IsMemset && !IsSmallMemset &&
AlignmentIsAcceptable(MVT::v2i64, 16))
return MVT::v2i64;
if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16))
return MVT::f128;
if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8))
return MVT::i64;
if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4))
return MVT::i32;
return MVT::Other;
}
// 12-bit optionally shifted immediates are legal for adds.
bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
if (Immed == std::numeric_limits<int64_t>::min()) {
LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
<< ": avoid UB for INT64_MIN\n");
return false;
}
// Same encoding for add/sub, just flip the sign.
Immed = std::abs(Immed);
bool IsLegal = ((Immed >> 12) == 0 ||
((Immed & 0xfff) == 0 && Immed >> 24 == 0));
LLVM_DEBUG(dbgs() << "Is " << Immed
<< " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
return IsLegal;
}
// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
// immediates is the same as for an add or a sub.
bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
return isLegalAddImmediate(Immed);
}
/// isLegalAddressingMode - Return true if the addressing mode represented
/// by AM is legal for this target, for a load/store of the specified type.
bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS, Instruction *I) const {
// AArch64 has five basic addressing modes:
// reg
// reg + 9-bit signed offset
// reg + SIZE_IN_BYTES * 12-bit unsigned offset
// reg1 + reg2
// reg + SIZE_IN_BYTES * reg
// No global is ever allowed as a base.
if (AM.BaseGV)
return false;
// No reg+reg+imm addressing.
if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
return false;
// check reg + imm case:
// i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
uint64_t NumBytes = 0;
if (Ty->isSized()) {
uint64_t NumBits = DL.getTypeSizeInBits(Ty);
NumBytes = NumBits / 8;
if (!isPowerOf2_64(NumBits))
NumBytes = 0;
}
if (!AM.Scale) {
int64_t Offset = AM.BaseOffs;
// 9-bit signed offset
if (isInt<9>(Offset))
return true;
// 12-bit unsigned offset
unsigned shift = Log2_64(NumBytes);
if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
// Must be a multiple of NumBytes (NumBytes is a power of 2)
(Offset >> shift) << shift == Offset)
return true;
return false;
}
// Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
}
bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
// Consider splitting large offset of struct or array.
return true;
}
int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS) const {
// Scaling factors are not free at all.
// Operands | Rt Latency
// -------------------------------------------
// Rt, [Xn, Xm] | 4
// -------------------------------------------
// Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
// Rt, [Xn, Wm, <extend> #imm] |
if (isLegalAddressingMode(DL, AM, Ty, AS))
// Scale represents reg2 * scale, thus account for 1 if
// it is not equal to 0 or 1.
return AM.Scale != 0 && AM.Scale != 1;
return -1;
}
bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
VT = VT.getScalarType();
if (!VT.isSimple())
return false;
switch (VT.getSimpleVT().SimpleTy) {
case MVT::f32:
case MVT::f64:
return true;
default:
break;
}
return false;
}
const MCPhysReg *
AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
// LR is a callee-save register, but we must treat it as clobbered by any call
// site. Hence we include LR in the scratch registers, which are in turn added
// as implicit-defs for stackmaps and patchpoints.
static const MCPhysReg ScratchRegs[] = {
AArch64::X16, AArch64::X17, AArch64::LR, 0
};
return ScratchRegs;
}
bool
AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const {
N = N->getOperand(0).getNode();
EVT VT = N->getValueType(0);
// If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
// it with shift to let it be lowered to UBFX.
if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
isa<ConstantSDNode>(N->getOperand(1))) {
uint64_t TruncMask = N->getConstantOperandVal(1);
if (isMask_64(TruncMask) &&
N->getOperand(0).getOpcode() == ISD::SRL &&
isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
return false;
}
return true;
}
bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
if (BitSize == 0)
return false;
int64_t Val = Imm.getSExtValue();
if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
return true;
if ((int64_t)Val < 0)
Val = ~Val;
if (BitSize == 32)
Val &= (1LL << 32) - 1;
unsigned LZ = countLeadingZeros((uint64_t)Val);
unsigned Shift = (63 - LZ) / 16;
// MOVZ is free so return true for one or fewer MOVK.
return Shift < 3;
}
bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
unsigned Index) const {
if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
return false;
return (Index == 0 || Index == ResVT.getVectorNumElements());
}
/// Turn vector tests of the signbit in the form of:
/// xor (sra X, elt_size(X)-1), -1
/// into:
/// cmge X, X, #0
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
EVT VT = N->getValueType(0);
if (!Subtarget->hasNEON() || !VT.isVector())
return SDValue();
// There must be a shift right algebraic before the xor, and the xor must be a
// 'not' operation.
SDValue Shift = N->getOperand(0);
SDValue Ones = N->getOperand(1);
if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
!ISD::isBuildVectorAllOnes(Ones.getNode()))
return SDValue();
// The shift should be smearing the sign bit across each vector element.
auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
return SDValue();
return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
}
// Generate SUBS and CSEL for integer abs.
static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDLoc DL(N);
// Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
// and change it to SUB and CSEL.
if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
N0.getOperand(0));
// Generate SUBS & CSEL.
SDValue Cmp =
DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
N0.getOperand(0), DAG.getConstant(0, DL, VT));
return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
SDValue(Cmp.getNode(), 1));
}
return SDValue();
}
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
return Cmp;
return performIntegerAbsCombine(N, DAG);
}
SDValue
AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const {
AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
if (isIntDivCheap(N->getValueType(0), Attr))
return SDValue(N,0); // Lower SDIV as SDIV
// fold (sdiv X, pow2)
EVT VT = N->getValueType(0);
if ((VT != MVT::i32 && VT != MVT::i64) ||
!(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
return SDValue();
SDLoc DL(N);
SDValue N0 = N->getOperand(0);
unsigned Lg2 = Divisor.countTrailingZeros();
SDValue Zero = DAG.getConstant(0, DL, VT);
SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
// Add (N0 < 0) ? Pow2 - 1 : 0;
SDValue CCVal;
SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
Created.push_back(Cmp.getNode());
Created.push_back(Add.getNode());
Created.push_back(CSel.getNode());
// Divide by pow2.
SDValue SRA =
DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));
// If we're dividing by a positive value, we're done. Otherwise, we must
// negate the result.
if (Divisor.isNonNegative())
return SRA;
Created.push_back(SRA.getNode());
return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
}
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
// The below optimizations require a constant RHS.
if (!isa<ConstantSDNode>(N->getOperand(1)))
return SDValue();
ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
const APInt &ConstValue = C->getAPIntValue();
// Multiplication of a power of two plus/minus one can be done more
// cheaply as as shift+add/sub. For now, this is true unilaterally. If
// future CPUs have a cheaper MADD instruction, this may need to be
// gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
// 64-bit is 5 cycles, so this is always a win.
// More aggressively, some multiplications N0 * C can be lowered to
// shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
// e.g. 6=3*2=(2+1)*2.
// TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
// which equals to (1+2)*16-(1+2).
SDValue N0 = N->getOperand(0);
// TrailingZeroes is used to test if the mul can be lowered to
// shift+add+shift.
unsigned TrailingZeroes = ConstValue.countTrailingZeros();
if (TrailingZeroes) {
// Conservatively do not lower to shift+add+shift if the mul might be
// folded into smul or umul.
if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) ||
isZeroExtended(N0.getNode(), DAG)))
return SDValue();
// Conservatively do not lower to shift+add+shift if the mul might be
// folded into madd or msub.
if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
N->use_begin()->getOpcode() == ISD::SUB))
return SDValue();
}
// Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
// and shift+add+shift.
APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
unsigned ShiftAmt, AddSubOpc;
// Is the shifted value the LHS operand of the add/sub?
bool ShiftValUseIsN0 = true;
// Do we need to negate the result?
bool NegateResult = false;
if (ConstValue.isNonNegative()) {
// (mul x, 2^N + 1) => (add (shl x, N), x)
// (mul x, 2^N - 1) => (sub (shl x, N), x)
// (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
APInt SCVMinus1 = ShiftedConstValue - 1;
APInt CVPlus1 = ConstValue + 1;
if (SCVMinus1.isPowerOf2()) {
ShiftAmt = SCVMinus1.logBase2();
AddSubOpc = ISD::ADD;
} else if (CVPlus1.isPowerOf2()) {
ShiftAmt = CVPlus1.logBase2();
AddSubOpc = ISD::SUB;
} else
return SDValue();
} else {
// (mul x, -(2^N - 1)) => (sub x, (shl x, N))
// (mul x, -(2^N + 1)) => - (add (shl x, N), x)
APInt CVNegPlus1 = -ConstValue + 1;
APInt CVNegMinus1 = -ConstValue - 1;
if (CVNegPlus1.isPowerOf2()) {
ShiftAmt = CVNegPlus1.logBase2();
AddSubOpc = ISD::SUB;
ShiftValUseIsN0 = false;
} else if (CVNegMinus1.isPowerOf2()) {
ShiftAmt = CVNegMinus1.logBase2();
AddSubOpc = ISD::ADD;
NegateResult = true;
} else
return SDValue();
}
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
DAG.getConstant(ShiftAmt, DL, MVT::i64));
SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0;
SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal;
SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
assert(!(NegateResult && TrailingZeroes) &&
"NegateResult and TrailingZeroes cannot both be true for now.");
// Negate the result.
if (NegateResult)
return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
// Shift the result.
if (TrailingZeroes)
return DAG.getNode(ISD::SHL, DL, VT, Res,
DAG.getConstant(TrailingZeroes, DL, MVT::i64));
return Res;
}
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
SelectionDAG &DAG) {
// Take advantage of vector comparisons producing 0 or -1 in each lane to
// optimize away operation when it's from a constant.
//
// The general transformation is:
// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
// AND(VECTOR_CMP(x,y), constant2)
// constant2 = UNARYOP(constant)
// Early exit if this isn't a vector operation, the operand of the
// unary operation isn't a bitwise AND, or if the sizes of the operations
// aren't the same.
EVT VT = N->getValueType(0);
if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
return SDValue();
// Now check that the other operand of the AND is a constant. We could
// make the transformation for non-constant splats as well, but it's unclear
// that would be a benefit as it would not eliminate any operations, just
// perform one more step in scalar code before moving to the vector unit.
if (BuildVectorSDNode *BV =
dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
// Bail out if the vector isn't a constant.
if (!BV->isConstant())
return SDValue();
// Everything checks out. Build up the new and improved node.
SDLoc DL(N);
EVT IntVT = BV->getValueType(0);
// Create a new constant of the appropriate type for the transformed
// DAG.
SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
// The AND node needs bitcasts to/from an integer vector type around it.
SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
N->getOperand(0)->getOperand(0), MaskConst);
SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
return Res;
}
return SDValue();
}
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
// First try to optimize away the conversion when it's conditionally from
// a constant. Vectors only.
if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
return Res;
EVT VT = N->getValueType(0);
if (VT != MVT::f32 && VT != MVT::f64)
return SDValue();
// Only optimize when the source and destination types have the same width.
if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
return SDValue();
// If the result of an integer load is only used by an integer-to-float
// conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
// This eliminates an "integer-to-vector-move" UOP and improves throughput.
SDValue N0 = N->getOperand(0);
if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
// Do not change the width of a volatile load.
!cast<LoadSDNode>(N0)->isVolatile()) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
LN0->getPointerInfo(), LN0->getAlignment(),
LN0->getMemOperand()->getFlags());
// Make sure successors of the original load stay after it by updating them
// to use the new Chain.
DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
unsigned Opcode =
(N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
return DAG.getNode(Opcode, SDLoc(N), VT, Load);
}
return SDValue();
}
/// Fold a floating-point multiply by power of two into floating-point to
/// fixed-point conversion.
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
if (!Subtarget->hasNEON())
return SDValue();
if (!N->getValueType(0).isSimple())
return SDValue();
SDValue Op = N->getOperand(0);
if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
Op.getOpcode() != ISD::FMUL)
return SDValue();
SDValue ConstVec = Op->getOperand(1);
if (!isa<BuildVectorSDNode>(ConstVec))
return SDValue();
MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
uint32_t FloatBits = FloatTy.getSizeInBits();
if (FloatBits != 32 && FloatBits != 64)
return SDValue();
MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
uint32_t IntBits = IntTy.getSizeInBits();
if (IntBits != 16 && IntBits != 32 && IntBits != 64)
return SDValue();
// Avoid conversions where iN is larger than the float (e.g., float -> i64).
if (IntBits > FloatBits)
return SDValue();
BitVector UndefElements;
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
int32_t Bits = IntBits == 64 ? 64 : 32;
int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
if (C == -1 || C == 0 || C > Bits)
return SDValue();
MVT ResTy;
unsigned NumLanes = Op.getValueType().getVectorNumElements();
switch (NumLanes) {
default:
return SDValue();
case 2:
ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
break;
case 4:
ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
break;
}
if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
return SDValue();
assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) &&
"Illegal vector type after legalization");
SDLoc DL(N);
bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
: Intrinsic::aarch64_neon_vcvtfp2fxu;
SDValue FixConv =
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
// We can handle smaller integers by generating an extra trunc.
if (IntBits < FloatBits)
FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
return FixConv;
}
/// Fold a floating-point divide by power of two into fixed-point to
/// floating-point conversion.
static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
if (!Subtarget->hasNEON())
return SDValue();
SDValue Op = N->getOperand(0);
unsigned Opc = Op->getOpcode();
if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
!Op.getOperand(0).getValueType().isSimple() ||
(Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
return SDValue();
SDValue ConstVec = N->getOperand(1);
if (!isa<BuildVectorSDNode>(ConstVec))
return SDValue();
MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
int32_t IntBits = IntTy.getSizeInBits();
if (IntBits != 16 && IntBits != 32 && IntBits != 64)
return SDValue();
MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
int32_t FloatBits = FloatTy.getSizeInBits();
if (FloatBits != 32 && FloatBits != 64)
return SDValue();
// Avoid conversions where iN is larger than the float (e.g., i64 -> float).
if (IntBits > FloatBits)
return SDValue();
BitVector UndefElements;
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
if (C == -1 || C == 0 || C > FloatBits)
return SDValue();
MVT ResTy;
unsigned NumLanes = Op.getValueType().getVectorNumElements();
switch (NumLanes) {
default:
return SDValue();
case 2:
ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
break;
case 4:
ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
break;
}
if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
return SDValue();
SDLoc DL(N);
SDValue ConvInput = Op.getOperand(0);
bool IsSigned = Opc == ISD::SINT_TO_FP;
if (IntBits < FloatBits)
ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
ResTy, ConvInput);
unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
: Intrinsic::aarch64_neon_vcvtfxu2fp;
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
DAG.getConstant(C, DL, MVT::i32));
}
/// An EXTR instruction is made up of two shifts, ORed together. This helper
/// searches for and classifies those shifts.
static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
bool &FromHi) {
if (N.getOpcode() == ISD::SHL)
FromHi = false;
else if (N.getOpcode() == ISD::SRL)
FromHi = true;
else
return false;
if (!isa<ConstantSDNode>(N.getOperand(1)))
return false;
ShiftAmount = N->getConstantOperandVal(1);
Src = N->getOperand(0);
return true;
}
/// EXTR instruction extracts a contiguous chunk of bits from two existing
/// registers viewed as a high/low pair. This function looks for the pattern:
/// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
/// with an EXTR. Can't quite be done in TableGen because the two immediates
/// aren't independent.
static SDValue tryCombineToEXTR(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
EVT VT = N->getValueType(0);
assert(N->getOpcode() == ISD::OR && "Unexpected root");
if (VT != MVT::i32 && VT != MVT::i64)
return SDValue();
SDValue LHS;
uint32_t ShiftLHS = 0;
bool LHSFromHi = false;
if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
return SDValue();
SDValue RHS;
uint32_t ShiftRHS = 0;
bool RHSFromHi = false;
if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
return SDValue();
// If they're both trying to come from the high part of the register, they're
// not really an EXTR.
if (LHSFromHi == RHSFromHi)
return SDValue();
if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
return SDValue();
if (LHSFromHi) {
std::swap(LHS, RHS);
std::swap(ShiftLHS, ShiftRHS);
}
return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
DAG.getConstant(ShiftRHS, DL, MVT::i64));
}
static SDValue tryCombineToBSL(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
EVT VT = N->getValueType(0);
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
if (!VT.isVector())
return SDValue();
SDValue N0 = N->getOperand(0);
if (N0.getOpcode() != ISD::AND)
return SDValue();
SDValue N1 = N->getOperand(1);
if (N1.getOpcode() != ISD::AND)
return SDValue();
// We only have to look for constant vectors here since the general, variable
// case can be handled in TableGen.
unsigned Bits = VT.getScalarSizeInBits();
uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
for (int i = 1; i >= 0; --i)
for (int j = 1; j >= 0; --j) {
BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
if (!BVN0 || !BVN1)
continue;
bool FoundMatch = true;
for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
if (!CN0 || !CN1 ||
CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
FoundMatch = false;
break;
}
}
if (FoundMatch)
return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
N0->getOperand(1 - i), N1->getOperand(1 - j));
}
return SDValue();
}
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
// Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
if (SDValue Res = tryCombineToEXTR(N, DCI))
return Res;
if (SDValue Res = tryCombineToBSL(N, DCI))
return Res;
return SDValue();
}
static SDValue performANDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
SDValue LHS = N->getOperand(0);
EVT VT = N->getValueType(0);
if (!VT.isVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
BuildVectorSDNode *BVN =
dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode());
if (!BVN)
return SDValue();
// AND does not accept an immediate, so check if we can use a BIC immediate
// instruction instead. We do this here instead of using a (and x, (mvni imm))
// pattern in isel, because some immediates may be lowered to the preferred
// (and x, (movi imm)) form, even though an mvni representation also exists.
APInt DefBits(VT.getSizeInBits(), 0);
APInt UndefBits(VT.getSizeInBits(), 0);
if (resolveBuildVector(BVN, DefBits, UndefBits)) {
SDValue NewOp;
DefBits = ~DefBits;
if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
DefBits, &LHS)) ||
(NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
DefBits, &LHS)))
return NewOp;
UndefBits = ~UndefBits;
if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
UndefBits, &LHS)) ||
(NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
UndefBits, &LHS)))
return NewOp;
}
return SDValue();
}
static SDValue performSRLCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
if (VT != MVT::i32 && VT != MVT::i64)
return SDValue();
// Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
// high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
// to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
SDValue N0 = N->getOperand(0);
if (N0.getOpcode() == ISD::BSWAP) {
SDLoc DL(N);
SDValue N1 = N->getOperand(1);
SDValue N00 = N0.getOperand(0);
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
uint64_t ShiftAmt = C->getZExtValue();
if (VT == MVT::i32 && ShiftAmt == 16 &&
DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))
return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
if (VT == MVT::i64 && ShiftAmt == 32 &&
DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))
return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
}
}
return SDValue();
}
static SDValue performBitcastCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
// Wait 'til after everything is legalized to try this. That way we have
// legal vector types and such.
if (DCI.isBeforeLegalizeOps())
return SDValue();
// Remove extraneous bitcasts around an extract_subvector.
// For example,
// (v4i16 (bitconvert
// (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
// becomes
// (extract_subvector ((v8i16 ...), (i64 4)))
// Only interested in 64-bit vectors as the ultimate result.
EVT VT = N->getValueType(0);
if (!VT.isVector())
return SDValue();
if (VT.getSimpleVT().getSizeInBits() != 64)
return SDValue();
// Is the operand an extract_subvector starting at the beginning or halfway
// point of the vector? A low half may also come through as an
// EXTRACT_SUBREG, so look for that, too.
SDValue Op0 = N->getOperand(0);
if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
!(Op0->isMachineOpcode() &&
Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
return SDValue();
uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
return SDValue();
} else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
if (idx != AArch64::dsub)
return SDValue();
// The dsub reference is equivalent to a lane zero subvector reference.
idx = 0;
}
// Look through the bitcast of the input to the extract.
if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
return SDValue();
SDValue Source = Op0->getOperand(0)->getOperand(0);
// If the source type has twice the number of elements as our destination
// type, we know this is an extract of the high or low half of the vector.
EVT SVT = Source->getValueType(0);
if (!SVT.isVector() ||
SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
return SDValue();
LLVM_DEBUG(
dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
// Create the simplified form to just extract the low or high half of the
// vector directly rather than bothering with the bitcasts.
SDLoc dl(N);
unsigned NumElements = VT.getVectorNumElements();
if (idx) {
SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
} else {
SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32);
return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
Source, SubReg),
0);
}
}
static SDValue performConcatVectorsCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
// Optimize concat_vectors of truncated vectors, where the intermediate
// type is illegal, to avoid said illegality, e.g.,
// (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
// (v2i16 (truncate (v2i64)))))
// ->
// (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
// (v4i32 (bitcast (v2i64))),
// <0, 2, 4, 6>)))
// This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
// on both input and result type, so we might generate worse code.
// On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
if (N->getNumOperands() == 2 &&
N0->getOpcode() == ISD::TRUNCATE &&
N1->getOpcode() == ISD::TRUNCATE) {
SDValue N00 = N0->getOperand(0);
SDValue N10 = N1->getOperand(0);
EVT N00VT = N00.getValueType();
if (N00VT == N10.getValueType() &&
(N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
for (size_t i = 0; i < Mask.size(); ++i)
Mask[i] = i * 2;
return DAG.getNode(ISD::TRUNCATE, dl, VT,
DAG.getVectorShuffle(
MidVT, dl,
DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
}
}
// Wait 'til after everything is legalized to try this. That way we have
// legal vector types and such.
if (DCI.isBeforeLegalizeOps())
return SDValue();
// If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
// splat. The indexed instructions are going to be expecting a DUPLANE64, so
// canonicalise to that.
if (N0 == N1 && VT.getVectorNumElements() == 2) {
assert(VT.getScalarSizeInBits() == 64);
return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
DAG.getConstant(0, dl, MVT::i64));
}
// Canonicalise concat_vectors so that the right-hand vector has as few
// bit-casts as possible before its real operation. The primary matching
// destination for these operations will be the narrowing "2" instructions,
// which depend on the operation being performed on this right-hand vector.
// For example,
// (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
// becomes
// (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
if (N1->getOpcode() != ISD::BITCAST)
return SDValue();
SDValue RHS = N1->getOperand(0);
MVT RHSTy = RHS.getValueType().getSimpleVT();
// If the RHS is not a vector, this is not the pattern we're looking for.
if (!RHSTy.isVector())
return SDValue();
LLVM_DEBUG(
dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
RHSTy.getVectorNumElements() * 2);
return DAG.getNode(ISD::BITCAST, dl, VT,
DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
RHS));
}
static SDValue tryCombineFixedPointConvert(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
// Wait until after everything is legalized to try this. That way we have
// legal vector types and such.
if (DCI.isBeforeLegalizeOps())
return SDValue();
// Transform a scalar conversion of a value from a lane extract into a
// lane extract of a vector conversion. E.g., from foo1 to foo2:
// double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
// double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
//
// The second form interacts better with instruction selection and the
// register allocator to avoid cross-class register copies that aren't
// coalescable due to a lane reference.
// Check the operand and see if it originates from a lane extract.
SDValue Op1 = N->getOperand(1);
if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
// Yep, no additional predication needed. Perform the transform.
SDValue IID = N->getOperand(0);
SDValue Shift = N->getOperand(2);
SDValue Vec = Op1.getOperand(0);
SDValue Lane = Op1.getOperand(1);
EVT ResTy = N->getValueType(0);
EVT VecResTy;
SDLoc DL(N);
// The vector width should be 128 bits by the time we get here, even
// if it started as 64 bits (the extract_vector handling will have
// done so).
assert(Vec.getValueSizeInBits() == 128 &&
"unexpected vector size on extract_vector_elt!");
if (Vec.getValueType() == MVT::v4i32)
VecResTy = MVT::v4f32;
else if (Vec.getValueType() == MVT::v2i64)
VecResTy = MVT::v2f64;
else
llvm_unreachable("unexpected vector type!");
SDValue Convert =
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
}
return SDValue();
}
// AArch64 high-vector "long" operations are formed by performing the non-high
// version on an extract_subvector of each operand which gets the high half:
//
// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
//
// However, there are cases which don't have an extract_high explicitly, but
// have another operation that can be made compatible with one for free. For
// example:
//
// (dupv64 scalar) --> (extract_high (dup128 scalar))
//
// This routine does the actual conversion of such DUPs, once outer routines
// have determined that everything else is in order.
// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
// similarly here.
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
switch (N.getOpcode()) {
case AArch64ISD::DUP:
case AArch64ISD::DUPLANE8:
case AArch64ISD::DUPLANE16:
case AArch64ISD::DUPLANE32:
case AArch64ISD::DUPLANE64:
case AArch64ISD::MOVI:
case AArch64ISD::MOVIshift:
case AArch64ISD::MOVIedit:
case AArch64ISD::MOVImsl:
case AArch64ISD::MVNIshift:
case AArch64ISD::MVNImsl:
break;
default:
// FMOV could be supported, but isn't very useful, as it would only occur
// if you passed a bitcast' floating point immediate to an eligible long
// integer op (addl, smull, ...).
return SDValue();
}
MVT NarrowTy = N.getSimpleValueType();
if (!NarrowTy.is64BitVector())
return SDValue();
MVT ElementTy = NarrowTy.getVectorElementType();
unsigned NumElems = NarrowTy.getVectorNumElements();
MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
SDLoc dl(N);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
DAG.getConstant(NumElems, dl, MVT::i64));
}
static bool isEssentiallyExtractHighSubvector(SDValue N) {
if (N.getOpcode() == ISD::BITCAST)
N = N.getOperand(0);
if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
return false;
return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
N.getOperand(0).getValueType().getVectorNumElements() / 2;
}
/// Helper structure to keep track of ISD::SET_CC operands.
struct GenericSetCCInfo {
const SDValue *Opnd0;
const SDValue *Opnd1;
ISD::CondCode CC;
};
/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
struct AArch64SetCCInfo {
const SDValue *Cmp;
AArch64CC::CondCode CC;
};
/// Helper structure to keep track of SetCC information.
union SetCCInfo {
GenericSetCCInfo Generic;
AArch64SetCCInfo AArch64;
};
/// Helper structure to be able to read SetCC information. If set to
/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
/// GenericSetCCInfo.
struct SetCCInfoAndKind {
SetCCInfo Info;
bool IsAArch64;
};
/// Check whether or not \p Op is a SET_CC operation, either a generic or
/// an
/// AArch64 lowered one.
/// \p SetCCInfo is filled accordingly.
/// \post SetCCInfo is meanginfull only when this function returns true.
/// \return True when Op is a kind of SET_CC operation.
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
// If this is a setcc, this is straight forward.
if (Op.getOpcode() == ISD::SETCC) {
SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
SetCCInfo.IsAArch64 = false;
return true;
}
// Otherwise, check if this is a matching csel instruction.
// In other words:
// - csel 1, 0, cc
// - csel 0, 1, !cc
if (Op.getOpcode() != AArch64ISD::CSEL)
return false;
// Set the information about the operands.
// TODO: we want the operands of the Cmp not the csel
SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
SetCCInfo.IsAArch64 = true;
SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
// Check that the operands matches the constraints:
// (1) Both operands must be constants.
// (2) One must be 1 and the other must be 0.
ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
// Check (1).
if (!TValue || !FValue)
return false;
// Check (2).
if (!TValue->isOne()) {
// Update the comparison when we are interested in !cc.
std::swap(TValue, FValue);
SetCCInfo.Info.AArch64.CC =
AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
}
return TValue->isOne() && FValue->isNullValue();
}
// Returns true if Op is setcc or zext of setcc.
static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
if (isSetCC(Op, Info))
return true;
return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
isSetCC(Op->getOperand(0), Info));
}
// The folding we want to perform is:
// (add x, [zext] (setcc cc ...) )
// -->
// (csel x, (add x, 1), !cc ...)
//
// The latter will get matched to a CSINC instruction.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
SDValue LHS = Op->getOperand(0);
SDValue RHS = Op->getOperand(1);
SetCCInfoAndKind InfoAndKind;
// If neither operand is a SET_CC, give up.
if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
std::swap(LHS, RHS);
if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
return SDValue();
}
// FIXME: This could be generatized to work for FP comparisons.
EVT CmpVT = InfoAndKind.IsAArch64
? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
: InfoAndKind.Info.Generic.Opnd0->getValueType();
if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
return SDValue();
SDValue CCVal;
SDValue Cmp;
SDLoc dl(Op);
if (InfoAndKind.IsAArch64) {
CCVal = DAG.getConstant(
AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
MVT::i32);
Cmp = *InfoAndKind.Info.AArch64.Cmp;
} else
Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0,
*InfoAndKind.Info.Generic.Opnd1,
ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
CCVal, DAG, dl);
EVT VT = Op->getValueType(0);
LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
}
// The basic add/sub long vector instructions have variants with "2" on the end
// which act on the high-half of their inputs. They are normally matched by
// patterns like:
//
// (add (zeroext (extract_high LHS)),
// (zeroext (extract_high RHS)))
// -> uaddl2 vD, vN, vM
//
// However, if one of the extracts is something like a duplicate, this
// instruction can still be used profitably. This function puts the DAG into a
// more appropriate form for those patterns to trigger.
static SDValue performAddSubLongCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
MVT VT = N->getSimpleValueType(0);
if (!VT.is128BitVector()) {
if (N->getOpcode() == ISD::ADD)
return performSetccAddFolding(N, DAG);
return SDValue();
}
// Make sure both branches are extended in the same way.
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
LHS.getOpcode() != ISD::SIGN_EXTEND) ||
LHS.getOpcode() != RHS.getOpcode())
return SDValue();
unsigned ExtType = LHS.getOpcode();
// It's not worth doing if at least one of the inputs isn't already an
// extract, but we don't know which it'll be so we have to try both.
if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
if (!RHS.getNode())
return SDValue();
RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
} else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
if (!LHS.getNode())
return SDValue();
LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
}
return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
}
// Massage DAGs which we can use the high-half "long" operations on into
// something isel will recognize better. E.g.
//
// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
// (aarch64_neon_umull (extract_high (v2i64 vec)))
// (extract_high (v2i64 (dup128 scalar)))))
//
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
assert(LHS.getValueType().is64BitVector() &&
RHS.getValueType().is64BitVector() &&
"unexpected shape for long operation");
// Either node could be a DUP, but it's not worth doing both of them (you'd
// just as well use the non-high version) so look for a corresponding extract
// operation on the other "wing".
if (isEssentiallyExtractHighSubvector(LHS)) {
RHS = tryExtendDUPToExtractHigh(RHS, DAG);
if (!RHS.getNode())
return SDValue();
} else if (isEssentiallyExtractHighSubvector(RHS)) {
LHS = tryExtendDUPToExtractHigh(LHS, DAG);
if (!LHS.getNode())
return SDValue();
}
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
N->getOperand(0), LHS, RHS);
}
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
MVT ElemTy = N->getSimpleValueType(0).getScalarType();
unsigned ElemBits = ElemTy.getSizeInBits();
int64_t ShiftAmount;
if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
APInt SplatValue, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
HasAnyUndefs, ElemBits) ||
SplatBitSize != ElemBits)
return SDValue();
ShiftAmount = SplatValue.getSExtValue();
} else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
ShiftAmount = CVN->getSExtValue();
} else
return SDValue();
unsigned Opcode;
bool IsRightShift;
switch (IID) {
default:
llvm_unreachable("Unknown shift intrinsic");
case Intrinsic::aarch64_neon_sqshl:
Opcode = AArch64ISD::SQSHL_I;
IsRightShift = false;
break;
case Intrinsic::aarch64_neon_uqshl:
Opcode = AArch64ISD::UQSHL_I;
IsRightShift = false;
break;
case Intrinsic::aarch64_neon_srshl:
Opcode = AArch64ISD::SRSHR_I;
IsRightShift = true;
break;
case Intrinsic::aarch64_neon_urshl:
Opcode = AArch64ISD::URSHR_I;
IsRightShift = true;
break;
case Intrinsic::aarch64_neon_sqshlu:
Opcode = AArch64ISD::SQSHLU_I;
IsRightShift = false;
break;
}
if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
SDLoc dl(N);
return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
DAG.getConstant(-ShiftAmount, dl, MVT::i32));
} else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
SDLoc dl(N);
return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
DAG.getConstant(ShiftAmount, dl, MVT::i32));
}
return SDValue();
}
// The CRC32[BH] instructions ignore the high bits of their data operand. Since
// the intrinsics must be legal and take an i32, this means there's almost
// certainly going to be a zext in the DAG which we can eliminate.
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
SDValue AndN = N->getOperand(2);
if (AndN.getOpcode() != ISD::AND)
return SDValue();
ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
if (!CMask || CMask->getZExtValue() != Mask)
return SDValue();
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
}
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
SelectionDAG &DAG) {
SDLoc dl(N);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
DAG.getNode(Opc, dl,
N->getOperand(1).getSimpleValueType(),
N->getOperand(1)),
DAG.getConstant(0, dl, MVT::i64));
}
static SDValue performIntrinsicCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
SelectionDAG &DAG = DCI.DAG;
unsigned IID = getIntrinsicID(N);
switch (IID) {
default:
break;
case Intrinsic::aarch64_neon_vcvtfxs2fp:
case Intrinsic::aarch64_neon_vcvtfxu2fp:
return tryCombineFixedPointConvert(N, DCI, DAG);
case Intrinsic::aarch64_neon_saddv:
return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
case Intrinsic::aarch64_neon_uaddv:
return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
case Intrinsic::aarch64_neon_sminv:
return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
case Intrinsic::aarch64_neon_uminv:
return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
case Intrinsic::aarch64_neon_smaxv:
return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
case Intrinsic::aarch64_neon_umaxv:
return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
case Intrinsic::aarch64_neon_fmax:
return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_fmin:
return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_fmaxnm:
return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_fminnm:
return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_smull:
case Intrinsic::aarch64_neon_umull:
case Intrinsic::aarch64_neon_pmull:
case Intrinsic::aarch64_neon_sqdmull:
return tryCombineLongOpWithDup(IID, N, DCI, DAG);
case Intrinsic::aarch64_neon_sqshl:
case Intrinsic::aarch64_neon_uqshl:
case Intrinsic::aarch64_neon_sqshlu:
case Intrinsic::aarch64_neon_srshl:
case Intrinsic::aarch64_neon_urshl:
return tryCombineShiftImm(IID, N, DAG);
case Intrinsic::aarch64_crc32b:
case Intrinsic::aarch64_crc32cb:
return tryCombineCRC32(0xff, N, DAG);
case Intrinsic::aarch64_crc32h:
case Intrinsic::aarch64_crc32ch:
return tryCombineCRC32(0xffff, N, DAG);
}
return SDValue();
}
static SDValue performExtendCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
// If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
// we can convert that DUP into another extract_high (of a bigger DUP), which
// helps the backend to decide that an sabdl2 would be useful, saving a real
// extract_high operation.
if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
SDNode *ABDNode = N->getOperand(0).getNode();
unsigned IID = getIntrinsicID(ABDNode);
if (IID == Intrinsic::aarch64_neon_sabd ||
IID == Intrinsic::aarch64_neon_uabd) {
SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
if (!NewABD.getNode())
return SDValue();
return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
NewABD);
}
}
// This is effectively a custom type legalization for AArch64.
//
// Type legalization will split an extend of a small, legal, type to a larger
// illegal type by first splitting the destination type, often creating
// illegal source types, which then get legalized in isel-confusing ways,
// leading to really terrible codegen. E.g.,
// %result = v8i32 sext v8i8 %value
// becomes
// %losrc = extract_subreg %value, ...
// %hisrc = extract_subreg %value, ...
// %lo = v4i32 sext v4i8 %losrc
// %hi = v4i32 sext v4i8 %hisrc
// Things go rapidly downhill from there.
//
// For AArch64, the [sz]ext vector instructions can only go up one element
// size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
// take two instructions.
//
// This implies that the most efficient way to do the extend from v8i8
// to two v4i32 values is to first extend the v8i8 to v8i16, then do
// the normal splitting to happen for the v8i16->v8i32.
// This is pre-legalization to catch some cases where the default
// type legalization will create ill-tempered code.
if (!DCI.isBeforeLegalizeOps())
return SDValue();
// We're only interested in cleaning things up for non-legal vector types
// here. If both the source and destination are legal, things will just
// work naturally without any fiddling.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT ResVT = N->getValueType(0);
if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
return SDValue();
// If the vector type isn't a simple VT, it's beyond the scope of what
// we're worried about here. Let legalization do its thing and hope for
// the best.
SDValue Src = N->getOperand(0);
EVT SrcVT = Src->getValueType(0);
if (!ResVT.isSimple() || !SrcVT.isSimple())
return SDValue();
// If the source VT is a 64-bit vector, we can play games and get the
// better results we want.
if (SrcVT.getSizeInBits() != 64)
return SDValue();
unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
unsigned ElementCount = SrcVT.getVectorNumElements();
SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
SDLoc DL(N);
Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
// Now split the rest of the operation into two halves, each with a 64
// bit source.
EVT LoVT, HiVT;
SDValue Lo, Hi;
unsigned NumElements = ResVT.getVectorNumElements();
assert(!(NumElements & 1) && "Splitting vector, but not in half!");
LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
ResVT.getVectorElementType(), NumElements / 2);
EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
LoVT.getVectorNumElements());
Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
DAG.getConstant(0, DL, MVT::i64));
Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64));
Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
// Now combine the parts back together so we still have a single result
// like the combiner expects.
return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
}
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
SDValue SplatVal, unsigned NumVecElts) {
assert(!St.isTruncatingStore() && "cannot split truncating vector store");
unsigned OrigAlignment = St.getAlignment();
unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
// Create scalar stores. This is at least as good as the code sequence for a
// split unaligned store which is a dup.s, ext.b, and two stores.
// Most of the time the three stores should be replaced by store pair
// instructions (stp).
SDLoc DL(&St);
SDValue BasePtr = St.getBasePtr();
uint64_t BaseOffset = 0;
const MachinePointerInfo &PtrInfo = St.getPointerInfo();
SDValue NewST1 =
DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
OrigAlignment, St.getMemOperand()->getFlags());
// As this in ISel, we will not merge this add which may degrade results.
if (BasePtr->getOpcode() == ISD::ADD &&
isa<ConstantSDNode>(BasePtr->getOperand(1))) {
BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
BasePtr = BasePtr->getOperand(0);
}
unsigned Offset = EltOffset;
while (--NumVecElts) {
unsigned Alignment = MinAlign(OrigAlignment, Offset);
SDValue OffsetPtr =
DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
PtrInfo.getWithOffset(Offset), Alignment,
St.getMemOperand()->getFlags());
Offset += EltOffset;
}
return NewST1;
}
/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
/// load store optimizer pass will merge them to store pair stores. This should
/// be better than a movi to create the vector zero followed by a vector store
/// if the zero constant is not re-used, since one instructions and one register
/// live range will be removed.
///
/// For example, the final generated code should be:
///
/// stp xzr, xzr, [x0]
///
/// instead of:
///
/// movi v0.2d, #0
/// str q0, [x0]
///
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
SDValue StVal = St.getValue();
EVT VT = StVal.getValueType();
// It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
// 2, 3 or 4 i32 elements.
int NumVecElts = VT.getVectorNumElements();
if (!(((NumVecElts == 2 || NumVecElts == 3) &&
VT.getVectorElementType().getSizeInBits() == 64) ||
((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
VT.getVectorElementType().getSizeInBits() == 32)))
return SDValue();
if (StVal.getOpcode() != ISD::BUILD_VECTOR)
return SDValue();
// If the zero constant has more than one use then the vector store could be
// better since the constant mov will be amortized and stp q instructions
// should be able to be formed.
if (!StVal.hasOneUse())
return SDValue();
// If the store is truncating then it's going down to i16 or smaller, which
// means it can be implemented in a single store anyway.
if (St.isTruncatingStore())
return SDValue();
// If the immediate offset of the address operand is too large for the stp
// instruction, then bail out.
if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
if (Offset < -512 || Offset > 504)
return SDValue();
}
for (int I = 0; I < NumVecElts; ++I) {
SDValue EltVal = StVal.getOperand(I);
if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
return SDValue();
}
// Use a CopyFromReg WZR/XZR here to prevent
// DAGCombiner::MergeConsecutiveStores from undoing this transformation.
SDLoc DL(&St);
unsigned ZeroReg;
EVT ZeroVT;
if (VT.getVectorElementType().getSizeInBits() == 32) {
ZeroReg = AArch64::WZR;
ZeroVT = MVT::i32;
} else {
ZeroReg = AArch64::XZR;
ZeroVT = MVT::i64;
}
SDValue SplatVal =
DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
}
/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
/// value. The load store optimizer pass will merge them to store pair stores.
/// This has better performance than a splat of the scalar followed by a split
/// vector store. Even if the stores are not merged it is four stores vs a dup,
/// followed by an ext.b and two stores.
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
SDValue StVal = St.getValue();
EVT VT = StVal.getValueType();
// Don't replace floating point stores, they possibly won't be transformed to
// stp because of the store pair suppress pass.
if (VT.isFloatingPoint())
return SDValue();
// We can express a splat as store pair(s) for 2 or 4 elements.
unsigned NumVecElts = VT.getVectorNumElements();
if (NumVecElts != 4 && NumVecElts != 2)
return SDValue();
// If the store is truncating then it's going down to i16 or smaller, which
// means it can be implemented in a single store anyway.
if (St.isTruncatingStore())
return SDValue();
// Check that this is a splat.
// Make sure that each of the relevant vector element locations are inserted
// to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
SDValue SplatVal;
for (unsigned I = 0; I < NumVecElts; ++I) {
// Check for insert vector elements.
if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
return SDValue();
// Check that same value is inserted at each vector element.
if (I == 0)
SplatVal = StVal.getOperand(1);
else if (StVal.getOperand(1) != SplatVal)
return SDValue();
// Check insert element index.
ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
if (!CIndex)
return SDValue();
uint64_t IndexVal = CIndex->getZExtValue();
if (IndexVal >= NumVecElts)
return SDValue();
IndexNotInserted.reset(IndexVal);
StVal = StVal.getOperand(0);
}
// Check that all vector element locations were inserted to.
if (IndexNotInserted.any())
return SDValue();
return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
}
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
StoreSDNode *S = cast<StoreSDNode>(N);
if (S->isVolatile() || S->isIndexed())
return SDValue();
SDValue StVal = S->getValue();
EVT VT = StVal.getValueType();
if (!VT.isVector())
return SDValue();
// If we get a splat of zeros, convert this vector store to a store of
// scalars. They will be merged into store pairs of xzr thereby removing one
// instruction and one register.
if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
return ReplacedZeroSplat;
// FIXME: The logic for deciding if an unaligned store should be split should
// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
// a call to that function here.
if (!Subtarget->isMisaligned128StoreSlow())
return SDValue();
// Don't split at -Oz.
if (DAG.getMachineFunction().getFunction().hasMinSize())
return SDValue();
// Don't split v2i64 vectors. Memcpy lowering produces those and splitting
// those up regresses performance on micro-benchmarks and olden/bh.
if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
return SDValue();
// Split unaligned 16B stores. They are terrible for performance.
// Don't split stores with alignment of 1 or 2. Code that uses clang vector
// extensions can use this to mark that it does not want splitting to happen
// (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
// eliminating alignment hazards is only 1 in 8 for alignment of 2.
if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
S->getAlignment() <= 2)
return SDValue();
// If we get a splat of a scalar convert this vector store to a store of
// scalars. They will be merged into store pairs thereby removing two
// instructions.
if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
return ReplacedSplat;
SDLoc DL(S);
unsigned NumElts = VT.getVectorNumElements() / 2;
// Split VT into two.
EVT HalfVT =
EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
DAG.getConstant(0, DL, MVT::i64));
SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
DAG.getConstant(NumElts, DL, MVT::i64));
SDValue BasePtr = S->getBasePtr();
SDValue NewST1 =
DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
S->getAlignment(), S->getMemOperand()->getFlags());
SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
DAG.getConstant(8, DL, MVT::i64));
return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
S->getPointerInfo(), S->getAlignment(),
S->getMemOperand()->getFlags());
}
/// Target-specific DAG combine function for post-increment LD1 (lane) and
/// post-increment LD1R.
static SDValue performPostLD1Combine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
bool IsLaneOp) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
unsigned LoadIdx = IsLaneOp ? 1 : 0;
SDNode *LD = N->getOperand(LoadIdx).getNode();
// If it is not LOAD, can not do such combine.
if (LD->getOpcode() != ISD::LOAD)
return SDValue();
// The vector lane must be a constant in the LD1LANE opcode.
SDValue Lane;
if (IsLaneOp) {
Lane = N->getOperand(2);
auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
return SDValue();
}
LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
EVT MemVT = LoadSDN->getMemoryVT();
// Check if memory operand is the same type as the vector element.
if (MemVT != VT.getVectorElementType())
return SDValue();
// Check if there are other uses. If so, do not combine as it will introduce
// an extra load.
for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
++UI) {
if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
continue;
if (*UI != N)
return SDValue();
}
SDValue Addr = LD->getOperand(1);
SDValue Vector = N->getOperand(0);
// Search for a use of the address operand that is an increment.
for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
Addr.getNode()->use_end(); UI != UE; ++UI) {
SDNode *User = *UI;
if (User->getOpcode() != ISD::ADD
|| UI.getUse().getResNo() != Addr.getResNo())
continue;
// If the increment is a constant, it must match the memory ref size.
SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
uint32_t IncVal = CInc->getZExtValue();
unsigned NumBytes = VT.getScalarSizeInBits() / 8;
if (IncVal != NumBytes)
continue;
Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
}
// To avoid cycle construction make sure that neither the load nor the add
// are predecessors to each other or the Vector.
SmallPtrSet<const SDNode *, 32> Visited;
SmallVector<const SDNode *, 16> Worklist;
Visited.insert(N);
Worklist.push_back(User);
Worklist.push_back(LD);
Worklist.push_back(Vector.getNode());
if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
SDNode::hasPredecessorHelper(User, Visited, Worklist))
continue;
SmallVector<SDValue, 8> Ops;
Ops.push_back(LD->getOperand(0)); // Chain
if (IsLaneOp) {
Ops.push_back(Vector); // The vector to be inserted
Ops.push_back(Lane); // The lane to be inserted in the vector
}
Ops.push_back(Addr);
Ops.push_back(Inc);
EVT Tys[3] = { VT, MVT::i64, MVT::Other };
SDVTList SDTys = DAG.getVTList(Tys);
unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
MemVT,
LoadSDN->getMemOperand());
// Update the uses.
SDValue NewResults[] = {
SDValue(LD, 0), // The result of load
SDValue(UpdN.getNode(), 2) // Chain
};
DCI.CombineTo(LD, NewResults);
DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
break;
}
return SDValue();
}
/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
/// address translation.
static bool performTBISimplification(SDValue Addr,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
APInt DemandedMask = APInt::getLowBitsSet(64, 56);
KnownBits Known;
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
!DCI.isBeforeLegalizeOps());
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
DCI.CommitTargetLoweringOpt(TLO);
return true;
}
return false;
}
static SDValue performSTORECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
return Split;
if (Subtarget->supportsAddressTopByteIgnored() &&
performTBISimplification(N->getOperand(2), DCI, DAG))
return SDValue(N, 0);
return SDValue();
}
/// Target-specific DAG combine function for NEON load/store intrinsics
/// to merge base address updates.
static SDValue performNEONPostLDSTCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
return SDValue();
unsigned AddrOpIdx = N->getNumOperands() - 1;
SDValue Addr = N->getOperand(AddrOpIdx);
// Search for a use of the address operand that is an increment.
for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
SDNode *User = *UI;
if (User->getOpcode() != ISD::ADD ||
UI.getUse().getResNo() != Addr.getResNo())
continue;
// Check that the add is independent of the load/store. Otherwise, folding
// it would create a cycle.
SmallPtrSet<const SDNode *, 32> Visited;
SmallVector<const SDNode *, 16> Worklist;
Visited.insert(Addr.getNode());
Worklist.push_back(N);
Worklist.push_back(User);
if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
SDNode::hasPredecessorHelper(User, Visited, Worklist))
continue;
// Find the new opcode for the updating load/store.
bool IsStore = false;
bool IsLaneOp = false;
bool IsDupOp = false;
unsigned NewOpc = 0;
unsigned NumVecs = 0;
unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
switch (IntNo) {
default: llvm_unreachable("unexpected intrinsic for Neon base update");
case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
NumVecs = 2; break;
case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
NumVecs = 3; break;
case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
NumVecs = 4; break;
case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
NumVecs = 2; IsStore = true; break;
case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
NumVecs = 3; IsStore = true; break;
case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
NumVecs = 4; IsStore = true; break;
case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
NumVecs = 2; break;
case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
NumVecs = 3; break;
case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
NumVecs = 4; break;
case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
NumVecs = 2; IsStore = true; break;
case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
NumVecs = 3; IsStore = true; break;
case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
NumVecs = 4; IsStore = true; break;
case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
NumVecs = 2; IsDupOp = true; break;
case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
NumVecs = 3; IsDupOp = true; break;
case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
NumVecs = 4; IsDupOp = true; break;
case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
NumVecs = 2; IsLaneOp = true; break;
case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
NumVecs = 3; IsLaneOp = true; break;
case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
NumVecs = 4; IsLaneOp = true; break;
case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
NumVecs = 2; IsStore = true; IsLaneOp = true; break;
case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
NumVecs = 3; IsStore = true; IsLaneOp = true; break;
case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
NumVecs = 4; IsStore = true; IsLaneOp = true; break;
}
EVT VecTy;
if (IsStore)
VecTy = N->getOperand(2).getValueType();
else
VecTy = N->getValueType(0);
// If the increment is a constant, it must match the memory ref size.
SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
uint32_t IncVal = CInc->getZExtValue();
unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
if (IsLaneOp || IsDupOp)
NumBytes /= VecTy.getVectorNumElements();
if (IncVal != NumBytes)
continue;
Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
}
SmallVector<SDValue, 8> Ops;
Ops.push_back(N->getOperand(0)); // Incoming chain
// Load lane and store have vector list as input.
if (IsLaneOp || IsStore)
for (unsigned i = 2; i < AddrOpIdx; ++i)
Ops.push_back(N->getOperand(i));
Ops.push_back(Addr); // Base register
Ops.push_back(Inc);
// Return Types.
EVT Tys[6];
unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
unsigned n;
for (n = 0; n < NumResultVecs; ++n)
Tys[n] = VecTy;
Tys[n++] = MVT::i64; // Type of write back register
Tys[n] = MVT::Other; // Type of the chain
SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
MemInt->getMemoryVT(),
MemInt->getMemOperand());
// Update the uses.
std::vector<SDValue> NewResults;
for (unsigned i = 0; i < NumResultVecs; ++i) {
NewResults.push_back(SDValue(UpdN.getNode(), i));
}
NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
DCI.CombineTo(N, NewResults);
DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
break;
}
return SDValue();
}
// Checks to see if the value is the prescribed width and returns information
// about its extension mode.
static
bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
ExtType = ISD::NON_EXTLOAD;
switch(V.getNode()->getOpcode()) {
default:
return false;
case ISD::LOAD: {
LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
|| (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
ExtType = LoadNode->getExtensionType();
return true;
}
return false;
}
case ISD::AssertSext: {
VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
if ((TypeNode->getVT() == MVT::i8 && width == 8)
|| (TypeNode->getVT() == MVT::i16 && width == 16)) {
ExtType = ISD::SEXTLOAD;
return true;
}
return false;
}
case ISD::AssertZext: {
VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
if ((TypeNode->getVT() == MVT::i8 && width == 8)
|| (TypeNode->getVT() == MVT::i16 && width == 16)) {
ExtType = ISD::ZEXTLOAD;
return true;
}
return false;
}
case ISD::Constant:
case ISD::TargetConstant: {
return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
1LL << (width - 1);
}
}
return true;
}
// This function does a whole lot of voodoo to determine if the tests are
// equivalent without and with a mask. Essentially what happens is that given a
// DAG resembling:
//
// +-------------+ +-------------+ +-------------+ +-------------+
// | Input | | AddConstant | | CompConstant| | CC |
// +-------------+ +-------------+ +-------------+ +-------------+
// | | | |
// V V | +----------+
// +-------------+ +----+ | |
// | ADD | |0xff| | |
// +-------------+ +----+ | |
// | | | |
// V V | |
// +-------------+ | |
// | AND | | |
// +-------------+ | |
// | | |
// +-----+ | |
// | | |
// V V V
// +-------------+
// | CMP |
// +-------------+
//
// The AND node may be safely removed for some combinations of inputs. In
// particular we need to take into account the extension type of the Input,
// the exact values of AddConstant, CompConstant, and CC, along with the nominal
// width of the input (this can work for any width inputs, the above graph is
// specific to 8 bits.
//
// The specific equations were worked out by generating output tables for each
// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
// problem was simplified by working with 4 bit inputs, which means we only
// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
// patterns present in both extensions (0,7). For every distinct set of
// AddConstant and CompConstants bit patterns we can consider the masked and
// unmasked versions to be equivalent if the result of this function is true for
// all 16 distinct bit patterns of for the current extension type of Input (w0).
//
// sub w8, w0, w1
// and w10, w8, #0x0f
// cmp w8, w2
// cset w9, AArch64CC
// cmp w10, w2
// cset w11, AArch64CC
// cmp w9, w11
// cset w0, eq
// ret
//
// Since the above function shows when the outputs are equivalent it defines
// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
// would be expensive to run during compiles. The equations below were written
// in a test harness that confirmed they gave equivalent outputs to the above
// for all inputs function, so they can be used determine if the removal is
// legal instead.
//
// isEquivalentMaskless() is the code for testing if the AND can be removed
// factored out of the DAG recognition as the DAG can take several forms.
static bool isEquivalentMaskless(unsigned CC, unsigned width,
ISD::LoadExtType ExtType, int AddConstant,
int CompConstant) {
// By being careful about our equations and only writing the in term
// symbolic values and well known constants (0, 1, -1, MaxUInt) we can
// make them generally applicable to all bit widths.
int MaxUInt = (1 << width);
// For the purposes of these comparisons sign extending the type is
// equivalent to zero extending the add and displacing it by half the integer
// width. Provided we are careful and make sure our equations are valid over
// the whole range we can just adjust the input and avoid writing equations
// for sign extended inputs.
if (ExtType == ISD::SEXTLOAD)
AddConstant -= (1 << (width-1));
switch(CC) {
case AArch64CC::LE:
case AArch64CC::GT:
if ((AddConstant == 0) ||
(CompConstant == MaxUInt - 1 && AddConstant < 0) ||
(AddConstant >= 0 && CompConstant < 0) ||
(AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
return true;
break;
case AArch64CC::LT:
case AArch64CC::GE:
if ((AddConstant == 0) ||
(AddConstant >= 0 && CompConstant <= 0) ||
(AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
return true;
break;
case AArch64CC::HI:
case AArch64CC::LS:
if ((AddConstant >= 0 && CompConstant < 0) ||
(AddConstant <= 0 && CompConstant >= -1 &&
CompConstant < AddConstant + MaxUInt))
return true;
break;
case AArch64CC::PL:
case AArch64CC::MI:
if ((AddConstant == 0) ||
(AddConstant > 0 && CompConstant <= 0) ||
(AddConstant < 0 && CompConstant <= AddConstant))
return true;
break;
case AArch64CC::LO:
case AArch64CC::HS:
if ((AddConstant >= 0 && CompConstant <= 0) ||
(AddConstant <= 0 && CompConstant >= 0 &&
CompConstant <= AddConstant + MaxUInt))
return true;
break;
case AArch64CC::EQ:
case AArch64CC::NE:
if ((AddConstant > 0 && CompConstant < 0) ||
(AddConstant < 0 && CompConstant >= 0 &&
CompConstant < AddConstant + MaxUInt) ||
(AddConstant >= 0 && CompConstant >= 0 &&
CompConstant >= AddConstant) ||
(AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
return true;
break;
case AArch64CC::VS:
case AArch64CC::VC:
case AArch64CC::AL:
case AArch64CC::NV:
return true;
case AArch64CC::Invalid:
break;
}
return false;
}
static
SDValue performCONDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG, unsigned CCIndex,
unsigned CmpIndex) {
unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
unsigned CondOpcode = SubsNode->getOpcode();
if (CondOpcode != AArch64ISD::SUBS)
return SDValue();
// There is a SUBS feeding this condition. Is it fed by a mask we can
// use?
SDNode *AndNode = SubsNode->getOperand(0).getNode();
unsigned MaskBits = 0;
if (AndNode->getOpcode() != ISD::AND)
return SDValue();
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
uint32_t CNV = CN->getZExtValue();
if (CNV == 255)
MaskBits = 8;
else if (CNV == 65535)
MaskBits = 16;
}
if (!MaskBits)
return SDValue();
SDValue AddValue = AndNode->getOperand(0);
if (AddValue.getOpcode() != ISD::ADD)
return SDValue();
// The basic dag structure is correct, grab the inputs and validate them.
SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
SDValue SubsInputValue = SubsNode->getOperand(1);
// The mask is present and the provenance of all the values is a smaller type,
// lets see if the mask is superfluous.
if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
!isa<ConstantSDNode>(SubsInputValue.getNode()))
return SDValue();
ISD::LoadExtType ExtType;
if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
!checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
!checkValueWidth(AddInputValue1, MaskBits, ExtType) )
return SDValue();
if(!isEquivalentMaskless(CC, MaskBits, ExtType,
cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
return SDValue();
// The AND is not necessary, remove it.
SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
SubsNode->getValueType(1));
SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
return SDValue(N, 0);
}
// Optimize compare with zero and branch.
static SDValue performBRCONDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
MachineFunction &MF = DAG.getMachineFunction();
// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
// will not be produced, as they are conditional branch instructions that do
// not set flags.
if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
return SDValue();
if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
N = NV.getNode();
SDValue Chain = N->getOperand(0);
SDValue Dest = N->getOperand(1);
SDValue CCVal = N->getOperand(2);
SDValue Cmp = N->getOperand(3);
assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
return SDValue();
unsigned CmpOpc = Cmp.getOpcode();
if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
return SDValue();
// Only attempt folding if there is only one use of the flag and no use of the
// value.
if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
return SDValue();
SDValue LHS = Cmp.getOperand(0);
SDValue RHS = Cmp.getOperand(1);
assert(LHS.getValueType() == RHS.getValueType() &&
"Expected the value type to be the same for both operands!");
if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
return SDValue();
if (isNullConstant(LHS))
std::swap(LHS, RHS);
if (!isNullConstant(RHS))
return SDValue();
if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
LHS.getOpcode() == ISD::SRL)
return SDValue();
// Fold the compare into the branch instruction.
SDValue BR;
if (CC == AArch64CC::EQ)
BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
else
BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
// Do not add new nodes to DAG combiner worklist.
DCI.CombineTo(N, BR, false);
return SDValue();
}
// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
// as well as whether the test should be inverted. This code is required to
// catch these cases (as opposed to standard dag combines) because
// AArch64ISD::TBZ is matched during legalization.
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
SelectionDAG &DAG) {
if (!Op->hasOneUse())
return Op;
// We don't handle undef/constant-fold cases below, as they should have
// already been taken care of (e.g. and of 0, test of undefined shifted bits,
// etc.)
// (tbz (trunc x), b) -> (tbz x, b)
// This case is just here to enable more of the below cases to be caught.
if (Op->getOpcode() == ISD::TRUNCATE &&
Bit < Op->getValueType(0).getSizeInBits()) {
return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
}
// (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
if (Op->getOpcode() == ISD::ANY_EXTEND &&
Bit < Op->getOperand(0).getValueSizeInBits()) {
return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
}
if (Op->getNumOperands() != 2)
return Op;
auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
if (!C)
return Op;
switch (Op->getOpcode()) {
default:
return Op;
// (tbz (and x, m), b) -> (tbz x, b)
case ISD::AND:
if ((C->getZExtValue() >> Bit) & 1)
return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
return Op;
// (tbz (shl x, c), b) -> (tbz x, b-c)
case ISD::SHL:
if (C->getZExtValue() <= Bit &&
(Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
Bit = Bit - C->getZExtValue();
return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
}
return Op;
// (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
case ISD::SRA:
Bit = Bit + C->getZExtValue();
if (Bit >= Op->getValueType(0).getSizeInBits())
Bit = Op->getValueType(0).getSizeInBits() - 1;
return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
// (tbz (srl x, c), b) -> (tbz x, b+c)
case ISD::SRL:
if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
Bit = Bit + C->getZExtValue();
return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
}
return Op;
// (tbz (xor x, -1), b) -> (tbnz x, b)
case ISD::XOR:
if ((C->getZExtValue() >> Bit) & 1)
Invert = !Invert;
return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
}
}
// Optimize test single bit zero/non-zero and branch.
static SDValue performTBZCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
bool Invert = false;
SDValue TestSrc = N->getOperand(1);
SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
if (TestSrc == NewTestSrc)
return SDValue();
unsigned NewOpc = N->getOpcode();
if (Invert) {
if (NewOpc == AArch64ISD::TBZ)
NewOpc = AArch64ISD::TBNZ;
else {
assert(NewOpc == AArch64ISD::TBNZ);
NewOpc = AArch64ISD::TBZ;
}
}
SDLoc DL(N);
return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
}
// vselect (v1i1 setcc) ->
// vselect (v1iXX setcc) (XX is the size of the compared operand type)
// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
// such VSELECT.
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
EVT CCVT = N0.getValueType();
if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
CCVT.getVectorElementType() != MVT::i1)
return SDValue();
EVT ResVT = N->getValueType(0);
EVT CmpVT = N0.getOperand(0).getValueType();
// Only combine when the result type is of the same size as the compared
// operands.
if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
return SDValue();
SDValue IfTrue = N->getOperand(1);
SDValue IfFalse = N->getOperand(2);
SDValue SetCC =
DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
N0.getOperand(0), N0.getOperand(1),
cast<CondCodeSDNode>(N0.getOperand(2))->get());
return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
IfTrue, IfFalse);
}
/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
/// the compare-mask instructions rather than going via NZCV, even if LHS and
/// RHS are really scalar. This replaces any scalar setcc in the above pattern
/// with a vector one followed by a DUP shuffle on the result.
static SDValue performSelectCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
SDValue N0 = N->getOperand(0);
EVT ResVT = N->getValueType(0);
if (N0.getOpcode() != ISD::SETCC)
return SDValue();
// Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
// scalar SetCCResultType. We also don't expect vectors, because we assume
// that selects fed by vector SETCCs are canonicalized to VSELECT.
assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
"Scalar-SETCC feeding SELECT has unexpected result type!");
// If NumMaskElts == 0, the comparison is larger than select result. The
// largest real NEON comparison is 64-bits per lane, which means the result is
// at most 32-bits and an illegal vector. Just bail out for now.
EVT SrcVT = N0.getOperand(0).getValueType();
// Don't try to do this optimization when the setcc itself has i1 operands.
// There are no legal vectors of i1, so this would be pointless.
if (SrcVT == MVT::i1)
return SDValue();
int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
if (!ResVT.isVector() || NumMaskElts == 0)
return SDValue();
SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
// Also bail out if the vector CCVT isn't the same size as ResVT.
// This can happen if the SETCC operand size doesn't divide the ResVT size
// (e.g., f64 vs v3f32).
if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
return SDValue();
// Make sure we didn't create illegal types, if we're not supposed to.
assert(DCI.isBeforeLegalize() ||
DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
// First perform a vector comparison, where lane 0 is the one we're interested
// in.
SDLoc DL(N0);
SDValue LHS =
DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
SDValue RHS =
DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
// Now duplicate the comparison mask we want across all other lanes.
SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
Mask = DAG.getNode(ISD::BITCAST, DL,
ResVT.changeVectorElementTypeToInteger(), Mask);
return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
}
/// Get rid of unnecessary NVCASTs (that don't change the type).
static SDValue performNVCASTCombine(SDNode *N) {
if (N->getValueType(0) == N->getOperand(0).getValueType())
return N->getOperand(0);
return SDValue();
}
// If all users of the globaladdr are of the form (globaladdr + constant), find
// the smallest constant, fold it into the globaladdr's offset and rewrite the
// globaladdr as (globaladdr + constant) - constant.
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
const AArch64Subtarget *Subtarget,
const TargetMachine &TM) {
auto *GN = cast<GlobalAddressSDNode>(N);
if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
AArch64II::MO_NO_FLAG)
return SDValue();
uint64_t MinOffset = -1ull;
for (SDNode *N : GN->uses()) {
if (N->getOpcode() != ISD::ADD)
return SDValue();
auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
if (!C)
C = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!C)
return SDValue();
MinOffset = std::min(MinOffset, C->getZExtValue());
}
uint64_t Offset = MinOffset + GN->getOffset();
// Require that the new offset is larger than the existing one. Otherwise, we
// can end up oscillating between two possible DAGs, for example,
// (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
if (Offset <= uint64_t(GN->getOffset()))
return SDValue();
// Check whether folding this offset is legal. It must not go out of bounds of
// the referenced object to avoid violating the code model, and must be
// smaller than 2^21 because this is the largest offset expressible in all
// object formats.
//
// This check also prevents us from folding negative offsets, which will end
// up being treated in the same way as large positive ones. They could also
// cause code model violations, and aren't really common enough to matter.
if (Offset >= (1 << 21))
return SDValue();
const GlobalValue *GV = GN->getGlobal();
Type *T = GV->getValueType();
if (!T->isSized() ||
Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
return SDValue();
SDLoc DL(GN);
SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
DAG.getConstant(MinOffset, DL, MVT::i64));
}
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
switch (N->getOpcode()) {
default:
LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
break;
case ISD::ADD:
case ISD::SUB:
return performAddSubLongCombine(N, DCI, DAG);
case ISD::XOR:
return performXorCombine(N, DAG, DCI, Subtarget);
case ISD::MUL:
return performMulCombine(N, DAG, DCI, Subtarget);
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
return performIntToFpCombine(N, DAG, Subtarget);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
return performFpToIntCombine(N, DAG, DCI, Subtarget);
case ISD::FDIV:
return performFDivCombine(N, DAG, DCI, Subtarget);
case ISD::OR:
return performORCombine(N, DCI, Subtarget);
case ISD::AND:
return performANDCombine(N, DCI);
case ISD::SRL:
return performSRLCombine(N, DCI);
case ISD::INTRINSIC_WO_CHAIN:
return performIntrinsicCombine(N, DCI, Subtarget);
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::SIGN_EXTEND:
return performExtendCombine(N, DCI, DAG);
case ISD::BITCAST:
return performBitcastCombine(N, DCI, DAG);
case ISD::CONCAT_VECTORS:
return performConcatVectorsCombine(N, DCI, DAG);
case ISD::SELECT:
return performSelectCombine(N, DCI);
case ISD::VSELECT:
return performVSelectCombine(N, DCI.DAG);
case ISD::LOAD:
if (performTBISimplification(N->getOperand(1), DCI, DAG))
return SDValue(N, 0);
break;
case ISD::STORE:
return performSTORECombine(N, DCI, DAG, Subtarget);
case AArch64ISD::BRCOND:
return performBRCONDCombine(N, DCI, DAG);
case AArch64ISD::TBNZ:
case AArch64ISD::TBZ:
return performTBZCombine(N, DCI, DAG);
case AArch64ISD::CSEL:
return performCONDCombine(N, DCI, DAG, 2, 3);
case AArch64ISD::DUP:
return performPostLD1Combine(N, DCI, false);
case AArch64ISD::NVCAST:
return performNVCASTCombine(N);
case ISD::INSERT_VECTOR_ELT:
return performPostLD1Combine(N, DCI, true);
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_ld4:
case Intrinsic::aarch64_neon_ld1x2:
case Intrinsic::aarch64_neon_ld1x3:
case Intrinsic::aarch64_neon_ld1x4:
case Intrinsic::aarch64_neon_ld2lane:
case Intrinsic::aarch64_neon_ld3lane:
case Intrinsic::aarch64_neon_ld4lane:
case Intrinsic::aarch64_neon_ld2r:
case Intrinsic::aarch64_neon_ld3r:
case Intrinsic::aarch64_neon_ld4r:
case Intrinsic::aarch64_neon_st2:
case Intrinsic::aarch64_neon_st3:
case Intrinsic::aarch64_neon_st4:
case Intrinsic::aarch64_neon_st1x2:
case Intrinsic::aarch64_neon_st1x3:
case Intrinsic::aarch64_neon_st1x4:
case Intrinsic::aarch64_neon_st2lane:
case Intrinsic::aarch64_neon_st3lane:
case Intrinsic::aarch64_neon_st4lane:
return performNEONPostLDSTCombine(N, DCI, DAG);
default:
break;
}
break;
case ISD::GlobalAddress:
return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
}
return SDValue();
}
// Check if the return value is used as only a return value, as otherwise
// we can't perform a tail-call. In particular, we need to check for
// target ISD nodes that are returns and any other "odd" constructs
// that the generic analysis code won't necessarily catch.
bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
SDValue &Chain) const {
if (N->getNumValues() != 1)
return false;
if (!N->hasNUsesOfValue(1, 0))
return false;
SDValue TCChain = Chain;
SDNode *Copy = *N->use_begin();
if (Copy->getOpcode() == ISD::CopyToReg) {
// If the copy has a glue operand, we conservatively assume it isn't safe to
// perform a tail call.
if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
MVT::Glue)
return false;
TCChain = Copy->getOperand(0);
} else if (Copy->getOpcode() != ISD::FP_EXTEND)
return false;
bool HasRet = false;
for (SDNode *Node : Copy->uses()) {
if (Node->getOpcode() != AArch64ISD::RET_FLAG)
return false;
HasRet = true;
}
if (!HasRet)
return false;
Chain = TCChain;
return true;
}
// Return whether the an instruction can potentially be optimized to a tail
// call. This will cause the optimizers to attempt to move, or duplicate,
// return instructions to help enable tail call optimizations for this
// instruction.
bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
return CI->isTailCall();
}
bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
SDValue &Offset,
ISD::MemIndexedMode &AM,
bool &IsInc,
SelectionDAG &DAG) const {
if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
return false;
Base = Op->getOperand(0);
// All of the indexed addressing mode instructions take a signed
// 9 bit immediate offset.
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
int64_t RHSC = RHS->getSExtValue();
if (Op->getOpcode() == ISD::SUB)
RHSC = -(uint64_t)RHSC;
if (!isInt<9>(RHSC))
return false;
IsInc = (Op->getOpcode() == ISD::ADD);
Offset = Op->getOperand(1);
return true;
}
return false;
}
bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
SDValue &Offset,
ISD::MemIndexedMode &AM,
SelectionDAG &DAG) const {
EVT VT;
SDValue Ptr;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
VT = LD->getMemoryVT();
Ptr = LD->getBasePtr();
} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
VT = ST->getMemoryVT();
Ptr = ST->getBasePtr();
} else
return false;
bool IsInc;
if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
return false;
AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
return true;
}
bool AArch64TargetLowering::getPostIndexedAddressParts(
SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
EVT VT;
SDValue Ptr;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
VT = LD->getMemoryVT();
Ptr = LD->getBasePtr();
} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
VT = ST->getMemoryVT();
Ptr = ST->getBasePtr();
} else
return false;
bool IsInc;
if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
return false;
// Post-indexing updates the base, so it's not a valid transform
// if that's not the same as the load's pointer.
if (Ptr != Base)
return false;
AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
return true;
}
static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) {
SDLoc DL(N);
SDValue Op = N->getOperand(0);
if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16)
return;
Op = SDValue(
DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
DAG.getUNDEF(MVT::i32), Op,
DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
0);
Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
}
static void ReplaceReductionResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG, unsigned InterOp,
unsigned AcrossOp) {
EVT LoVT, HiVT;
SDValue Lo, Hi;
SDLoc dl(N);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
Results.push_back(SplitVal);
}
static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
SDLoc DL(N);
SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
DAG.getNode(ISD::SRL, DL, MVT::i128, N,
DAG.getConstant(64, DL, MVT::i64)));
return std::make_pair(Lo, Hi);
}
// Create an even/odd pair of X registers holding integer value V.
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
SDLoc dl(V.getNode());
SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64);
SDValue VHi = DAG.getAnyExtOrTrunc(
DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)),
dl, MVT::i64);
if (DAG.getDataLayout().isBigEndian())
std::swap (VLo, VHi);
SDValue RegClass =
DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
return SDValue(
DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
}
static void ReplaceCMP_SWAP_128Results(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
assert(N->getValueType(0) == MVT::i128 &&
"AtomicCmpSwap on types less than 128 should be legal");
if (Subtarget->hasLSE()) {
// LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
// so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
SDValue Ops[] = {
createGPRPairNode(DAG, N->getOperand(2)), // Compare value
createGPRPairNode(DAG, N->getOperand(3)), // Store value
N->getOperand(1), // Ptr
N->getOperand(0), // Chain in
};
MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
unsigned Opcode;
switch (MemOp->getOrdering()) {
case AtomicOrdering::Monotonic:
Opcode = AArch64::CASPX;
break;
case AtomicOrdering::Acquire:
Opcode = AArch64::CASPAX;
break;
case AtomicOrdering::Release:
Opcode = AArch64::CASPLX;
break;
case AtomicOrdering::AcquireRelease:
case AtomicOrdering::SequentiallyConsistent:
Opcode = AArch64::CASPALX;
break;
default:
llvm_unreachable("Unexpected ordering!");
}
MachineSDNode *CmpSwap = DAG.getMachineNode(
Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
DAG.setNodeMemRefs(CmpSwap, {MemOp});
unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
if (DAG.getDataLayout().isBigEndian())
std::swap(SubReg1, SubReg2);
Results.push_back(DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
SDValue(CmpSwap, 0)));
Results.push_back(DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
SDValue(CmpSwap, 0)));
Results.push_back(SDValue(CmpSwap, 1)); // Chain out
return;
}
auto Desired = splitInt128(N->getOperand(2), DAG);
auto New = splitInt128(N->getOperand(3), DAG);
SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
New.first, New.second, N->getOperand(0)};
SDNode *CmpSwap = DAG.getMachineNode(
AArch64::CMP_SWAP_128, SDLoc(N),
DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);
MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
Results.push_back(SDValue(CmpSwap, 0));
Results.push_back(SDValue(CmpSwap, 1));
Results.push_back(SDValue(CmpSwap, 3));
}
void AArch64TargetLowering::ReplaceNodeResults(
SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
switch (N->getOpcode()) {
default:
llvm_unreachable("Don't know how to custom expand this");
case ISD::BITCAST:
ReplaceBITCASTResults(N, Results, DAG);
return;
case ISD::VECREDUCE_ADD:
case ISD::VECREDUCE_SMAX:
case ISD::VECREDUCE_SMIN:
case ISD::VECREDUCE_UMAX:
case ISD::VECREDUCE_UMIN:
Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
return;
case AArch64ISD::SADDV:
ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
return;
case AArch64ISD::UADDV:
ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
return;
case AArch64ISD::SMINV:
ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
return;
case AArch64ISD::UMINV:
ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
return;
case AArch64ISD::SMAXV:
ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
return;
case AArch64ISD::UMAXV:
ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
return;
case ISD::FP_TO_UINT:
case ISD::FP_TO_SINT:
assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
// Let normal code take care of it by not adding anything to Results.
return;
case ISD::ATOMIC_CMP_SWAP:
ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
return;
}
}
bool AArch64TargetLowering::useLoadStackGuardNode() const {
if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
return TargetLowering::useLoadStackGuardNode();
return true;
}
unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
// Combine multiple FDIVs with the same divisor into multiple FMULs by the
// reciprocal if there are three or more FDIVs.
return 3;
}
TargetLoweringBase::LegalizeTypeAction
AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
// During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
// v4i16, v2i32 instead of to promote.
if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
VT == MVT::v1f32)
return TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
}
// Loads and stores less than 128-bits are already atomic; ones above that
// are doomed anyway, so defer to the default libcall and blame the OS when
// things go wrong.
bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
return Size == 128;
}
// Loads and stores less than 128-bits are already atomic; ones above that
// are doomed anyway, so defer to the default libcall and blame the OS when
// things go wrong.
TargetLowering::AtomicExpansionKind
AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
unsigned Size = LI->getType()->getPrimitiveSizeInBits();
return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
}
// For the real atomic operations, we have ldxr/stxr up to 128 bits,
TargetLowering::AtomicExpansionKind
AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
if (AI->isFloatingPointOperation())
return AtomicExpansionKind::CmpXChg;
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
if (Size > 128) return AtomicExpansionKind::None;
// Nand not supported in LSE.
if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;
// Leave 128 bits to LLSC.
return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC;
}
TargetLowering::AtomicExpansionKind
AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
AtomicCmpXchgInst *AI) const {
// If subtarget has LSE, leave cmpxchg intact for codegen.
if (Subtarget->hasLSE())
return AtomicExpansionKind::None;
// At -O0, fast-regalloc cannot cope with the live vregs necessary to
// implement cmpxchg without spilling. If the address being exchanged is also
// on the stack and close enough to the spill slot, this can lead to a
// situation where the monitor always gets cleared and the atomic operation
// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
if (getTargetMachine().getOptLevel() == 0)
return AtomicExpansionKind::None;
return AtomicExpansionKind::LLSC;
}
Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
AtomicOrdering Ord) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
bool IsAcquire = isAcquireOrStronger(Ord);
// Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
// intrinsic must return {i64, i64} and we have to recombine them into a
// single i128 here.
if (ValTy->getPrimitiveSizeInBits() == 128) {
Intrinsic::ID Int =
IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
Function *Ldxr = Intrinsic::getDeclaration(M, Int);
Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
return Builder.CreateOr(
Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
}
Type *Tys[] = { Addr->getType() };
Intrinsic::ID Int =
IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
Type *EltTy = cast<PointerType>(Addr->getType())->getElementType();
const DataLayout &DL = M->getDataLayout();
IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(EltTy));
Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy);
return Builder.CreateBitCast(Trunc, EltTy);
}
void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
IRBuilder<> &Builder) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
}
Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
Value *Val, Value *Addr,
AtomicOrdering Ord) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
bool IsRelease = isReleaseOrStronger(Ord);
// Since the intrinsics must have legal type, the i128 intrinsics take two
// parameters: "i64, i64". We must marshal Val into the appropriate form
// before the call.
if (Val->getType()->getPrimitiveSizeInBits() == 128) {
Intrinsic::ID Int =
IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
Function *Stxr = Intrinsic::getDeclaration(M, Int);
Type *Int64Ty = Type::getInt64Ty(M->getContext());
Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
}
Intrinsic::ID Int =
IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
Type *Tys[] = { Addr->getType() };
Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
const DataLayout &DL = M->getDataLayout();
IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
Val = Builder.CreateBitCast(Val, IntValTy);
return Builder.CreateCall(Stxr,
{Builder.CreateZExtOrBitCast(
Val, Stxr->getFunctionType()->getParamType(0)),
Addr});
}
bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
return Ty->isArrayTy();
}
bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
EVT) const {
return false;
}
static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) {
Module *M = IRB.GetInsertBlock()->getParent()->getParent();
Function *ThreadPointerFunc =
Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
return IRB.CreatePointerCast(
IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
Offset),
IRB.getInt8PtrTy()->getPointerTo(0));
}
Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
// Android provides a fixed TLS slot for the stack cookie. See the definition
// of TLS_SLOT_STACK_GUARD in
// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
if (Subtarget->isTargetAndroid())
return UseTlsOffset(IRB, 0x28);
// Fuchsia is similar.
// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
if (Subtarget->isTargetFuchsia())
return UseTlsOffset(IRB, -0x10);
return TargetLowering::getIRStackGuard(IRB);
}
void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
// MSVC CRT provides functionalities for stack protection.
if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
// MSVC CRT has a global variable holding security cookie.
M.getOrInsertGlobal("__security_cookie",
Type::getInt8PtrTy(M.getContext()));
// MSVC CRT has a function to validate security cookie.
FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
"__security_check_cookie", Type::getVoidTy(M.getContext()),
Type::getInt8PtrTy(M.getContext()));
if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
F->setCallingConv(CallingConv::Win64);
F->addAttribute(1, Attribute::AttrKind::InReg);
}
return;
}
TargetLowering::insertSSPDeclarations(M);
}
Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
// MSVC CRT has a global variable holding security cookie.
if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
return M.getGlobalVariable("__security_cookie");
return TargetLowering::getSDagStackGuard(M);
}
Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
// MSVC CRT has a function to validate security cookie.
if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
return M.getFunction("__security_check_cookie");
return TargetLowering::getSSPStackGuardCheck(M);
}
Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
// Android provides a fixed TLS slot for the SafeStack pointer. See the
// definition of TLS_SLOT_SAFESTACK in
// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
if (Subtarget->isTargetAndroid())
return UseTlsOffset(IRB, 0x48);
// Fuchsia is similar.
// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
if (Subtarget->isTargetFuchsia())
return UseTlsOffset(IRB, -0x8);
return TargetLowering::getSafeStackPointerLocation(IRB);
}
bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
const Instruction &AndI) const {
// Only sink 'and' mask to cmp use block if it is masking a single bit, since
// this is likely to be fold the and/cmp/br into a single tbz instruction. It
// may be beneficial to sink in other cases, but we would have to check that
// the cmp would not get folded into the br to form a cbz for these to be
// beneficial.
ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
if (!Mask)
return false;
return Mask->getValue().isPowerOf2();
}
void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
// Update IsSplitCSR in AArch64unctionInfo.
AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
AFI->setIsSplitCSR(true);
}
void AArch64TargetLowering::insertCopiesSplitCSR(
MachineBasicBlock *Entry,
const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
if (!IStart)
return;
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
MachineBasicBlock::iterator MBBI = Entry->begin();
for (const MCPhysReg *I = IStart; *I; ++I) {
const TargetRegisterClass *RC = nullptr;
if (AArch64::GPR64RegClass.contains(*I))
RC = &AArch64::GPR64RegClass;
else if (AArch64::FPR64RegClass.contains(*I))
RC = &AArch64::FPR64RegClass;
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
unsigned NewVR = MRI->createVirtualRegister(RC);
// Create copy from CSR to a virtual register.
// FIXME: this currently does not emit CFI pseudo-instructions, it works
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
// nounwind. If we want to generalize this later, we may need to emit
// CFI pseudo-instructions.
assert(Entry->getParent()->getFunction().hasFnAttribute(
Attribute::NoUnwind) &&
"Function should be nounwind in insertCopiesSplitCSR!");
Entry->addLiveIn(*I);
BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
.addReg(*I);
// Insert the copy-back instructions right before the terminator.
for (auto *Exit : Exits)
BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
TII->get(TargetOpcode::COPY), *I)
.addReg(NewVR);
}
}
bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
// Integer division on AArch64 is expensive. However, when aggressively
// optimizing for code size, we prefer to use a div instruction, as it is
// usually smaller than the alternative sequence.
// The exception to this is vector division. Since AArch64 doesn't have vector
// integer division, leaving the division as-is is a loss even in terms of
// size, because it will have to be scalarized, while the alternative code
// sequence can be performed in vector form.
bool OptSize =
Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
return OptSize && !VT.isVector();
}
bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
// We want inc-of-add for scalars and sub-of-not for vectors.
return VT.isScalarInteger();
}
bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
}
unsigned
AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
return getPointerTy(DL).getSizeInBits();
return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
}
void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
MF.getFrameInfo().computeMaxCallFrameSize(MF);
TargetLoweringBase::finalizeLowering(MF);
}
// Unlike X86, we let frame lowering assign offsets to all catch objects.
bool AArch64TargetLowering::needsFixedCatchObjects() const {
return false;
}
Index: vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64InstrInfo.td (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64InstrInfo.td (revision 351303)
@@ -1,6943 +1,6943 @@
//=- AArch64InstrInfo.td - Describe the AArch64 Instructions -*- tablegen -*-=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// AArch64 Instruction definitions.
//
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
// ARM Instruction Predicate Definitions.
//
def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">,
AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">,
AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
def HasV8_3a : Predicate<"Subtarget->hasV8_3aOps()">,
AssemblerPredicate<"HasV8_3aOps", "armv8.3a">;
def HasV8_4a : Predicate<"Subtarget->hasV8_4aOps()">,
AssemblerPredicate<"HasV8_4aOps", "armv8.4a">;
def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">,
AssemblerPredicate<"HasV8_5aOps", "armv8.5a">;
def HasVH : Predicate<"Subtarget->hasVH()">,
AssemblerPredicate<"FeatureVH", "vh">;
def HasLOR : Predicate<"Subtarget->hasLOR()">,
AssemblerPredicate<"FeatureLOR", "lor">;
def HasPA : Predicate<"Subtarget->hasPA()">,
AssemblerPredicate<"FeaturePA", "pa">;
def HasJS : Predicate<"Subtarget->hasJS()">,
AssemblerPredicate<"FeatureJS", "jsconv">;
def HasCCIDX : Predicate<"Subtarget->hasCCIDX()">,
AssemblerPredicate<"FeatureCCIDX", "ccidx">;
def HasComplxNum : Predicate<"Subtarget->hasComplxNum()">,
AssemblerPredicate<"FeatureComplxNum", "complxnum">;
def HasNV : Predicate<"Subtarget->hasNV()">,
AssemblerPredicate<"FeatureNV", "nv">;
def HasRASv8_4 : Predicate<"Subtarget->hasRASv8_4()">,
AssemblerPredicate<"FeatureRASv8_4", "rasv8_4">;
def HasMPAM : Predicate<"Subtarget->hasMPAM()">,
AssemblerPredicate<"FeatureMPAM", "mpam">;
def HasDIT : Predicate<"Subtarget->hasDIT()">,
AssemblerPredicate<"FeatureDIT", "dit">;
def HasTRACEV8_4 : Predicate<"Subtarget->hasTRACEV8_4()">,
AssemblerPredicate<"FeatureTRACEV8_4", "tracev8.4">;
def HasAM : Predicate<"Subtarget->hasAM()">,
AssemblerPredicate<"FeatureAM", "am">;
def HasSEL2 : Predicate<"Subtarget->hasSEL2()">,
AssemblerPredicate<"FeatureSEL2", "sel2">;
def HasTLB_RMI : Predicate<"Subtarget->hasTLB_RMI()">,
AssemblerPredicate<"FeatureTLB_RMI", "tlb-rmi">;
def HasFMI : Predicate<"Subtarget->hasFMI()">,
AssemblerPredicate<"FeatureFMI", "fmi">;
def HasRCPC_IMMO : Predicate<"Subtarget->hasRCPCImm()">,
AssemblerPredicate<"FeatureRCPC_IMMO", "rcpc-immo">;
def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">,
AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
def HasNEON : Predicate<"Subtarget->hasNEON()">,
AssemblerPredicate<"FeatureNEON", "neon">;
def HasCrypto : Predicate<"Subtarget->hasCrypto()">,
AssemblerPredicate<"FeatureCrypto", "crypto">;
def HasSM4 : Predicate<"Subtarget->hasSM4()">,
AssemblerPredicate<"FeatureSM4", "sm4">;
def HasSHA3 : Predicate<"Subtarget->hasSHA3()">,
AssemblerPredicate<"FeatureSHA3", "sha3">;
def HasSHA2 : Predicate<"Subtarget->hasSHA2()">,
AssemblerPredicate<"FeatureSHA2", "sha2">;
def HasAES : Predicate<"Subtarget->hasAES()">,
AssemblerPredicate<"FeatureAES", "aes">;
def HasDotProd : Predicate<"Subtarget->hasDotProd()">,
AssemblerPredicate<"FeatureDotProd", "dotprod">;
def HasCRC : Predicate<"Subtarget->hasCRC()">,
AssemblerPredicate<"FeatureCRC", "crc">;
def HasLSE : Predicate<"Subtarget->hasLSE()">,
AssemblerPredicate<"FeatureLSE", "lse">;
def HasRAS : Predicate<"Subtarget->hasRAS()">,
AssemblerPredicate<"FeatureRAS", "ras">;
def HasRDM : Predicate<"Subtarget->hasRDM()">,
AssemblerPredicate<"FeatureRDM", "rdm">;
def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">;
def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">,
AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
def HasFP16FML : Predicate<"Subtarget->hasFP16FML()">,
AssemblerPredicate<"FeatureFP16FML", "fp16fml">;
def HasSPE : Predicate<"Subtarget->hasSPE()">,
AssemblerPredicate<"FeatureSPE", "spe">;
def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">,
AssemblerPredicate<"FeatureFuseAES",
"fuse-aes">;
def HasSVE : Predicate<"Subtarget->hasSVE()">,
AssemblerPredicate<"FeatureSVE", "sve">;
def HasSVE2 : Predicate<"Subtarget->hasSVE2()">,
AssemblerPredicate<"FeatureSVE2", "sve2">;
def HasSVE2AES : Predicate<"Subtarget->hasSVE2AES()">,
AssemblerPredicate<"FeatureSVE2AES", "sve2-aes">;
def HasSVE2SM4 : Predicate<"Subtarget->hasSVE2SM4()">,
AssemblerPredicate<"FeatureSVE2SM4", "sve2-sm4">;
def HasSVE2SHA3 : Predicate<"Subtarget->hasSVE2SHA3()">,
AssemblerPredicate<"FeatureSVE2SHA3", "sve2-sha3">;
def HasSVE2BitPerm : Predicate<"Subtarget->hasSVE2BitPerm()">,
- AssemblerPredicate<"FeatureSVE2BitPerm", "bitperm">;
+ AssemblerPredicate<"FeatureSVE2BitPerm", "sve2-bitperm">;
def HasRCPC : Predicate<"Subtarget->hasRCPC()">,
AssemblerPredicate<"FeatureRCPC", "rcpc">;
def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">,
AssemblerPredicate<"FeatureAltFPCmp", "altnzcv">;
def HasFRInt3264 : Predicate<"Subtarget->hasFRInt3264()">,
AssemblerPredicate<"FeatureFRInt3264", "frint3264">;
def HasSB : Predicate<"Subtarget->hasSB()">,
AssemblerPredicate<"FeatureSB", "sb">;
def HasPredRes : Predicate<"Subtarget->hasPredRes()">,
AssemblerPredicate<"FeaturePredRes", "predres">;
def HasCCDP : Predicate<"Subtarget->hasCCDP()">,
AssemblerPredicate<"FeatureCacheDeepPersist", "ccdp">;
def HasBTI : Predicate<"Subtarget->hasBTI()">,
AssemblerPredicate<"FeatureBranchTargetId", "bti">;
def HasMTE : Predicate<"Subtarget->hasMTE()">,
AssemblerPredicate<"FeatureMTE", "mte">;
def IsLE : Predicate<"Subtarget->isLittleEndian()">;
def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
def IsWindows : Predicate<"Subtarget->isTargetWindows()">;
def UseAlternateSExtLoadCVTF32
: Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
def UseNegativeImmediates
: Predicate<"false">, AssemblerPredicate<"!FeatureNoNegativeImmediates",
"NegativeImmediates">;
def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
SDTCisInt<1>]>>;
//===----------------------------------------------------------------------===//
// AArch64-specific DAG Nodes.
//
// SDTBinaryArithWithFlagsOut - RES1, FLAGS = op LHS, RHS
def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2,
[SDTCisSameAs<0, 2>,
SDTCisSameAs<0, 3>,
SDTCisInt<0>, SDTCisVT<1, i32>]>;
// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS
def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3,
[SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
SDTCisInt<0>,
SDTCisVT<3, i32>]>;
// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS
def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
[SDTCisSameAs<0, 2>,
SDTCisSameAs<0, 3>,
SDTCisInt<0>,
SDTCisVT<1, i32>,
SDTCisVT<4, i32>]>;
def SDT_AArch64Brcond : SDTypeProfile<0, 3,
[SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
SDTCisVT<2, i32>]>;
def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
SDTCisVT<2, OtherVT>]>;
def SDT_AArch64CSel : SDTypeProfile<1, 4,
[SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
SDTCisInt<3>,
SDTCisVT<4, i32>]>;
def SDT_AArch64CCMP : SDTypeProfile<1, 5,
[SDTCisVT<0, i32>,
SDTCisInt<1>,
SDTCisSameAs<1, 2>,
SDTCisInt<3>,
SDTCisInt<4>,
SDTCisVT<5, i32>]>;
def SDT_AArch64FCCMP : SDTypeProfile<1, 5,
[SDTCisVT<0, i32>,
SDTCisFP<1>,
SDTCisSameAs<1, 2>,
SDTCisInt<3>,
SDTCisInt<4>,
SDTCisVT<5, i32>]>;
def SDT_AArch64FCmp : SDTypeProfile<0, 2,
[SDTCisFP<0>,
SDTCisSameAs<0, 1>]>;
def SDT_AArch64Dup : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
def SDT_AArch64DupLane : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
def SDT_AArch64Zip : SDTypeProfile<1, 2, [SDTCisVec<0>,
SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>]>;
def SDT_AArch64MOVIedit : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
def SDT_AArch64MOVIshift : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
def SDT_AArch64vecimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisInt<2>, SDTCisInt<3>]>;
def SDT_AArch64UnaryVec: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
def SDT_AArch64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>, SDTCisInt<3>]>;
def SDT_AArch64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>;
def SDT_AArch64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
def SDT_AArch64fcmpz : SDTypeProfile<1, 1, []>;
def SDT_AArch64fcmp : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>;
def SDT_AArch64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>]>;
def SDT_AArch64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>,
SDTCisSameAs<0,3>]>;
def SDT_AArch64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
def SDT_AArch64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;
def SDT_AArch64ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;
def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
SDTCisPtrTy<1>]>;
// Generates the general dynamic sequences, i.e.
// adrp x0, :tlsdesc:var
// ldr x1, [x0, #:tlsdesc_lo12:var]
// add x0, x0, #:tlsdesc_lo12:var
// .tlsdesccall var
// blr x1
// (the TPIDR_EL0 offset is put directly in X0, hence no "result" here)
// number of operands (the variable)
def SDT_AArch64TLSDescCallSeq : SDTypeProfile<0,1,
[SDTCisPtrTy<0>]>;
def SDT_AArch64WrapperLarge : SDTypeProfile<1, 4,
[SDTCisVT<0, i64>, SDTCisVT<1, i32>,
SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>,
SDTCisSameAs<1, 4>]>;
// Node definitions.
def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
def AArch64adr : SDNode<"AArch64ISD::ADR", SDTIntUnaryOp, []>;
def AArch64addlow : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>;
def AArch64LOADgot : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>;
def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START",
SDCallSeqStart<[ SDTCisVT<0, i32>,
SDTCisVT<1, i32> ]>,
[SDNPHasChain, SDNPOutGlue]>;
def AArch64callseq_end : SDNode<"ISD::CALLSEQ_END",
SDCallSeqEnd<[ SDTCisVT<0, i32>,
SDTCisVT<1, i32> ]>,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
def AArch64call : SDNode<"AArch64ISD::CALL",
SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
SDNPVariadic]>;
def AArch64brcond : SDNode<"AArch64ISD::BRCOND", SDT_AArch64Brcond,
[SDNPHasChain]>;
def AArch64cbz : SDNode<"AArch64ISD::CBZ", SDT_AArch64cbz,
[SDNPHasChain]>;
def AArch64cbnz : SDNode<"AArch64ISD::CBNZ", SDT_AArch64cbz,
[SDNPHasChain]>;
def AArch64tbz : SDNode<"AArch64ISD::TBZ", SDT_AArch64tbz,
[SDNPHasChain]>;
def AArch64tbnz : SDNode<"AArch64ISD::TBNZ", SDT_AArch64tbz,
[SDNPHasChain]>;
def AArch64csel : SDNode<"AArch64ISD::CSEL", SDT_AArch64CSel>;
def AArch64csinv : SDNode<"AArch64ISD::CSINV", SDT_AArch64CSel>;
def AArch64csneg : SDNode<"AArch64ISD::CSNEG", SDT_AArch64CSel>;
def AArch64csinc : SDNode<"AArch64ISD::CSINC", SDT_AArch64CSel>;
def AArch64retflag : SDNode<"AArch64ISD::RET_FLAG", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
def AArch64adc : SDNode<"AArch64ISD::ADC", SDTBinaryArithWithFlagsIn >;
def AArch64sbc : SDNode<"AArch64ISD::SBC", SDTBinaryArithWithFlagsIn>;
def AArch64add_flag : SDNode<"AArch64ISD::ADDS", SDTBinaryArithWithFlagsOut,
[SDNPCommutative]>;
def AArch64sub_flag : SDNode<"AArch64ISD::SUBS", SDTBinaryArithWithFlagsOut>;
def AArch64and_flag : SDNode<"AArch64ISD::ANDS", SDTBinaryArithWithFlagsOut,
[SDNPCommutative]>;
def AArch64adc_flag : SDNode<"AArch64ISD::ADCS", SDTBinaryArithWithFlagsInOut>;
def AArch64sbc_flag : SDNode<"AArch64ISD::SBCS", SDTBinaryArithWithFlagsInOut>;
def AArch64ccmp : SDNode<"AArch64ISD::CCMP", SDT_AArch64CCMP>;
def AArch64ccmn : SDNode<"AArch64ISD::CCMN", SDT_AArch64CCMP>;
def AArch64fccmp : SDNode<"AArch64ISD::FCCMP", SDT_AArch64FCCMP>;
def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
def AArch64fcmp : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>;
def AArch64dup : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>;
def AArch64duplane8 : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>;
def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>;
def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>;
def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>;
def AArch64zip1 : SDNode<"AArch64ISD::ZIP1", SDT_AArch64Zip>;
def AArch64zip2 : SDNode<"AArch64ISD::ZIP2", SDT_AArch64Zip>;
def AArch64uzp1 : SDNode<"AArch64ISD::UZP1", SDT_AArch64Zip>;
def AArch64uzp2 : SDNode<"AArch64ISD::UZP2", SDT_AArch64Zip>;
def AArch64trn1 : SDNode<"AArch64ISD::TRN1", SDT_AArch64Zip>;
def AArch64trn2 : SDNode<"AArch64ISD::TRN2", SDT_AArch64Zip>;
def AArch64movi_edit : SDNode<"AArch64ISD::MOVIedit", SDT_AArch64MOVIedit>;
def AArch64movi_shift : SDNode<"AArch64ISD::MOVIshift", SDT_AArch64MOVIshift>;
def AArch64movi_msl : SDNode<"AArch64ISD::MOVImsl", SDT_AArch64MOVIshift>;
def AArch64mvni_shift : SDNode<"AArch64ISD::MVNIshift", SDT_AArch64MOVIshift>;
def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>;
def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>;
def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>;
def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>;
def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>;
def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>;
def AArch64ext : SDNode<"AArch64ISD::EXT", SDT_AArch64ExtVec>;
def AArch64vashr : SDNode<"AArch64ISD::VASHR", SDT_AArch64vshift>;
def AArch64vlshr : SDNode<"AArch64ISD::VLSHR", SDT_AArch64vshift>;
def AArch64vshl : SDNode<"AArch64ISD::VSHL", SDT_AArch64vshift>;
def AArch64sqshli : SDNode<"AArch64ISD::SQSHL_I", SDT_AArch64vshift>;
def AArch64uqshli : SDNode<"AArch64ISD::UQSHL_I", SDT_AArch64vshift>;
def AArch64sqshlui : SDNode<"AArch64ISD::SQSHLU_I", SDT_AArch64vshift>;
def AArch64srshri : SDNode<"AArch64ISD::SRSHR_I", SDT_AArch64vshift>;
def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>;
def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>;
def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>;
def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>;
def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>;
def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>;
def AArch64cmgt: SDNode<"AArch64ISD::CMGT", SDT_AArch64binvec>;
def AArch64cmhi: SDNode<"AArch64ISD::CMHI", SDT_AArch64binvec>;
def AArch64cmhs: SDNode<"AArch64ISD::CMHS", SDT_AArch64binvec>;
def AArch64fcmeq: SDNode<"AArch64ISD::FCMEQ", SDT_AArch64fcmp>;
def AArch64fcmge: SDNode<"AArch64ISD::FCMGE", SDT_AArch64fcmp>;
def AArch64fcmgt: SDNode<"AArch64ISD::FCMGT", SDT_AArch64fcmp>;
def AArch64cmeqz: SDNode<"AArch64ISD::CMEQz", SDT_AArch64unvec>;
def AArch64cmgez: SDNode<"AArch64ISD::CMGEz", SDT_AArch64unvec>;
def AArch64cmgtz: SDNode<"AArch64ISD::CMGTz", SDT_AArch64unvec>;
def AArch64cmlez: SDNode<"AArch64ISD::CMLEz", SDT_AArch64unvec>;
def AArch64cmltz: SDNode<"AArch64ISD::CMLTz", SDT_AArch64unvec>;
def AArch64cmtst : PatFrag<(ops node:$LHS, node:$RHS),
(AArch64not (AArch64cmeqz (and node:$LHS, node:$RHS)))>;
def AArch64fcmeqz: SDNode<"AArch64ISD::FCMEQz", SDT_AArch64fcmpz>;
def AArch64fcmgez: SDNode<"AArch64ISD::FCMGEz", SDT_AArch64fcmpz>;
def AArch64fcmgtz: SDNode<"AArch64ISD::FCMGTz", SDT_AArch64fcmpz>;
def AArch64fcmlez: SDNode<"AArch64ISD::FCMLEz", SDT_AArch64fcmpz>;
def AArch64fcmltz: SDNode<"AArch64ISD::FCMLTz", SDT_AArch64fcmpz>;
def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>;
def AArch64orri: SDNode<"AArch64ISD::ORRi", SDT_AArch64vecimm>;
def AArch64neg : SDNode<"AArch64ISD::NEG", SDT_AArch64unvec>;
def AArch64tcret: SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64TCRET,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
def AArch64Prefetch : SDNode<"AArch64ISD::PREFETCH", SDT_AArch64PREFETCH,
[SDNPHasChain, SDNPSideEffect]>;
def AArch64sitof: SDNode<"AArch64ISD::SITOF", SDT_AArch64ITOF>;
def AArch64uitof: SDNode<"AArch64ISD::UITOF", SDT_AArch64ITOF>;
def AArch64tlsdesc_callseq : SDNode<"AArch64ISD::TLSDESC_CALLSEQ",
SDT_AArch64TLSDescCallSeq,
[SDNPInGlue, SDNPOutGlue, SDNPHasChain,
SDNPVariadic]>;
def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",
SDT_AArch64WrapperLarge>;
def AArch64NvCast : SDNode<"AArch64ISD::NVCAST", SDTUnaryOp>;
def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
SDTCisSameAs<1, 2>]>;
def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>;
def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>;
def AArch64frecpe : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>;
def AArch64frecps : SDNode<"AArch64ISD::FRECPS", SDTFPBinOp>;
def AArch64frsqrte : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>;
def AArch64frsqrts : SDNode<"AArch64ISD::FRSQRTS", SDTFPBinOp>;
def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>;
def AArch64uaddv : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>;
def AArch64sminv : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>;
def AArch64uminv : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>;
def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>;
def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def AArch64st2g : SDNode<"AArch64ISD::ST2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def AArch64stz2g : SDNode<"AArch64ISD::STZ2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
// AArch64 Instruction Predicate Definitions.
// We could compute these on a per-module basis but doing so requires accessing
// the Function object through the <Target>Subtarget and objections were raised
// to that (see post-commit review comments for r301750).
let RecomputePerFunction = 1 in {
def ForCodeSize : Predicate<"MF->getFunction().hasOptSize()">;
def NotForCodeSize : Predicate<"!MF->getFunction().hasOptSize()">;
// Avoid generating STRQro if it is slow, unless we're optimizing for code size.
def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || MF->getFunction().hasOptSize()">;
def UseBTI : Predicate<[{ MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
def NotUseBTI : Predicate<[{ !MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
}
include "AArch64InstrFormats.td"
include "SVEInstrFormats.td"
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
// Miscellaneous instructions.
//===----------------------------------------------------------------------===//
let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
// We set Sched to empty list because we expect these instructions to simply get
// removed in most cases.
def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
[(AArch64callseq_start timm:$amt1, timm:$amt2)]>,
Sched<[]>;
def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
[(AArch64callseq_end timm:$amt1, timm:$amt2)]>,
Sched<[]>;
} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
let isReMaterializable = 1, isCodeGenOnly = 1 in {
// FIXME: The following pseudo instructions are only needed because remat
// cannot handle multiple instructions. When that changes, they can be
// removed, along with the AArch64Wrapper node.
let AddedComplexity = 10 in
def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr),
[(set GPR64:$dst, (AArch64LOADgot tglobaladdr:$addr))]>,
Sched<[WriteLDAdr]>;
// The MOVaddr instruction should match only when the add is not folded
// into a load or store address.
def MOVaddr
: Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
[(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaladdr:$hi),
tglobaladdr:$low))]>,
Sched<[WriteAdrAdr]>;
def MOVaddrJT
: Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
[(set GPR64:$dst, (AArch64addlow (AArch64adrp tjumptable:$hi),
tjumptable:$low))]>,
Sched<[WriteAdrAdr]>;
def MOVaddrCP
: Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
[(set GPR64:$dst, (AArch64addlow (AArch64adrp tconstpool:$hi),
tconstpool:$low))]>,
Sched<[WriteAdrAdr]>;
def MOVaddrBA
: Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
[(set GPR64:$dst, (AArch64addlow (AArch64adrp tblockaddress:$hi),
tblockaddress:$low))]>,
Sched<[WriteAdrAdr]>;
def MOVaddrTLS
: Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
[(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaltlsaddr:$hi),
tglobaltlsaddr:$low))]>,
Sched<[WriteAdrAdr]>;
def MOVaddrEXT
: Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
[(set GPR64:$dst, (AArch64addlow (AArch64adrp texternalsym:$hi),
texternalsym:$low))]>,
Sched<[WriteAdrAdr]>;
// Normally AArch64addlow either gets folded into a following ldr/str,
// or together with an adrp into MOVaddr above. For cases with TLS, it
// might appear without either of them, so allow lowering it into a plain
// add.
def ADDlowTLS
: Pseudo<(outs GPR64:$dst), (ins GPR64:$src, i64imm:$low),
[(set GPR64:$dst, (AArch64addlow GPR64:$src,
tglobaltlsaddr:$low))]>,
Sched<[WriteAdr]>;
} // isReMaterializable, isCodeGenOnly
def : Pat<(AArch64LOADgot tglobaltlsaddr:$addr),
(LOADgot tglobaltlsaddr:$addr)>;
def : Pat<(AArch64LOADgot texternalsym:$addr),
(LOADgot texternalsym:$addr)>;
def : Pat<(AArch64LOADgot tconstpool:$addr),
(LOADgot tconstpool:$addr)>;
// 32-bit jump table destination is actually only 2 instructions since we can
// use the table itself as a PC-relative base. But optimization occurs after
// branch relaxation so be pessimistic.
let Size = 12, Constraints = "@earlyclobber $dst,@earlyclobber $scratch" in {
def JumpTableDest32 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch),
(ins GPR64:$table, GPR64:$entry, i32imm:$jti), []>,
Sched<[]>;
def JumpTableDest16 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch),
(ins GPR64:$table, GPR64:$entry, i32imm:$jti), []>,
Sched<[]>;
def JumpTableDest8 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch),
(ins GPR64:$table, GPR64:$entry, i32imm:$jti), []>,
Sched<[]>;
}
// Space-consuming pseudo to aid testing of placement and reachability
// algorithms. Immediate operand is the number of bytes this "instruction"
// occupies; register operands can be used to enforce dependency and constrain
// the scheduler.
let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in
def SPACE : Pseudo<(outs GPR64:$Rd), (ins i32imm:$size, GPR64:$Rn),
[(set GPR64:$Rd, (int_aarch64_space imm:$size, GPR64:$Rn))]>,
Sched<[]>;
let hasSideEffects = 1, isCodeGenOnly = 1 in {
def SpeculationSafeValueX
: Pseudo<(outs GPR64:$dst), (ins GPR64:$src), []>, Sched<[]>;
def SpeculationSafeValueW
: Pseudo<(outs GPR32:$dst), (ins GPR32:$src), []>, Sched<[]>;
}
//===----------------------------------------------------------------------===//
// System instructions.
//===----------------------------------------------------------------------===//
def HINT : HintI<"hint">;
def : InstAlias<"nop", (HINT 0b000)>;
def : InstAlias<"yield",(HINT 0b001)>;
def : InstAlias<"wfe", (HINT 0b010)>;
def : InstAlias<"wfi", (HINT 0b011)>;
def : InstAlias<"sev", (HINT 0b100)>;
def : InstAlias<"sevl", (HINT 0b101)>;
def : InstAlias<"esb", (HINT 0b10000)>, Requires<[HasRAS]>;
def : InstAlias<"csdb", (HINT 20)>;
def : InstAlias<"bti", (HINT 32)>, Requires<[HasBTI]>;
def : InstAlias<"bti $op", (HINT btihint_op:$op)>, Requires<[HasBTI]>;
// v8.2a Statistical Profiling extension
def : InstAlias<"psb $op", (HINT psbhint_op:$op)>, Requires<[HasSPE]>;
// As far as LLVM is concerned this writes to the system's exclusive monitors.
let mayLoad = 1, mayStore = 1 in
def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;
// NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot
// model patterns with sufficiently fine granularity.
let mayLoad = ?, mayStore = ? in {
def DMB : CRmSystemI<barrier_op, 0b101, "dmb",
[(int_aarch64_dmb (i32 imm32_0_15:$CRm))]>;
def DSB : CRmSystemI<barrier_op, 0b100, "dsb",
[(int_aarch64_dsb (i32 imm32_0_15:$CRm))]>;
def ISB : CRmSystemI<barrier_op, 0b110, "isb",
[(int_aarch64_isb (i32 imm32_0_15:$CRm))]>;
def TSB : CRmSystemI<barrier_op, 0b010, "tsb", []> {
let CRm = 0b0010;
let Inst{12} = 0;
let Predicates = [HasTRACEV8_4];
}
}
// ARMv8.2-A Dot Product
let Predicates = [HasDotProd] in {
defm SDOT : SIMDThreeSameVectorDot<0, "sdot", int_aarch64_neon_sdot>;
defm UDOT : SIMDThreeSameVectorDot<1, "udot", int_aarch64_neon_udot>;
defm SDOTlane : SIMDThreeSameVectorDotIndex<0, "sdot", int_aarch64_neon_sdot>;
defm UDOTlane : SIMDThreeSameVectorDotIndex<1, "udot", int_aarch64_neon_udot>;
}
// ARMv8.2-A FP16 Fused Multiply-Add Long
let Predicates = [HasNEON, HasFP16FML] in {
defm FMLAL : SIMDThreeSameVectorFML<0, 1, 0b001, "fmlal", int_aarch64_neon_fmlal>;
defm FMLSL : SIMDThreeSameVectorFML<0, 1, 0b101, "fmlsl", int_aarch64_neon_fmlsl>;
defm FMLAL2 : SIMDThreeSameVectorFML<1, 0, 0b001, "fmlal2", int_aarch64_neon_fmlal2>;
defm FMLSL2 : SIMDThreeSameVectorFML<1, 0, 0b101, "fmlsl2", int_aarch64_neon_fmlsl2>;
defm FMLALlane : SIMDThreeSameVectorFMLIndex<0, 0b0000, "fmlal", int_aarch64_neon_fmlal>;
defm FMLSLlane : SIMDThreeSameVectorFMLIndex<0, 0b0100, "fmlsl", int_aarch64_neon_fmlsl>;
defm FMLAL2lane : SIMDThreeSameVectorFMLIndex<1, 0b1000, "fmlal2", int_aarch64_neon_fmlal2>;
defm FMLSL2lane : SIMDThreeSameVectorFMLIndex<1, 0b1100, "fmlsl2", int_aarch64_neon_fmlsl2>;
}
// Armv8.2-A Crypto extensions
let Predicates = [HasSHA3] in {
def SHA512H : CryptoRRRTied<0b0, 0b00, "sha512h">;
def SHA512H2 : CryptoRRRTied<0b0, 0b01, "sha512h2">;
def SHA512SU0 : CryptoRRTied_2D<0b0, 0b00, "sha512su0">;
def SHA512SU1 : CryptoRRRTied_2D<0b0, 0b10, "sha512su1">;
def RAX1 : CryptoRRR_2D<0b0,0b11, "rax1">;
def EOR3 : CryptoRRRR_16B<0b00, "eor3">;
def BCAX : CryptoRRRR_16B<0b01, "bcax">;
def XAR : CryptoRRRi6<"xar">;
} // HasSHA3
let Predicates = [HasSM4] in {
def SM3TT1A : CryptoRRRi2Tied<0b0, 0b00, "sm3tt1a">;
def SM3TT1B : CryptoRRRi2Tied<0b0, 0b01, "sm3tt1b">;
def SM3TT2A : CryptoRRRi2Tied<0b0, 0b10, "sm3tt2a">;
def SM3TT2B : CryptoRRRi2Tied<0b0, 0b11, "sm3tt2b">;
def SM3SS1 : CryptoRRRR_4S<0b10, "sm3ss1">;
def SM3PARTW1 : CryptoRRRTied_4S<0b1, 0b00, "sm3partw1">;
def SM3PARTW2 : CryptoRRRTied_4S<0b1, 0b01, "sm3partw2">;
def SM4ENCKEY : CryptoRRR_4S<0b1, 0b10, "sm4ekey">;
def SM4E : CryptoRRTied_4S<0b0, 0b01, "sm4e">;
} // HasSM4
let Predicates = [HasRCPC] in {
// v8.3 Release Consistent Processor Consistent support, optional in v8.2.
def LDAPRB : RCPCLoad<0b00, "ldaprb", GPR32>;
def LDAPRH : RCPCLoad<0b01, "ldaprh", GPR32>;
def LDAPRW : RCPCLoad<0b10, "ldapr", GPR32>;
def LDAPRX : RCPCLoad<0b11, "ldapr", GPR64>;
}
// v8.3a complex add and multiply-accumulate. No predicate here, that is done
// inside the multiclass as the FP16 versions need different predicates.
defm FCMLA : SIMDThreeSameVectorTiedComplexHSD<1, 0b110, complexrotateop,
"fcmla", null_frag>;
defm FCADD : SIMDThreeSameVectorComplexHSD<1, 0b111, complexrotateopodd,
"fcadd", null_frag>;
defm FCMLA : SIMDIndexedTiedComplexHSD<1, 0, 1, complexrotateop, "fcmla",
null_frag>;
// v8.3a Pointer Authentication
// These instructions inhabit part of the hint space and so can be used for
// armv8 targets
let Uses = [LR], Defs = [LR] in {
def PACIAZ : SystemNoOperands<0b000, "paciaz">;
def PACIBZ : SystemNoOperands<0b010, "pacibz">;
def AUTIAZ : SystemNoOperands<0b100, "autiaz">;
def AUTIBZ : SystemNoOperands<0b110, "autibz">;
}
let Uses = [LR, SP], Defs = [LR] in {
def PACIASP : SystemNoOperands<0b001, "paciasp">;
def PACIBSP : SystemNoOperands<0b011, "pacibsp">;
def AUTIASP : SystemNoOperands<0b101, "autiasp">;
def AUTIBSP : SystemNoOperands<0b111, "autibsp">;
}
let Uses = [X16, X17], Defs = [X17], CRm = 0b0001 in {
def PACIA1716 : SystemNoOperands<0b000, "pacia1716">;
def PACIB1716 : SystemNoOperands<0b010, "pacib1716">;
def AUTIA1716 : SystemNoOperands<0b100, "autia1716">;
def AUTIB1716 : SystemNoOperands<0b110, "autib1716">;
}
let Uses = [LR], Defs = [LR], CRm = 0b0000 in {
def XPACLRI : SystemNoOperands<0b111, "xpaclri">;
}
// These pointer authentication isntructions require armv8.3a
let Predicates = [HasPA] in {
multiclass SignAuth<bits<3> prefix, bits<3> prefix_z, string asm> {
def IA : SignAuthOneData<prefix, 0b00, !strconcat(asm, "ia")>;
def IB : SignAuthOneData<prefix, 0b01, !strconcat(asm, "ib")>;
def DA : SignAuthOneData<prefix, 0b10, !strconcat(asm, "da")>;
def DB : SignAuthOneData<prefix, 0b11, !strconcat(asm, "db")>;
def IZA : SignAuthZero<prefix_z, 0b00, !strconcat(asm, "iza")>;
def DZA : SignAuthZero<prefix_z, 0b10, !strconcat(asm, "dza")>;
def IZB : SignAuthZero<prefix_z, 0b01, !strconcat(asm, "izb")>;
def DZB : SignAuthZero<prefix_z, 0b11, !strconcat(asm, "dzb")>;
}
defm PAC : SignAuth<0b000, 0b010, "pac">;
defm AUT : SignAuth<0b001, 0b011, "aut">;
def XPACI : SignAuthZero<0b100, 0b00, "xpaci">;
def XPACD : SignAuthZero<0b100, 0b01, "xpacd">;
def PACGA : SignAuthTwoOperand<0b1100, "pacga", null_frag>;
// Combined Instructions
def BRAA : AuthBranchTwoOperands<0, 0, "braa">;
def BRAB : AuthBranchTwoOperands<0, 1, "brab">;
def BLRAA : AuthBranchTwoOperands<1, 0, "blraa">;
def BLRAB : AuthBranchTwoOperands<1, 1, "blrab">;
def BRAAZ : AuthOneOperand<0b000, 0, "braaz">;
def BRABZ : AuthOneOperand<0b000, 1, "brabz">;
def BLRAAZ : AuthOneOperand<0b001, 0, "blraaz">;
def BLRABZ : AuthOneOperand<0b001, 1, "blrabz">;
let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
def RETAA : AuthReturn<0b010, 0, "retaa">;
def RETAB : AuthReturn<0b010, 1, "retab">;
def ERETAA : AuthReturn<0b100, 0, "eretaa">;
def ERETAB : AuthReturn<0b100, 1, "eretab">;
}
defm LDRAA : AuthLoad<0, "ldraa", simm10Scaled>;
defm LDRAB : AuthLoad<1, "ldrab", simm10Scaled>;
}
// v8.3a floating point conversion for javascript
let Predicates = [HasJS, HasFPARMv8] in
def FJCVTZS : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32,
"fjcvtzs",
[(set GPR32:$Rd,
(int_aarch64_fjcvtzs FPR64:$Rn))]> {
let Inst{31} = 0;
} // HasJS, HasFPARMv8
// v8.4 Flag manipulation instructions
let Predicates = [HasFMI] in {
def CFINV : SimpleSystemI<0, (ins), "cfinv", "">, Sched<[WriteSys]> {
let Inst{20-5} = 0b0000001000000000;
}
def SETF8 : BaseFlagManipulation<0, 0, (ins GPR32:$Rn), "setf8", "{\t$Rn}">;
def SETF16 : BaseFlagManipulation<0, 1, (ins GPR32:$Rn), "setf16", "{\t$Rn}">;
def RMIF : FlagRotate<(ins GPR64:$Rn, uimm6:$imm, imm0_15:$mask), "rmif",
"{\t$Rn, $imm, $mask}">;
} // HasFMI
// v8.5 flag manipulation instructions
let Predicates = [HasAltNZCV], Uses = [NZCV], Defs = [NZCV] in {
def XAFLAG : PstateWriteSimple<(ins), "xaflag", "">, Sched<[WriteSys]> {
let Inst{18-16} = 0b000;
let Inst{11-8} = 0b0000;
let Unpredictable{11-8} = 0b1111;
let Inst{7-5} = 0b001;
}
def AXFLAG : PstateWriteSimple<(ins), "axflag", "">, Sched<[WriteSys]> {
let Inst{18-16} = 0b000;
let Inst{11-8} = 0b0000;
let Unpredictable{11-8} = 0b1111;
let Inst{7-5} = 0b010;
}
} // HasAltNZCV
// Armv8.5-A speculation barrier
def SB : SimpleSystemI<0, (ins), "sb", "">, Sched<[]> {
let Inst{20-5} = 0b0001100110000111;
let Unpredictable{11-8} = 0b1111;
let Predicates = [HasSB];
let hasSideEffects = 1;
}
def : InstAlias<"clrex", (CLREX 0xf)>;
def : InstAlias<"isb", (ISB 0xf)>;
def : InstAlias<"ssbb", (DSB 0)>;
def : InstAlias<"pssbb", (DSB 4)>;
def MRS : MRSI;
def MSR : MSRI;
def MSRpstateImm1 : MSRpstateImm0_1;
def MSRpstateImm4 : MSRpstateImm0_15;
// The thread pointer (on Linux, at least, where this has been implemented) is
// TPIDR_EL0.
def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins),
[(set GPR64:$dst, AArch64threadpointer)]>, Sched<[WriteSys]>;
let Uses = [ X9 ], Defs = [ X16, X17, LR, NZCV ] in {
def HWASAN_CHECK_MEMACCESS : Pseudo<
(outs), (ins GPR64noip:$ptr, i32imm:$accessinfo),
[(int_hwasan_check_memaccess X9, GPR64noip:$ptr, (i32 imm:$accessinfo))]>,
Sched<[]>;
}
// The cycle counter PMC register is PMCCNTR_EL0.
let Predicates = [HasPerfMon] in
def : Pat<(readcyclecounter), (MRS 0xdce8)>;
// FPCR register
def : Pat<(i64 (int_aarch64_get_fpcr)), (MRS 0xda20)>;
// Generic system instructions
def SYSxt : SystemXtI<0, "sys">;
def SYSLxt : SystemLXtI<1, "sysl">;
def : InstAlias<"sys $op1, $Cn, $Cm, $op2",
(SYSxt imm0_7:$op1, sys_cr_op:$Cn,
sys_cr_op:$Cm, imm0_7:$op2, XZR)>;
//===----------------------------------------------------------------------===//
// Move immediate instructions.
//===----------------------------------------------------------------------===//
defm MOVK : InsertImmediate<0b11, "movk">;
defm MOVN : MoveImmediate<0b00, "movn">;
let PostEncoderMethod = "fixMOVZ" in
defm MOVZ : MoveImmediate<0b10, "movz">;
// First group of aliases covers an implicit "lsl #0".
def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0), 0>;
def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0), 0>;
def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>;
// Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax.
def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48), 0>;
def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32), 0>;
def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16), 0>;
def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0), 0>;
def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16), 0>;
def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0), 0>;
// Final group of aliases covers true "mov $Rd, $imm" cases.
multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR,
int width, int shift> {
def _asmoperand : AsmOperandClass {
let Name = basename # width # "_lsl" # shift # "MovAlias";
let PredicateMethod = "is" # basename # "MovAlias<" # width # ", "
# shift # ">";
let RenderMethod = "add" # basename # "MovAliasOperands<" # shift # ">";
}
def _movimm : Operand<i32> {
let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_asmoperand");
}
def : InstAlias<"mov $Rd, $imm",
(INST GPR:$Rd, !cast<Operand>(NAME # "_movimm"):$imm, shift)>;
}
defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 0>;
defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 16>;
defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 0>;
defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 16>;
defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 32>;
defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 48>;
defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 0>;
defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 16>;
defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 0>;
defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 16>;
defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 32>;
defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 48>;
let isReMaterializable = 1, isCodeGenOnly = 1, isMoveImm = 1,
isAsCheapAsAMove = 1 in {
// FIXME: The following pseudo instructions are only needed because remat
// cannot handle multiple instructions. When that changes, we can select
// directly to the real instructions and get rid of these pseudos.
def MOVi32imm
: Pseudo<(outs GPR32:$dst), (ins i32imm:$src),
[(set GPR32:$dst, imm:$src)]>,
Sched<[WriteImm]>;
def MOVi64imm
: Pseudo<(outs GPR64:$dst), (ins i64imm:$src),
[(set GPR64:$dst, imm:$src)]>,
Sched<[WriteImm]>;
} // isReMaterializable, isCodeGenOnly
// If possible, we want to use MOVi32imm even for 64-bit moves. This gives the
// eventual expansion code fewer bits to worry about getting right. Marshalling
// the types is a little tricky though:
def i64imm_32bit : ImmLeaf<i64, [{
return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
}]>;
def s64imm_32bit : ImmLeaf<i64, [{
int64_t Imm64 = static_cast<int64_t>(Imm);
return Imm64 >= std::numeric_limits<int32_t>::min() &&
Imm64 <= std::numeric_limits<int32_t>::max();
}]>;
def trunc_imm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i32);
}]>;
def gi_trunc_imm : GICustomOperandRenderer<"renderTruncImm">,
GISDNodeXFormEquiv<trunc_imm>;
def : Pat<(i64 i64imm_32bit:$src),
(SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;
// Materialize FP constants via MOVi32imm/MOVi64imm (MachO large code model).
def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
return CurDAG->getTargetConstant(
N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32);
}]>;
def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
return CurDAG->getTargetConstant(
N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64);
}]>;
def : Pat<(f32 fpimm:$in),
(COPY_TO_REGCLASS (MOVi32imm (bitcast_fpimm_to_i32 f32:$in)), FPR32)>;
def : Pat<(f64 fpimm:$in),
(COPY_TO_REGCLASS (MOVi64imm (bitcast_fpimm_to_i64 f64:$in)), FPR64)>;
// Deal with the various forms of (ELF) large addressing with MOVZ/MOVK
// sequences.
def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
tglobaladdr:$g1, tglobaladdr:$g0),
(MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g0, 0),
tglobaladdr:$g1, 16),
tglobaladdr:$g2, 32),
tglobaladdr:$g3, 48)>;
def : Pat<(AArch64WrapperLarge tblockaddress:$g3, tblockaddress:$g2,
tblockaddress:$g1, tblockaddress:$g0),
(MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g0, 0),
tblockaddress:$g1, 16),
tblockaddress:$g2, 32),
tblockaddress:$g3, 48)>;
def : Pat<(AArch64WrapperLarge tconstpool:$g3, tconstpool:$g2,
tconstpool:$g1, tconstpool:$g0),
(MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g0, 0),
tconstpool:$g1, 16),
tconstpool:$g2, 32),
tconstpool:$g3, 48)>;
def : Pat<(AArch64WrapperLarge tjumptable:$g3, tjumptable:$g2,
tjumptable:$g1, tjumptable:$g0),
(MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g0, 0),
tjumptable:$g1, 16),
tjumptable:$g2, 32),
tjumptable:$g3, 48)>;
//===----------------------------------------------------------------------===//
// Arithmetic instructions.
//===----------------------------------------------------------------------===//
// Add/subtract with carry.
defm ADC : AddSubCarry<0, "adc", "adcs", AArch64adc, AArch64adc_flag>;
defm SBC : AddSubCarry<1, "sbc", "sbcs", AArch64sbc, AArch64sbc_flag>;
def : InstAlias<"ngc $dst, $src", (SBCWr GPR32:$dst, WZR, GPR32:$src)>;
def : InstAlias<"ngc $dst, $src", (SBCXr GPR64:$dst, XZR, GPR64:$src)>;
def : InstAlias<"ngcs $dst, $src", (SBCSWr GPR32:$dst, WZR, GPR32:$src)>;
def : InstAlias<"ngcs $dst, $src", (SBCSXr GPR64:$dst, XZR, GPR64:$src)>;
// Add/subtract
defm ADD : AddSub<0, "add", "sub", add>;
defm SUB : AddSub<1, "sub", "add">;
def : InstAlias<"mov $dst, $src",
(ADDWri GPR32sponly:$dst, GPR32sp:$src, 0, 0)>;
def : InstAlias<"mov $dst, $src",
(ADDWri GPR32sp:$dst, GPR32sponly:$src, 0, 0)>;
def : InstAlias<"mov $dst, $src",
(ADDXri GPR64sponly:$dst, GPR64sp:$src, 0, 0)>;
def : InstAlias<"mov $dst, $src",
(ADDXri GPR64sp:$dst, GPR64sponly:$src, 0, 0)>;
defm ADDS : AddSubS<0, "adds", AArch64add_flag, "cmn", "subs", "cmp">;
defm SUBS : AddSubS<1, "subs", AArch64sub_flag, "cmp", "adds", "cmn">;
// Use SUBS instead of SUB to enable CSE between SUBS and SUB.
def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm),
(SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>;
def : Pat<(sub GPR64sp:$Rn, addsub_shifted_imm64:$imm),
(SUBSXri GPR64sp:$Rn, addsub_shifted_imm64:$imm)>;
def : Pat<(sub GPR32:$Rn, GPR32:$Rm),
(SUBSWrr GPR32:$Rn, GPR32:$Rm)>;
def : Pat<(sub GPR64:$Rn, GPR64:$Rm),
(SUBSXrr GPR64:$Rn, GPR64:$Rm)>;
def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
(SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>;
def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
(SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
let AddedComplexity = 1 in {
def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
(SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
(SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
}
// Because of the immediate format for add/sub-imm instructions, the
// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
// These patterns capture that transformation.
let AddedComplexity = 1 in {
def : Pat<(add GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
(SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
def : Pat<(add GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
(SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
def : Pat<(sub GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
(ADDWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
def : Pat<(sub GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
(ADDXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
}
// Because of the immediate format for add/sub-imm instructions, the
// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
// These patterns capture that transformation.
let AddedComplexity = 1 in {
def : Pat<(AArch64add_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
(SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
def : Pat<(AArch64add_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
(SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
def : Pat<(AArch64sub_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
(ADDSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
def : Pat<(AArch64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
(ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
}
def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
def : InstAlias<"neg $dst, $src$shift",
(SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
def : InstAlias<"neg $dst, $src$shift",
(SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
def : InstAlias<"negs $dst, $src$shift",
(SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
def : InstAlias<"negs $dst, $src$shift",
(SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
// Unsigned/Signed divide
defm UDIV : Div<0, "udiv", udiv>;
defm SDIV : Div<1, "sdiv", sdiv>;
def : Pat<(int_aarch64_udiv GPR32:$Rn, GPR32:$Rm), (UDIVWr GPR32:$Rn, GPR32:$Rm)>;
def : Pat<(int_aarch64_udiv GPR64:$Rn, GPR64:$Rm), (UDIVXr GPR64:$Rn, GPR64:$Rm)>;
def : Pat<(int_aarch64_sdiv GPR32:$Rn, GPR32:$Rm), (SDIVWr GPR32:$Rn, GPR32:$Rm)>;
def : Pat<(int_aarch64_sdiv GPR64:$Rn, GPR64:$Rm), (SDIVXr GPR64:$Rn, GPR64:$Rm)>;
// Variable shift
defm ASRV : Shift<0b10, "asr", sra>;
defm LSLV : Shift<0b00, "lsl", shl>;
defm LSRV : Shift<0b01, "lsr", srl>;
defm RORV : Shift<0b11, "ror", rotr>;
def : ShiftAlias<"asrv", ASRVWr, GPR32>;
def : ShiftAlias<"asrv", ASRVXr, GPR64>;
def : ShiftAlias<"lslv", LSLVWr, GPR32>;
def : ShiftAlias<"lslv", LSLVXr, GPR64>;
def : ShiftAlias<"lsrv", LSRVWr, GPR32>;
def : ShiftAlias<"lsrv", LSRVXr, GPR64>;
def : ShiftAlias<"rorv", RORVWr, GPR32>;
def : ShiftAlias<"rorv", RORVXr, GPR64>;
// Multiply-add
let AddedComplexity = 5 in {
defm MADD : MulAccum<0, "madd", add>;
defm MSUB : MulAccum<1, "msub", sub>;
def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)),
(MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
def : Pat<(i64 (mul GPR64:$Rn, GPR64:$Rm)),
(MADDXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))),
(MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))),
(MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
def : Pat<(i32 (mul (ineg GPR32:$Rn), GPR32:$Rm)),
(MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
def : Pat<(i64 (mul (ineg GPR64:$Rn), GPR64:$Rm)),
(MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
} // AddedComplexity = 5
let AddedComplexity = 5 in {
def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>;
def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>;
def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>;
def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))),
(SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))),
(UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))),
(SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))),
(UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
def : Pat<(i64 (mul (sext GPR32:$Rn), (s64imm_32bit:$C))),
(SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
def : Pat<(i64 (mul (zext GPR32:$Rn), (i64imm_32bit:$C))),
(UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C))),
(SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
(MOVi32imm (trunc_imm imm:$C)), XZR)>;
def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))),
(SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))),
(UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
def : Pat<(i64 (ineg (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)))),
(SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
(MOVi32imm (trunc_imm imm:$C)), XZR)>;
def : Pat<(i64 (add (mul (sext GPR32:$Rn), (s64imm_32bit:$C)), GPR64:$Ra)),
(SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
def : Pat<(i64 (add (mul (zext GPR32:$Rn), (i64imm_32bit:$C)), GPR64:$Ra)),
(UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
def : Pat<(i64 (add (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)),
GPR64:$Ra)),
(SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
(MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
def : Pat<(i64 (sub GPR64:$Ra, (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))),
(SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
def : Pat<(i64 (sub GPR64:$Ra, (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))),
(UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
def : Pat<(i64 (sub GPR64:$Ra, (mul (sext_inreg GPR64:$Rn, i32),
(s64imm_32bit:$C)))),
(SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
(MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
} // AddedComplexity = 5
def : MulAccumWAlias<"mul", MADDWrrr>;
def : MulAccumXAlias<"mul", MADDXrrr>;
def : MulAccumWAlias<"mneg", MSUBWrrr>;
def : MulAccumXAlias<"mneg", MSUBXrrr>;
def : WideMulAccumAlias<"smull", SMADDLrrr>;
def : WideMulAccumAlias<"smnegl", SMSUBLrrr>;
def : WideMulAccumAlias<"umull", UMADDLrrr>;
def : WideMulAccumAlias<"umnegl", UMSUBLrrr>;
// Multiply-high
def SMULHrr : MulHi<0b010, "smulh", mulhs>;
def UMULHrr : MulHi<0b110, "umulh", mulhu>;
// CRC32
def CRC32Brr : BaseCRC32<0, 0b00, 0, GPR32, int_aarch64_crc32b, "crc32b">;
def CRC32Hrr : BaseCRC32<0, 0b01, 0, GPR32, int_aarch64_crc32h, "crc32h">;
def CRC32Wrr : BaseCRC32<0, 0b10, 0, GPR32, int_aarch64_crc32w, "crc32w">;
def CRC32Xrr : BaseCRC32<1, 0b11, 0, GPR64, int_aarch64_crc32x, "crc32x">;
def CRC32CBrr : BaseCRC32<0, 0b00, 1, GPR32, int_aarch64_crc32cb, "crc32cb">;
def CRC32CHrr : BaseCRC32<0, 0b01, 1, GPR32, int_aarch64_crc32ch, "crc32ch">;
def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_aarch64_crc32cw, "crc32cw">;
def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_aarch64_crc32cx, "crc32cx">;
// v8.1 atomic CAS
defm CAS : CompareAndSwap<0, 0, "">;
defm CASA : CompareAndSwap<1, 0, "a">;
defm CASL : CompareAndSwap<0, 1, "l">;
defm CASAL : CompareAndSwap<1, 1, "al">;
// v8.1 atomic CASP
defm CASP : CompareAndSwapPair<0, 0, "">;
defm CASPA : CompareAndSwapPair<1, 0, "a">;
defm CASPL : CompareAndSwapPair<0, 1, "l">;
defm CASPAL : CompareAndSwapPair<1, 1, "al">;
// v8.1 atomic SWP
defm SWP : Swap<0, 0, "">;
defm SWPA : Swap<1, 0, "a">;
defm SWPL : Swap<0, 1, "l">;
defm SWPAL : Swap<1, 1, "al">;
// v8.1 atomic LD<OP>(register). Performs load and then ST<OP>(register)
defm LDADD : LDOPregister<0b000, "add", 0, 0, "">;
defm LDADDA : LDOPregister<0b000, "add", 1, 0, "a">;
defm LDADDL : LDOPregister<0b000, "add", 0, 1, "l">;
defm LDADDAL : LDOPregister<0b000, "add", 1, 1, "al">;
defm LDCLR : LDOPregister<0b001, "clr", 0, 0, "">;
defm LDCLRA : LDOPregister<0b001, "clr", 1, 0, "a">;
defm LDCLRL : LDOPregister<0b001, "clr", 0, 1, "l">;
defm LDCLRAL : LDOPregister<0b001, "clr", 1, 1, "al">;
defm LDEOR : LDOPregister<0b010, "eor", 0, 0, "">;
defm LDEORA : LDOPregister<0b010, "eor", 1, 0, "a">;
defm LDEORL : LDOPregister<0b010, "eor", 0, 1, "l">;
defm LDEORAL : LDOPregister<0b010, "eor", 1, 1, "al">;
defm LDSET : LDOPregister<0b011, "set", 0, 0, "">;
defm LDSETA : LDOPregister<0b011, "set", 1, 0, "a">;
defm LDSETL : LDOPregister<0b011, "set", 0, 1, "l">;
defm LDSETAL : LDOPregister<0b011, "set", 1, 1, "al">;
defm LDSMAX : LDOPregister<0b100, "smax", 0, 0, "">;
defm LDSMAXA : LDOPregister<0b100, "smax", 1, 0, "a">;
defm LDSMAXL : LDOPregister<0b100, "smax", 0, 1, "l">;
defm LDSMAXAL : LDOPregister<0b100, "smax", 1, 1, "al">;
defm LDSMIN : LDOPregister<0b101, "smin", 0, 0, "">;
defm LDSMINA : LDOPregister<0b101, "smin", 1, 0, "a">;
defm LDSMINL : LDOPregister<0b101, "smin", 0, 1, "l">;
defm LDSMINAL : LDOPregister<0b101, "smin", 1, 1, "al">;
defm LDUMAX : LDOPregister<0b110, "umax", 0, 0, "">;
defm LDUMAXA : LDOPregister<0b110, "umax", 1, 0, "a">;
defm LDUMAXL : LDOPregister<0b110, "umax", 0, 1, "l">;
defm LDUMAXAL : LDOPregister<0b110, "umax", 1, 1, "al">;
defm LDUMIN : LDOPregister<0b111, "umin", 0, 0, "">;
defm LDUMINA : LDOPregister<0b111, "umin", 1, 0, "a">;
defm LDUMINL : LDOPregister<0b111, "umin", 0, 1, "l">;
defm LDUMINAL : LDOPregister<0b111, "umin", 1, 1, "al">;
// v8.1 atomic ST<OP>(register) as aliases to "LD<OP>(register) when Rt=xZR"
defm : STOPregister<"stadd","LDADD">; // STADDx
defm : STOPregister<"stclr","LDCLR">; // STCLRx
defm : STOPregister<"steor","LDEOR">; // STEORx
defm : STOPregister<"stset","LDSET">; // STSETx
defm : STOPregister<"stsmax","LDSMAX">;// STSMAXx
defm : STOPregister<"stsmin","LDSMIN">;// STSMINx
defm : STOPregister<"stumax","LDUMAX">;// STUMAXx
defm : STOPregister<"stumin","LDUMIN">;// STUMINx
// v8.5 Memory Tagging Extension
let Predicates = [HasMTE] in {
def IRG : BaseTwoOperand<0b0100, GPR64sp, "irg", int_aarch64_irg, GPR64sp, GPR64>,
Sched<[]>{
let Inst{31} = 1;
}
def GMI : BaseTwoOperand<0b0101, GPR64, "gmi", int_aarch64_gmi, GPR64sp>, Sched<[]>{
let Inst{31} = 1;
let isNotDuplicable = 1;
}
def ADDG : AddSubG<0, "addg", null_frag>;
def SUBG : AddSubG<1, "subg", null_frag>;
def : InstAlias<"irg $dst, $src", (IRG GPR64sp:$dst, GPR64sp:$src, XZR), 1>;
def SUBP : SUBP<0, "subp", int_aarch64_subp>, Sched<[]>;
def SUBPS : SUBP<1, "subps", null_frag>, Sched<[]>{
let Defs = [NZCV];
}
def : InstAlias<"cmpp $lhs, $rhs", (SUBPS XZR, GPR64sp:$lhs, GPR64sp:$rhs), 0>;
def LDG : MemTagLoad<"ldg", "\t$Rt, [$Rn, $offset]">;
def : Pat<(int_aarch64_addg (am_indexedu6s128 GPR64sp:$Rn, uimm6s16:$imm6), imm0_15:$imm4),
(ADDG GPR64sp:$Rn, imm0_63:$imm6, imm0_15:$imm4)>;
def : Pat<(int_aarch64_ldg GPR64:$Rt, (am_indexeds9s128 GPR64sp:$Rn, simm9s16:$offset)),
(LDG GPR64:$Rt, GPR64sp:$Rn, simm9s16:$offset)>;
def : InstAlias<"ldg $Rt, [$Rn]", (LDG GPR64:$Rt, GPR64sp:$Rn, 0), 1>;
def LDGM : MemTagVector<1, "ldgm", "\t$Rt, [$Rn]",
(outs GPR64:$Rt), (ins GPR64sp:$Rn)>;
def STGM : MemTagVector<0, "stgm", "\t$Rt, [$Rn]",
(outs), (ins GPR64:$Rt, GPR64sp:$Rn)>;
def STZGM : MemTagVector<0, "stzgm", "\t$Rt, [$Rn]",
(outs), (ins GPR64:$Rt, GPR64sp:$Rn)> {
let Inst{23} = 0;
}
defm STG : MemTagStore<0b00, "stg">;
defm STZG : MemTagStore<0b01, "stzg">;
defm ST2G : MemTagStore<0b10, "st2g">;
defm STZ2G : MemTagStore<0b11, "stz2g">;
def : Pat<(AArch64stg GPR64sp:$Rn, (am_indexeds9s128 GPR64sp:$Rm, simm9s16:$imm)),
(STGOffset $Rn, $Rm, $imm)>;
def : Pat<(AArch64stzg GPR64sp:$Rn, (am_indexeds9s128 GPR64sp:$Rm, simm9s16:$imm)),
(STZGOffset $Rn, $Rm, $imm)>;
def : Pat<(AArch64st2g GPR64sp:$Rn, (am_indexeds9s128 GPR64sp:$Rm, simm9s16:$imm)),
(ST2GOffset $Rn, $Rm, $imm)>;
def : Pat<(AArch64stz2g GPR64sp:$Rn, (am_indexeds9s128 GPR64sp:$Rm, simm9s16:$imm)),
(STZ2GOffset $Rn, $Rm, $imm)>;
defm STGP : StorePairOffset <0b01, 0, GPR64z, simm7s16, "stgp">;
def STGPpre : StorePairPreIdx <0b01, 0, GPR64z, simm7s16, "stgp">;
def STGPpost : StorePairPostIdx<0b01, 0, GPR64z, simm7s16, "stgp">;
def : Pat<(int_aarch64_stg GPR64:$Rt, (am_indexeds9s128 GPR64sp:$Rn, simm9s16:$offset)),
(STGOffset GPR64:$Rt, GPR64sp:$Rn, simm9s16:$offset)>;
def : Pat<(int_aarch64_stgp (am_indexed7s128 GPR64sp:$Rn, simm7s16:$imm), GPR64:$Rt, GPR64:$Rt2),
(STGPi $Rt, $Rt2, $Rn, $imm)>;
def IRGstack
: Pseudo<(outs GPR64sp:$Rd), (ins GPR64sp:$Rsp, GPR64:$Rm), []>,
Sched<[]>;
def TAGPstack
: Pseudo<(outs GPR64sp:$Rd), (ins GPR64sp:$Rn, uimm6s16:$imm6, GPR64sp:$Rm, imm0_15:$imm4), []>,
Sched<[]>;
// Explicit SP in the first operand prevents ShrinkWrap optimization
// from leaving this instruction out of the stack frame. When IRGstack
// is transformed into IRG, this operand is replaced with the actual
// register / expression for the tagged base pointer of the current function.
def : Pat<(int_aarch64_irg_sp i64:$Rm), (IRGstack SP, i64:$Rm)>;
// Large STG to be expanded into a loop. $Rm is the size, $Rn is start address.
// $Rn_wback is one past the end of the range.
let isCodeGenOnly=1, mayStore=1 in {
def STGloop
: Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn),
[], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >,
Sched<[WriteAdr, WriteST]>;
def STZGloop
: Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn),
[], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >,
Sched<[WriteAdr, WriteST]>;
}
} // Predicates = [HasMTE]
//===----------------------------------------------------------------------===//
// Logical instructions.
//===----------------------------------------------------------------------===//
// (immediate)
defm ANDS : LogicalImmS<0b11, "ands", AArch64and_flag, "bics">;
defm AND : LogicalImm<0b00, "and", and, "bic">;
defm EOR : LogicalImm<0b10, "eor", xor, "eon">;
defm ORR : LogicalImm<0b01, "orr", or, "orn">;
// FIXME: these aliases *are* canonical sometimes (when movz can't be
// used). Actually, it seems to be working right now, but putting logical_immXX
// here is a bit dodgy on the AsmParser side too.
def : InstAlias<"mov $dst, $imm", (ORRWri GPR32sp:$dst, WZR,
logical_imm32:$imm), 0>;
def : InstAlias<"mov $dst, $imm", (ORRXri GPR64sp:$dst, XZR,
logical_imm64:$imm), 0>;
// (register)
defm ANDS : LogicalRegS<0b11, 0, "ands", AArch64and_flag>;
defm BICS : LogicalRegS<0b11, 1, "bics",
BinOpFrag<(AArch64and_flag node:$LHS, (not node:$RHS))>>;
defm AND : LogicalReg<0b00, 0, "and", and>;
defm BIC : LogicalReg<0b00, 1, "bic",
BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
defm EON : LogicalReg<0b10, 1, "eon",
BinOpFrag<(not (xor node:$LHS, node:$RHS))>>;
defm EOR : LogicalReg<0b10, 0, "eor", xor>;
defm ORN : LogicalReg<0b01, 1, "orn",
BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
defm ORR : LogicalReg<0b01, 0, "orr", or>;
def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0), 2>;
def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0), 2>;
def : InstAlias<"mvn $Wd, $Wm", (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0), 3>;
def : InstAlias<"mvn $Xd, $Xm", (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0), 3>;
def : InstAlias<"mvn $Wd, $Wm$sh",
(ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, logical_shift32:$sh), 2>;
def : InstAlias<"mvn $Xd, $Xm$sh",
(ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, logical_shift64:$sh), 2>;
def : InstAlias<"tst $src1, $src2",
(ANDSWri WZR, GPR32:$src1, logical_imm32:$src2), 2>;
def : InstAlias<"tst $src1, $src2",
(ANDSXri XZR, GPR64:$src1, logical_imm64:$src2), 2>;
def : InstAlias<"tst $src1, $src2",
(ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0), 3>;
def : InstAlias<"tst $src1, $src2",
(ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0), 3>;
def : InstAlias<"tst $src1, $src2$sh",
(ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift32:$sh), 2>;
def : InstAlias<"tst $src1, $src2$sh",
(ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift64:$sh), 2>;
def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>;
def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;
//===----------------------------------------------------------------------===//
// One operand data processing instructions.
//===----------------------------------------------------------------------===//
defm CLS : OneOperandData<0b101, "cls">;
defm CLZ : OneOperandData<0b100, "clz", ctlz>;
defm RBIT : OneOperandData<0b000, "rbit", bitreverse>;
def REV16Wr : OneWRegData<0b001, "rev16",
UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
def REV16Xr : OneXRegData<0b001, "rev16", null_frag>;
def : Pat<(cttz GPR32:$Rn),
(CLZWr (RBITWr GPR32:$Rn))>;
def : Pat<(cttz GPR64:$Rn),
(CLZXr (RBITXr GPR64:$Rn))>;
def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)),
(i32 1))),
(CLSWr GPR32:$Rn)>;
def : Pat<(ctlz (or (shl (xor (sra GPR64:$Rn, (i64 63)), GPR64:$Rn), (i64 1)),
(i64 1))),
(CLSXr GPR64:$Rn)>;
// Unlike the other one operand instructions, the instructions with the "rev"
// mnemonic do *not* just different in the size bit, but actually use different
// opcode bits for the different sizes.
def REVWr : OneWRegData<0b010, "rev", bswap>;
def REVXr : OneXRegData<0b011, "rev", bswap>;
def REV32Xr : OneXRegData<0b010, "rev32",
UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>;
def : InstAlias<"rev64 $Rd, $Rn", (REVXr GPR64:$Rd, GPR64:$Rn), 0>;
// The bswap commutes with the rotr so we want a pattern for both possible
// orders.
def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>;
def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
//===----------------------------------------------------------------------===//
// Bitfield immediate extraction instruction.
//===----------------------------------------------------------------------===//
let hasSideEffects = 0 in
defm EXTR : ExtractImm<"extr">;
def : InstAlias<"ror $dst, $src, $shift",
(EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>;
def : InstAlias<"ror $dst, $src, $shift",
(EXTRXrri GPR64:$dst, GPR64:$src, GPR64:$src, imm0_63:$shift)>;
def : Pat<(rotr GPR32:$Rn, (i64 imm0_31:$imm)),
(EXTRWrri GPR32:$Rn, GPR32:$Rn, imm0_31:$imm)>;
def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)),
(EXTRXrri GPR64:$Rn, GPR64:$Rn, imm0_63:$imm)>;
//===----------------------------------------------------------------------===//
// Other bitfield immediate instructions.
//===----------------------------------------------------------------------===//
let hasSideEffects = 0 in {
defm BFM : BitfieldImmWith2RegArgs<0b01, "bfm">;
defm SBFM : BitfieldImm<0b00, "sbfm">;
defm UBFM : BitfieldImm<0b10, "ubfm">;
}
def i32shift_a : Operand<i64>, SDNodeXForm<imm, [{
uint64_t enc = (32 - N->getZExtValue()) & 0x1f;
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
}]>;
def i32shift_b : Operand<i64>, SDNodeXForm<imm, [{
uint64_t enc = 31 - N->getZExtValue();
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
}]>;
// min(7, 31 - shift_amt)
def i32shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
uint64_t enc = 31 - N->getZExtValue();
enc = enc > 7 ? 7 : enc;
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
}]>;
// min(15, 31 - shift_amt)
def i32shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
uint64_t enc = 31 - N->getZExtValue();
enc = enc > 15 ? 15 : enc;
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
}]>;
def i64shift_a : Operand<i64>, SDNodeXForm<imm, [{
uint64_t enc = (64 - N->getZExtValue()) & 0x3f;
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
}]>;
def i64shift_b : Operand<i64>, SDNodeXForm<imm, [{
uint64_t enc = 63 - N->getZExtValue();
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
}]>;
// min(7, 63 - shift_amt)
def i64shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
uint64_t enc = 63 - N->getZExtValue();
enc = enc > 7 ? 7 : enc;
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
}]>;
// min(15, 63 - shift_amt)
def i64shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
uint64_t enc = 63 - N->getZExtValue();
enc = enc > 15 ? 15 : enc;
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
}]>;
// min(31, 63 - shift_amt)
def i64shift_sext_i32 : Operand<i64>, SDNodeXForm<imm, [{
uint64_t enc = 63 - N->getZExtValue();
enc = enc > 31 ? 31 : enc;
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
}]>;
def : Pat<(shl GPR32:$Rn, (i64 imm0_31:$imm)),
(UBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
(i64 (i32shift_b imm0_31:$imm)))>;
def : Pat<(shl GPR64:$Rn, (i64 imm0_63:$imm)),
(UBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
(i64 (i64shift_b imm0_63:$imm)))>;
let AddedComplexity = 10 in {
def : Pat<(sra GPR32:$Rn, (i64 imm0_31:$imm)),
(SBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
def : Pat<(sra GPR64:$Rn, (i64 imm0_63:$imm)),
(SBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
}
def : InstAlias<"asr $dst, $src, $shift",
(SBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
def : InstAlias<"asr $dst, $src, $shift",
(SBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
def : InstAlias<"sxtb $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
def : InstAlias<"sxtb $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
def : InstAlias<"sxth $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
def : InstAlias<"sxth $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
def : InstAlias<"sxtw $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
def : Pat<(srl GPR32:$Rn, (i64 imm0_31:$imm)),
(UBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
def : Pat<(srl GPR64:$Rn, (i64 imm0_63:$imm)),
(UBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
def : InstAlias<"lsr $dst, $src, $shift",
(UBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
def : InstAlias<"lsr $dst, $src, $shift",
(UBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
def : InstAlias<"uxtb $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
def : InstAlias<"uxtb $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
def : InstAlias<"uxth $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
//===----------------------------------------------------------------------===//
// Conditional comparison instructions.
//===----------------------------------------------------------------------===//
defm CCMN : CondComparison<0, "ccmn", AArch64ccmn>;
defm CCMP : CondComparison<1, "ccmp", AArch64ccmp>;
//===----------------------------------------------------------------------===//
// Conditional select instructions.
//===----------------------------------------------------------------------===//
defm CSEL : CondSelect<0, 0b00, "csel">;
def inc : PatFrag<(ops node:$in), (add node:$in, 1)>;
defm CSINC : CondSelectOp<0, 0b01, "csinc", inc>;
defm CSINV : CondSelectOp<1, 0b00, "csinv", not>;
defm CSNEG : CondSelectOp<1, 0b01, "csneg", ineg>;
def : Pat<(AArch64csinv GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
(CSINVWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
def : Pat<(AArch64csinv GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
(CSINVXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
def : Pat<(AArch64csneg GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
(CSNEGWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
def : Pat<(AArch64csneg GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
(CSNEGXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
def : Pat<(AArch64csinc GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
(CSINCWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
def : Pat<(AArch64csinc GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
(CSINCXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
def : Pat<(AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV),
(CSINCWr WZR, WZR, (i32 imm:$cc))>;
def : Pat<(AArch64csel (i64 0), (i64 1), (i32 imm:$cc), NZCV),
(CSINCXr XZR, XZR, (i32 imm:$cc))>;
def : Pat<(AArch64csel GPR32:$tval, (i32 1), (i32 imm:$cc), NZCV),
(CSINCWr GPR32:$tval, WZR, (i32 imm:$cc))>;
def : Pat<(AArch64csel GPR64:$tval, (i64 1), (i32 imm:$cc), NZCV),
(CSINCXr GPR64:$tval, XZR, (i32 imm:$cc))>;
def : Pat<(AArch64csel (i32 1), GPR32:$fval, (i32 imm:$cc), NZCV),
(CSINCWr GPR32:$fval, WZR, (i32 (inv_cond_XFORM imm:$cc)))>;
def : Pat<(AArch64csel (i64 1), GPR64:$fval, (i32 imm:$cc), NZCV),
(CSINCXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>;
def : Pat<(AArch64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV),
(CSINVWr WZR, WZR, (i32 imm:$cc))>;
def : Pat<(AArch64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV),
(CSINVXr XZR, XZR, (i32 imm:$cc))>;
def : Pat<(AArch64csel GPR32:$tval, (i32 -1), (i32 imm:$cc), NZCV),
(CSINVWr GPR32:$tval, WZR, (i32 imm:$cc))>;
def : Pat<(AArch64csel GPR64:$tval, (i64 -1), (i32 imm:$cc), NZCV),
(CSINVXr GPR64:$tval, XZR, (i32 imm:$cc))>;
def : Pat<(AArch64csel (i32 -1), GPR32:$fval, (i32 imm:$cc), NZCV),
(CSINVWr GPR32:$fval, WZR, (i32 (inv_cond_XFORM imm:$cc)))>;
def : Pat<(AArch64csel (i64 -1), GPR64:$fval, (i32 imm:$cc), NZCV),
(CSINVXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>;
// The inverse of the condition code from the alias instruction is what is used
// in the aliased instruction. The parser all ready inverts the condition code
// for these aliases.
def : InstAlias<"cset $dst, $cc",
(CSINCWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
def : InstAlias<"cset $dst, $cc",
(CSINCXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
def : InstAlias<"csetm $dst, $cc",
(CSINVWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
def : InstAlias<"csetm $dst, $cc",
(CSINVXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
def : InstAlias<"cinc $dst, $src, $cc",
(CSINCWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
def : InstAlias<"cinc $dst, $src, $cc",
(CSINCXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
def : InstAlias<"cinv $dst, $src, $cc",
(CSINVWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
def : InstAlias<"cinv $dst, $src, $cc",
(CSINVXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
def : InstAlias<"cneg $dst, $src, $cc",
(CSNEGWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
def : InstAlias<"cneg $dst, $src, $cc",
(CSNEGXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
//===----------------------------------------------------------------------===//
// PC-relative instructions.
//===----------------------------------------------------------------------===//
let isReMaterializable = 1 in {
let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in {
def ADR : ADRI<0, "adr", adrlabel,
[(set GPR64:$Xd, (AArch64adr tglobaladdr:$label))]>;
} // hasSideEffects = 0
def ADRP : ADRI<1, "adrp", adrplabel,
[(set GPR64:$Xd, (AArch64adrp tglobaladdr:$label))]>;
} // isReMaterializable = 1
// page address of a constant pool entry, block address
def : Pat<(AArch64adr tconstpool:$cp), (ADR tconstpool:$cp)>;
def : Pat<(AArch64adr tblockaddress:$cp), (ADR tblockaddress:$cp)>;
def : Pat<(AArch64adr texternalsym:$sym), (ADR texternalsym:$sym)>;
def : Pat<(AArch64adr tjumptable:$sym), (ADR tjumptable:$sym)>;
def : Pat<(AArch64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>;
def : Pat<(AArch64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>;
def : Pat<(AArch64adrp texternalsym:$sym), (ADRP texternalsym:$sym)>;
//===----------------------------------------------------------------------===//
// Unconditional branch (register) instructions.
//===----------------------------------------------------------------------===//
let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
def RET : BranchReg<0b0010, "ret", []>;
def DRPS : SpecialReturn<0b0101, "drps">;
def ERET : SpecialReturn<0b0100, "eret">;
} // isReturn = 1, isTerminator = 1, isBarrier = 1
// Default to the LR register.
def : InstAlias<"ret", (RET LR)>;
let isCall = 1, Defs = [LR], Uses = [SP] in {
def BLR : BranchReg<0b0001, "blr", [(AArch64call GPR64:$Rn)]>;
} // isCall
let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
} // isBranch, isTerminator, isBarrier, isIndirectBranch
// Create a separate pseudo-instruction for codegen to use so that we don't
// flag lr as used in every function. It'll be restored before the RET by the
// epilogue if it's legitimately used.
def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]>,
Sched<[WriteBrReg]> {
let isTerminator = 1;
let isBarrier = 1;
let isReturn = 1;
}
// This is a directive-like pseudo-instruction. The purpose is to insert an
// R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction
// (which in the usual case is a BLR).
let hasSideEffects = 1 in
def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []>, Sched<[]> {
let AsmString = ".tlsdesccall $sym";
}
// Pseudo instruction to tell the streamer to emit a 'B' character into the
// augmentation string.
def EMITBKEY : Pseudo<(outs), (ins), []>, Sched<[]> {}
// FIXME: maybe the scratch register used shouldn't be fixed to X1?
// FIXME: can "hasSideEffects be dropped?
let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1,
isCodeGenOnly = 1 in
def TLSDESC_CALLSEQ
: Pseudo<(outs), (ins i64imm:$sym),
[(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>,
Sched<[WriteI, WriteLD, WriteI, WriteBrReg]>;
def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym),
(TLSDESC_CALLSEQ texternalsym:$sym)>;
//===----------------------------------------------------------------------===//
// Conditional branch (immediate) instruction.
//===----------------------------------------------------------------------===//
def Bcc : BranchCond;
//===----------------------------------------------------------------------===//
// Compare-and-branch instructions.
//===----------------------------------------------------------------------===//
defm CBZ : CmpBranch<0, "cbz", AArch64cbz>;
defm CBNZ : CmpBranch<1, "cbnz", AArch64cbnz>;
//===----------------------------------------------------------------------===//
// Test-bit-and-branch instructions.
//===----------------------------------------------------------------------===//
defm TBZ : TestBranch<0, "tbz", AArch64tbz>;
defm TBNZ : TestBranch<1, "tbnz", AArch64tbnz>;
//===----------------------------------------------------------------------===//
// Unconditional branch (immediate) instructions.
//===----------------------------------------------------------------------===//
let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
def B : BranchImm<0, "b", [(br bb:$addr)]>;
} // isBranch, isTerminator, isBarrier
let isCall = 1, Defs = [LR], Uses = [SP] in {
def BL : CallImm<1, "bl", [(AArch64call tglobaladdr:$addr)]>;
} // isCall
def : Pat<(AArch64call texternalsym:$func), (BL texternalsym:$func)>;
//===----------------------------------------------------------------------===//
// Exception generation instructions.
//===----------------------------------------------------------------------===//
let isTrap = 1 in {
def BRK : ExceptionGeneration<0b001, 0b00, "brk">;
}
def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">;
def HLT : ExceptionGeneration<0b010, 0b00, "hlt">;
def HVC : ExceptionGeneration<0b000, 0b10, "hvc">;
def SMC : ExceptionGeneration<0b000, 0b11, "smc">;
def SVC : ExceptionGeneration<0b000, 0b01, "svc">;
// DCPSn defaults to an immediate operand of zero if unspecified.
def : InstAlias<"dcps1", (DCPS1 0)>;
def : InstAlias<"dcps2", (DCPS2 0)>;
def : InstAlias<"dcps3", (DCPS3 0)>;
def UDF : UDFType<0, "udf">;
//===----------------------------------------------------------------------===//
// Load instructions.
//===----------------------------------------------------------------------===//
// Pair (indexed, offset)
defm LDPW : LoadPairOffset<0b00, 0, GPR32z, simm7s4, "ldp">;
defm LDPX : LoadPairOffset<0b10, 0, GPR64z, simm7s8, "ldp">;
defm LDPS : LoadPairOffset<0b00, 1, FPR32Op, simm7s4, "ldp">;
defm LDPD : LoadPairOffset<0b01, 1, FPR64Op, simm7s8, "ldp">;
defm LDPQ : LoadPairOffset<0b10, 1, FPR128Op, simm7s16, "ldp">;
defm LDPSW : LoadPairOffset<0b01, 0, GPR64z, simm7s4, "ldpsw">;
// Pair (pre-indexed)
def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32z, simm7s4, "ldp">;
def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64z, simm7s8, "ldp">;
def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32Op, simm7s4, "ldp">;
def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64Op, simm7s8, "ldp">;
def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128Op, simm7s16, "ldp">;
def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64z, simm7s4, "ldpsw">;
// Pair (post-indexed)
def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32z, simm7s4, "ldp">;
def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64z, simm7s8, "ldp">;
def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32Op, simm7s4, "ldp">;
def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64Op, simm7s8, "ldp">;
def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128Op, simm7s16, "ldp">;
def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64z, simm7s4, "ldpsw">;
// Pair (no allocate)
defm LDNPW : LoadPairNoAlloc<0b00, 0, GPR32z, simm7s4, "ldnp">;
defm LDNPX : LoadPairNoAlloc<0b10, 0, GPR64z, simm7s8, "ldnp">;
defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32Op, simm7s4, "ldnp">;
defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64Op, simm7s8, "ldnp">;
defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128Op, simm7s16, "ldnp">;
//---
// (register offset)
//---
// Integer
defm LDRBB : Load8RO<0b00, 0, 0b01, GPR32, "ldrb", i32, zextloadi8>;
defm LDRHH : Load16RO<0b01, 0, 0b01, GPR32, "ldrh", i32, zextloadi16>;
defm LDRW : Load32RO<0b10, 0, 0b01, GPR32, "ldr", i32, load>;
defm LDRX : Load64RO<0b11, 0, 0b01, GPR64, "ldr", i64, load>;
// Floating-point
defm LDRB : Load8RO<0b00, 1, 0b01, FPR8Op, "ldr", untyped, load>;
defm LDRH : Load16RO<0b01, 1, 0b01, FPR16Op, "ldr", f16, load>;
defm LDRS : Load32RO<0b10, 1, 0b01, FPR32Op, "ldr", f32, load>;
defm LDRD : Load64RO<0b11, 1, 0b01, FPR64Op, "ldr", f64, load>;
defm LDRQ : Load128RO<0b00, 1, 0b11, FPR128Op, "ldr", f128, load>;
// Load sign-extended half-word
defm LDRSHW : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh", i32, sextloadi16>;
defm LDRSHX : Load16RO<0b01, 0, 0b10, GPR64, "ldrsh", i64, sextloadi16>;
// Load sign-extended byte
defm LDRSBW : Load8RO<0b00, 0, 0b11, GPR32, "ldrsb", i32, sextloadi8>;
defm LDRSBX : Load8RO<0b00, 0, 0b10, GPR64, "ldrsb", i64, sextloadi8>;
// Load sign-extended word
defm LDRSW : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
// Pre-fetch.
defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
// For regular load, we do not have any alignment requirement.
// Thus, it is safe to directly map the vector loads with interesting
// addressing modes.
// FIXME: We could do the same for bitconvert to floating point vectors.
multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
ValueType ScalTy, ValueType VecTy,
Instruction LOADW, Instruction LOADX,
SubRegIndex sub> {
def : Pat<(VecTy (scalar_to_vector (ScalTy
(loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
(INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
(LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
sub)>;
def : Pat<(VecTy (scalar_to_vector (ScalTy
(loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
(INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
(LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
sub)>;
}
let AddedComplexity = 10 in {
defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v8i8, LDRBroW, LDRBroX, bsub>;
defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v16i8, LDRBroW, LDRBroX, bsub>;
defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;
defm : ScalToVecROLoadPat<ro16, load, i32, v4f16, LDRHroW, LDRHroX, hsub>;
defm : ScalToVecROLoadPat<ro16, load, i32, v8f16, LDRHroW, LDRHroX, hsub>;
defm : ScalToVecROLoadPat<ro32, load, i32, v2i32, LDRSroW, LDRSroX, ssub>;
defm : ScalToVecROLoadPat<ro32, load, i32, v4i32, LDRSroW, LDRSroX, ssub>;
defm : ScalToVecROLoadPat<ro32, load, f32, v2f32, LDRSroW, LDRSroX, ssub>;
defm : ScalToVecROLoadPat<ro32, load, f32, v4f32, LDRSroW, LDRSroX, ssub>;
defm : ScalToVecROLoadPat<ro64, load, i64, v2i64, LDRDroW, LDRDroX, dsub>;
defm : ScalToVecROLoadPat<ro64, load, f64, v2f64, LDRDroW, LDRDroX, dsub>;
def : Pat <(v1i64 (scalar_to_vector (i64
(load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
ro_Wextend64:$extend))))),
(LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
def : Pat <(v1i64 (scalar_to_vector (i64
(load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
ro_Xextend64:$extend))))),
(LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
}
// Match all load 64 bits width whose type is compatible with FPR64
multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy,
Instruction LOADW, Instruction LOADX> {
def : Pat<(VecTy (load (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
(LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
def : Pat<(VecTy (load (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
(LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
}
let AddedComplexity = 10 in {
let Predicates = [IsLE] in {
// We must do vector loads with LD1 in big-endian.
defm : VecROLoadPat<ro64, v2i32, LDRDroW, LDRDroX>;
defm : VecROLoadPat<ro64, v2f32, LDRDroW, LDRDroX>;
defm : VecROLoadPat<ro64, v8i8, LDRDroW, LDRDroX>;
defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>;
defm : VecROLoadPat<ro64, v4f16, LDRDroW, LDRDroX>;
}
defm : VecROLoadPat<ro64, v1i64, LDRDroW, LDRDroX>;
defm : VecROLoadPat<ro64, v1f64, LDRDroW, LDRDroX>;
// Match all load 128 bits width whose type is compatible with FPR128
let Predicates = [IsLE] in {
// We must do vector loads with LD1 in big-endian.
defm : VecROLoadPat<ro128, v2i64, LDRQroW, LDRQroX>;
defm : VecROLoadPat<ro128, v2f64, LDRQroW, LDRQroX>;
defm : VecROLoadPat<ro128, v4i32, LDRQroW, LDRQroX>;
defm : VecROLoadPat<ro128, v4f32, LDRQroW, LDRQroX>;
defm : VecROLoadPat<ro128, v8i16, LDRQroW, LDRQroX>;
defm : VecROLoadPat<ro128, v8f16, LDRQroW, LDRQroX>;
defm : VecROLoadPat<ro128, v16i8, LDRQroW, LDRQroX>;
}
} // AddedComplexity = 10
// zextload -> i64
multiclass ExtLoadTo64ROPat<ROAddrMode ro, SDPatternOperator loadop,
Instruction INSTW, Instruction INSTX> {
def : Pat<(i64 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
(SUBREG_TO_REG (i64 0),
(INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
sub_32)>;
def : Pat<(i64 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
(SUBREG_TO_REG (i64 0),
(INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
sub_32)>;
}
let AddedComplexity = 10 in {
defm : ExtLoadTo64ROPat<ro8, zextloadi8, LDRBBroW, LDRBBroX>;
defm : ExtLoadTo64ROPat<ro16, zextloadi16, LDRHHroW, LDRHHroX>;
defm : ExtLoadTo64ROPat<ro32, zextloadi32, LDRWroW, LDRWroX>;
// zextloadi1 -> zextloadi8
defm : ExtLoadTo64ROPat<ro8, zextloadi1, LDRBBroW, LDRBBroX>;
// extload -> zextload
defm : ExtLoadTo64ROPat<ro8, extloadi8, LDRBBroW, LDRBBroX>;
defm : ExtLoadTo64ROPat<ro16, extloadi16, LDRHHroW, LDRHHroX>;
defm : ExtLoadTo64ROPat<ro32, extloadi32, LDRWroW, LDRWroX>;
// extloadi1 -> zextloadi8
defm : ExtLoadTo64ROPat<ro8, extloadi1, LDRBBroW, LDRBBroX>;
}
// zextload -> i64
multiclass ExtLoadTo32ROPat<ROAddrMode ro, SDPatternOperator loadop,
Instruction INSTW, Instruction INSTX> {
def : Pat<(i32 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
(INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
def : Pat<(i32 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
(INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
}
let AddedComplexity = 10 in {
// extload -> zextload
defm : ExtLoadTo32ROPat<ro8, extloadi8, LDRBBroW, LDRBBroX>;
defm : ExtLoadTo32ROPat<ro16, extloadi16, LDRHHroW, LDRHHroX>;
defm : ExtLoadTo32ROPat<ro32, extloadi32, LDRWroW, LDRWroX>;
// zextloadi1 -> zextloadi8
defm : ExtLoadTo32ROPat<ro8, zextloadi1, LDRBBroW, LDRBBroX>;
}
//---
// (unsigned immediate)
//---
defm LDRX : LoadUI<0b11, 0, 0b01, GPR64z, uimm12s8, "ldr",
[(set GPR64z:$Rt,
(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
defm LDRW : LoadUI<0b10, 0, 0b01, GPR32z, uimm12s4, "ldr",
[(set GPR32z:$Rt,
(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
defm LDRB : LoadUI<0b00, 1, 0b01, FPR8Op, uimm12s1, "ldr",
[(set FPR8Op:$Rt,
(load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>;
defm LDRH : LoadUI<0b01, 1, 0b01, FPR16Op, uimm12s2, "ldr",
[(set (f16 FPR16Op:$Rt),
(load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)))]>;
defm LDRS : LoadUI<0b10, 1, 0b01, FPR32Op, uimm12s4, "ldr",
[(set (f32 FPR32Op:$Rt),
(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
defm LDRD : LoadUI<0b11, 1, 0b01, FPR64Op, uimm12s8, "ldr",
[(set (f64 FPR64Op:$Rt),
(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128Op, uimm12s16, "ldr",
[(set (f128 FPR128Op:$Rt),
(load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>;
// For regular load, we do not have any alignment requirement.
// Thus, it is safe to directly map the vector loads with interesting
// addressing modes.
// FIXME: We could do the same for bitconvert to floating point vectors.
def : Pat <(v8i8 (scalar_to_vector (i32
(extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
(INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
(LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
def : Pat <(v16i8 (scalar_to_vector (i32
(extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
def : Pat <(v4i16 (scalar_to_vector (i32
(extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
(INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
(LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
def : Pat <(v8i16 (scalar_to_vector (i32
(extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
(LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
def : Pat <(v2i32 (scalar_to_vector (i32
(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
(INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
(LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
def : Pat <(v4i32 (scalar_to_vector (i32
(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
def : Pat <(v1i64 (scalar_to_vector (i64
(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
def : Pat <(v2i64 (scalar_to_vector (i64
(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
(LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
// Match all load 64 bits width whose type is compatible with FPR64
let Predicates = [IsLE] in {
// We must use LD1 to perform vector loads in big-endian.
def : Pat<(v2f32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
def : Pat<(v8i8 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
def : Pat<(v4i16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
def : Pat<(v2i32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
def : Pat<(v4f16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
}
def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
def : Pat<(v1i64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
// Match all load 128 bits width whose type is compatible with FPR128
let Predicates = [IsLE] in {
// We must use LD1 to perform vector loads in big-endian.
def : Pat<(v4f32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
def : Pat<(v2f64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
def : Pat<(v16i8 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
def : Pat<(v8i16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
def : Pat<(v4i32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
def : Pat<(v2i64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
def : Pat<(v8f16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
}
def : Pat<(f128 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
defm LDRHH : LoadUI<0b01, 0, 0b01, GPR32, uimm12s2, "ldrh",
[(set GPR32:$Rt,
(zextloadi16 (am_indexed16 GPR64sp:$Rn,
uimm12s2:$offset)))]>;
defm LDRBB : LoadUI<0b00, 0, 0b01, GPR32, uimm12s1, "ldrb",
[(set GPR32:$Rt,
(zextloadi8 (am_indexed8 GPR64sp:$Rn,
uimm12s1:$offset)))]>;
// zextload -> i64
def : Pat<(i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
(SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
def : Pat<(i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
(SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
// zextloadi1 -> zextloadi8
def : Pat<(i32 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
(LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
def : Pat<(i64 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
(SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
// extload -> zextload
def : Pat<(i32 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
(LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
def : Pat<(i32 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
(LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
def : Pat<(i32 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
(LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
def : Pat<(i64 (extloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
(SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
def : Pat<(i64 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
(SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
def : Pat<(i64 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
(SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
def : Pat<(i64 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
(SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
// load sign-extended half-word
defm LDRSHW : LoadUI<0b01, 0, 0b11, GPR32, uimm12s2, "ldrsh",
[(set GPR32:$Rt,
(sextloadi16 (am_indexed16 GPR64sp:$Rn,
uimm12s2:$offset)))]>;
defm LDRSHX : LoadUI<0b01, 0, 0b10, GPR64, uimm12s2, "ldrsh",
[(set GPR64:$Rt,
(sextloadi16 (am_indexed16 GPR64sp:$Rn,
uimm12s2:$offset)))]>;
// load sign-extended byte
defm LDRSBW : LoadUI<0b00, 0, 0b11, GPR32, uimm12s1, "ldrsb",
[(set GPR32:$Rt,
(sextloadi8 (am_indexed8 GPR64sp:$Rn,
uimm12s1:$offset)))]>;
defm LDRSBX : LoadUI<0b00, 0, 0b10, GPR64, uimm12s1, "ldrsb",
[(set GPR64:$Rt,
(sextloadi8 (am_indexed8 GPR64sp:$Rn,
uimm12s1:$offset)))]>;
// load sign-extended word
defm LDRSW : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
[(set GPR64:$Rt,
(sextloadi32 (am_indexed32 GPR64sp:$Rn,
uimm12s4:$offset)))]>;
// load zero-extended word
def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
(SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
// Pre-fetch.
def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
[(AArch64Prefetch imm:$Rt,
(am_indexed64 GPR64sp:$Rn,
uimm12s8:$offset))]>;
def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>;
//---
// (literal)
def alignedglobal : PatLeaf<(iPTR iPTR:$label), [{
if (auto *G = dyn_cast<GlobalAddressSDNode>(N)) {
const DataLayout &DL = MF->getDataLayout();
unsigned Align = G->getGlobal()->getPointerAlignment(DL);
return Align >= 4 && G->getOffset() % 4 == 0;
}
if (auto *C = dyn_cast<ConstantPoolSDNode>(N))
return C->getAlignment() >= 4 && C->getOffset() % 4 == 0;
return false;
}]>;
def LDRWl : LoadLiteral<0b00, 0, GPR32z, "ldr",
[(set GPR32z:$Rt, (load (AArch64adr alignedglobal:$label)))]>;
def LDRXl : LoadLiteral<0b01, 0, GPR64z, "ldr",
[(set GPR64z:$Rt, (load (AArch64adr alignedglobal:$label)))]>;
def LDRSl : LoadLiteral<0b00, 1, FPR32Op, "ldr",
[(set (f32 FPR32Op:$Rt), (load (AArch64adr alignedglobal:$label)))]>;
def LDRDl : LoadLiteral<0b01, 1, FPR64Op, "ldr",
[(set (f64 FPR64Op:$Rt), (load (AArch64adr alignedglobal:$label)))]>;
def LDRQl : LoadLiteral<0b10, 1, FPR128Op, "ldr",
[(set (f128 FPR128Op:$Rt), (load (AArch64adr alignedglobal:$label)))]>;
// load sign-extended word
def LDRSWl : LoadLiteral<0b10, 0, GPR64z, "ldrsw",
[(set GPR64z:$Rt, (sextloadi32 (AArch64adr alignedglobal:$label)))]>;
let AddedComplexity = 20 in {
def : Pat<(i64 (zextloadi32 (AArch64adr alignedglobal:$label))),
(SUBREG_TO_REG (i64 0), (LDRWl $label), sub_32)>;
}
// prefetch
def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>;
// [(AArch64Prefetch imm:$Rt, tglobaladdr:$label)]>;
//---
// (unscaled immediate)
defm LDURX : LoadUnscaled<0b11, 0, 0b01, GPR64z, "ldur",
[(set GPR64z:$Rt,
(load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32z, "ldur",
[(set GPR32z:$Rt,
(load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8Op, "ldur",
[(set FPR8Op:$Rt,
(load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16Op, "ldur",
[(set FPR16Op:$Rt,
(load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32Op, "ldur",
[(set (f32 FPR32Op:$Rt),
(load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
defm LDURD : LoadUnscaled<0b11, 1, 0b01, FPR64Op, "ldur",
[(set (f64 FPR64Op:$Rt),
(load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
defm LDURQ : LoadUnscaled<0b00, 1, 0b11, FPR128Op, "ldur",
[(set (f128 FPR128Op:$Rt),
(load (am_unscaled128 GPR64sp:$Rn, simm9:$offset)))]>;
defm LDURHH
: LoadUnscaled<0b01, 0, 0b01, GPR32, "ldurh",
[(set GPR32:$Rt,
(zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
defm LDURBB
: LoadUnscaled<0b00, 0, 0b01, GPR32, "ldurb",
[(set GPR32:$Rt,
(zextloadi8 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
// Match all load 64 bits width whose type is compatible with FPR64
let Predicates = [IsLE] in {
def : Pat<(v2f32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
(LDURDi GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(v2i32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
(LDURDi GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(v4i16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
(LDURDi GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(v8i8 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
(LDURDi GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(v4f16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
(LDURDi GPR64sp:$Rn, simm9:$offset)>;
}
def : Pat<(v1f64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
(LDURDi GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(v1i64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
(LDURDi GPR64sp:$Rn, simm9:$offset)>;
// Match all load 128 bits width whose type is compatible with FPR128
let Predicates = [IsLE] in {
def : Pat<(v2f64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
(LDURQi GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(v2i64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
(LDURQi GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(v4f32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
(LDURQi GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(v4i32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
(LDURQi GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(v8i16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
(LDURQi GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(v16i8 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
(LDURQi GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(v8f16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
(LDURQi GPR64sp:$Rn, simm9:$offset)>;
}
// anyext -> zext
def : Pat<(i32 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
(LDURHHi GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(i32 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
(LDURBBi GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(i32 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
(LDURBBi GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(i64 (extloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
(SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
def : Pat<(i64 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
(SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
def : Pat<(i64 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
(SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
def : Pat<(i64 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
(SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
// unscaled zext
def : Pat<(i32 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
(LDURHHi GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(i32 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
(LDURBBi GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(i32 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
(LDURBBi GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(i64 (zextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
(SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
(SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
(SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
def : Pat<(i64 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
(SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
//---
// LDR mnemonics fall back to LDUR for negative or unaligned offsets.
// Define new assembler match classes as we want to only match these when
// the don't otherwise match the scaled addressing mode for LDR/STR. Don't
// associate a DiagnosticType either, as we want the diagnostic for the
// canonical form (the scaled operand) to take precedence.
class SImm9OffsetOperand<int Width> : AsmOperandClass {
let Name = "SImm9OffsetFB" # Width;
let PredicateMethod = "isSImm9OffsetFB<" # Width # ">";
let RenderMethod = "addImmOperands";
}
def SImm9OffsetFB8Operand : SImm9OffsetOperand<8>;
def SImm9OffsetFB16Operand : SImm9OffsetOperand<16>;
def SImm9OffsetFB32Operand : SImm9OffsetOperand<32>;
def SImm9OffsetFB64Operand : SImm9OffsetOperand<64>;
def SImm9OffsetFB128Operand : SImm9OffsetOperand<128>;
def simm9_offset_fb8 : Operand<i64> {
let ParserMatchClass = SImm9OffsetFB8Operand;
}
def simm9_offset_fb16 : Operand<i64> {
let ParserMatchClass = SImm9OffsetFB16Operand;
}
def simm9_offset_fb32 : Operand<i64> {
let ParserMatchClass = SImm9OffsetFB32Operand;
}
def simm9_offset_fb64 : Operand<i64> {
let ParserMatchClass = SImm9OffsetFB64Operand;
}
def simm9_offset_fb128 : Operand<i64> {
let ParserMatchClass = SImm9OffsetFB128Operand;
}
def : InstAlias<"ldr $Rt, [$Rn, $offset]",
(LDURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
def : InstAlias<"ldr $Rt, [$Rn, $offset]",
(LDURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
def : InstAlias<"ldr $Rt, [$Rn, $offset]",
(LDURBi FPR8Op:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
def : InstAlias<"ldr $Rt, [$Rn, $offset]",
(LDURHi FPR16Op:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
def : InstAlias<"ldr $Rt, [$Rn, $offset]",
(LDURSi FPR32Op:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
def : InstAlias<"ldr $Rt, [$Rn, $offset]",
(LDURDi FPR64Op:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
def : InstAlias<"ldr $Rt, [$Rn, $offset]",
(LDURQi FPR128Op:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
// zextload -> i64
def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
(SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
(SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
// load sign-extended half-word
defm LDURSHW
: LoadUnscaled<0b01, 0, 0b11, GPR32, "ldursh",
[(set GPR32:$Rt,
(sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
defm LDURSHX
: LoadUnscaled<0b01, 0, 0b10, GPR64, "ldursh",
[(set GPR64:$Rt,
(sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
// load sign-extended byte
defm LDURSBW
: LoadUnscaled<0b00, 0, 0b11, GPR32, "ldursb",
[(set GPR32:$Rt,
(sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
defm LDURSBX
: LoadUnscaled<0b00, 0, 0b10, GPR64, "ldursb",
[(set GPR64:$Rt,
(sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
// load sign-extended word
defm LDURSW
: LoadUnscaled<0b10, 0, 0b10, GPR64, "ldursw",
[(set GPR64:$Rt,
(sextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
// zero and sign extending aliases from generic LDR* mnemonics to LDUR*.
def : InstAlias<"ldrb $Rt, [$Rn, $offset]",
(LDURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
def : InstAlias<"ldrh $Rt, [$Rn, $offset]",
(LDURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
(LDURSBWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
(LDURSBXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
(LDURSHWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
(LDURSHXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
def : InstAlias<"ldrsw $Rt, [$Rn, $offset]",
(LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
// Pre-fetch.
defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
[(AArch64Prefetch imm:$Rt,
(am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
//---
// (unscaled immediate, unprivileged)
defm LDTRX : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">;
defm LDTRW : LoadUnprivileged<0b10, 0, 0b01, GPR32, "ldtr">;
defm LDTRH : LoadUnprivileged<0b01, 0, 0b01, GPR32, "ldtrh">;
defm LDTRB : LoadUnprivileged<0b00, 0, 0b01, GPR32, "ldtrb">;
// load sign-extended half-word
defm LDTRSHW : LoadUnprivileged<0b01, 0, 0b11, GPR32, "ldtrsh">;
defm LDTRSHX : LoadUnprivileged<0b01, 0, 0b10, GPR64, "ldtrsh">;
// load sign-extended byte
defm LDTRSBW : LoadUnprivileged<0b00, 0, 0b11, GPR32, "ldtrsb">;
defm LDTRSBX : LoadUnprivileged<0b00, 0, 0b10, GPR64, "ldtrsb">;
// load sign-extended word
defm LDTRSW : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">;
//---
// (immediate pre-indexed)
def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32z, "ldr">;
def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64z, "ldr">;
def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8Op, "ldr">;
def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16Op, "ldr">;
def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32Op, "ldr">;
def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64Op, "ldr">;
def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128Op, "ldr">;
// load sign-extended half-word
def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32z, "ldrsh">;
def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64z, "ldrsh">;
// load sign-extended byte
def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32z, "ldrsb">;
def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64z, "ldrsb">;
// load zero-extended byte
def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32z, "ldrb">;
def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32z, "ldrh">;
// load sign-extended word
def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64z, "ldrsw">;
//---
// (immediate post-indexed)
def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32z, "ldr">;
def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64z, "ldr">;
def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8Op, "ldr">;
def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16Op, "ldr">;
def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32Op, "ldr">;
def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64Op, "ldr">;
def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128Op, "ldr">;
// load sign-extended half-word
def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32z, "ldrsh">;
def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64z, "ldrsh">;
// load sign-extended byte
def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32z, "ldrsb">;
def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64z, "ldrsb">;
// load zero-extended byte
def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32z, "ldrb">;
def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32z, "ldrh">;
// load sign-extended word
def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64z, "ldrsw">;
//===----------------------------------------------------------------------===//
// Store instructions.
//===----------------------------------------------------------------------===//
// Pair (indexed, offset)
// FIXME: Use dedicated range-checked addressing mode operand here.
defm STPW : StorePairOffset<0b00, 0, GPR32z, simm7s4, "stp">;
defm STPX : StorePairOffset<0b10, 0, GPR64z, simm7s8, "stp">;
defm STPS : StorePairOffset<0b00, 1, FPR32Op, simm7s4, "stp">;
defm STPD : StorePairOffset<0b01, 1, FPR64Op, simm7s8, "stp">;
defm STPQ : StorePairOffset<0b10, 1, FPR128Op, simm7s16, "stp">;
// Pair (pre-indexed)
def STPWpre : StorePairPreIdx<0b00, 0, GPR32z, simm7s4, "stp">;
def STPXpre : StorePairPreIdx<0b10, 0, GPR64z, simm7s8, "stp">;
def STPSpre : StorePairPreIdx<0b00, 1, FPR32Op, simm7s4, "stp">;
def STPDpre : StorePairPreIdx<0b01, 1, FPR64Op, simm7s8, "stp">;
def STPQpre : StorePairPreIdx<0b10, 1, FPR128Op, simm7s16, "stp">;
// Pair (pre-indexed)
def STPWpost : StorePairPostIdx<0b00, 0, GPR32z, simm7s4, "stp">;
def STPXpost : StorePairPostIdx<0b10, 0, GPR64z, simm7s8, "stp">;
def STPSpost : StorePairPostIdx<0b00, 1, FPR32Op, simm7s4, "stp">;
def STPDpost : StorePairPostIdx<0b01, 1, FPR64Op, simm7s8, "stp">;
def STPQpost : StorePairPostIdx<0b10, 1, FPR128Op, simm7s16, "stp">;
// Pair (no allocate)
defm STNPW : StorePairNoAlloc<0b00, 0, GPR32z, simm7s4, "stnp">;
defm STNPX : StorePairNoAlloc<0b10, 0, GPR64z, simm7s8, "stnp">;
defm STNPS : StorePairNoAlloc<0b00, 1, FPR32Op, simm7s4, "stnp">;
defm STNPD : StorePairNoAlloc<0b01, 1, FPR64Op, simm7s8, "stnp">;
defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">;
//---
// (Register offset)
// Integer
defm STRBB : Store8RO< 0b00, 0, 0b00, GPR32, "strb", i32, truncstorei8>;
defm STRHH : Store16RO<0b01, 0, 0b00, GPR32, "strh", i32, truncstorei16>;
defm STRW : Store32RO<0b10, 0, 0b00, GPR32, "str", i32, store>;
defm STRX : Store64RO<0b11, 0, 0b00, GPR64, "str", i64, store>;
// Floating-point
defm STRB : Store8RO< 0b00, 1, 0b00, FPR8Op, "str", untyped, store>;
defm STRH : Store16RO<0b01, 1, 0b00, FPR16Op, "str", f16, store>;
defm STRS : Store32RO<0b10, 1, 0b00, FPR32Op, "str", f32, store>;
defm STRD : Store64RO<0b11, 1, 0b00, FPR64Op, "str", f64, store>;
defm STRQ : Store128RO<0b00, 1, 0b10, FPR128Op, "str", f128, store>;
let Predicates = [UseSTRQro], AddedComplexity = 10 in {
def : Pat<(store (f128 FPR128:$Rt),
(ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
ro_Wextend128:$extend)),
(STRQroW FPR128:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend)>;
def : Pat<(store (f128 FPR128:$Rt),
(ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
ro_Xextend128:$extend)),
(STRQroX FPR128:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Wextend128:$extend)>;
}
multiclass TruncStoreFrom64ROPat<ROAddrMode ro, SDPatternOperator storeop,
Instruction STRW, Instruction STRX> {
def : Pat<(storeop GPR64:$Rt,
(ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
(STRW (EXTRACT_SUBREG GPR64:$Rt, sub_32),
GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
def : Pat<(storeop GPR64:$Rt,
(ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
(STRX (EXTRACT_SUBREG GPR64:$Rt, sub_32),
GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
}
let AddedComplexity = 10 in {
// truncstore i64
defm : TruncStoreFrom64ROPat<ro8, truncstorei8, STRBBroW, STRBBroX>;
defm : TruncStoreFrom64ROPat<ro16, truncstorei16, STRHHroW, STRHHroX>;
defm : TruncStoreFrom64ROPat<ro32, truncstorei32, STRWroW, STRWroX>;
}
multiclass VecROStorePat<ROAddrMode ro, ValueType VecTy, RegisterClass FPR,
Instruction STRW, Instruction STRX> {
def : Pat<(store (VecTy FPR:$Rt),
(ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
(STRW FPR:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
def : Pat<(store (VecTy FPR:$Rt),
(ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
(STRX FPR:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
}
let AddedComplexity = 10 in {
// Match all store 64 bits width whose type is compatible with FPR64
let Predicates = [IsLE] in {
// We must use ST1 to store vectors in big-endian.
defm : VecROStorePat<ro64, v2i32, FPR64, STRDroW, STRDroX>;
defm : VecROStorePat<ro64, v2f32, FPR64, STRDroW, STRDroX>;
defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>;
defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>;
defm : VecROStorePat<ro64, v4f16, FPR64, STRDroW, STRDroX>;
}
defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>;
defm : VecROStorePat<ro64, v1f64, FPR64, STRDroW, STRDroX>;
// Match all store 128 bits width whose type is compatible with FPR128
let Predicates = [IsLE, UseSTRQro] in {
// We must use ST1 to store vectors in big-endian.
defm : VecROStorePat<ro128, v2i64, FPR128, STRQroW, STRQroX>;
defm : VecROStorePat<ro128, v2f64, FPR128, STRQroW, STRQroX>;
defm : VecROStorePat<ro128, v4i32, FPR128, STRQroW, STRQroX>;
defm : VecROStorePat<ro128, v4f32, FPR128, STRQroW, STRQroX>;
defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>;
defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>;
defm : VecROStorePat<ro128, v8f16, FPR128, STRQroW, STRQroX>;
}
} // AddedComplexity = 10
// Match stores from lane 0 to the appropriate subreg's store.
multiclass VecROStoreLane0Pat<ROAddrMode ro, SDPatternOperator storeop,
ValueType VecTy, ValueType STy,
SubRegIndex SubRegIdx,
Instruction STRW, Instruction STRX> {
def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
(ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
(STRW (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
(ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
(STRX (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
}
let AddedComplexity = 19 in {
defm : VecROStoreLane0Pat<ro16, truncstorei16, v8i16, i32, hsub, STRHroW, STRHroX>;
defm : VecROStoreLane0Pat<ro16, store, v8f16, f16, hsub, STRHroW, STRHroX>;
defm : VecROStoreLane0Pat<ro32, store, v4i32, i32, ssub, STRSroW, STRSroX>;
defm : VecROStoreLane0Pat<ro32, store, v4f32, f32, ssub, STRSroW, STRSroX>;
defm : VecROStoreLane0Pat<ro64, store, v2i64, i64, dsub, STRDroW, STRDroX>;
defm : VecROStoreLane0Pat<ro64, store, v2f64, f64, dsub, STRDroW, STRDroX>;
}
//---
// (unsigned immediate)
defm STRX : StoreUIz<0b11, 0, 0b00, GPR64z, uimm12s8, "str",
[(store GPR64z:$Rt,
(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
defm STRW : StoreUIz<0b10, 0, 0b00, GPR32z, uimm12s4, "str",
[(store GPR32z:$Rt,
(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
defm STRB : StoreUI<0b00, 1, 0b00, FPR8Op, uimm12s1, "str",
[(store FPR8Op:$Rt,
(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>;
defm STRH : StoreUI<0b01, 1, 0b00, FPR16Op, uimm12s2, "str",
[(store (f16 FPR16Op:$Rt),
(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))]>;
defm STRS : StoreUI<0b10, 1, 0b00, FPR32Op, uimm12s4, "str",
[(store (f32 FPR32Op:$Rt),
(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
defm STRD : StoreUI<0b11, 1, 0b00, FPR64Op, uimm12s8, "str",
[(store (f64 FPR64Op:$Rt),
(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
defm STRQ : StoreUI<0b00, 1, 0b10, FPR128Op, uimm12s16, "str", []>;
defm STRHH : StoreUIz<0b01, 0, 0b00, GPR32z, uimm12s2, "strh",
[(truncstorei16 GPR32z:$Rt,
(am_indexed16 GPR64sp:$Rn,
uimm12s2:$offset))]>;
defm STRBB : StoreUIz<0b00, 0, 0b00, GPR32z, uimm12s1, "strb",
[(truncstorei8 GPR32z:$Rt,
(am_indexed8 GPR64sp:$Rn,
uimm12s1:$offset))]>;
let AddedComplexity = 10 in {
// Match all store 64 bits width whose type is compatible with FPR64
def : Pat<(store (v1i64 FPR64:$Rt),
(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
def : Pat<(store (v1f64 FPR64:$Rt),
(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
let Predicates = [IsLE] in {
// We must use ST1 to store vectors in big-endian.
def : Pat<(store (v2f32 FPR64:$Rt),
(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
def : Pat<(store (v8i8 FPR64:$Rt),
(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
def : Pat<(store (v4i16 FPR64:$Rt),
(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
def : Pat<(store (v2i32 FPR64:$Rt),
(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
def : Pat<(store (v4f16 FPR64:$Rt),
(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
}
// Match all store 128 bits width whose type is compatible with FPR128
def : Pat<(store (f128 FPR128:$Rt),
(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
let Predicates = [IsLE] in {
// We must use ST1 to store vectors in big-endian.
def : Pat<(store (v4f32 FPR128:$Rt),
(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
def : Pat<(store (v2f64 FPR128:$Rt),
(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
def : Pat<(store (v16i8 FPR128:$Rt),
(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
def : Pat<(store (v8i16 FPR128:$Rt),
(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
def : Pat<(store (v4i32 FPR128:$Rt),
(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
def : Pat<(store (v2i64 FPR128:$Rt),
(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
def : Pat<(store (v8f16 FPR128:$Rt),
(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
}
// truncstore i64
def : Pat<(truncstorei32 GPR64:$Rt,
(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)),
(STRWui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s4:$offset)>;
def : Pat<(truncstorei16 GPR64:$Rt,
(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
(STRHHui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s2:$offset)>;
def : Pat<(truncstorei8 GPR64:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)),
(STRBBui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s1:$offset)>;
} // AddedComplexity = 10
// Match stores from lane 0 to the appropriate subreg's store.
multiclass VecStoreLane0Pat<Operand UIAddrMode, SDPatternOperator storeop,
ValueType VTy, ValueType STy,
SubRegIndex SubRegIdx, Operand IndexType,
Instruction STR> {
def : Pat<(storeop (STy (vector_extract (VTy VecListOne128:$Vt), 0)),
(UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
(STR (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
GPR64sp:$Rn, IndexType:$offset)>;
}
let AddedComplexity = 19 in {
defm : VecStoreLane0Pat<am_indexed16, truncstorei16, v8i16, i32, hsub, uimm12s2, STRHui>;
defm : VecStoreLane0Pat<am_indexed16, store, v8f16, f16, hsub, uimm12s2, STRHui>;
defm : VecStoreLane0Pat<am_indexed32, store, v4i32, i32, ssub, uimm12s4, STRSui>;
defm : VecStoreLane0Pat<am_indexed32, store, v4f32, f32, ssub, uimm12s4, STRSui>;
defm : VecStoreLane0Pat<am_indexed64, store, v2i64, i64, dsub, uimm12s8, STRDui>;
defm : VecStoreLane0Pat<am_indexed64, store, v2f64, f64, dsub, uimm12s8, STRDui>;
}
//---
// (unscaled immediate)
defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64z, "stur",
[(store GPR64z:$Rt,
(am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32z, "stur",
[(store GPR32z:$Rt,
(am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8Op, "stur",
[(store FPR8Op:$Rt,
(am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16Op, "stur",
[(store (f16 FPR16Op:$Rt),
(am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
defm STURS : StoreUnscaled<0b10, 1, 0b00, FPR32Op, "stur",
[(store (f32 FPR32Op:$Rt),
(am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
defm STURD : StoreUnscaled<0b11, 1, 0b00, FPR64Op, "stur",
[(store (f64 FPR64Op:$Rt),
(am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
defm STURQ : StoreUnscaled<0b00, 1, 0b10, FPR128Op, "stur",
[(store (f128 FPR128Op:$Rt),
(am_unscaled128 GPR64sp:$Rn, simm9:$offset))]>;
defm STURHH : StoreUnscaled<0b01, 0, 0b00, GPR32z, "sturh",
[(truncstorei16 GPR32z:$Rt,
(am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32z, "sturb",
[(truncstorei8 GPR32z:$Rt,
(am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
// Armv8.4 Weaker Release Consistency enhancements
// LDAPR & STLR with Immediate Offset instructions
let Predicates = [HasRCPC_IMMO] in {
defm STLURB : BaseStoreUnscaleV84<"stlurb", 0b00, 0b00, GPR32>;
defm STLURH : BaseStoreUnscaleV84<"stlurh", 0b01, 0b00, GPR32>;
defm STLURW : BaseStoreUnscaleV84<"stlur", 0b10, 0b00, GPR32>;
defm STLURX : BaseStoreUnscaleV84<"stlur", 0b11, 0b00, GPR64>;
defm LDAPURB : BaseLoadUnscaleV84<"ldapurb", 0b00, 0b01, GPR32>;
defm LDAPURSBW : BaseLoadUnscaleV84<"ldapursb", 0b00, 0b11, GPR32>;
defm LDAPURSBX : BaseLoadUnscaleV84<"ldapursb", 0b00, 0b10, GPR64>;
defm LDAPURH : BaseLoadUnscaleV84<"ldapurh", 0b01, 0b01, GPR32>;
defm LDAPURSHW : BaseLoadUnscaleV84<"ldapursh", 0b01, 0b11, GPR32>;
defm LDAPURSHX : BaseLoadUnscaleV84<"ldapursh", 0b01, 0b10, GPR64>;
defm LDAPUR : BaseLoadUnscaleV84<"ldapur", 0b10, 0b01, GPR32>;
defm LDAPURSW : BaseLoadUnscaleV84<"ldapursw", 0b10, 0b10, GPR64>;
defm LDAPURX : BaseLoadUnscaleV84<"ldapur", 0b11, 0b01, GPR64>;
}
// Match all store 64 bits width whose type is compatible with FPR64
def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
let AddedComplexity = 10 in {
let Predicates = [IsLE] in {
// We must use ST1 to store vectors in big-endian.
def : Pat<(store (v2f32 FPR64:$Rt),
(am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(store (v8i8 FPR64:$Rt),
(am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(store (v4i16 FPR64:$Rt),
(am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(store (v2i32 FPR64:$Rt),
(am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(store (v4f16 FPR64:$Rt),
(am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
}
// Match all store 128 bits width whose type is compatible with FPR128
def : Pat<(store (f128 FPR128:$Rt), (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
let Predicates = [IsLE] in {
// We must use ST1 to store vectors in big-endian.
def : Pat<(store (v4f32 FPR128:$Rt),
(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(store (v2f64 FPR128:$Rt),
(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(store (v16i8 FPR128:$Rt),
(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(store (v8i16 FPR128:$Rt),
(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(store (v4i32 FPR128:$Rt),
(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(store (v2i64 FPR128:$Rt),
(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(store (v2f64 FPR128:$Rt),
(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(store (v8f16 FPR128:$Rt),
(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
}
} // AddedComplexity = 10
// unscaled i64 truncating stores
def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
(STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
(STURHHi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
(STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
// Match stores from lane 0 to the appropriate subreg's store.
multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,
ValueType VTy, ValueType STy,
SubRegIndex SubRegIdx, Instruction STR> {
defm : VecStoreLane0Pat<am_unscaled128, StoreOp, VTy, STy, SubRegIdx, simm9, STR>;
}
let AddedComplexity = 19 in {
defm : VecStoreULane0Pat<truncstorei16, v8i16, i32, hsub, STURHi>;
defm : VecStoreULane0Pat<store, v8f16, f16, hsub, STURHi>;
defm : VecStoreULane0Pat<store, v4i32, i32, ssub, STURSi>;
defm : VecStoreULane0Pat<store, v4f32, f32, ssub, STURSi>;
defm : VecStoreULane0Pat<store, v2i64, i64, dsub, STURDi>;
defm : VecStoreULane0Pat<store, v2f64, f64, dsub, STURDi>;
}
//---
// STR mnemonics fall back to STUR for negative or unaligned offsets.
def : InstAlias<"str $Rt, [$Rn, $offset]",
(STURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
def : InstAlias<"str $Rt, [$Rn, $offset]",
(STURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
def : InstAlias<"str $Rt, [$Rn, $offset]",
(STURBi FPR8Op:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
def : InstAlias<"str $Rt, [$Rn, $offset]",
(STURHi FPR16Op:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
def : InstAlias<"str $Rt, [$Rn, $offset]",
(STURSi FPR32Op:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
def : InstAlias<"str $Rt, [$Rn, $offset]",
(STURDi FPR64Op:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
def : InstAlias<"str $Rt, [$Rn, $offset]",
(STURQi FPR128Op:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
def : InstAlias<"strb $Rt, [$Rn, $offset]",
(STURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
def : InstAlias<"strh $Rt, [$Rn, $offset]",
(STURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
//---
// (unscaled immediate, unprivileged)
defm STTRW : StoreUnprivileged<0b10, 0, 0b00, GPR32, "sttr">;
defm STTRX : StoreUnprivileged<0b11, 0, 0b00, GPR64, "sttr">;
defm STTRH : StoreUnprivileged<0b01, 0, 0b00, GPR32, "sttrh">;
defm STTRB : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">;
//---
// (immediate pre-indexed)
def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32z, "str", pre_store, i32>;
def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64z, "str", pre_store, i64>;
def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8Op, "str", pre_store, untyped>;
def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16Op, "str", pre_store, f16>;
def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32Op, "str", pre_store, f32>;
def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64Op, "str", pre_store, f64>;
def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128Op, "str", pre_store, f128>;
def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32z, "strb", pre_truncsti8, i32>;
def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32z, "strh", pre_truncsti16, i32>;
// truncstore i64
def : Pat<(pre_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
(STRWpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
simm9:$off)>;
def : Pat<(pre_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
(STRHHpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
simm9:$off)>;
def : Pat<(pre_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
(STRBBpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
simm9:$off)>;
def : Pat<(pre_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(pre_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(pre_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(pre_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(pre_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(pre_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(pre_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(pre_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(pre_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(pre_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(pre_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
//---
// (immediate post-indexed)
def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32z, "str", post_store, i32>;
def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64z, "str", post_store, i64>;
def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8Op, "str", post_store, untyped>;
def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16Op, "str", post_store, f16>;
def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32Op, "str", post_store, f32>;
def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64Op, "str", post_store, f64>;
def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128Op, "str", post_store, f128>;
def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32z, "strb", post_truncsti8, i32>;
def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32z, "strh", post_truncsti16, i32>;
// truncstore i64
def : Pat<(post_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
(STRWpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
simm9:$off)>;
def : Pat<(post_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
(STRHHpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
simm9:$off)>;
def : Pat<(post_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
(STRBBpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
simm9:$off)>;
def : Pat<(post_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
//===----------------------------------------------------------------------===//
// Load/store exclusive instructions.
//===----------------------------------------------------------------------===//
def LDARW : LoadAcquire <0b10, 1, 1, 0, 1, GPR32, "ldar">;
def LDARX : LoadAcquire <0b11, 1, 1, 0, 1, GPR64, "ldar">;
def LDARB : LoadAcquire <0b00, 1, 1, 0, 1, GPR32, "ldarb">;
def LDARH : LoadAcquire <0b01, 1, 1, 0, 1, GPR32, "ldarh">;
def LDAXRW : LoadExclusive <0b10, 0, 1, 0, 1, GPR32, "ldaxr">;
def LDAXRX : LoadExclusive <0b11, 0, 1, 0, 1, GPR64, "ldaxr">;
def LDAXRB : LoadExclusive <0b00, 0, 1, 0, 1, GPR32, "ldaxrb">;
def LDAXRH : LoadExclusive <0b01, 0, 1, 0, 1, GPR32, "ldaxrh">;
def LDXRW : LoadExclusive <0b10, 0, 1, 0, 0, GPR32, "ldxr">;
def LDXRX : LoadExclusive <0b11, 0, 1, 0, 0, GPR64, "ldxr">;
def LDXRB : LoadExclusive <0b00, 0, 1, 0, 0, GPR32, "ldxrb">;
def LDXRH : LoadExclusive <0b01, 0, 1, 0, 0, GPR32, "ldxrh">;
def STLRW : StoreRelease <0b10, 1, 0, 0, 1, GPR32, "stlr">;
def STLRX : StoreRelease <0b11, 1, 0, 0, 1, GPR64, "stlr">;
def STLRB : StoreRelease <0b00, 1, 0, 0, 1, GPR32, "stlrb">;
def STLRH : StoreRelease <0b01, 1, 0, 0, 1, GPR32, "stlrh">;
def STLXRW : StoreExclusive<0b10, 0, 0, 0, 1, GPR32, "stlxr">;
def STLXRX : StoreExclusive<0b11, 0, 0, 0, 1, GPR64, "stlxr">;
def STLXRB : StoreExclusive<0b00, 0, 0, 0, 1, GPR32, "stlxrb">;
def STLXRH : StoreExclusive<0b01, 0, 0, 0, 1, GPR32, "stlxrh">;
def STXRW : StoreExclusive<0b10, 0, 0, 0, 0, GPR32, "stxr">;
def STXRX : StoreExclusive<0b11, 0, 0, 0, 0, GPR64, "stxr">;
def STXRB : StoreExclusive<0b00, 0, 0, 0, 0, GPR32, "stxrb">;
def STXRH : StoreExclusive<0b01, 0, 0, 0, 0, GPR32, "stxrh">;
def LDAXPW : LoadExclusivePair<0b10, 0, 1, 1, 1, GPR32, "ldaxp">;
def LDAXPX : LoadExclusivePair<0b11, 0, 1, 1, 1, GPR64, "ldaxp">;
def LDXPW : LoadExclusivePair<0b10, 0, 1, 1, 0, GPR32, "ldxp">;
def LDXPX : LoadExclusivePair<0b11, 0, 1, 1, 0, GPR64, "ldxp">;
def STLXPW : StoreExclusivePair<0b10, 0, 0, 1, 1, GPR32, "stlxp">;
def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">;
def STXPW : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">;
def STXPX : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">;
let Predicates = [HasLOR] in {
// v8.1a "Limited Order Region" extension load-acquire instructions
def LDLARW : LoadAcquire <0b10, 1, 1, 0, 0, GPR32, "ldlar">;
def LDLARX : LoadAcquire <0b11, 1, 1, 0, 0, GPR64, "ldlar">;
def LDLARB : LoadAcquire <0b00, 1, 1, 0, 0, GPR32, "ldlarb">;
def LDLARH : LoadAcquire <0b01, 1, 1, 0, 0, GPR32, "ldlarh">;
// v8.1a "Limited Order Region" extension store-release instructions
def STLLRW : StoreRelease <0b10, 1, 0, 0, 0, GPR32, "stllr">;
def STLLRX : StoreRelease <0b11, 1, 0, 0, 0, GPR64, "stllr">;
def STLLRB : StoreRelease <0b00, 1, 0, 0, 0, GPR32, "stllrb">;
def STLLRH : StoreRelease <0b01, 1, 0, 0, 0, GPR32, "stllrh">;
}
//===----------------------------------------------------------------------===//
// Scaled floating point to integer conversion instructions.
//===----------------------------------------------------------------------===//
defm FCVTAS : FPToIntegerUnscaled<0b00, 0b100, "fcvtas", int_aarch64_neon_fcvtas>;
defm FCVTAU : FPToIntegerUnscaled<0b00, 0b101, "fcvtau", int_aarch64_neon_fcvtau>;
defm FCVTMS : FPToIntegerUnscaled<0b10, 0b000, "fcvtms", int_aarch64_neon_fcvtms>;
defm FCVTMU : FPToIntegerUnscaled<0b10, 0b001, "fcvtmu", int_aarch64_neon_fcvtmu>;
defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_aarch64_neon_fcvtns>;
defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_aarch64_neon_fcvtnu>;
defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_aarch64_neon_fcvtps>;
defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_aarch64_neon_fcvtpu>;
defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
def : Pat<(i32 (round f16:$Rn)), (!cast<Instruction>(INST # UWHr) $Rn)>;
def : Pat<(i64 (round f16:$Rn)), (!cast<Instruction>(INST # UXHr) $Rn)>;
def : Pat<(i32 (round f32:$Rn)), (!cast<Instruction>(INST # UWSr) $Rn)>;
def : Pat<(i64 (round f32:$Rn)), (!cast<Instruction>(INST # UXSr) $Rn)>;
def : Pat<(i32 (round f64:$Rn)), (!cast<Instruction>(INST # UWDr) $Rn)>;
def : Pat<(i64 (round f64:$Rn)), (!cast<Instruction>(INST # UXDr) $Rn)>;
def : Pat<(i32 (round (fmul f16:$Rn, fixedpoint_f16_i32:$scale))),
(!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
def : Pat<(i64 (round (fmul f16:$Rn, fixedpoint_f16_i64:$scale))),
(!cast<Instruction>(INST # SXHri) $Rn, $scale)>;
def : Pat<(i32 (round (fmul f32:$Rn, fixedpoint_f32_i32:$scale))),
(!cast<Instruction>(INST # SWSri) $Rn, $scale)>;
def : Pat<(i64 (round (fmul f32:$Rn, fixedpoint_f32_i64:$scale))),
(!cast<Instruction>(INST # SXSri) $Rn, $scale)>;
def : Pat<(i32 (round (fmul f64:$Rn, fixedpoint_f64_i32:$scale))),
(!cast<Instruction>(INST # SWDri) $Rn, $scale)>;
def : Pat<(i64 (round (fmul f64:$Rn, fixedpoint_f64_i64:$scale))),
(!cast<Instruction>(INST # SXDri) $Rn, $scale)>;
}
defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzs, "FCVTZS">;
defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzu, "FCVTZU">;
multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> {
def : Pat<(i32 (to_int (round f32:$Rn))),
(!cast<Instruction>(INST # UWSr) f32:$Rn)>;
def : Pat<(i64 (to_int (round f32:$Rn))),
(!cast<Instruction>(INST # UXSr) f32:$Rn)>;
def : Pat<(i32 (to_int (round f64:$Rn))),
(!cast<Instruction>(INST # UWDr) f64:$Rn)>;
def : Pat<(i64 (to_int (round f64:$Rn))),
(!cast<Instruction>(INST # UXDr) f64:$Rn)>;
}
defm : FPToIntegerPats<fp_to_sint, fceil, "FCVTPS">;
defm : FPToIntegerPats<fp_to_uint, fceil, "FCVTPU">;
defm : FPToIntegerPats<fp_to_sint, ffloor, "FCVTMS">;
defm : FPToIntegerPats<fp_to_uint, ffloor, "FCVTMU">;
defm : FPToIntegerPats<fp_to_sint, ftrunc, "FCVTZS">;
defm : FPToIntegerPats<fp_to_uint, ftrunc, "FCVTZU">;
defm : FPToIntegerPats<fp_to_sint, fround, "FCVTAS">;
defm : FPToIntegerPats<fp_to_uint, fround, "FCVTAU">;
let Predicates = [HasFullFP16] in {
def : Pat<(i32 (lround f16:$Rn)),
(!cast<Instruction>(FCVTASUWHr) f16:$Rn)>;
def : Pat<(i64 (lround f16:$Rn)),
(!cast<Instruction>(FCVTASUXHr) f16:$Rn)>;
def : Pat<(i64 (llround f16:$Rn)),
(!cast<Instruction>(FCVTASUXHr) f16:$Rn)>;
}
def : Pat<(i32 (lround f32:$Rn)),
(!cast<Instruction>(FCVTASUWSr) f32:$Rn)>;
def : Pat<(i32 (lround f64:$Rn)),
(!cast<Instruction>(FCVTASUWDr) f64:$Rn)>;
def : Pat<(i64 (lround f32:$Rn)),
(!cast<Instruction>(FCVTASUXSr) f32:$Rn)>;
def : Pat<(i64 (lround f64:$Rn)),
(!cast<Instruction>(FCVTASUXDr) f64:$Rn)>;
def : Pat<(i64 (llround f32:$Rn)),
(!cast<Instruction>(FCVTASUXSr) f32:$Rn)>;
def : Pat<(i64 (llround f64:$Rn)),
(!cast<Instruction>(FCVTASUXDr) f64:$Rn)>;
//===----------------------------------------------------------------------===//
// Scaled integer to floating point conversion instructions.
//===----------------------------------------------------------------------===//
defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>;
defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;
//===----------------------------------------------------------------------===//
// Unscaled integer to floating point conversion instruction.
//===----------------------------------------------------------------------===//
defm FMOV : UnscaledConversion<"fmov">;
// Add pseudo ops for FMOV 0 so we can mark them as isReMaterializable
let isReMaterializable = 1, isCodeGenOnly = 1, isAsCheapAsAMove = 1 in {
def FMOVH0 : Pseudo<(outs FPR16:$Rd), (ins), [(set f16:$Rd, (fpimm0))]>,
Sched<[WriteF]>, Requires<[HasFullFP16]>;
def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>,
Sched<[WriteF]>;
def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>,
Sched<[WriteF]>;
}
// Similarly add aliases
def : InstAlias<"fmov $Rd, #0.0", (FMOVWHr FPR16:$Rd, WZR), 0>,
Requires<[HasFullFP16]>;
def : InstAlias<"fmov $Rd, #0.0", (FMOVWSr FPR32:$Rd, WZR), 0>;
def : InstAlias<"fmov $Rd, #0.0", (FMOVXDr FPR64:$Rd, XZR), 0>;
//===----------------------------------------------------------------------===//
// Floating point conversion instruction.
//===----------------------------------------------------------------------===//
defm FCVT : FPConversion<"fcvt">;
//===----------------------------------------------------------------------===//
// Floating point single operand instructions.
//===----------------------------------------------------------------------===//
defm FABS : SingleOperandFPData<0b0001, "fabs", fabs>;
defm FMOV : SingleOperandFPData<0b0000, "fmov">;
defm FNEG : SingleOperandFPData<0b0010, "fneg", fneg>;
defm FRINTA : SingleOperandFPData<0b1100, "frinta", fround>;
defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_aarch64_neon_frintn>;
defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))),
(FRINTNDr FPR64:$Rn)>;
defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
let SchedRW = [WriteFDiv] in {
defm FSQRT : SingleOperandFPData<0b0011, "fsqrt", fsqrt>;
}
let Predicates = [HasFRInt3264] in {
defm FRINT32Z : FRIntNNT<0b00, "frint32z">;
defm FRINT64Z : FRIntNNT<0b10, "frint64z">;
defm FRINT32X : FRIntNNT<0b01, "frint32x">;
defm FRINT64X : FRIntNNT<0b11, "frint64x">;
} // HasFRInt3264
let Predicates = [HasFullFP16] in {
def : Pat<(i32 (lrint f16:$Rn)),
(FCVTZSUWHr (!cast<Instruction>(FRINTXHr) f16:$Rn))>;
def : Pat<(i64 (lrint f16:$Rn)),
(FCVTZSUXHr (!cast<Instruction>(FRINTXHr) f16:$Rn))>;
def : Pat<(i64 (llrint f16:$Rn)),
(FCVTZSUXHr (!cast<Instruction>(FRINTXHr) f16:$Rn))>;
}
def : Pat<(i32 (lrint f32:$Rn)),
(FCVTZSUWSr (!cast<Instruction>(FRINTXSr) f32:$Rn))>;
def : Pat<(i32 (lrint f64:$Rn)),
(FCVTZSUWDr (!cast<Instruction>(FRINTXDr) f64:$Rn))>;
def : Pat<(i64 (lrint f32:$Rn)),
(FCVTZSUXSr (!cast<Instruction>(FRINTXSr) f32:$Rn))>;
def : Pat<(i64 (lrint f64:$Rn)),
(FCVTZSUXDr (!cast<Instruction>(FRINTXDr) f64:$Rn))>;
def : Pat<(i64 (llrint f32:$Rn)),
(FCVTZSUXSr (!cast<Instruction>(FRINTXSr) f32:$Rn))>;
def : Pat<(i64 (llrint f64:$Rn)),
(FCVTZSUXDr (!cast<Instruction>(FRINTXDr) f64:$Rn))>;
//===----------------------------------------------------------------------===//
// Floating point two operand instructions.
//===----------------------------------------------------------------------===//
defm FADD : TwoOperandFPData<0b0010, "fadd", fadd>;
let SchedRW = [WriteFDiv] in {
defm FDIV : TwoOperandFPData<0b0001, "fdiv", fdiv>;
}
defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", fmaxnum>;
defm FMAX : TwoOperandFPData<0b0100, "fmax", fmaximum>;
defm FMINNM : TwoOperandFPData<0b0111, "fminnm", fminnum>;
defm FMIN : TwoOperandFPData<0b0101, "fmin", fminimum>;
let SchedRW = [WriteFMul] in {
defm FMUL : TwoOperandFPData<0b0000, "fmul", fmul>;
defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
}
defm FSUB : TwoOperandFPData<0b0011, "fsub", fsub>;
def : Pat<(v1f64 (fmaximum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
def : Pat<(v1f64 (fminimum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(FMINDrr FPR64:$Rn, FPR64:$Rm)>;
def : Pat<(v1f64 (fmaxnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
def : Pat<(v1f64 (fminnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(FMINNMDrr FPR64:$Rn, FPR64:$Rm)>;
//===----------------------------------------------------------------------===//
// Floating point three operand instructions.
//===----------------------------------------------------------------------===//
defm FMADD : ThreeOperandFPData<0, 0, "fmadd", fma>;
defm FMSUB : ThreeOperandFPData<0, 1, "fmsub",
TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd",
TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >;
defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;
// The following def pats catch the case where the LHS of an FMA is negated.
// The TriOpFrag above catches the case where the middle operand is negated.
// N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike
// the NEON variant.
def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)),
(FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)),
(FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
// We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and
// "(-a) + b*(-c)".
def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))),
(FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))),
(FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))),
(FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))),
(FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
//===----------------------------------------------------------------------===//
// Floating point comparison instructions.
//===----------------------------------------------------------------------===//
defm FCMPE : FPComparison<1, "fcmpe">;
defm FCMP : FPComparison<0, "fcmp", AArch64fcmp>;
//===----------------------------------------------------------------------===//
// Floating point conditional comparison instructions.
//===----------------------------------------------------------------------===//
defm FCCMPE : FPCondComparison<1, "fccmpe">;
defm FCCMP : FPCondComparison<0, "fccmp", AArch64fccmp>;
//===----------------------------------------------------------------------===//
// Floating point conditional select instruction.
//===----------------------------------------------------------------------===//
defm FCSEL : FPCondSelect<"fcsel">;
// CSEL instructions providing f128 types need to be handled by a
// pseudo-instruction since the eventual code will need to introduce basic
// blocks and control flow.
def F128CSEL : Pseudo<(outs FPR128:$Rd),
(ins FPR128:$Rn, FPR128:$Rm, ccode:$cond),
[(set (f128 FPR128:$Rd),
(AArch64csel FPR128:$Rn, FPR128:$Rm,
(i32 imm:$cond), NZCV))]> {
let Uses = [NZCV];
let usesCustomInserter = 1;
let hasNoSchedulingInfo = 1;
}
//===----------------------------------------------------------------------===//
// Instructions used for emitting unwind opcodes on ARM64 Windows.
//===----------------------------------------------------------------------===//
let isPseudo = 1 in {
def SEH_StackAlloc : Pseudo<(outs), (ins i32imm:$size), []>, Sched<[]>;
def SEH_SaveFPLR : Pseudo<(outs), (ins i32imm:$offs), []>, Sched<[]>;
def SEH_SaveFPLR_X : Pseudo<(outs), (ins i32imm:$offs), []>, Sched<[]>;
def SEH_SaveReg : Pseudo<(outs), (ins i32imm:$reg, i32imm:$offs), []>, Sched<[]>;
def SEH_SaveReg_X : Pseudo<(outs), (ins i32imm:$reg, i32imm:$offs), []>, Sched<[]>;
def SEH_SaveRegP : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>;
def SEH_SaveRegP_X : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>;
def SEH_SaveFReg : Pseudo<(outs), (ins i32imm:$reg, i32imm:$offs), []>, Sched<[]>;
def SEH_SaveFReg_X : Pseudo<(outs), (ins i32imm:$reg, i32imm:$offs), []>, Sched<[]>;
def SEH_SaveFRegP : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>;
def SEH_SaveFRegP_X : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>;
def SEH_SetFP : Pseudo<(outs), (ins), []>, Sched<[]>;
def SEH_AddFP : Pseudo<(outs), (ins i32imm:$offs), []>, Sched<[]>;
def SEH_Nop : Pseudo<(outs), (ins), []>, Sched<[]>;
def SEH_PrologEnd : Pseudo<(outs), (ins), []>, Sched<[]>;
def SEH_EpilogStart : Pseudo<(outs), (ins), []>, Sched<[]>;
def SEH_EpilogEnd : Pseudo<(outs), (ins), []>, Sched<[]>;
}
// Pseudo instructions for Windows EH
//===----------------------------------------------------------------------===//
let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
isCodeGenOnly = 1, isReturn = 1, isEHScopeReturn = 1, isPseudo = 1 in {
def CLEANUPRET : Pseudo<(outs), (ins), [(cleanupret)]>, Sched<[]>;
let usesCustomInserter = 1 in
def CATCHRET : Pseudo<(outs), (ins am_brcond:$dst, am_brcond:$src), [(catchret bb:$dst, bb:$src)]>,
Sched<[]>;
}
let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1,
usesCustomInserter = 1 in
def CATCHPAD : Pseudo<(outs), (ins), [(catchpad)]>, Sched<[]>;
//===----------------------------------------------------------------------===//
// Floating point immediate move.
//===----------------------------------------------------------------------===//
let isReMaterializable = 1 in {
defm FMOV : FPMoveImmediate<"fmov">;
}
//===----------------------------------------------------------------------===//
// Advanced SIMD two vector instructions.
//===----------------------------------------------------------------------===//
defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
int_aarch64_neon_uabd>;
// Match UABDL in log2-shuffle patterns.
def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)),
(zext (v8i8 V64:$opB))))),
(UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
(v8i16 (add (sub (zext (v8i8 V64:$opA)),
(zext (v8i8 V64:$opB))),
(AArch64vashr v8i16:$src, (i32 15))))),
(UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 V128:$opA)),
(zext (extract_high_v16i8 V128:$opB))))),
(UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
(v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)),
(zext (extract_high_v16i8 V128:$opB))),
(AArch64vashr v8i16:$src, (i32 15))))),
(UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
def : Pat<(abs (v4i32 (sub (zext (v4i16 V64:$opA)),
(zext (v4i16 V64:$opB))))),
(UABDLv4i16_v4i32 V64:$opA, V64:$opB)>;
def : Pat<(abs (v4i32 (sub (zext (extract_high_v8i16 V128:$opA)),
(zext (extract_high_v8i16 V128:$opB))))),
(UABDLv8i16_v4i32 V128:$opA, V128:$opB)>;
def : Pat<(abs (v2i64 (sub (zext (v2i32 V64:$opA)),
(zext (v2i32 V64:$opB))))),
(UABDLv2i32_v2i64 V64:$opA, V64:$opB)>;
def : Pat<(abs (v2i64 (sub (zext (extract_high_v4i32 V128:$opA)),
(zext (extract_high_v4i32 V128:$opB))))),
(UABDLv4i32_v2i64 V128:$opA, V128:$opB)>;
defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", abs>;
defm CLS : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>;
defm CLZ : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
defm CMEQ : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>;
defm CMGE : SIMDCmpTwoVector<1, 0b01000, "cmge", AArch64cmgez>;
defm CMGT : SIMDCmpTwoVector<0, 0b01000, "cmgt", AArch64cmgtz>;
defm CMLE : SIMDCmpTwoVector<1, 0b01001, "cmle", AArch64cmlez>;
defm CMLT : SIMDCmpTwoVector<0, 0b01010, "cmlt", AArch64cmltz>;
defm CNT : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>;
defm FABS : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>;
defm FCMEQ : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
defm FCMGE : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
defm FCMGT : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
defm FCMLE : SIMDFPCmpTwoVector<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
defm FCMLT : SIMDFPCmpTwoVector<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
defm FCVTAS : SIMDTwoVectorFPToInt<0,0,0b11100, "fcvtas",int_aarch64_neon_fcvtas>;
defm FCVTAU : SIMDTwoVectorFPToInt<1,0,0b11100, "fcvtau",int_aarch64_neon_fcvtau>;
defm FCVTL : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">;
def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))),
(FCVTLv4i16 V64:$Rn)>;
def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
(i64 4)))),
(FCVTLv8i16 V128:$Rn)>;
def : Pat<(v2f64 (fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
def : Pat<(v2f64 (fpextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
(i64 2))))),
(FCVTLv4i32 V128:$Rn)>;
def : Pat<(v4f32 (fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
def : Pat<(v4f32 (fpextend (v4f16 (extract_subvector (v8f16 V128:$Rn),
(i64 4))))),
(FCVTLv8i16 V128:$Rn)>;
defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>;
defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>;
defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_aarch64_neon_fcvtns>;
defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_aarch64_neon_fcvtnu>;
defm FCVTN : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">;
def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
(FCVTNv4i16 V128:$Rn)>;
def : Pat<(concat_vectors V64:$Rd,
(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
(FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
def : Pat<(v2f32 (fpround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
def : Pat<(v4f16 (fpround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>;
def : Pat<(concat_vectors V64:$Rd, (v2f32 (fpround (v2f64 V128:$Rn)))),
(FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>;
defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
int_aarch64_neon_fcvtxn>;
defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
def : Pat<(v4i16 (int_aarch64_neon_fcvtzs v4f16:$Rn)), (FCVTZSv4f16 $Rn)>;
def : Pat<(v8i16 (int_aarch64_neon_fcvtzs v8f16:$Rn)), (FCVTZSv8f16 $Rn)>;
def : Pat<(v2i32 (int_aarch64_neon_fcvtzs v2f32:$Rn)), (FCVTZSv2f32 $Rn)>;
def : Pat<(v4i32 (int_aarch64_neon_fcvtzs v4f32:$Rn)), (FCVTZSv4f32 $Rn)>;
def : Pat<(v2i64 (int_aarch64_neon_fcvtzs v2f64:$Rn)), (FCVTZSv2f64 $Rn)>;
def : Pat<(v4i16 (int_aarch64_neon_fcvtzu v4f16:$Rn)), (FCVTZUv4f16 $Rn)>;
def : Pat<(v8i16 (int_aarch64_neon_fcvtzu v8f16:$Rn)), (FCVTZUv8f16 $Rn)>;
def : Pat<(v2i32 (int_aarch64_neon_fcvtzu v2f32:$Rn)), (FCVTZUv2f32 $Rn)>;
def : Pat<(v4i32 (int_aarch64_neon_fcvtzu v4f32:$Rn)), (FCVTZUv4f32 $Rn)>;
def : Pat<(v2i64 (int_aarch64_neon_fcvtzu v2f64:$Rn)), (FCVTZUv2f64 $Rn)>;
defm FNEG : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>;
defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", fround>;
defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_aarch64_neon_frintn>;
defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>;
defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>;
defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>;
let Predicates = [HasFRInt3264] in {
defm FRINT32Z : FRIntNNTVector<0, 0, "frint32z">;
defm FRINT64Z : FRIntNNTVector<0, 1, "frint64z">;
defm FRINT32X : FRIntNNTVector<1, 0, "frint32x">;
defm FRINT64X : FRIntNNTVector<1, 1, "frint64x">;
} // HasFRInt3264
defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_aarch64_neon_frsqrte>;
defm FSQRT : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>;
defm NEG : SIMDTwoVectorBHSD<1, 0b01011, "neg",
UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
defm NOT : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>;
// Aliases for MVN -> NOT.
def : InstAlias<"mvn{ $Vd.8b, $Vn.8b|.8b $Vd, $Vn}",
(NOTv8i8 V64:$Vd, V64:$Vn)>;
def : InstAlias<"mvn{ $Vd.16b, $Vn.16b|.16b $Vd, $Vn}",
(NOTv16i8 V128:$Vd, V128:$Vn)>;
def : Pat<(AArch64neg (v8i8 V64:$Rn)), (NEGv8i8 V64:$Rn)>;
def : Pat<(AArch64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>;
def : Pat<(AArch64neg (v4i16 V64:$Rn)), (NEGv4i16 V64:$Rn)>;
def : Pat<(AArch64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>;
def : Pat<(AArch64neg (v2i32 V64:$Rn)), (NEGv2i32 V64:$Rn)>;
def : Pat<(AArch64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>;
def : Pat<(AArch64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>;
def : Pat<(AArch64not (v8i8 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
def : Pat<(AArch64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
def : Pat<(AArch64not (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
def : Pat<(AArch64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
def : Pat<(AArch64not (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
def : Pat<(AArch64not (v1i64 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
def : Pat<(AArch64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
def : Pat<(AArch64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
def : Pat<(vnot (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
def : Pat<(vnot (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
defm RBIT : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_aarch64_neon_rbit>;
defm REV16 : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>;
defm REV32 : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>;
defm REV64 : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>;
defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
BinOpFrag<(add node:$LHS, (int_aarch64_neon_saddlp node:$RHS))> >;
defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_aarch64_neon_saddlp>;
defm SCVTF : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
defm SHLL : SIMDVectorLShiftLongBySizeBHS;
defm SQABS : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
defm SQNEG : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
defm SQXTN : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_aarch64_neon_sqxtn>;
defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_aarch64_neon_sqxtun>;
defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd>;
defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
BinOpFrag<(add node:$LHS, (int_aarch64_neon_uaddlp node:$RHS))> >;
defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp",
int_aarch64_neon_uaddlp>;
defm UCVTF : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
defm UQXTN : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>;
defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>;
defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>;
defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>;
defm XTN : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;
def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>;
def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>;
def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>;
def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>;
def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
// Patterns for vector long shift (by element width). These need to match all
// three of zext, sext and anyext so it's easier to pull the patterns out of the
// definition.
multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> {
def : Pat<(AArch64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)),
(SHLLv8i8 V64:$Rn)>;
def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)),
(SHLLv16i8 V128:$Rn)>;
def : Pat<(AArch64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)),
(SHLLv4i16 V64:$Rn)>;
def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)),
(SHLLv8i16 V128:$Rn)>;
def : Pat<(AArch64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)),
(SHLLv2i32 V64:$Rn)>;
def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)),
(SHLLv4i32 V128:$Rn)>;
}
defm : SIMDVectorLShiftLongBySizeBHSPats<anyext>;
defm : SIMDVectorLShiftLongBySizeBHSPats<zext>;
defm : SIMDVectorLShiftLongBySizeBHSPats<sext>;
//===----------------------------------------------------------------------===//
// Advanced SIMD three vector instructions.
//===----------------------------------------------------------------------===//
defm ADD : SIMDThreeSameVector<0, 0b10000, "add", add>;
defm ADDP : SIMDThreeSameVector<0, 0b10111, "addp", int_aarch64_neon_addp>;
defm CMEQ : SIMDThreeSameVector<1, 0b10001, "cmeq", AArch64cmeq>;
defm CMGE : SIMDThreeSameVector<0, 0b00111, "cmge", AArch64cmge>;
defm CMGT : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>;
defm CMHI : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>;
defm CMHS : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>;
defm CMTST : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>;
defm FABD : SIMDThreeSameVectorFP<1,1,0b010,"fabd", int_aarch64_neon_fabd>;
let Predicates = [HasNEON] in {
foreach VT = [ v2f32, v4f32, v2f64 ] in
def : Pat<(fabs (fsub VT:$Rn, VT:$Rm)), (!cast<Instruction>("FABD"#VT) VT:$Rn, VT:$Rm)>;
}
let Predicates = [HasNEON, HasFullFP16] in {
foreach VT = [ v4f16, v8f16 ] in
def : Pat<(fabs (fsub VT:$Rn, VT:$Rm)), (!cast<Instruction>("FABD"#VT) VT:$Rn, VT:$Rm)>;
}
defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>;
defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>;
defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_faddp>;
defm FADD : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>;
defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
defm FCMGT : SIMDThreeSameVectorFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
defm FDIV : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", fdiv>;
defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", fmaxnum>;
defm FMAXP : SIMDThreeSameVectorFP<1,0,0b110,"fmaxp", int_aarch64_neon_fmaxp>;
defm FMAX : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaximum>;
defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b000,"fminnmp", int_aarch64_neon_fminnmp>;
defm FMINNM : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", fminnum>;
defm FMINP : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>;
defm FMIN : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminimum>;
// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
// instruction expects the addend first, while the fma intrinsic puts it last.
defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla",
TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b001, "fmls",
TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
// The following def pats catch the case where the LHS of an FMA is negated.
// The TriOpFrag above catches the case where the middle operand is negated.
def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)),
(FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>;
def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
(FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>;
def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
(FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
defm FMULX : SIMDThreeSameVectorFP<0,0,0b011,"fmulx", int_aarch64_neon_fmulx>;
defm FMUL : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>;
defm FRECPS : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>;
defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>;
defm FSUB : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>;
defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >;
defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
TriOpFrag<(add node:$LHS, (int_aarch64_neon_sabd node:$MHS, node:$RHS))> >;
defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>;
defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>;
defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>;
defm SMINP : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>;
defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", smin>;
defm SQADD : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>;
defm SQDMULH : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>;
defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>;
defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>;
defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>;
defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>;
defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>;
defm UABA : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
TriOpFrag<(add node:$LHS, (int_aarch64_neon_uabd node:$MHS, node:$RHS))> >;
defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>;
defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>;
defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>;
defm UMINP : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>;
defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", umin>;
defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;
defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>;
defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",
int_aarch64_neon_sqadd>;
defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh",
int_aarch64_neon_sqsub>;
defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
(BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
(BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
(BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
(BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
(BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
(BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
(BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
(BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
(ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>;
def : InstAlias<"mov{\t$dst.8h, $src.8h|.8h\t$dst, $src}",
(ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
def : InstAlias<"mov{\t$dst.4s, $src.4s|.4s\t$dst, $src}",
(ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
def : InstAlias<"mov{\t$dst.2d, $src.2d|.2d\t$dst, $src}",
(ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
def : InstAlias<"mov{\t$dst.8b, $src.8b|.8b\t$dst, $src}",
(ORRv8i8 V64:$dst, V64:$src, V64:$src), 1>;
def : InstAlias<"mov{\t$dst.4h, $src.4h|.4h\t$dst, $src}",
(ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
def : InstAlias<"mov{\t$dst.2s, $src.2s|.2s\t$dst, $src}",
(ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
def : InstAlias<"mov{\t$dst.1d, $src.1d|.1d\t$dst, $src}",
(ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
def : InstAlias<"{cmls\t$dst.8b, $src1.8b, $src2.8b" #
"|cmls.8b\t$dst, $src1, $src2}",
(CMHSv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
def : InstAlias<"{cmls\t$dst.16b, $src1.16b, $src2.16b" #
"|cmls.16b\t$dst, $src1, $src2}",
(CMHSv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
def : InstAlias<"{cmls\t$dst.4h, $src1.4h, $src2.4h" #
"|cmls.4h\t$dst, $src1, $src2}",
(CMHSv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
def : InstAlias<"{cmls\t$dst.8h, $src1.8h, $src2.8h" #
"|cmls.8h\t$dst, $src1, $src2}",
(CMHSv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
def : InstAlias<"{cmls\t$dst.2s, $src1.2s, $src2.2s" #
"|cmls.2s\t$dst, $src1, $src2}",
(CMHSv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
def : InstAlias<"{cmls\t$dst.4s, $src1.4s, $src2.4s" #
"|cmls.4s\t$dst, $src1, $src2}",
(CMHSv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
def : InstAlias<"{cmls\t$dst.2d, $src1.2d, $src2.2d" #
"|cmls.2d\t$dst, $src1, $src2}",
(CMHSv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
def : InstAlias<"{cmlo\t$dst.8b, $src1.8b, $src2.8b" #
"|cmlo.8b\t$dst, $src1, $src2}",
(CMHIv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
def : InstAlias<"{cmlo\t$dst.16b, $src1.16b, $src2.16b" #
"|cmlo.16b\t$dst, $src1, $src2}",
(CMHIv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
def : InstAlias<"{cmlo\t$dst.4h, $src1.4h, $src2.4h" #
"|cmlo.4h\t$dst, $src1, $src2}",
(CMHIv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
def : InstAlias<"{cmlo\t$dst.8h, $src1.8h, $src2.8h" #
"|cmlo.8h\t$dst, $src1, $src2}",
(CMHIv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
def : InstAlias<"{cmlo\t$dst.2s, $src1.2s, $src2.2s" #
"|cmlo.2s\t$dst, $src1, $src2}",
(CMHIv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
def : InstAlias<"{cmlo\t$dst.4s, $src1.4s, $src2.4s" #
"|cmlo.4s\t$dst, $src1, $src2}",
(CMHIv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
def : InstAlias<"{cmlo\t$dst.2d, $src1.2d, $src2.2d" #
"|cmlo.2d\t$dst, $src1, $src2}",
(CMHIv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
def : InstAlias<"{cmle\t$dst.8b, $src1.8b, $src2.8b" #
"|cmle.8b\t$dst, $src1, $src2}",
(CMGEv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
def : InstAlias<"{cmle\t$dst.16b, $src1.16b, $src2.16b" #
"|cmle.16b\t$dst, $src1, $src2}",
(CMGEv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
def : InstAlias<"{cmle\t$dst.4h, $src1.4h, $src2.4h" #
"|cmle.4h\t$dst, $src1, $src2}",
(CMGEv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
def : InstAlias<"{cmle\t$dst.8h, $src1.8h, $src2.8h" #
"|cmle.8h\t$dst, $src1, $src2}",
(CMGEv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
def : InstAlias<"{cmle\t$dst.2s, $src1.2s, $src2.2s" #
"|cmle.2s\t$dst, $src1, $src2}",
(CMGEv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
def : InstAlias<"{cmle\t$dst.4s, $src1.4s, $src2.4s" #
"|cmle.4s\t$dst, $src1, $src2}",
(CMGEv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
def : InstAlias<"{cmle\t$dst.2d, $src1.2d, $src2.2d" #
"|cmle.2d\t$dst, $src1, $src2}",
(CMGEv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
def : InstAlias<"{cmlt\t$dst.8b, $src1.8b, $src2.8b" #
"|cmlt.8b\t$dst, $src1, $src2}",
(CMGTv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
def : InstAlias<"{cmlt\t$dst.16b, $src1.16b, $src2.16b" #
"|cmlt.16b\t$dst, $src1, $src2}",
(CMGTv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
def : InstAlias<"{cmlt\t$dst.4h, $src1.4h, $src2.4h" #
"|cmlt.4h\t$dst, $src1, $src2}",
(CMGTv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
def : InstAlias<"{cmlt\t$dst.8h, $src1.8h, $src2.8h" #
"|cmlt.8h\t$dst, $src1, $src2}",
(CMGTv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
def : InstAlias<"{cmlt\t$dst.2s, $src1.2s, $src2.2s" #
"|cmlt.2s\t$dst, $src1, $src2}",
(CMGTv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
def : InstAlias<"{cmlt\t$dst.4s, $src1.4s, $src2.4s" #
"|cmlt.4s\t$dst, $src1, $src2}",
(CMGTv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
"|cmlt.2d\t$dst, $src1, $src2}",
(CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
let Predicates = [HasNEON, HasFullFP16] in {
def : InstAlias<"{fcmle\t$dst.4h, $src1.4h, $src2.4h" #
"|fcmle.4h\t$dst, $src1, $src2}",
(FCMGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
def : InstAlias<"{fcmle\t$dst.8h, $src1.8h, $src2.8h" #
"|fcmle.8h\t$dst, $src1, $src2}",
(FCMGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
}
def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
"|fcmle.2s\t$dst, $src1, $src2}",
(FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
def : InstAlias<"{fcmle\t$dst.4s, $src1.4s, $src2.4s" #
"|fcmle.4s\t$dst, $src1, $src2}",
(FCMGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
"|fcmle.2d\t$dst, $src1, $src2}",
(FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
let Predicates = [HasNEON, HasFullFP16] in {
def : InstAlias<"{fcmlt\t$dst.4h, $src1.4h, $src2.4h" #
"|fcmlt.4h\t$dst, $src1, $src2}",
(FCMGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
def : InstAlias<"{fcmlt\t$dst.8h, $src1.8h, $src2.8h" #
"|fcmlt.8h\t$dst, $src1, $src2}",
(FCMGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
}
def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
"|fcmlt.2s\t$dst, $src1, $src2}",
(FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
def : InstAlias<"{fcmlt\t$dst.4s, $src1.4s, $src2.4s" #
"|fcmlt.4s\t$dst, $src1, $src2}",
(FCMGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
"|fcmlt.2d\t$dst, $src1, $src2}",
(FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
let Predicates = [HasNEON, HasFullFP16] in {
def : InstAlias<"{facle\t$dst.4h, $src1.4h, $src2.4h" #
"|facle.4h\t$dst, $src1, $src2}",
(FACGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
def : InstAlias<"{facle\t$dst.8h, $src1.8h, $src2.8h" #
"|facle.8h\t$dst, $src1, $src2}",
(FACGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
}
def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
"|facle.2s\t$dst, $src1, $src2}",
(FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
def : InstAlias<"{facle\t$dst.4s, $src1.4s, $src2.4s" #
"|facle.4s\t$dst, $src1, $src2}",
(FACGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
"|facle.2d\t$dst, $src1, $src2}",
(FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
let Predicates = [HasNEON, HasFullFP16] in {
def : InstAlias<"{faclt\t$dst.4h, $src1.4h, $src2.4h" #
"|faclt.4h\t$dst, $src1, $src2}",
(FACGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
def : InstAlias<"{faclt\t$dst.8h, $src1.8h, $src2.8h" #
"|faclt.8h\t$dst, $src1, $src2}",
(FACGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
}
def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
"|faclt.2s\t$dst, $src1, $src2}",
(FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
def : InstAlias<"{faclt\t$dst.4s, $src1.4s, $src2.4s" #
"|faclt.4s\t$dst, $src1, $src2}",
(FACGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
def : InstAlias<"{faclt\t$dst.2d, $src1.2d, $src2.2d" #
"|faclt.2d\t$dst, $src1, $src2}",
(FACGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
//===----------------------------------------------------------------------===//
// Advanced SIMD three scalar instructions.
//===----------------------------------------------------------------------===//
defm ADD : SIMDThreeScalarD<0, 0b10000, "add", add>;
defm CMEQ : SIMDThreeScalarD<1, 0b10001, "cmeq", AArch64cmeq>;
defm CMGE : SIMDThreeScalarD<0, 0b00111, "cmge", AArch64cmge>;
defm CMGT : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>;
defm CMHI : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>;
defm CMHS : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>;
defm CMTST : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
defm FABD : SIMDFPThreeScalar<1, 1, 0b010, "fabd", int_aarch64_sisd_fabd>;
def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(FABD64 FPR64:$Rn, FPR64:$Rm)>;
let Predicates = [HasFullFP16] in {
def : Pat<(fabs (fsub f16:$Rn, f16:$Rm)), (FABD16 f16:$Rn, f16:$Rm)>;
}
def : Pat<(fabs (fsub f32:$Rn, f32:$Rm)), (FABD32 f32:$Rn, f32:$Rm)>;
def : Pat<(fabs (fsub f64:$Rn, f64:$Rm)), (FABD64 f64:$Rn, f64:$Rm)>;
defm FACGE : SIMDThreeScalarFPCmp<1, 0, 0b101, "facge",
int_aarch64_neon_facge>;
defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
int_aarch64_neon_facgt>;
defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx>;
defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps>;
defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts>;
defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
defm SQRSHL : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl",int_aarch64_neon_sqrshl>;
defm SQSHL : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_aarch64_neon_sqshl>;
defm SQSUB : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_aarch64_neon_sqsub>;
defm SRSHL : SIMDThreeScalarD< 0, 0b01010, "srshl", int_aarch64_neon_srshl>;
defm SSHL : SIMDThreeScalarD< 0, 0b01000, "sshl", int_aarch64_neon_sshl>;
defm SUB : SIMDThreeScalarD< 1, 0b10000, "sub", sub>;
defm UQADD : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_aarch64_neon_uqadd>;
defm UQRSHL : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl",int_aarch64_neon_uqrshl>;
defm UQSHL : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl>;
defm UQSUB : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>;
defm URSHL : SIMDThreeScalarD< 1, 0b01010, "urshl", int_aarch64_neon_urshl>;
defm USHL : SIMDThreeScalarD< 1, 0b01000, "ushl", int_aarch64_neon_ushl>;
let Predicates = [HasRDM] in {
defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">;
defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">;
def : Pat<(i32 (int_aarch64_neon_sqadd
(i32 FPR32:$Rd),
(i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn),
(i32 FPR32:$Rm))))),
(SQRDMLAHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
def : Pat<(i32 (int_aarch64_neon_sqsub
(i32 FPR32:$Rd),
(i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn),
(i32 FPR32:$Rm))))),
(SQRDMLSHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
}
def : InstAlias<"cmls $dst, $src1, $src2",
(CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
def : InstAlias<"cmle $dst, $src1, $src2",
(CMGEv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
def : InstAlias<"cmlo $dst, $src1, $src2",
(CMHIv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
def : InstAlias<"cmlt $dst, $src1, $src2",
(CMGTv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
def : InstAlias<"fcmle $dst, $src1, $src2",
(FCMGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
def : InstAlias<"fcmle $dst, $src1, $src2",
(FCMGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
def : InstAlias<"fcmlt $dst, $src1, $src2",
(FCMGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
def : InstAlias<"fcmlt $dst, $src1, $src2",
(FCMGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
def : InstAlias<"facle $dst, $src1, $src2",
(FACGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
def : InstAlias<"facle $dst, $src1, $src2",
(FACGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
def : InstAlias<"faclt $dst, $src1, $src2",
(FACGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
def : InstAlias<"faclt $dst, $src1, $src2",
(FACGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
//===----------------------------------------------------------------------===//
// Advanced SIMD three scalar instructions (mixed operands).
//===----------------------------------------------------------------------===//
defm SQDMULL : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
int_aarch64_neon_sqdmulls_scalar>;
defm SQDMLAL : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
defm SQDMLSL : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;
def : Pat<(i64 (int_aarch64_neon_sqadd (i64 FPR64:$Rd),
(i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
(i32 FPR32:$Rm))))),
(SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd),
(i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
(i32 FPR32:$Rm))))),
(SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
//===----------------------------------------------------------------------===//
// Advanced SIMD two scalar instructions.
//===----------------------------------------------------------------------===//
defm ABS : SIMDTwoScalarD< 0, 0b01011, "abs", abs>;
defm CMEQ : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", AArch64cmeqz>;
defm CMGE : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>;
defm CMGT : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>;
defm CMLE : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", AArch64cmlez>;
defm CMLT : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", AArch64cmltz>;
defm FCMEQ : SIMDFPCmpTwoScalar<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
defm FCMGE : SIMDFPCmpTwoScalar<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
defm FCMGT : SIMDFPCmpTwoScalar<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
defm FCMLE : SIMDFPCmpTwoScalar<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
defm FCMLT : SIMDFPCmpTwoScalar<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
defm FCVTAS : SIMDFPTwoScalar< 0, 0, 0b11100, "fcvtas">;
defm FCVTAU : SIMDFPTwoScalar< 1, 0, 0b11100, "fcvtau">;
defm FCVTMS : SIMDFPTwoScalar< 0, 0, 0b11011, "fcvtms">;
defm FCVTMU : SIMDFPTwoScalar< 1, 0, 0b11011, "fcvtmu">;
defm FCVTNS : SIMDFPTwoScalar< 0, 0, 0b11010, "fcvtns">;
defm FCVTNU : SIMDFPTwoScalar< 1, 0, 0b11010, "fcvtnu">;
defm FCVTPS : SIMDFPTwoScalar< 0, 1, 0b11010, "fcvtps">;
defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu">;
def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">;
defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">;
defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe">;
defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx">;
defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte">;
defm NEG : SIMDTwoScalarD< 1, 0b01011, "neg",
UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
defm SCVTF : SIMDFPTwoScalarCVT< 0, 0, 0b11101, "scvtf", AArch64sitof>;
defm SQABS : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
defm SQNEG : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
defm SQXTN : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>;
defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>;
defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
int_aarch64_neon_suqadd>;
defm UCVTF : SIMDFPTwoScalarCVT< 1, 0, 0b11101, "ucvtf", AArch64uitof>;
defm UQXTN : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>;
defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
int_aarch64_neon_usqadd>;
def : Pat<(AArch64neg (v1i64 V64:$Rn)), (NEGv1i64 V64:$Rn)>;
def : Pat<(v1i64 (int_aarch64_neon_fcvtas (v1f64 FPR64:$Rn))),
(FCVTASv1i64 FPR64:$Rn)>;
def : Pat<(v1i64 (int_aarch64_neon_fcvtau (v1f64 FPR64:$Rn))),
(FCVTAUv1i64 FPR64:$Rn)>;
def : Pat<(v1i64 (int_aarch64_neon_fcvtms (v1f64 FPR64:$Rn))),
(FCVTMSv1i64 FPR64:$Rn)>;
def : Pat<(v1i64 (int_aarch64_neon_fcvtmu (v1f64 FPR64:$Rn))),
(FCVTMUv1i64 FPR64:$Rn)>;
def : Pat<(v1i64 (int_aarch64_neon_fcvtns (v1f64 FPR64:$Rn))),
(FCVTNSv1i64 FPR64:$Rn)>;
def : Pat<(v1i64 (int_aarch64_neon_fcvtnu (v1f64 FPR64:$Rn))),
(FCVTNUv1i64 FPR64:$Rn)>;
def : Pat<(v1i64 (int_aarch64_neon_fcvtps (v1f64 FPR64:$Rn))),
(FCVTPSv1i64 FPR64:$Rn)>;
def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))),
(FCVTPUv1i64 FPR64:$Rn)>;
def : Pat<(f16 (int_aarch64_neon_frecpe (f16 FPR16:$Rn))),
(FRECPEv1f16 FPR16:$Rn)>;
def : Pat<(f32 (int_aarch64_neon_frecpe (f32 FPR32:$Rn))),
(FRECPEv1i32 FPR32:$Rn)>;
def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
(FRECPEv1i64 FPR64:$Rn)>;
def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))),
(FRECPEv1i64 FPR64:$Rn)>;
def : Pat<(f32 (AArch64frecpe (f32 FPR32:$Rn))),
(FRECPEv1i32 FPR32:$Rn)>;
def : Pat<(v2f32 (AArch64frecpe (v2f32 V64:$Rn))),
(FRECPEv2f32 V64:$Rn)>;
def : Pat<(v4f32 (AArch64frecpe (v4f32 FPR128:$Rn))),
(FRECPEv4f32 FPR128:$Rn)>;
def : Pat<(f64 (AArch64frecpe (f64 FPR64:$Rn))),
(FRECPEv1i64 FPR64:$Rn)>;
def : Pat<(v1f64 (AArch64frecpe (v1f64 FPR64:$Rn))),
(FRECPEv1i64 FPR64:$Rn)>;
def : Pat<(v2f64 (AArch64frecpe (v2f64 FPR128:$Rn))),
(FRECPEv2f64 FPR128:$Rn)>;
def : Pat<(f32 (AArch64frecps (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
(FRECPS32 FPR32:$Rn, FPR32:$Rm)>;
def : Pat<(v2f32 (AArch64frecps (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
(FRECPSv2f32 V64:$Rn, V64:$Rm)>;
def : Pat<(v4f32 (AArch64frecps (v4f32 FPR128:$Rn), (v4f32 FPR128:$Rm))),
(FRECPSv4f32 FPR128:$Rn, FPR128:$Rm)>;
def : Pat<(f64 (AArch64frecps (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
(FRECPS64 FPR64:$Rn, FPR64:$Rm)>;
def : Pat<(v2f64 (AArch64frecps (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
(FRECPSv2f64 FPR128:$Rn, FPR128:$Rm)>;
def : Pat<(f16 (int_aarch64_neon_frecpx (f16 FPR16:$Rn))),
(FRECPXv1f16 FPR16:$Rn)>;
def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
(FRECPXv1i32 FPR32:$Rn)>;
def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
(FRECPXv1i64 FPR64:$Rn)>;
def : Pat<(f16 (int_aarch64_neon_frsqrte (f16 FPR16:$Rn))),
(FRSQRTEv1f16 FPR16:$Rn)>;
def : Pat<(f32 (int_aarch64_neon_frsqrte (f32 FPR32:$Rn))),
(FRSQRTEv1i32 FPR32:$Rn)>;
def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
(FRSQRTEv1i64 FPR64:$Rn)>;
def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))),
(FRSQRTEv1i64 FPR64:$Rn)>;
def : Pat<(f32 (AArch64frsqrte (f32 FPR32:$Rn))),
(FRSQRTEv1i32 FPR32:$Rn)>;
def : Pat<(v2f32 (AArch64frsqrte (v2f32 V64:$Rn))),
(FRSQRTEv2f32 V64:$Rn)>;
def : Pat<(v4f32 (AArch64frsqrte (v4f32 FPR128:$Rn))),
(FRSQRTEv4f32 FPR128:$Rn)>;
def : Pat<(f64 (AArch64frsqrte (f64 FPR64:$Rn))),
(FRSQRTEv1i64 FPR64:$Rn)>;
def : Pat<(v1f64 (AArch64frsqrte (v1f64 FPR64:$Rn))),
(FRSQRTEv1i64 FPR64:$Rn)>;
def : Pat<(v2f64 (AArch64frsqrte (v2f64 FPR128:$Rn))),
(FRSQRTEv2f64 FPR128:$Rn)>;
def : Pat<(f32 (AArch64frsqrts (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
(FRSQRTS32 FPR32:$Rn, FPR32:$Rm)>;
def : Pat<(v2f32 (AArch64frsqrts (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
(FRSQRTSv2f32 V64:$Rn, V64:$Rm)>;
def : Pat<(v4f32 (AArch64frsqrts (v4f32 FPR128:$Rn), (v4f32 FPR128:$Rm))),
(FRSQRTSv4f32 FPR128:$Rn, FPR128:$Rm)>;
def : Pat<(f64 (AArch64frsqrts (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
(FRSQRTS64 FPR64:$Rn, FPR64:$Rm)>;
def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
(FRSQRTSv2f64 FPR128:$Rn, FPR128:$Rm)>;
// If an integer is about to be converted to a floating point value,
// just load it on the floating point unit.
// Here are the patterns for 8 and 16-bits to float.
// 8-bits -> float.
multiclass UIntToFPROLoadPat<ValueType DstTy, ValueType SrcTy,
SDPatternOperator loadop, Instruction UCVTF,
ROAddrMode ro, Instruction LDRW, Instruction LDRX,
SubRegIndex sub> {
def : Pat<(DstTy (uint_to_fp (SrcTy
(loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm,
ro.Wext:$extend))))),
(UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
(LDRW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
sub))>;
def : Pat<(DstTy (uint_to_fp (SrcTy
(loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm,
ro.Wext:$extend))))),
(UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
(LDRX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
sub))>;
}
defm : UIntToFPROLoadPat<f32, i32, zextloadi8,
UCVTFv1i32, ro8, LDRBroW, LDRBroX, bsub>;
def : Pat <(f32 (uint_to_fp (i32
(zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
(UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
(LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
def : Pat <(f32 (uint_to_fp (i32
(zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
(UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
(LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
// 16-bits -> float.
defm : UIntToFPROLoadPat<f32, i32, zextloadi16,
UCVTFv1i32, ro16, LDRHroW, LDRHroX, hsub>;
def : Pat <(f32 (uint_to_fp (i32
(zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
(UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
(LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
def : Pat <(f32 (uint_to_fp (i32
(zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
(UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
(LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
// 32-bits are handled in target specific dag combine:
// performIntToFpCombine.
// 64-bits integer to 32-bits floating point, not possible with
// UCVTF on floating point registers (both source and destination
// must have the same size).
// Here are the patterns for 8, 16, 32, and 64-bits to double.
// 8-bits -> double.
defm : UIntToFPROLoadPat<f64, i32, zextloadi8,
UCVTFv1i64, ro8, LDRBroW, LDRBroX, bsub>;
def : Pat <(f64 (uint_to_fp (i32
(zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
(UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
(LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
def : Pat <(f64 (uint_to_fp (i32
(zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
(UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
(LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
// 16-bits -> double.
defm : UIntToFPROLoadPat<f64, i32, zextloadi16,
UCVTFv1i64, ro16, LDRHroW, LDRHroX, hsub>;
def : Pat <(f64 (uint_to_fp (i32
(zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
(UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
(LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
def : Pat <(f64 (uint_to_fp (i32
(zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
(UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
(LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
// 32-bits -> double.
defm : UIntToFPROLoadPat<f64, i32, load,
UCVTFv1i64, ro32, LDRSroW, LDRSroX, ssub>;
def : Pat <(f64 (uint_to_fp (i32
(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
(UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
(LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub))>;
def : Pat <(f64 (uint_to_fp (i32
(load (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
(UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
(LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
// 64-bits -> double are handled in target specific dag combine:
// performIntToFpCombine.
//===----------------------------------------------------------------------===//
// Advanced SIMD three different-sized vector instructions.
//===----------------------------------------------------------------------===//
defm ADDHN : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>;
defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>;
defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_aarch64_neon_pmull>;
defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
int_aarch64_neon_sabd>;
defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
int_aarch64_neon_sabd>;
defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl",
BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw",
BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
defm SMLAL : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
defm SMLSL : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_aarch64_neon_smull>;
defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
int_aarch64_neon_sqadd>;
defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
int_aarch64_neon_sqsub>;
defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
int_aarch64_neon_sqdmull>;
defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
int_aarch64_neon_uabd>;
defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
BinOpFrag<(add node:$LHS, (zext node:$RHS))>>;
defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>;
defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>;
defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw",
BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;
// Additional patterns for SMULL and UMULL
multiclass Neon_mul_widen_patterns<SDPatternOperator opnode,
Instruction INST8B, Instruction INST4H, Instruction INST2S> {
def : Pat<(v8i16 (opnode (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
(INST8B V64:$Rn, V64:$Rm)>;
def : Pat<(v4i32 (opnode (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
(INST4H V64:$Rn, V64:$Rm)>;
def : Pat<(v2i64 (opnode (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
(INST2S V64:$Rn, V64:$Rm)>;
}
defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16,
SMULLv4i16_v4i32, SMULLv2i32_v2i64>;
defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16,
UMULLv4i16_v4i32, UMULLv2i32_v2i64>;
// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL
multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode,
Instruction INST8B, Instruction INST4H, Instruction INST2S> {
def : Pat<(v8i16 (opnode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
(INST8B V128:$Rd, V64:$Rn, V64:$Rm)>;
def : Pat<(v4i32 (opnode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
(INST4H V128:$Rd, V64:$Rn, V64:$Rm)>;
def : Pat<(v2i64 (opnode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
(INST2S V128:$Rd, V64:$Rn, V64:$Rm)>;
}
defm : Neon_mulacc_widen_patterns<
TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
defm : Neon_mulacc_widen_patterns<
TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
defm : Neon_mulacc_widen_patterns<
TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
defm : Neon_mulacc_widen_patterns<
TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
// Patterns for 64-bit pmull
def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
(PMULLv1i64 V64:$Rn, V64:$Rm)>;
def : Pat<(int_aarch64_neon_pmull64 (extractelt (v2i64 V128:$Rn), (i64 1)),
(extractelt (v2i64 V128:$Rm), (i64 1))),
(PMULLv2i64 V128:$Rn, V128:$Rm)>;
// CodeGen patterns for addhn and subhn instructions, which can actually be
// written in LLVM IR without too much difficulty.
// ADDHN
def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))),
(ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
(i32 16))))),
(ADDHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
(i32 32))))),
(ADDHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
def : Pat<(concat_vectors (v8i8 V64:$Rd),
(trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm),
(i32 8))))),
(ADDHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
V128:$Rn, V128:$Rm)>;
def : Pat<(concat_vectors (v4i16 V64:$Rd),
(trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
(i32 16))))),
(ADDHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
V128:$Rn, V128:$Rm)>;
def : Pat<(concat_vectors (v2i32 V64:$Rd),
(trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
(i32 32))))),
(ADDHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
V128:$Rn, V128:$Rm)>;
// SUBHN
def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm), (i32 8))))),
(SUBHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
(i32 16))))),
(SUBHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
(i32 32))))),
(SUBHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
def : Pat<(concat_vectors (v8i8 V64:$Rd),
(trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
(i32 8))))),
(SUBHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
V128:$Rn, V128:$Rm)>;
def : Pat<(concat_vectors (v4i16 V64:$Rd),
(trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
(i32 16))))),
(SUBHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
V128:$Rn, V128:$Rm)>;
def : Pat<(concat_vectors (v2i32 V64:$Rd),
(trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
(i32 32))))),
(SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
V128:$Rn, V128:$Rm)>;
//----------------------------------------------------------------------------
// AdvSIMD bitwise extract from vector instruction.
//----------------------------------------------------------------------------
defm EXT : SIMDBitwiseExtract<"ext">;
def AdjustExtImm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(8 + N->getZExtValue(), SDLoc(N), MVT::i32);
}]>;
multiclass ExtPat<ValueType VT64, ValueType VT128, int N> {
def : Pat<(VT64 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
(EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
def : Pat<(VT128 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
(EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
// We use EXT to handle extract_subvector to copy the upper 64-bits of a
// 128-bit vector.
def : Pat<(VT64 (extract_subvector V128:$Rn, (i64 N))),
(EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
// A 64-bit EXT of two halves of the same 128-bit register can be done as a
// single 128-bit EXT.
def : Pat<(VT64 (AArch64ext (extract_subvector V128:$Rn, (i64 0)),
(extract_subvector V128:$Rn, (i64 N)),
(i32 imm:$imm))),
(EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, imm:$imm), dsub)>;
// A 64-bit EXT of the high half of a 128-bit register can be done using a
// 128-bit EXT of the whole register with an adjustment to the immediate. The
// top half of the other operand will be unset, but that doesn't matter as it
// will not be used.
def : Pat<(VT64 (AArch64ext (extract_subvector V128:$Rn, (i64 N)),
V64:$Rm,
(i32 imm:$imm))),
(EXTRACT_SUBREG (EXTv16i8 V128:$Rn,
(SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
(AdjustExtImm imm:$imm)), dsub)>;
}
defm : ExtPat<v8i8, v16i8, 8>;
defm : ExtPat<v4i16, v8i16, 4>;
defm : ExtPat<v4f16, v8f16, 4>;
defm : ExtPat<v2i32, v4i32, 2>;
defm : ExtPat<v2f32, v4f32, 2>;
defm : ExtPat<v1i64, v2i64, 1>;
defm : ExtPat<v1f64, v2f64, 1>;
//----------------------------------------------------------------------------
// AdvSIMD zip vector
//----------------------------------------------------------------------------
defm TRN1 : SIMDZipVector<0b010, "trn1", AArch64trn1>;
defm TRN2 : SIMDZipVector<0b110, "trn2", AArch64trn2>;
defm UZP1 : SIMDZipVector<0b001, "uzp1", AArch64uzp1>;
defm UZP2 : SIMDZipVector<0b101, "uzp2", AArch64uzp2>;
defm ZIP1 : SIMDZipVector<0b011, "zip1", AArch64zip1>;
defm ZIP2 : SIMDZipVector<0b111, "zip2", AArch64zip2>;
//----------------------------------------------------------------------------
// AdvSIMD TBL/TBX instructions
//----------------------------------------------------------------------------
defm TBL : SIMDTableLookup< 0, "tbl">;
defm TBX : SIMDTableLookupTied<1, "tbx">;
def : Pat<(v8i8 (int_aarch64_neon_tbl1 (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
(TBLv8i8One VecListOne128:$Rn, V64:$Ri)>;
def : Pat<(v16i8 (int_aarch64_neon_tbl1 (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
(TBLv16i8One V128:$Ri, V128:$Rn)>;
def : Pat<(v8i8 (int_aarch64_neon_tbx1 (v8i8 V64:$Rd),
(v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
(TBXv8i8One V64:$Rd, VecListOne128:$Rn, V64:$Ri)>;
def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd),
(v16i8 V128:$Ri), (v16i8 V128:$Rn))),
(TBXv16i8One V128:$Rd, V128:$Ri, V128:$Rn)>;
//----------------------------------------------------------------------------
// AdvSIMD scalar CPY instruction
//----------------------------------------------------------------------------
defm CPY : SIMDScalarCPY<"cpy">;
//----------------------------------------------------------------------------
// AdvSIMD scalar pairwise instructions
//----------------------------------------------------------------------------
defm ADDP : SIMDPairwiseScalarD<0, 0b11011, "addp">;
defm FADDP : SIMDFPPairwiseScalar<0, 0b01101, "faddp">;
defm FMAXNMP : SIMDFPPairwiseScalar<0, 0b01100, "fmaxnmp">;
defm FMAXP : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">;
defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">;
defm FMINP : SIMDFPPairwiseScalar<1, 0b01111, "fminp">;
def : Pat<(v2i64 (AArch64saddv V128:$Rn)),
(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>;
def : Pat<(v2i64 (AArch64uaddv V128:$Rn)),
(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>;
def : Pat<(f32 (int_aarch64_neon_faddv (v2f32 V64:$Rn))),
(FADDPv2i32p V64:$Rn)>;
def : Pat<(f32 (int_aarch64_neon_faddv (v4f32 V128:$Rn))),
(FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>;
def : Pat<(f64 (int_aarch64_neon_faddv (v2f64 V128:$Rn))),
(FADDPv2i64p V128:$Rn)>;
def : Pat<(f32 (int_aarch64_neon_fmaxnmv (v2f32 V64:$Rn))),
(FMAXNMPv2i32p V64:$Rn)>;
def : Pat<(f64 (int_aarch64_neon_fmaxnmv (v2f64 V128:$Rn))),
(FMAXNMPv2i64p V128:$Rn)>;
def : Pat<(f32 (int_aarch64_neon_fmaxv (v2f32 V64:$Rn))),
(FMAXPv2i32p V64:$Rn)>;
def : Pat<(f64 (int_aarch64_neon_fmaxv (v2f64 V128:$Rn))),
(FMAXPv2i64p V128:$Rn)>;
def : Pat<(f32 (int_aarch64_neon_fminnmv (v2f32 V64:$Rn))),
(FMINNMPv2i32p V64:$Rn)>;
def : Pat<(f64 (int_aarch64_neon_fminnmv (v2f64 V128:$Rn))),
(FMINNMPv2i64p V128:$Rn)>;
def : Pat<(f32 (int_aarch64_neon_fminv (v2f32 V64:$Rn))),
(FMINPv2i32p V64:$Rn)>;
def : Pat<(f64 (int_aarch64_neon_fminv (v2f64 V128:$Rn))),
(FMINPv2i64p V128:$Rn)>;
//----------------------------------------------------------------------------
// AdvSIMD INS/DUP instructions
//----------------------------------------------------------------------------
def DUPv8i8gpr : SIMDDupFromMain<0, {?,?,?,?,1}, ".8b", v8i8, V64, GPR32>;
def DUPv16i8gpr : SIMDDupFromMain<1, {?,?,?,?,1}, ".16b", v16i8, V128, GPR32>;
def DUPv4i16gpr : SIMDDupFromMain<0, {?,?,?,1,0}, ".4h", v4i16, V64, GPR32>;
def DUPv8i16gpr : SIMDDupFromMain<1, {?,?,?,1,0}, ".8h", v8i16, V128, GPR32>;
def DUPv2i32gpr : SIMDDupFromMain<0, {?,?,1,0,0}, ".2s", v2i32, V64, GPR32>;
def DUPv4i32gpr : SIMDDupFromMain<1, {?,?,1,0,0}, ".4s", v4i32, V128, GPR32>;
def DUPv2i64gpr : SIMDDupFromMain<1, {?,1,0,0,0}, ".2d", v2i64, V128, GPR64>;
def DUPv2i64lane : SIMDDup64FromElement;
def DUPv2i32lane : SIMDDup32FromElement<0, ".2s", v2i32, V64>;
def DUPv4i32lane : SIMDDup32FromElement<1, ".4s", v4i32, V128>;
def DUPv4i16lane : SIMDDup16FromElement<0, ".4h", v4i16, V64>;
def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>;
def DUPv8i8lane : SIMDDup8FromElement <0, ".8b", v8i8, V64>;
def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>;
// DUP from a 64-bit register to a 64-bit register is just a copy
def : Pat<(v1i64 (AArch64dup (i64 GPR64:$Rn))),
(COPY_TO_REGCLASS GPR64:$Rn, FPR64)>;
def : Pat<(v1f64 (AArch64dup (f64 FPR64:$Rn))),
(COPY_TO_REGCLASS FPR64:$Rn, FPR64)>;
def : Pat<(v2f32 (AArch64dup (f32 FPR32:$Rn))),
(v2f32 (DUPv2i32lane
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
(i64 0)))>;
def : Pat<(v4f32 (AArch64dup (f32 FPR32:$Rn))),
(v4f32 (DUPv4i32lane
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
(i64 0)))>;
def : Pat<(v2f64 (AArch64dup (f64 FPR64:$Rn))),
(v2f64 (DUPv2i64lane
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub),
(i64 0)))>;
def : Pat<(v4f16 (AArch64dup (f16 FPR16:$Rn))),
(v4f16 (DUPv4i16lane
(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
(i64 0)))>;
def : Pat<(v8f16 (AArch64dup (f16 FPR16:$Rn))),
(v8f16 (DUPv8i16lane
(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
(i64 0)))>;
def : Pat<(v4f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
(DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>;
def : Pat<(v8f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
(DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>;
def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
(DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
(DUPv4i32lane V128:$Rn, VectorIndexS:$imm)>;
def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
(DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
// If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
// instruction even if the types don't match: we just have to remap the lane
// carefully. N.b. this trick only applies to truncations.
def VecIndex_x2 : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(2 * N->getZExtValue(), SDLoc(N), MVT::i64);
}]>;
def VecIndex_x4 : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(4 * N->getZExtValue(), SDLoc(N), MVT::i64);
}]>;
def VecIndex_x8 : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(8 * N->getZExtValue(), SDLoc(N), MVT::i64);
}]>;
multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
ValueType Src128VT, ValueType ScalVT,
Instruction DUP, SDNodeXForm IdxXFORM> {
def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
imm:$idx)))),
(DUP V128:$Rn, (IdxXFORM imm:$idx))>;
def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
imm:$idx)))),
(DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
}
defm : DUPWithTruncPats<v8i8, v4i16, v8i16, i32, DUPv8i8lane, VecIndex_x2>;
defm : DUPWithTruncPats<v8i8, v2i32, v4i32, i32, DUPv8i8lane, VecIndex_x4>;
defm : DUPWithTruncPats<v4i16, v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
defm : DUPWithTruncPats<v16i8, v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
defm : DUPWithTruncPats<v16i8, v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
defm : DUPWithTruncPats<v8i16, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
SDNodeXForm IdxXFORM> {
def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
imm:$idx))))),
(DUP V128:$Rn, (IdxXFORM imm:$idx))>;
def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn),
imm:$idx))))),
(DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
}
defm : DUPWithTrunci64Pats<v8i8, DUPv8i8lane, VecIndex_x8>;
defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane, VecIndex_x4>;
defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane, VecIndex_x2>;
defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
// SMOV and UMOV definitions, with some extra patterns for convenience
defm SMOV : SMov;
defm UMOV : UMov;
def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
(i32 (SMOVvi8to32 V128:$Rn, VectorIndexB:$idx))>;
def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
(i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
(i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
(i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
(i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))),
(i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>;
def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v16i8 V128:$Rn),
VectorIndexB:$idx)))), i8),
(i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v8i16 V128:$Rn),
VectorIndexH:$idx)))), i16),
(i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
// Extracting i8 or i16 elements will have the zero-extend transformed to
// an 'and' mask by type legalization since neither i8 nor i16 are legal types
// for AArch64. Match these patterns here since UMOV already zeroes out the high
// bits of the destination register.
def : Pat<(and (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx),
(i32 0xff)),
(i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx))>;
def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),
(i32 0xffff)),
(i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>;
defm INS : SIMDIns;
def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
(INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
(INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
(v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
(i32 FPR32:$Rn), ssub))>;
def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 FPR32:$Rn), ssub))>;
def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
(v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
(i64 FPR64:$Rn), dsub))>;
def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
(INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
(INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
(INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
(INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
(f16 FPR16:$Rm), (i64 VectorIndexS:$imm))),
(EXTRACT_SUBREG
(INSvi16lane
(v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), V64:$Rn, dsub)),
VectorIndexS:$imm,
(v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
(i64 0)),
dsub)>;
def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
(f16 FPR16:$Rm), (i64 VectorIndexH:$imm))),
(INSvi16lane
V128:$Rn, VectorIndexH:$imm,
(v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
(i64 0))>;
def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
(f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
(EXTRACT_SUBREG
(INSvi32lane
(v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)),
VectorIndexS:$imm,
(v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
(i64 0)),
dsub)>;
def : Pat<(v4f32 (vector_insert (v4f32 V128:$Rn),
(f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
(INSvi32lane
V128:$Rn, VectorIndexS:$imm,
(v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
(i64 0))>;
def : Pat<(v2f64 (vector_insert (v2f64 V128:$Rn),
(f64 FPR64:$Rm), (i64 VectorIndexD:$imm))),
(INSvi64lane
V128:$Rn, VectorIndexD:$imm,
(v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)),
(i64 0))>;
// Copy an element at a constant index in one vector into a constant indexed
// element of another.
// FIXME refactor to a shared class/dev parameterized on vector type, vector
// index type and INS extension
def : Pat<(v16i8 (int_aarch64_neon_vcopy_lane
(v16i8 V128:$Vd), VectorIndexB:$idx, (v16i8 V128:$Vs),
VectorIndexB:$idx2)),
(v16i8 (INSvi8lane
V128:$Vd, VectorIndexB:$idx, V128:$Vs, VectorIndexB:$idx2)
)>;
def : Pat<(v8i16 (int_aarch64_neon_vcopy_lane
(v8i16 V128:$Vd), VectorIndexH:$idx, (v8i16 V128:$Vs),
VectorIndexH:$idx2)),
(v8i16 (INSvi16lane
V128:$Vd, VectorIndexH:$idx, V128:$Vs, VectorIndexH:$idx2)
)>;
def : Pat<(v4i32 (int_aarch64_neon_vcopy_lane
(v4i32 V128:$Vd), VectorIndexS:$idx, (v4i32 V128:$Vs),
VectorIndexS:$idx2)),
(v4i32 (INSvi32lane
V128:$Vd, VectorIndexS:$idx, V128:$Vs, VectorIndexS:$idx2)
)>;
def : Pat<(v2i64 (int_aarch64_neon_vcopy_lane
(v2i64 V128:$Vd), VectorIndexD:$idx, (v2i64 V128:$Vs),
VectorIndexD:$idx2)),
(v2i64 (INSvi64lane
V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2)
)>;
multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
ValueType VTScal, Instruction INS> {
def : Pat<(VT128 (vector_insert V128:$src,
(VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
imm:$Immd)),
(INS V128:$src, imm:$Immd, V128:$Rn, imm:$Immn)>;
def : Pat<(VT128 (vector_insert V128:$src,
(VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
imm:$Immd)),
(INS V128:$src, imm:$Immd,
(SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn)>;
def : Pat<(VT64 (vector_insert V64:$src,
(VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
imm:$Immd)),
(EXTRACT_SUBREG (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub),
imm:$Immd, V128:$Rn, imm:$Immn),
dsub)>;
def : Pat<(VT64 (vector_insert V64:$src,
(VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
imm:$Immd)),
(EXTRACT_SUBREG
(INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), imm:$Immd,
(SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn),
dsub)>;
}
defm : Neon_INS_elt_pattern<v8f16, v4f16, f16, INSvi16lane>;
defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;
// Floating point vector extractions are codegen'd as either a sequence of
// subregister extractions, or a MOV (aka CPY here, alias for DUP) if
// the lane number is anything other than zero.
def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
(f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
(f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
def : Pat<(vector_extract (v8f16 V128:$Rn), 0),
(f16 (EXTRACT_SUBREG V128:$Rn, hsub))>;
def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
(f64 (CPYi64 V128:$Rn, VectorIndexD:$idx))>;
def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
(f32 (CPYi32 V128:$Rn, VectorIndexS:$idx))>;
def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx),
(f16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>;
// All concat_vectors operations are canonicalised to act on i64 vectors for
// AArch64. In the general case we need an instruction, which had just as well be
// INS.
class ConcatPat<ValueType DstTy, ValueType SrcTy>
: Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
(INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;
def : ConcatPat<v2i64, v1i64>;
def : ConcatPat<v2f64, v1f64>;
def : ConcatPat<v4i32, v2i32>;
def : ConcatPat<v4f32, v2f32>;
def : ConcatPat<v8i16, v4i16>;
def : ConcatPat<v8f16, v4f16>;
def : ConcatPat<v16i8, v8i8>;
// If the high lanes are undef, though, we can just ignore them:
class ConcatUndefPat<ValueType DstTy, ValueType SrcTy>
: Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;
def : ConcatUndefPat<v2i64, v1i64>;
def : ConcatUndefPat<v2f64, v1f64>;
def : ConcatUndefPat<v4i32, v2i32>;
def : ConcatUndefPat<v4f32, v2f32>;
def : ConcatUndefPat<v8i16, v4i16>;
def : ConcatUndefPat<v16i8, v8i8>;
//----------------------------------------------------------------------------
// AdvSIMD across lanes instructions
//----------------------------------------------------------------------------
defm ADDV : SIMDAcrossLanesBHS<0, 0b11011, "addv">;
defm SMAXV : SIMDAcrossLanesBHS<0, 0b01010, "smaxv">;
defm SMINV : SIMDAcrossLanesBHS<0, 0b11010, "sminv">;
defm UMAXV : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
defm UMINV : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
defm SADDLV : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
defm UADDLV : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
defm FMAXNMV : SIMDFPAcrossLanes<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
defm FMAXV : SIMDFPAcrossLanes<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
defm FMINV : SIMDFPAcrossLanes<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
// Patterns for across-vector intrinsics, that have a node equivalent, that
// returns a vector (with only the low lane defined) instead of a scalar.
// In effect, opNode is the same as (scalar_to_vector (IntNode)).
multiclass SIMDAcrossLanesIntrinsic<string baseOpc,
SDPatternOperator opNode> {
// If a lane instruction caught the vector_extract around opNode, we can
// directly match the latter to the instruction.
def : Pat<(v8i8 (opNode V64:$Rn)),
(INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub)>;
def : Pat<(v16i8 (opNode V128:$Rn)),
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub)>;
def : Pat<(v4i16 (opNode V64:$Rn)),
(INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub)>;
def : Pat<(v8i16 (opNode V128:$Rn)),
(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub)>;
def : Pat<(v4i32 (opNode V128:$Rn)),
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub)>;
// If none did, fallback to the explicit patterns, consuming the vector_extract.
def : Pat<(i32 (vector_extract (insert_subvector undef, (v8i8 (opNode V64:$Rn)),
(i32 0)), (i64 0))),
(EXTRACT_SUBREG (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn),
bsub), ssub)>;
def : Pat<(i32 (vector_extract (v16i8 (opNode V128:$Rn)), (i64 0))),
(EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn),
bsub), ssub)>;
def : Pat<(i32 (vector_extract (insert_subvector undef,
(v4i16 (opNode V64:$Rn)), (i32 0)), (i64 0))),
(EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn),
hsub), ssub)>;
def : Pat<(i32 (vector_extract (v8i16 (opNode V128:$Rn)), (i64 0))),
(EXTRACT_SUBREG (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn),
hsub), ssub)>;
def : Pat<(i32 (vector_extract (v4i32 (opNode V128:$Rn)), (i64 0))),
(EXTRACT_SUBREG (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn),
ssub), ssub)>;
}
multiclass SIMDAcrossLanesSignedIntrinsic<string baseOpc,
SDPatternOperator opNode>
: SIMDAcrossLanesIntrinsic<baseOpc, opNode> {
// If there is a sign extension after this intrinsic, consume it as smov already
// performed it
def : Pat<(i32 (sext_inreg (i32 (vector_extract (insert_subvector undef,
(opNode (v8i8 V64:$Rn)), (i32 0)), (i64 0))), i8)),
(i32 (SMOVvi8to32
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
(i64 0)))>;
def : Pat<(i32 (sext_inreg (i32 (vector_extract
(opNode (v16i8 V128:$Rn)), (i64 0))), i8)),
(i32 (SMOVvi8to32
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
(i64 0)))>;
def : Pat<(i32 (sext_inreg (i32 (vector_extract (insert_subvector undef,
(opNode (v4i16 V64:$Rn)), (i32 0)), (i64 0))), i16)),
(i32 (SMOVvi16to32
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
(i64 0)))>;
def : Pat<(i32 (sext_inreg (i32 (vector_extract
(opNode (v8i16 V128:$Rn)), (i64 0))), i16)),
(i32 (SMOVvi16to32
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
(i64 0)))>;
}
multiclass SIMDAcrossLanesUnsignedIntrinsic<string baseOpc,
SDPatternOperator opNode>
: SIMDAcrossLanesIntrinsic<baseOpc, opNode> {
// If there is a masking operation keeping only what has been actually
// generated, consume it.
def : Pat<(i32 (and (i32 (vector_extract (insert_subvector undef,
(opNode (v8i8 V64:$Rn)), (i32 0)), (i64 0))), maski8_or_more)),
(i32 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
ssub))>;
def : Pat<(i32 (and (i32 (vector_extract (opNode (v16i8 V128:$Rn)), (i64 0))),
maski8_or_more)),
(i32 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
ssub))>;
def : Pat<(i32 (and (i32 (vector_extract (insert_subvector undef,
(opNode (v4i16 V64:$Rn)), (i32 0)), (i64 0))), maski16_or_more)),
(i32 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
ssub))>;
def : Pat<(i32 (and (i32 (vector_extract (opNode (v8i16 V128:$Rn)), (i64 0))),
maski16_or_more)),
(i32 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
ssub))>;
}
defm : SIMDAcrossLanesSignedIntrinsic<"ADDV", AArch64saddv>;
// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
def : Pat<(v2i32 (AArch64saddv (v2i32 V64:$Rn))),
(ADDPv2i32 V64:$Rn, V64:$Rn)>;
defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV", AArch64uaddv>;
// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
def : Pat<(v2i32 (AArch64uaddv (v2i32 V64:$Rn))),
(ADDPv2i32 V64:$Rn, V64:$Rn)>;
defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", AArch64smaxv>;
def : Pat<(v2i32 (AArch64smaxv (v2i32 V64:$Rn))),
(SMAXPv2i32 V64:$Rn, V64:$Rn)>;
defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", AArch64sminv>;
def : Pat<(v2i32 (AArch64sminv (v2i32 V64:$Rn))),
(SMINPv2i32 V64:$Rn, V64:$Rn)>;
defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", AArch64umaxv>;
def : Pat<(v2i32 (AArch64umaxv (v2i32 V64:$Rn))),
(UMAXPv2i32 V64:$Rn, V64:$Rn)>;
defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", AArch64uminv>;
def : Pat<(v2i32 (AArch64uminv (v2i32 V64:$Rn))),
(UMINPv2i32 V64:$Rn, V64:$Rn)>;
multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
(i32 (SMOVvi16to32
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
(i64 0)))>;
def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
(i32 (SMOVvi16to32
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
(i64 0)))>;
def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
(i32 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
ssub))>;
def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
(i32 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
ssub))>;
def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
(i64 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
dsub))>;
}
multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
Intrinsic intOp> {
def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
(i32 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
ssub))>;
def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
(i32 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
ssub))>;
def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
(i32 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
ssub))>;
def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
(i32 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
ssub))>;
def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
(i64 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
dsub))>;
}
defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>;
defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>;
// The vaddlv_s32 intrinsic gets mapped to SADDLP.
def : Pat<(i64 (int_aarch64_neon_saddlv (v2i32 V64:$Rn))),
(i64 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(SADDLPv2i32_v1i64 V64:$Rn), dsub),
dsub))>;
// The vaddlv_u32 intrinsic gets mapped to UADDLP.
def : Pat<(i64 (int_aarch64_neon_uaddlv (v2i32 V64:$Rn))),
(i64 (EXTRACT_SUBREG
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(UADDLPv2i32_v1i64 V64:$Rn), dsub),
dsub))>;
//------------------------------------------------------------------------------
// AdvSIMD modified immediate instructions
//------------------------------------------------------------------------------
// AdvSIMD BIC
defm BIC : SIMDModifiedImmVectorShiftTied<1, 0b11, 0b01, "bic", AArch64bici>;
// AdvSIMD ORR
defm ORR : SIMDModifiedImmVectorShiftTied<0, 0b11, 0b01, "orr", AArch64orri>;
def : InstAlias<"bic $Vd.4h, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"bic $Vd.8h, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"bic $Vd.2s, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"bic $Vd.4s, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"orr $Vd.4h, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"orr $Vd.8h, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"orr $Vd.2s, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"orr $Vd.4s, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0)>;
def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>;
// AdvSIMD FMOV
def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1111, V128, fpimm8,
"fmov", ".2d",
[(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1111, V64, fpimm8,
"fmov", ".2s",
[(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1111, V128, fpimm8,
"fmov", ".4s",
[(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
let Predicates = [HasNEON, HasFullFP16] in {
def FMOVv4f16_ns : SIMDModifiedImmVectorNoShift<0, 0, 1, 0b1111, V64, fpimm8,
"fmov", ".4h",
[(set (v4f16 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
def FMOVv8f16_ns : SIMDModifiedImmVectorNoShift<1, 0, 1, 0b1111, V128, fpimm8,
"fmov", ".8h",
[(set (v8f16 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
} // Predicates = [HasNEON, HasFullFP16]
// AdvSIMD MOVI
// EDIT byte mask: scalar
let isReMaterializable = 1, isAsCheapAsAMove = 1 in
def MOVID : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi",
[(set FPR64:$Rd, simdimmtype10:$imm8)]>;
// The movi_edit node has the immediate value already encoded, so we use
// a plain imm0_255 here.
def : Pat<(f64 (AArch64movi_edit imm0_255:$shift)),
(MOVID imm0_255:$shift)>;
// EDIT byte mask: 2d
// The movi_edit node has the immediate value already encoded, so we use
// a plain imm0_255 in the pattern
let isReMaterializable = 1, isAsCheapAsAMove = 1 in
def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128,
simdimmtype10,
"movi", ".2d",
[(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;
def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
def : Pat<(v16i8 immAllZerosV), (MOVIv2d_ns (i32 0))>;
def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>;
def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;
// Set 64-bit vectors to all 0/1 by extracting from a 128-bit register as the
// extract is free and this gives better MachineCSE results.
def : Pat<(v1i64 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
def : Pat<(v2i32 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
def : Pat<(v4i16 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
def : Pat<(v8i8 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
def : Pat<(v1i64 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
def : Pat<(v2i32 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
def : Pat<(v4i16 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
def : Pat<(v8i8 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
// EDIT per word & halfword: 2s, 4h, 4s, & 8h
let isReMaterializable = 1, isAsCheapAsAMove = 1 in
defm MOVI : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"movi $Vd.4s, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"movi.4h $Vd, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"movi.8h $Vd, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"movi.2s $Vd, $imm", (MOVIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"movi.4s $Vd, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
def : Pat<(v2i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
(MOVIv2i32 imm0_255:$imm8, imm:$shift)>;
def : Pat<(v4i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
(MOVIv4i32 imm0_255:$imm8, imm:$shift)>;
def : Pat<(v4i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
(MOVIv4i16 imm0_255:$imm8, imm:$shift)>;
def : Pat<(v8i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
(MOVIv8i16 imm0_255:$imm8, imm:$shift)>;
let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
// EDIT per word: 2s & 4s with MSL shifter
def MOVIv2s_msl : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s",
[(set (v2i32 V64:$Rd),
(AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
def MOVIv4s_msl : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
[(set (v4i32 V128:$Rd),
(AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
// Per byte: 8b & 16b
def MOVIv8b_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1110, V64, imm0_255,
"movi", ".8b",
[(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>;
def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1110, V128, imm0_255,
"movi", ".16b",
[(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>;
}
// AdvSIMD MVNI
// EDIT per word & halfword: 2s, 4h, 4s, & 8h
let isReMaterializable = 1, isAsCheapAsAMove = 1 in
defm MVNI : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">;
def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"mvni $Vd.8h, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"mvni $Vd.2s, $imm", (MVNIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"mvni $Vd.4s, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"mvni.4h $Vd, $imm", (MVNIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"mvni.8h $Vd, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"mvni.2s $Vd, $imm", (MVNIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"mvni.4s $Vd, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
def : Pat<(v2i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
(MVNIv2i32 imm0_255:$imm8, imm:$shift)>;
def : Pat<(v4i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
(MVNIv4i32 imm0_255:$imm8, imm:$shift)>;
def : Pat<(v4i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
(MVNIv4i16 imm0_255:$imm8, imm:$shift)>;
def : Pat<(v8i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
(MVNIv8i16 imm0_255:$imm8, imm:$shift)>;
// EDIT per word: 2s & 4s with MSL shifter
let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
def MVNIv2s_msl : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s",
[(set (v2i32 V64:$Rd),
(AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
[(set (v4i32 V128:$Rd),
(AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
}
//----------------------------------------------------------------------------
// AdvSIMD indexed element
//----------------------------------------------------------------------------
let hasSideEffects = 0 in {
defm FMLA : SIMDFPIndexedTied<0, 0b0001, "fmla">;
defm FMLS : SIMDFPIndexedTied<0, 0b0101, "fmls">;
}
// NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the
// instruction expects the addend first, while the intrinsic expects it last.
// On the other hand, there are quite a few valid combinatorial options due to
// the commutativity of multiplication and the fact that (-x) * y = x * (-y).
defm : SIMDFPIndexedTiedPatterns<"FMLA",
TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
defm : SIMDFPIndexedTiedPatterns<"FMLA",
TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
defm : SIMDFPIndexedTiedPatterns<"FMLS",
TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
defm : SIMDFPIndexedTiedPatterns<"FMLS",
TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
defm : SIMDFPIndexedTiedPatterns<"FMLS",
TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
defm : SIMDFPIndexedTiedPatterns<"FMLS",
TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
// 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit
// and DUP scalar.
def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
(AArch64duplane32 (v4f32 (fneg V128:$Rm)),
VectorIndexS:$idx))),
(FMLSv2i32_indexed V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
(v2f32 (AArch64duplane32
(v4f32 (insert_subvector undef,
(v2f32 (fneg V64:$Rm)),
(i32 0))),
VectorIndexS:$idx)))),
(FMLSv2i32_indexed V64:$Rd, V64:$Rn,
(SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
VectorIndexS:$idx)>;
def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
(AArch64dup (f32 (fneg FPR32Op:$Rm))))),
(FMLSv2i32_indexed V64:$Rd, V64:$Rn,
(SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
// 3 variants for the .4s version: DUPLANE from 128-bit, DUPLANE from 64-bit
// and DUP scalar.
def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
(AArch64duplane32 (v4f32 (fneg V128:$Rm)),
VectorIndexS:$idx))),
(FMLSv4i32_indexed V128:$Rd, V128:$Rn, V128:$Rm,
VectorIndexS:$idx)>;
def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
(v4f32 (AArch64duplane32
(v4f32 (insert_subvector undef,
(v2f32 (fneg V64:$Rm)),
(i32 0))),
VectorIndexS:$idx)))),
(FMLSv4i32_indexed V128:$Rd, V128:$Rn,
(SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
VectorIndexS:$idx)>;
def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
(AArch64dup (f32 (fneg FPR32Op:$Rm))))),
(FMLSv4i32_indexed V128:$Rd, V128:$Rn,
(SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
// 2 variants for the .2d version: DUPLANE from 128-bit, and DUP scalar
// (DUPLANE from 64-bit would be trivial).
def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
(AArch64duplane64 (v2f64 (fneg V128:$Rm)),
VectorIndexD:$idx))),
(FMLSv2i64_indexed
V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
(AArch64dup (f64 (fneg FPR64Op:$Rm))))),
(FMLSv2i64_indexed V128:$Rd, V128:$Rn,
(SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
// 2 variants for 32-bit scalar version: extract from .2s or from .4s
def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
(vector_extract (v4f32 (fneg V128:$Rm)),
VectorIndexS:$idx))),
(FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
V128:$Rm, VectorIndexS:$idx)>;
def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
(vector_extract (v4f32 (insert_subvector undef,
(v2f32 (fneg V64:$Rm)),
(i32 0))),
VectorIndexS:$idx))),
(FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
(SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
// 1 variant for 64-bit scalar version: extract from .1d or from .2d
def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
(vector_extract (v2f64 (fneg V128:$Rm)),
VectorIndexS:$idx))),
(FMLSv1i64_indexed FPR64:$Rd, FPR64:$Rn,
V128:$Rm, VectorIndexS:$idx)>;
}
defm : FMLSIndexedAfterNegPatterns<
TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
defm : FMLSIndexedAfterNegPatterns<
TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
defm FMULX : SIMDFPIndexed<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
defm FMUL : SIMDFPIndexed<0, 0b1001, "fmul", fmul>;
def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
(FMULv2i32_indexed V64:$Rn,
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
(i64 0))>;
def : Pat<(v4f32 (fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
(FMULv4i32_indexed V128:$Rn,
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
(i64 0))>;
def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
(FMULv2i64_indexed V128:$Rn,
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
(i64 0))>;
defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla",
TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>;
defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls",
TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>;
defm MUL : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
int_aarch64_neon_smull>;
defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
int_aarch64_neon_sqadd>;
defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
int_aarch64_neon_sqsub>;
defm SQRDMLAH : SIMDIndexedSQRDMLxHSDTied<1, 0b1101, "sqrdmlah",
int_aarch64_neon_sqadd>;
defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh",
int_aarch64_neon_sqsub>;
defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
defm UMLAL : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
defm UMLSL : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
defm UMULL : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
int_aarch64_neon_umull>;
// A scalar sqdmull with the second operand being a vector lane can be
// handled directly with the indexed instruction encoding.
def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
(vector_extract (v4i32 V128:$Vm),
VectorIndexS:$idx)),
(SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;
//----------------------------------------------------------------------------
// AdvSIMD scalar shift instructions
//----------------------------------------------------------------------------
defm FCVTZS : SIMDFPScalarRShift<0, 0b11111, "fcvtzs">;
defm FCVTZU : SIMDFPScalarRShift<1, 0b11111, "fcvtzu">;
defm SCVTF : SIMDFPScalarRShift<0, 0b11100, "scvtf">;
defm UCVTF : SIMDFPScalarRShift<1, 0b11100, "ucvtf">;
// Codegen patterns for the above. We don't put these directly on the
// instructions because TableGen's type inference can't handle the truth.
// Having the same base pattern for fp <--> int totally freaks it out.
def : Pat<(int_aarch64_neon_vcvtfp2fxs FPR32:$Rn, vecshiftR32:$imm),
(FCVTZSs FPR32:$Rn, vecshiftR32:$imm)>;
def : Pat<(int_aarch64_neon_vcvtfp2fxu FPR32:$Rn, vecshiftR32:$imm),
(FCVTZUs FPR32:$Rn, vecshiftR32:$imm)>;
def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxs (f64 FPR64:$Rn), vecshiftR64:$imm)),
(FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f64 FPR64:$Rn), vecshiftR64:$imm)),
(FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn),
vecshiftR64:$imm)),
(FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn),
vecshiftR64:$imm)),
(FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
def : Pat<(int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm),
(UCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
def : Pat<(f64 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
(UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
def : Pat<(v1f64 (int_aarch64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn),
vecshiftR64:$imm)),
(SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
def : Pat<(f64 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
(SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
def : Pat<(v1f64 (int_aarch64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
vecshiftR64:$imm)),
(UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
(SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
// Patterns for FP16 Instrinsics - requires reg copy to/from as i16s not supported.
def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 (sext_inreg FPR32:$Rn, i16)), vecshiftR16:$imm)),
(SCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 FPR32:$Rn), vecshiftR16:$imm)),
(SCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR16:$imm)),
(SCVTFh (EXTRACT_SUBREG FPR64:$Rn, hsub), vecshiftR16:$imm)>;
def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp
(and FPR32:$Rn, (i32 65535)),
vecshiftR16:$imm)),
(UCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR16:$imm)),
(UCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR16:$imm)),
(UCVTFh (EXTRACT_SUBREG FPR64:$Rn, hsub), vecshiftR16:$imm)>;
def : Pat<(i32 (int_aarch64_neon_vcvtfp2fxs (f16 FPR16:$Rn), vecshiftR32:$imm)),
(i32 (INSERT_SUBREG
(i32 (IMPLICIT_DEF)),
(FCVTZSh FPR16:$Rn, vecshiftR32:$imm),
hsub))>;
def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxs (f16 FPR16:$Rn), vecshiftR64:$imm)),
(i64 (INSERT_SUBREG
(i64 (IMPLICIT_DEF)),
(FCVTZSh FPR16:$Rn, vecshiftR64:$imm),
hsub))>;
def : Pat<(i32 (int_aarch64_neon_vcvtfp2fxu (f16 FPR16:$Rn), vecshiftR32:$imm)),
(i32 (INSERT_SUBREG
(i32 (IMPLICIT_DEF)),
(FCVTZUh FPR16:$Rn, vecshiftR32:$imm),
hsub))>;
def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f16 FPR16:$Rn), vecshiftR64:$imm)),
(i64 (INSERT_SUBREG
(i64 (IMPLICIT_DEF)),
(FCVTZUh FPR16:$Rn, vecshiftR64:$imm),
hsub))>;
def : Pat<(i32 (int_aarch64_neon_facge (f16 FPR16:$Rn), (f16 FPR16:$Rm))),
(i32 (INSERT_SUBREG
(i32 (IMPLICIT_DEF)),
(FACGE16 FPR16:$Rn, FPR16:$Rm),
hsub))>;
def : Pat<(i32 (int_aarch64_neon_facgt (f16 FPR16:$Rn), (f16 FPR16:$Rm))),
(i32 (INSERT_SUBREG
(i32 (IMPLICIT_DEF)),
(FACGT16 FPR16:$Rn, FPR16:$Rm),
hsub))>;
defm SHL : SIMDScalarLShiftD< 0, 0b01010, "shl", AArch64vshl>;
defm SLI : SIMDScalarLShiftDTied<1, 0b01010, "sli">;
defm SQRSHRN : SIMDScalarRShiftBHS< 0, 0b10011, "sqrshrn",
int_aarch64_neon_sqrshrn>;
defm SQRSHRUN : SIMDScalarRShiftBHS< 1, 0b10001, "sqrshrun",
int_aarch64_neon_sqrshrun>;
defm SQSHLU : SIMDScalarLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
defm SQSHL : SIMDScalarLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
defm SQSHRN : SIMDScalarRShiftBHS< 0, 0b10010, "sqshrn",
int_aarch64_neon_sqshrn>;
defm SQSHRUN : SIMDScalarRShiftBHS< 1, 0b10000, "sqshrun",
int_aarch64_neon_sqshrun>;
defm SRI : SIMDScalarRShiftDTied< 1, 0b01000, "sri">;
defm SRSHR : SIMDScalarRShiftD< 0, 0b00100, "srshr", AArch64srshri>;
defm SRSRA : SIMDScalarRShiftDTied< 0, 0b00110, "srsra",
TriOpFrag<(add node:$LHS,
(AArch64srshri node:$MHS, node:$RHS))>>;
defm SSHR : SIMDScalarRShiftD< 0, 0b00000, "sshr", AArch64vashr>;
defm SSRA : SIMDScalarRShiftDTied< 0, 0b00010, "ssra",
TriOpFrag<(add node:$LHS,
(AArch64vashr node:$MHS, node:$RHS))>>;
defm UQRSHRN : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn",
int_aarch64_neon_uqrshrn>;
defm UQSHL : SIMDScalarLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
defm UQSHRN : SIMDScalarRShiftBHS< 1, 0b10010, "uqshrn",
int_aarch64_neon_uqshrn>;
defm URSHR : SIMDScalarRShiftD< 1, 0b00100, "urshr", AArch64urshri>;
defm URSRA : SIMDScalarRShiftDTied< 1, 0b00110, "ursra",
TriOpFrag<(add node:$LHS,
(AArch64urshri node:$MHS, node:$RHS))>>;
defm USHR : SIMDScalarRShiftD< 1, 0b00000, "ushr", AArch64vlshr>;
defm USRA : SIMDScalarRShiftDTied< 1, 0b00010, "usra",
TriOpFrag<(add node:$LHS,
(AArch64vlshr node:$MHS, node:$RHS))>>;
//----------------------------------------------------------------------------
// AdvSIMD vector shift instructions
//----------------------------------------------------------------------------
defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>;
defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>;
defm SCVTF: SIMDVectorRShiftToFP<0, 0b11100, "scvtf",
int_aarch64_neon_vcvtfxs2fp>;
defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
int_aarch64_neon_rshrn>;
defm SHL : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>;
defm SHRN : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn",
BinOpFrag<(trunc (AArch64vashr node:$LHS, node:$RHS))>>;
defm SLI : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_aarch64_neon_vsli>;
def : Pat<(v1i64 (int_aarch64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
(i32 vecshiftL64:$imm))),
(SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>;
defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn",
int_aarch64_neon_sqrshrn>;
defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun",
int_aarch64_neon_sqrshrun>;
defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
defm SQSHL : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
defm SQSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn",
int_aarch64_neon_sqshrn>;
defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun",
int_aarch64_neon_sqshrun>;
defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_aarch64_neon_vsri>;
def : Pat<(v1i64 (int_aarch64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
(i32 vecshiftR64:$imm))),
(SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>;
defm SRSHR : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", AArch64srshri>;
defm SRSRA : SIMDVectorRShiftBHSDTied<0, 0b00110, "srsra",
TriOpFrag<(add node:$LHS,
(AArch64srshri node:$MHS, node:$RHS))> >;
defm SSHLL : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
BinOpFrag<(AArch64vshl (sext node:$LHS), node:$RHS)>>;
defm SSHR : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>;
defm SSRA : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>;
defm UCVTF : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf",
int_aarch64_neon_vcvtfxu2fp>;
defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
int_aarch64_neon_uqrshrn>;
defm UQSHL : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
defm UQSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn",
int_aarch64_neon_uqshrn>;
defm URSHR : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", AArch64urshri>;
defm URSRA : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra",
TriOpFrag<(add node:$LHS,
(AArch64urshri node:$MHS, node:$RHS))> >;
defm USHLL : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll",
BinOpFrag<(AArch64vshl (zext node:$LHS), node:$RHS)>>;
defm USHR : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>;
defm USRA : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >;
// SHRN patterns for when a logical right shift was used instead of arithmetic
// (the immediate guarantees no sign bits actually end up in the result so it
// doesn't matter).
def : Pat<(v8i8 (trunc (AArch64vlshr (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))),
(SHRNv8i8_shift V128:$Rn, vecshiftR16Narrow:$imm)>;
def : Pat<(v4i16 (trunc (AArch64vlshr (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))),
(SHRNv4i16_shift V128:$Rn, vecshiftR32Narrow:$imm)>;
def : Pat<(v2i32 (trunc (AArch64vlshr (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))),
(SHRNv2i32_shift V128:$Rn, vecshiftR64Narrow:$imm)>;
def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Rd),
(trunc (AArch64vlshr (v8i16 V128:$Rn),
vecshiftR16Narrow:$imm)))),
(SHRNv16i8_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
V128:$Rn, vecshiftR16Narrow:$imm)>;
def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Rd),
(trunc (AArch64vlshr (v4i32 V128:$Rn),
vecshiftR32Narrow:$imm)))),
(SHRNv8i16_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
V128:$Rn, vecshiftR32Narrow:$imm)>;
def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd),
(trunc (AArch64vlshr (v2i64 V128:$Rn),
vecshiftR64Narrow:$imm)))),
(SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
V128:$Rn, vecshiftR32Narrow:$imm)>;
// Vector sign and zero extensions are implemented with SSHLL and USSHLL.
// Anyexts are implemented as zexts.
def : Pat<(v8i16 (sext (v8i8 V64:$Rn))), (SSHLLv8i8_shift V64:$Rn, (i32 0))>;
def : Pat<(v8i16 (zext (v8i8 V64:$Rn))), (USHLLv8i8_shift V64:$Rn, (i32 0))>;
def : Pat<(v8i16 (anyext (v8i8 V64:$Rn))), (USHLLv8i8_shift V64:$Rn, (i32 0))>;
def : Pat<(v4i32 (sext (v4i16 V64:$Rn))), (SSHLLv4i16_shift V64:$Rn, (i32 0))>;
def : Pat<(v4i32 (zext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
def : Pat<(v2i64 (sext (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>;
def : Pat<(v2i64 (zext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
// Also match an extend from the upper half of a 128 bit source register.
def : Pat<(v8i16 (anyext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
(USHLLv16i8_shift V128:$Rn, (i32 0))>;
def : Pat<(v8i16 (zext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
(USHLLv16i8_shift V128:$Rn, (i32 0))>;
def : Pat<(v8i16 (sext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
(SSHLLv16i8_shift V128:$Rn, (i32 0))>;
def : Pat<(v4i32 (anyext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
(USHLLv8i16_shift V128:$Rn, (i32 0))>;
def : Pat<(v4i32 (zext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
(USHLLv8i16_shift V128:$Rn, (i32 0))>;
def : Pat<(v4i32 (sext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
(SSHLLv8i16_shift V128:$Rn, (i32 0))>;
def : Pat<(v2i64 (anyext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
(USHLLv4i32_shift V128:$Rn, (i32 0))>;
def : Pat<(v2i64 (zext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
(USHLLv4i32_shift V128:$Rn, (i32 0))>;
def : Pat<(v2i64 (sext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
(SSHLLv4i32_shift V128:$Rn, (i32 0))>;
// Vector shift sxtl aliases
def : InstAlias<"sxtl.8h $dst, $src1",
(SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
def : InstAlias<"sxtl $dst.8h, $src1.8b",
(SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
def : InstAlias<"sxtl.4s $dst, $src1",
(SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
def : InstAlias<"sxtl $dst.4s, $src1.4h",
(SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
def : InstAlias<"sxtl.2d $dst, $src1",
(SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
def : InstAlias<"sxtl $dst.2d, $src1.2s",
(SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
// Vector shift sxtl2 aliases
def : InstAlias<"sxtl2.8h $dst, $src1",
(SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
def : InstAlias<"sxtl2 $dst.8h, $src1.16b",
(SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
def : InstAlias<"sxtl2.4s $dst, $src1",
(SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
def : InstAlias<"sxtl2 $dst.4s, $src1.8h",
(SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
def : InstAlias<"sxtl2.2d $dst, $src1",
(SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
def : InstAlias<"sxtl2 $dst.2d, $src1.4s",
(SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
// Vector shift uxtl aliases
def : InstAlias<"uxtl.8h $dst, $src1",
(USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
def : InstAlias<"uxtl $dst.8h, $src1.8b",
(USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
def : InstAlias<"uxtl.4s $dst, $src1",
(USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
def : InstAlias<"uxtl $dst.4s, $src1.4h",
(USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
def : InstAlias<"uxtl.2d $dst, $src1",
(USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
def : InstAlias<"uxtl $dst.2d, $src1.2s",
(USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
// Vector shift uxtl2 aliases
def : InstAlias<"uxtl2.8h $dst, $src1",
(USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
def : InstAlias<"uxtl2 $dst.8h, $src1.16b",
(USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
def : InstAlias<"uxtl2.4s $dst, $src1",
(USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
def : InstAlias<"uxtl2 $dst.4s, $src1.8h",
(USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
def : InstAlias<"uxtl2.2d $dst, $src1",
(USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
def : InstAlias<"uxtl2 $dst.2d, $src1.4s",
(USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
// If an integer is about to be converted to a floating point value,
// just load it on the floating point unit.
// These patterns are more complex because floating point loads do not
// support sign extension.
// The sign extension has to be explicitly added and is only supported for
// one step: byte-to-half, half-to-word, word-to-doubleword.
// SCVTF GPR -> FPR is 9 cycles.
// SCVTF FPR -> FPR is 4 cyclces.
// (sign extension with lengthen) SXTL FPR -> FPR is 2 cycles.
// Therefore, we can do 2 sign extensions and one SCVTF FPR -> FPR
// and still being faster.
// However, this is not good for code size.
// 8-bits -> float. 2 sizes step-up.
class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
: Pat<(f32 (sint_to_fp (i32 (sextloadi8 addrmode)))),
(SCVTFv1i32 (f32 (EXTRACT_SUBREG
(SSHLLv4i16_shift
(f64
(EXTRACT_SUBREG
(SSHLLv8i8_shift
(INSERT_SUBREG (f64 (IMPLICIT_DEF)),
INST,
bsub),
0),
dsub)),
0),
ssub)))>,
Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
(LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
def : SExtLoadi8CVTf32Pat<(ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext),
(LDRBroX GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext)>;
def : SExtLoadi8CVTf32Pat<(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset),
(LDRBui GPR64sp:$Rn, uimm12s1:$offset)>;
def : SExtLoadi8CVTf32Pat<(am_unscaled8 GPR64sp:$Rn, simm9:$offset),
(LDURBi GPR64sp:$Rn, simm9:$offset)>;
// 16-bits -> float. 1 size step-up.
class SExtLoadi16CVTf32Pat<dag addrmode, dag INST>
: Pat<(f32 (sint_to_fp (i32 (sextloadi16 addrmode)))),
(SCVTFv1i32 (f32 (EXTRACT_SUBREG
(SSHLLv4i16_shift
(INSERT_SUBREG (f64 (IMPLICIT_DEF)),
INST,
hsub),
0),
ssub)))>, Requires<[NotForCodeSize]>;
def : SExtLoadi16CVTf32Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
(LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
def : SExtLoadi16CVTf32Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
(LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
def : SExtLoadi16CVTf32Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
(LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
def : SExtLoadi16CVTf32Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
(LDURHi GPR64sp:$Rn, simm9:$offset)>;
// 32-bits to 32-bits are handled in target specific dag combine:
// performIntToFpCombine.
// 64-bits integer to 32-bits floating point, not possible with
// SCVTF on floating point registers (both source and destination
// must have the same size).
// Here are the patterns for 8, 16, 32, and 64-bits to double.
// 8-bits -> double. 3 size step-up: give up.
// 16-bits -> double. 2 size step.
class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
: Pat <(f64 (sint_to_fp (i32 (sextloadi16 addrmode)))),
(SCVTFv1i64 (f64 (EXTRACT_SUBREG
(SSHLLv2i32_shift
(f64
(EXTRACT_SUBREG
(SSHLLv4i16_shift
(INSERT_SUBREG (f64 (IMPLICIT_DEF)),
INST,
hsub),
0),
dsub)),
0),
dsub)))>,
Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
(LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
(LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
def : SExtLoadi16CVTf64Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
(LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
def : SExtLoadi16CVTf64Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
(LDURHi GPR64sp:$Rn, simm9:$offset)>;
// 32-bits -> double. 1 size step-up.
class SExtLoadi32CVTf64Pat<dag addrmode, dag INST>
: Pat <(f64 (sint_to_fp (i32 (load addrmode)))),
(SCVTFv1i64 (f64 (EXTRACT_SUBREG
(SSHLLv2i32_shift
(INSERT_SUBREG (f64 (IMPLICIT_DEF)),
INST,
ssub),
0),
dsub)))>, Requires<[NotForCodeSize]>;
def : SExtLoadi32CVTf64Pat<(ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext),
(LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext)>;
def : SExtLoadi32CVTf64Pat<(ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext),
(LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext)>;
def : SExtLoadi32CVTf64Pat<(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset),
(LDRSui GPR64sp:$Rn, uimm12s4:$offset)>;
def : SExtLoadi32CVTf64Pat<(am_unscaled32 GPR64sp:$Rn, simm9:$offset),
(LDURSi GPR64sp:$Rn, simm9:$offset)>;
// 64-bits -> double are handled in target specific dag combine:
// performIntToFpCombine.
//----------------------------------------------------------------------------
// AdvSIMD Load-Store Structure
//----------------------------------------------------------------------------
defm LD1 : SIMDLd1Multiple<"ld1">;
defm LD2 : SIMDLd2Multiple<"ld2">;
defm LD3 : SIMDLd3Multiple<"ld3">;
defm LD4 : SIMDLd4Multiple<"ld4">;
defm ST1 : SIMDSt1Multiple<"st1">;
defm ST2 : SIMDSt2Multiple<"st2">;
defm ST3 : SIMDSt3Multiple<"st3">;
defm ST4 : SIMDSt4Multiple<"st4">;
class Ld1Pat<ValueType ty, Instruction INST>
: Pat<(ty (load GPR64sp:$Rn)), (INST GPR64sp:$Rn)>;
def : Ld1Pat<v16i8, LD1Onev16b>;
def : Ld1Pat<v8i16, LD1Onev8h>;
def : Ld1Pat<v4i32, LD1Onev4s>;
def : Ld1Pat<v2i64, LD1Onev2d>;
def : Ld1Pat<v8i8, LD1Onev8b>;
def : Ld1Pat<v4i16, LD1Onev4h>;
def : Ld1Pat<v2i32, LD1Onev2s>;
def : Ld1Pat<v1i64, LD1Onev1d>;
class St1Pat<ValueType ty, Instruction INST>
: Pat<(store ty:$Vt, GPR64sp:$Rn),
(INST ty:$Vt, GPR64sp:$Rn)>;
def : St1Pat<v16i8, ST1Onev16b>;
def : St1Pat<v8i16, ST1Onev8h>;
def : St1Pat<v4i32, ST1Onev4s>;
def : St1Pat<v2i64, ST1Onev2d>;
def : St1Pat<v8i8, ST1Onev8b>;
def : St1Pat<v4i16, ST1Onev4h>;
def : St1Pat<v2i32, ST1Onev2s>;
def : St1Pat<v1i64, ST1Onev1d>;
//---
// Single-element
//---
defm LD1R : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>;
defm LD2R : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>;
defm LD3R : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>;
defm LD4R : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>;
let mayLoad = 1, hasSideEffects = 0 in {
defm LD1 : SIMDLdSingleBTied<0, 0b000, "ld1", VecListOneb, GPR64pi1>;
defm LD1 : SIMDLdSingleHTied<0, 0b010, 0, "ld1", VecListOneh, GPR64pi2>;
defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes, GPR64pi4>;
defm LD1 : SIMDLdSingleDTied<0, 0b100, 0b01, "ld1", VecListOned, GPR64pi8>;
defm LD2 : SIMDLdSingleBTied<1, 0b000, "ld2", VecListTwob, GPR64pi2>;
defm LD2 : SIMDLdSingleHTied<1, 0b010, 0, "ld2", VecListTwoh, GPR64pi4>;
defm LD2 : SIMDLdSingleSTied<1, 0b100, 0b00, "ld2", VecListTwos, GPR64pi8>;
defm LD2 : SIMDLdSingleDTied<1, 0b100, 0b01, "ld2", VecListTwod, GPR64pi16>;
defm LD3 : SIMDLdSingleBTied<0, 0b001, "ld3", VecListThreeb, GPR64pi3>;
defm LD3 : SIMDLdSingleHTied<0, 0b011, 0, "ld3", VecListThreeh, GPR64pi6>;
defm LD3 : SIMDLdSingleSTied<0, 0b101, 0b00, "ld3", VecListThrees, GPR64pi12>;
defm LD3 : SIMDLdSingleDTied<0, 0b101, 0b01, "ld3", VecListThreed, GPR64pi24>;
defm LD4 : SIMDLdSingleBTied<1, 0b001, "ld4", VecListFourb, GPR64pi4>;
defm LD4 : SIMDLdSingleHTied<1, 0b011, 0, "ld4", VecListFourh, GPR64pi8>;
defm LD4 : SIMDLdSingleSTied<1, 0b101, 0b00, "ld4", VecListFours, GPR64pi16>;
defm LD4 : SIMDLdSingleDTied<1, 0b101, 0b01, "ld4", VecListFourd, GPR64pi32>;
}
def : Pat<(v8i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
(LD1Rv8b GPR64sp:$Rn)>;
def : Pat<(v16i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
(LD1Rv16b GPR64sp:$Rn)>;
def : Pat<(v4i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
(LD1Rv4h GPR64sp:$Rn)>;
def : Pat<(v8i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
(LD1Rv8h GPR64sp:$Rn)>;
def : Pat<(v2i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
(LD1Rv2s GPR64sp:$Rn)>;
def : Pat<(v4i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
(LD1Rv4s GPR64sp:$Rn)>;
def : Pat<(v2i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
(LD1Rv2d GPR64sp:$Rn)>;
def : Pat<(v1i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
(LD1Rv1d GPR64sp:$Rn)>;
// Grab the floating point version too
def : Pat<(v2f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
(LD1Rv2s GPR64sp:$Rn)>;
def : Pat<(v4f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
(LD1Rv4s GPR64sp:$Rn)>;
def : Pat<(v2f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
(LD1Rv2d GPR64sp:$Rn)>;
def : Pat<(v1f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
(LD1Rv1d GPR64sp:$Rn)>;
def : Pat<(v4f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
(LD1Rv4h GPR64sp:$Rn)>;
def : Pat<(v8f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
(LD1Rv8h GPR64sp:$Rn)>;
class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
ValueType VTy, ValueType STy, Instruction LD1>
: Pat<(vector_insert (VTy VecListOne128:$Rd),
(STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
(LD1 VecListOne128:$Rd, VecIndex:$idx, GPR64sp:$Rn)>;
def : Ld1Lane128Pat<extloadi8, VectorIndexB, v16i8, i32, LD1i8>;
def : Ld1Lane128Pat<extloadi16, VectorIndexH, v8i16, i32, LD1i16>;
def : Ld1Lane128Pat<load, VectorIndexS, v4i32, i32, LD1i32>;
def : Ld1Lane128Pat<load, VectorIndexS, v4f32, f32, LD1i32>;
def : Ld1Lane128Pat<load, VectorIndexD, v2i64, i64, LD1i64>;
def : Ld1Lane128Pat<load, VectorIndexD, v2f64, f64, LD1i64>;
def : Ld1Lane128Pat<load, VectorIndexH, v8f16, f16, LD1i16>;
class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
ValueType VTy, ValueType STy, Instruction LD1>
: Pat<(vector_insert (VTy VecListOne64:$Rd),
(STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
(EXTRACT_SUBREG
(LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
VecIndex:$idx, GPR64sp:$Rn),
dsub)>;
def : Ld1Lane64Pat<extloadi8, VectorIndexB, v8i8, i32, LD1i8>;
def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
def : Ld1Lane64Pat<load, VectorIndexS, v2i32, i32, LD1i32>;
def : Ld1Lane64Pat<load, VectorIndexS, v2f32, f32, LD1i32>;
def : Ld1Lane64Pat<load, VectorIndexH, v4f16, f16, LD1i16>;
defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
defm LD2 : SIMDLdSt2SingleAliases<"ld2">;
defm LD3 : SIMDLdSt3SingleAliases<"ld3">;
defm LD4 : SIMDLdSt4SingleAliases<"ld4">;
// Stores
defm ST1 : SIMDStSingleB<0, 0b000, "st1", VecListOneb, GPR64pi1>;
defm ST1 : SIMDStSingleH<0, 0b010, 0, "st1", VecListOneh, GPR64pi2>;
defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;
let AddedComplexity = 19 in
class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
ValueType VTy, ValueType STy, Instruction ST1>
: Pat<(scalar_store
(STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
GPR64sp:$Rn),
(ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn)>;
def : St1Lane128Pat<truncstorei8, VectorIndexB, v16i8, i32, ST1i8>;
def : St1Lane128Pat<truncstorei16, VectorIndexH, v8i16, i32, ST1i16>;
def : St1Lane128Pat<store, VectorIndexS, v4i32, i32, ST1i32>;
def : St1Lane128Pat<store, VectorIndexS, v4f32, f32, ST1i32>;
def : St1Lane128Pat<store, VectorIndexD, v2i64, i64, ST1i64>;
def : St1Lane128Pat<store, VectorIndexD, v2f64, f64, ST1i64>;
def : St1Lane128Pat<store, VectorIndexH, v8f16, f16, ST1i16>;
let AddedComplexity = 19 in
class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
ValueType VTy, ValueType STy, Instruction ST1>
: Pat<(scalar_store
(STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
GPR64sp:$Rn),
(ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
VecIndex:$idx, GPR64sp:$Rn)>;
def : St1Lane64Pat<truncstorei8, VectorIndexB, v8i8, i32, ST1i8>;
def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
def : St1Lane64Pat<store, VectorIndexS, v2i32, i32, ST1i32>;
def : St1Lane64Pat<store, VectorIndexS, v2f32, f32, ST1i32>;
def : St1Lane64Pat<store, VectorIndexH, v4f16, f16, ST1i16>;
multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex,
ValueType VTy, ValueType STy, Instruction ST1,
int offset> {
def : Pat<(scalar_store
(STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
GPR64sp:$Rn, offset),
(ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
VecIndex:$idx, GPR64sp:$Rn, XZR)>;
def : Pat<(scalar_store
(STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
GPR64sp:$Rn, GPR64:$Rm),
(ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
}
defm : St1LanePost64Pat<post_truncsti8, VectorIndexB, v8i8, i32, ST1i8_POST, 1>;
defm : St1LanePost64Pat<post_truncsti16, VectorIndexH, v4i16, i32, ST1i16_POST,
2>;
defm : St1LanePost64Pat<post_store, VectorIndexS, v2i32, i32, ST1i32_POST, 4>;
defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>;
defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>;
defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>;
defm : St1LanePost64Pat<post_store, VectorIndexH, v4f16, f16, ST1i16_POST, 2>;
multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex,
ValueType VTy, ValueType STy, Instruction ST1,
int offset> {
def : Pat<(scalar_store
(STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
GPR64sp:$Rn, offset),
(ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, XZR)>;
def : Pat<(scalar_store
(STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
GPR64sp:$Rn, GPR64:$Rm),
(ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
}
defm : St1LanePost128Pat<post_truncsti8, VectorIndexB, v16i8, i32, ST1i8_POST,
1>;
defm : St1LanePost128Pat<post_truncsti16, VectorIndexH, v8i16, i32, ST1i16_POST,
2>;
defm : St1LanePost128Pat<post_store, VectorIndexS, v4i32, i32, ST1i32_POST, 4>;
defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>;
defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
defm : St1LanePost128Pat<post_store, VectorIndexH, v8f16, f16, ST1i16_POST, 2>;
let mayStore = 1, hasSideEffects = 0 in {
defm ST2 : SIMDStSingleB<1, 0b000, "st2", VecListTwob, GPR64pi2>;
defm ST2 : SIMDStSingleH<1, 0b010, 0, "st2", VecListTwoh, GPR64pi4>;
defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos, GPR64pi8>;
defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod, GPR64pi16>;
defm ST3 : SIMDStSingleB<0, 0b001, "st3", VecListThreeb, GPR64pi3>;
defm ST3 : SIMDStSingleH<0, 0b011, 0, "st3", VecListThreeh, GPR64pi6>;
defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>;
defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>;
defm ST4 : SIMDStSingleB<1, 0b001, "st4", VecListFourb, GPR64pi4>;
defm ST4 : SIMDStSingleH<1, 0b011, 0, "st4", VecListFourh, GPR64pi8>;
defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours, GPR64pi16>;
defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd, GPR64pi32>;
}
defm ST1 : SIMDLdSt1SingleAliases<"st1">;
defm ST2 : SIMDLdSt2SingleAliases<"st2">;
defm ST3 : SIMDLdSt3SingleAliases<"st3">;
defm ST4 : SIMDLdSt4SingleAliases<"st4">;
//----------------------------------------------------------------------------
// Crypto extensions
//----------------------------------------------------------------------------
let Predicates = [HasAES] in {
def AESErr : AESTiedInst<0b0100, "aese", int_aarch64_crypto_aese>;
def AESDrr : AESTiedInst<0b0101, "aesd", int_aarch64_crypto_aesd>;
def AESMCrr : AESInst< 0b0110, "aesmc", int_aarch64_crypto_aesmc>;
def AESIMCrr : AESInst< 0b0111, "aesimc", int_aarch64_crypto_aesimc>;
}
// Pseudo instructions for AESMCrr/AESIMCrr with a register constraint required
// for AES fusion on some CPUs.
let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in {
def AESMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">,
Sched<[WriteV]>;
def AESIMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">,
Sched<[WriteV]>;
}
// Only use constrained versions of AES(I)MC instructions if they are paired with
// AESE/AESD.
def : Pat<(v16i8 (int_aarch64_crypto_aesmc
(v16i8 (int_aarch64_crypto_aese (v16i8 V128:$src1),
(v16i8 V128:$src2))))),
(v16i8 (AESMCrrTied (v16i8 (AESErr (v16i8 V128:$src1),
(v16i8 V128:$src2)))))>,
Requires<[HasFuseAES]>;
def : Pat<(v16i8 (int_aarch64_crypto_aesimc
(v16i8 (int_aarch64_crypto_aesd (v16i8 V128:$src1),
(v16i8 V128:$src2))))),
(v16i8 (AESIMCrrTied (v16i8 (AESDrr (v16i8 V128:$src1),
(v16i8 V128:$src2)))))>,
Requires<[HasFuseAES]>;
let Predicates = [HasSHA2] in {
def SHA1Crrr : SHATiedInstQSV<0b000, "sha1c", int_aarch64_crypto_sha1c>;
def SHA1Prrr : SHATiedInstQSV<0b001, "sha1p", int_aarch64_crypto_sha1p>;
def SHA1Mrrr : SHATiedInstQSV<0b010, "sha1m", int_aarch64_crypto_sha1m>;
def SHA1SU0rrr : SHATiedInstVVV<0b011, "sha1su0", int_aarch64_crypto_sha1su0>;
def SHA256Hrrr : SHATiedInstQQV<0b100, "sha256h", int_aarch64_crypto_sha256h>;
def SHA256H2rrr : SHATiedInstQQV<0b101, "sha256h2",int_aarch64_crypto_sha256h2>;
def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_aarch64_crypto_sha256su1>;
def SHA1Hrr : SHAInstSS< 0b0000, "sha1h", int_aarch64_crypto_sha1h>;
def SHA1SU1rr : SHATiedInstVV<0b0001, "sha1su1", int_aarch64_crypto_sha1su1>;
def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0>;
}
//----------------------------------------------------------------------------
// Compiler-pseudos
//----------------------------------------------------------------------------
// FIXME: Like for X86, these should go in their own separate .td file.
def def32 : PatLeaf<(i32 GPR32:$src), [{
return isDef32(*N);
}]>;
// In the case of a 32-bit def that is known to implicitly zero-extend,
// we can use a SUBREG_TO_REG.
def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;
// For an anyext, we don't care what the high bits are, so we can perform an
// INSERT_SUBREF into an IMPLICIT_DEF.
def : Pat<(i64 (anyext GPR32:$src)),
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
// When we need to explicitly zero-extend, we use a 32-bit MOV instruction and
// then assert the extension has happened.
def : Pat<(i64 (zext GPR32:$src)),
(SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
// To sign extend, we use a signed bitfield move instruction (SBFM) on the
// containing super-reg.
def : Pat<(i64 (sext GPR32:$src)),
(SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
def : Pat<(i64 (sext_inreg GPR64:$src, i32)), (SBFMXri GPR64:$src, 0, 31)>;
def : Pat<(i64 (sext_inreg GPR64:$src, i16)), (SBFMXri GPR64:$src, 0, 15)>;
def : Pat<(i64 (sext_inreg GPR64:$src, i8)), (SBFMXri GPR64:$src, 0, 7)>;
def : Pat<(i64 (sext_inreg GPR64:$src, i1)), (SBFMXri GPR64:$src, 0, 0)>;
def : Pat<(i32 (sext_inreg GPR32:$src, i16)), (SBFMWri GPR32:$src, 0, 15)>;
def : Pat<(i32 (sext_inreg GPR32:$src, i8)), (SBFMWri GPR32:$src, 0, 7)>;
def : Pat<(i32 (sext_inreg GPR32:$src, i1)), (SBFMWri GPR32:$src, 0, 0)>;
def : Pat<(shl (sext_inreg GPR32:$Rn, i8), (i64 imm0_31:$imm)),
(SBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
(i64 (i32shift_sext_i8 imm0_31:$imm)))>;
def : Pat<(shl (sext_inreg GPR64:$Rn, i8), (i64 imm0_63:$imm)),
(SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
(i64 (i64shift_sext_i8 imm0_63:$imm)))>;
def : Pat<(shl (sext_inreg GPR32:$Rn, i16), (i64 imm0_31:$imm)),
(SBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
(i64 (i32shift_sext_i16 imm0_31:$imm)))>;
def : Pat<(shl (sext_inreg GPR64:$Rn, i16), (i64 imm0_63:$imm)),
(SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
(i64 (i64shift_sext_i16 imm0_63:$imm)))>;
def : Pat<(shl (i64 (sext GPR32:$Rn)), (i64 imm0_63:$imm)),
(SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
(i64 (i64shift_a imm0_63:$imm)),
(i64 (i64shift_sext_i32 imm0_63:$imm)))>;
// sra patterns have an AddedComplexity of 10, so make sure we have a higher
// AddedComplexity for the following patterns since we want to match sext + sra
// patterns before we attempt to match a single sra node.
let AddedComplexity = 20 in {
// We support all sext + sra combinations which preserve at least one bit of the
// original value which is to be sign extended. E.g. we support shifts up to
// bitwidth-1 bits.
def : Pat<(sra (sext_inreg GPR32:$Rn, i8), (i64 imm0_7:$imm)),
(SBFMWri GPR32:$Rn, (i64 imm0_7:$imm), 7)>;
def : Pat<(sra (sext_inreg GPR64:$Rn, i8), (i64 imm0_7:$imm)),
(SBFMXri GPR64:$Rn, (i64 imm0_7:$imm), 7)>;
def : Pat<(sra (sext_inreg GPR32:$Rn, i16), (i64 imm0_15:$imm)),
(SBFMWri GPR32:$Rn, (i64 imm0_15:$imm), 15)>;
def : Pat<(sra (sext_inreg GPR64:$Rn, i16), (i64 imm0_15:$imm)),
(SBFMXri GPR64:$Rn, (i64 imm0_15:$imm), 15)>;
def : Pat<(sra (i64 (sext GPR32:$Rn)), (i64 imm0_31:$imm)),
(SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
(i64 imm0_31:$imm), 31)>;
} // AddedComplexity = 20
// To truncate, we can simply extract from a subregister.
def : Pat<(i32 (trunc GPR64sp:$src)),
(i32 (EXTRACT_SUBREG GPR64sp:$src, sub_32))>;
// __builtin_trap() uses the BRK instruction on AArch64.
def : Pat<(trap), (BRK 1)>;
def : Pat<(debugtrap), (BRK 0xF000)>, Requires<[IsWindows]>;
// Multiply high patterns which multiply the lower subvector using smull/umull
// and the upper subvector with smull2/umull2. Then shuffle the high the high
// part of both results together.
def : Pat<(v16i8 (mulhs V128:$Rn, V128:$Rm)),
(UZP2v16i8
(SMULLv8i8_v8i16 (EXTRACT_SUBREG V128:$Rn, dsub),
(EXTRACT_SUBREG V128:$Rm, dsub)),
(SMULLv16i8_v8i16 V128:$Rn, V128:$Rm))>;
def : Pat<(v8i16 (mulhs V128:$Rn, V128:$Rm)),
(UZP2v8i16
(SMULLv4i16_v4i32 (EXTRACT_SUBREG V128:$Rn, dsub),
(EXTRACT_SUBREG V128:$Rm, dsub)),
(SMULLv8i16_v4i32 V128:$Rn, V128:$Rm))>;
def : Pat<(v4i32 (mulhs V128:$Rn, V128:$Rm)),
(UZP2v4i32
(SMULLv2i32_v2i64 (EXTRACT_SUBREG V128:$Rn, dsub),
(EXTRACT_SUBREG V128:$Rm, dsub)),
(SMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>;
def : Pat<(v16i8 (mulhu V128:$Rn, V128:$Rm)),
(UZP2v16i8
(UMULLv8i8_v8i16 (EXTRACT_SUBREG V128:$Rn, dsub),
(EXTRACT_SUBREG V128:$Rm, dsub)),
(UMULLv16i8_v8i16 V128:$Rn, V128:$Rm))>;
def : Pat<(v8i16 (mulhu V128:$Rn, V128:$Rm)),
(UZP2v8i16
(UMULLv4i16_v4i32 (EXTRACT_SUBREG V128:$Rn, dsub),
(EXTRACT_SUBREG V128:$Rm, dsub)),
(UMULLv8i16_v4i32 V128:$Rn, V128:$Rm))>;
def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)),
(UZP2v4i32
(UMULLv2i32_v2i64 (EXTRACT_SUBREG V128:$Rn, dsub),
(EXTRACT_SUBREG V128:$Rm, dsub)),
(UMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>;
// Conversions within AdvSIMD types in the same register size are free.
// But because we need a consistent lane ordering, in big endian many
// conversions require one or more REV instructions.
//
// Consider a simple memory load followed by a bitconvert then a store.
// v0 = load v2i32
// v1 = BITCAST v2i32 v0 to v4i16
// store v4i16 v2
//
// In big endian mode every memory access has an implicit byte swap. LDR and
// STR do a 64-bit byte swap, whereas LD1/ST1 do a byte swap per lane - that
// is, they treat the vector as a sequence of elements to be byte-swapped.
// The two pairs of instructions are fundamentally incompatible. We've decided
// to use LD1/ST1 only to simplify compiler implementation.
//
// LD1/ST1 perform the equivalent of a sequence of LDR/STR + REV. This makes
// the original code sequence:
// v0 = load v2i32
// v1 = REV v2i32 (implicit)
// v2 = BITCAST v2i32 v1 to v4i16
// v3 = REV v4i16 v2 (implicit)
// store v4i16 v3
//
// But this is now broken - the value stored is different to the value loaded
// due to lane reordering. To fix this, on every BITCAST we must perform two
// other REVs:
// v0 = load v2i32
// v1 = REV v2i32 (implicit)
// v2 = REV v2i32
// v3 = BITCAST v2i32 v2 to v4i16
// v4 = REV v4i16
// v5 = REV v4i16 v4 (implicit)
// store v4i16 v5
//
// This means an extra two instructions, but actually in most cases the two REV
// instructions can be combined into one. For example:
// (REV64_2s (REV64_4h X)) === (REV32_4h X)
//
// There is also no 128-bit REV instruction. This must be synthesized with an
// EXT instruction.
//
// Most bitconverts require some sort of conversion. The only exceptions are:
// a) Identity conversions - vNfX <-> vNiX
// b) Single-lane-to-scalar - v1fX <-> fX or v1iX <-> iX
//
// Natural vector casts (64 bit)
def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4f16 (AArch64NvCast (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>;
def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>;
def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4f16 (AArch64NvCast (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4f16 (AArch64NvCast (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>;
def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
def : Pat<(v2f32 (AArch64NvCast (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>;
def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4f16 (AArch64NvCast (f64 FPR64:$src))), (v4f16 FPR64:$src)>;
def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v1f64 (AArch64NvCast (f64 FPR64:$src))), (v1f64 FPR64:$src)>;
def : Pat<(v8i8 (AArch64NvCast (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v4i16 (AArch64NvCast (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v2i32 (AArch64NvCast (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
def : Pat<(v2f32 (AArch64NvCast (v2f32 FPR64:$src))), (v2f32 FPR64:$src)>;
def : Pat<(v1i64 (AArch64NvCast (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v1f64 (AArch64NvCast (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
// Natural vector casts (128 bit)
def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
def : Pat<(v8f16 (AArch64NvCast (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v2f64 (AArch64NvCast (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>;
def : Pat<(v8f16 (AArch64NvCast (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v4f32 (AArch64NvCast (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v2f64 (AArch64NvCast (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
def : Pat<(v8f16 (AArch64NvCast (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v4f32 (AArch64NvCast (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v2f64 (AArch64NvCast (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
def : Pat<(v8f16 (AArch64NvCast (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>;
def : Pat<(v4i32 (AArch64NvCast (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v2i64 (AArch64NvCast (v2i64 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v4f32 (AArch64NvCast (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v2f64 (AArch64NvCast (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v16i8 (AArch64NvCast (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v8i16 (AArch64NvCast (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v8f16 (AArch64NvCast (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
def : Pat<(v2f64 (AArch64NvCast (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v8i16 (AArch64NvCast (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v8f16 (AArch64NvCast (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
def : Pat<(v4f32 (AArch64NvCast (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
}
let Predicates = [IsBE] in {
def : Pat<(v8i8 (bitconvert GPR64:$Xn)),
(REV64v8i8 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
def : Pat<(v4i16 (bitconvert GPR64:$Xn)),
(REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
def : Pat<(v2i32 (bitconvert GPR64:$Xn)),
(REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
def : Pat<(v4f16 (bitconvert GPR64:$Xn)),
(REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
def : Pat<(v2f32 (bitconvert GPR64:$Xn)),
(REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))),
(REV64v8i8 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
(REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
(REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
(REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
(REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
}
def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)),
(COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)),
(COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))),
(COPY_TO_REGCLASS GPR32:$Xn, FPR32)>;
def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))),
(COPY_TO_REGCLASS FPR32:$Xn, GPR32)>;
def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))),
(COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))),
(COPY_TO_REGCLASS FPR64:$Xn, GPR64)>;
def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
let Predicates = [IsLE] in {
def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))),
(v1i64 (REV64v2i32 FPR64:$src))>;
def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))),
(v1i64 (REV64v4i16 FPR64:$src))>;
def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))),
(v1i64 (REV64v8i8 FPR64:$src))>;
def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))),
(v1i64 (REV64v4i16 FPR64:$src))>;
def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))),
(v1i64 (REV64v2i32 FPR64:$src))>;
}
def : Pat<(v1i64 (bitconvert (v1f64 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v1i64 (bitconvert (f64 FPR64:$src))), (v1i64 FPR64:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), (v2i32 FPR64:$src)>;
def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 FPR64:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))),
(v2i32 (REV64v2i32 FPR64:$src))>;
def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))),
(v2i32 (REV32v4i16 FPR64:$src))>;
def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))),
(v2i32 (REV32v8i8 FPR64:$src))>;
def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))),
(v2i32 (REV64v2i32 FPR64:$src))>;
def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
(v2i32 (REV64v2i32 FPR64:$src))>;
def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))),
(v2i32 (REV32v4i16 FPR64:$src))>;
}
def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))),
(v4i16 (REV64v4i16 FPR64:$src))>;
def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))),
(v4i16 (REV32v4i16 FPR64:$src))>;
def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))),
(v4i16 (REV16v8i8 FPR64:$src))>;
def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))),
(v4i16 (REV64v4i16 FPR64:$src))>;
def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))),
(v4i16 (REV32v4i16 FPR64:$src))>;
def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
(v4i16 (REV64v4i16 FPR64:$src))>;
}
def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), (v4f16 FPR64:$src)>;
def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>;
def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>;
def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))), (v4f16 FPR64:$src)>;
def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 FPR64:$src)>;
def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 FPR64:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))),
(v4f16 (REV64v4i16 FPR64:$src))>;
def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))),
(v4f16 (REV32v4i16 FPR64:$src))>;
def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))),
(v4f16 (REV16v8i8 FPR64:$src))>;
def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))),
(v4f16 (REV64v4i16 FPR64:$src))>;
def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))),
(v4f16 (REV32v4i16 FPR64:$src))>;
def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))),
(v4f16 (REV64v4i16 FPR64:$src))>;
}
def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v8i8 (bitconvert (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v8i8 (bitconvert (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))), (v8i8 FPR64:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))),
(v8i8 (REV64v8i8 FPR64:$src))>;
def : Pat<(v8i8 (bitconvert (v2i32 FPR64:$src))),
(v8i8 (REV32v8i8 FPR64:$src))>;
def : Pat<(v8i8 (bitconvert (v4i16 FPR64:$src))),
(v8i8 (REV16v8i8 FPR64:$src))>;
def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))),
(v8i8 (REV64v8i8 FPR64:$src))>;
def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))),
(v8i8 (REV32v8i8 FPR64:$src))>;
def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))),
(v8i8 (REV64v8i8 FPR64:$src))>;
def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))),
(v8i8 (REV16v8i8 FPR64:$src))>;
}
let Predicates = [IsLE] in {
def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))), (f64 FPR64:$src)>;
def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))), (f64 FPR64:$src)>;
def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))), (f64 FPR64:$src)>;
def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), (f64 FPR64:$src)>;
def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))), (f64 FPR64:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))),
(f64 (REV64v2i32 FPR64:$src))>;
def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))),
(f64 (REV64v4i16 FPR64:$src))>;
def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))),
(f64 (REV64v2i32 FPR64:$src))>;
def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))),
(f64 (REV64v8i8 FPR64:$src))>;
def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))),
(f64 (REV64v4i16 FPR64:$src))>;
}
def : Pat<(f64 (bitconvert (v1i64 FPR64:$src))), (f64 FPR64:$src)>;
def : Pat<(f64 (bitconvert (v1f64 FPR64:$src))), (f64 FPR64:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>;
def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))), (v1f64 FPR64:$src)>;
def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 FPR64:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))),
(v1f64 (REV64v2i32 FPR64:$src))>;
def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))),
(v1f64 (REV64v4i16 FPR64:$src))>;
def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))),
(v1f64 (REV64v8i8 FPR64:$src))>;
def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))),
(v1f64 (REV64v2i32 FPR64:$src))>;
def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))),
(v1f64 (REV64v4i16 FPR64:$src))>;
}
def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
def : Pat<(v1f64 (bitconvert (f64 FPR64:$src))), (v1f64 FPR64:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), (v2f32 FPR64:$src)>;
def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>;
def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>;
def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 FPR64:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))),
(v2f32 (REV64v2i32 FPR64:$src))>;
def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))),
(v2f32 (REV32v4i16 FPR64:$src))>;
def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))),
(v2f32 (REV32v8i8 FPR64:$src))>;
def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))),
(v2f32 (REV64v2i32 FPR64:$src))>;
def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))),
(v2f32 (REV64v2i32 FPR64:$src))>;
def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))),
(v2f32 (REV32v4i16 FPR64:$src))>;
}
def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
let Predicates = [IsLE] in {
def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), (f128 FPR128:$src)>;
def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>;
def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 FPR128:$src)>;
def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))),
(f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))),
(f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
(REV64v4i32 FPR128:$src), (i32 8)))>;
def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))),
(f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
(REV64v8i16 FPR128:$src), (i32 8)))>;
def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))),
(f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
(REV64v8i16 FPR128:$src), (i32 8)))>;
def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))),
(f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))),
(f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
(REV64v4i32 FPR128:$src), (i32 8)))>;
def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))),
(f128 (EXTv16i8 (REV64v16i8 FPR128:$src),
(REV64v16i8 FPR128:$src), (i32 8)))>;
}
let Predicates = [IsLE] in {
def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))),
(v2f64 (EXTv16i8 FPR128:$src,
FPR128:$src, (i32 8)))>;
def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))),
(v2f64 (REV64v4i32 FPR128:$src))>;
def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))),
(v2f64 (REV64v8i16 FPR128:$src))>;
def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))),
(v2f64 (REV64v8i16 FPR128:$src))>;
def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))),
(v2f64 (REV64v16i8 FPR128:$src))>;
def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))),
(v2f64 (REV64v4i32 FPR128:$src))>;
}
def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))),
(v4f32 (EXTv16i8 (REV64v4i32 FPR128:$src),
(REV64v4i32 FPR128:$src), (i32 8)))>;
def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))),
(v4f32 (REV32v8i16 FPR128:$src))>;
def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))),
(v4f32 (REV32v8i16 FPR128:$src))>;
def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))),
(v4f32 (REV32v16i8 FPR128:$src))>;
def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))),
(v4f32 (REV64v4i32 FPR128:$src))>;
def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))),
(v4f32 (REV64v4i32 FPR128:$src))>;
}
def : Pat<(v4f32 (bitconvert (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 FPR128:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))),
(v2i64 (EXTv16i8 FPR128:$src,
FPR128:$src, (i32 8)))>;
def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))),
(v2i64 (REV64v4i32 FPR128:$src))>;
def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))),
(v2i64 (REV64v8i16 FPR128:$src))>;
def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))),
(v2i64 (REV64v16i8 FPR128:$src))>;
def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))),
(v2i64 (REV64v4i32 FPR128:$src))>;
def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))),
(v2i64 (REV64v8i16 FPR128:$src))>;
}
def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 FPR128:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))),
(v4i32 (EXTv16i8 (REV64v4i32 FPR128:$src),
(REV64v4i32 FPR128:$src),
(i32 8)))>;
def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))),
(v4i32 (REV64v4i32 FPR128:$src))>;
def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))),
(v4i32 (REV32v8i16 FPR128:$src))>;
def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))),
(v4i32 (REV32v16i8 FPR128:$src))>;
def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))),
(v4i32 (REV64v4i32 FPR128:$src))>;
def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))),
(v4i32 (REV32v8i16 FPR128:$src))>;
}
def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))), (v8i16 FPR128:$src)>;
def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))),
(v8i16 (EXTv16i8 (REV64v8i16 FPR128:$src),
(REV64v8i16 FPR128:$src),
(i32 8)))>;
def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))),
(v8i16 (REV64v8i16 FPR128:$src))>;
def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))),
(v8i16 (REV32v8i16 FPR128:$src))>;
def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))),
(v8i16 (REV16v16i8 FPR128:$src))>;
def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))),
(v8i16 (REV64v8i16 FPR128:$src))>;
def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
(v8i16 (REV32v8i16 FPR128:$src))>;
}
def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), (v8f16 FPR128:$src)>;
def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>;
def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))),
(v8f16 (EXTv16i8 (REV64v8i16 FPR128:$src),
(REV64v8i16 FPR128:$src),
(i32 8)))>;
def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))),
(v8f16 (REV64v8i16 FPR128:$src))>;
def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))),
(v8f16 (REV32v8i16 FPR128:$src))>;
def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))),
(v8f16 (REV16v16i8 FPR128:$src))>;
def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))),
(v8f16 (REV64v8i16 FPR128:$src))>;
def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))),
(v8f16 (REV32v8i16 FPR128:$src))>;
}
def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 FPR128:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))),
(v16i8 (EXTv16i8 (REV64v16i8 FPR128:$src),
(REV64v16i8 FPR128:$src),
(i32 8)))>;
def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))),
(v16i8 (REV64v16i8 FPR128:$src))>;
def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))),
(v16i8 (REV32v16i8 FPR128:$src))>;
def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))),
(v16i8 (REV16v16i8 FPR128:$src))>;
def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))),
(v16i8 (REV64v16i8 FPR128:$src))>;
def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))),
(v16i8 (REV32v16i8 FPR128:$src))>;
def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))),
(v16i8 (REV16v16i8 FPR128:$src))>;
}
def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 0))),
(EXTRACT_SUBREG V128:$Rn, dsub)>;
def : Pat<(v8i8 (extract_subvector V128:$Rn, (i64 0))),
(EXTRACT_SUBREG V128:$Rn, dsub)>;
def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 0))),
(EXTRACT_SUBREG V128:$Rn, dsub)>;
def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 0))),
(EXTRACT_SUBREG V128:$Rn, dsub)>;
def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 0))),
(EXTRACT_SUBREG V128:$Rn, dsub)>;
def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 0))),
(EXTRACT_SUBREG V128:$Rn, dsub)>;
def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 0))),
(EXTRACT_SUBREG V128:$Rn, dsub)>;
def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
(EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))),
(EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 1))),
(EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))),
(EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
// A 64-bit subvector insert to the first 128-bit vector position
// is a subregister copy that needs no instruction.
multiclass InsertSubvectorUndef<ValueType Ty> {
def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (Ty 0)),
(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (Ty 0)),
(INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (Ty 0)),
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (Ty 0)),
(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (Ty 0)),
(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (Ty 0)),
(INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (Ty 0)),
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
}
defm : InsertSubvectorUndef<i32>;
defm : InsertSubvectorUndef<i64>;
// Use pair-wise add instructions when summing up the lanes for v2f64, v2i64
// or v2f32.
def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)),
(vector_extract (v2i64 FPR128:$Rn), (i64 1)))),
(i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>;
def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
(vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
(f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>;
// vector_extract on 64-bit vectors gets promoted to a 128 bit vector,
// so we match on v4f32 here, not v2f32. This will also catch adding
// the low two lanes of a true v4f32 vector.
def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
(vector_extract (v4f32 FPR128:$Rn), (i64 1))),
(f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
// Scalar 64-bit shifts in FPR64 registers.
def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
(SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
def : Pat<(i64 (int_aarch64_neon_ushl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
(USHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
(SRSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
(URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
// Patterns for nontemporal/no-allocate stores.
// We have to resort to tricks to turn a single-input store into a store pair,
// because there is no single-input nontemporal store, only STNP.
let Predicates = [IsLE] in {
let AddedComplexity = 15 in {
class NTStore128Pat<ValueType VT> :
Pat<(nontemporalstore (VT FPR128:$Rt),
(am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
(STNPDi (EXTRACT_SUBREG FPR128:$Rt, dsub),
(CPYi64 FPR128:$Rt, (i64 1)),
GPR64sp:$Rn, simm7s8:$offset)>;
def : NTStore128Pat<v2i64>;
def : NTStore128Pat<v4i32>;
def : NTStore128Pat<v8i16>;
def : NTStore128Pat<v16i8>;
class NTStore64Pat<ValueType VT> :
Pat<(nontemporalstore (VT FPR64:$Rt),
(am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
(STNPSi (EXTRACT_SUBREG FPR64:$Rt, ssub),
(CPYi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)),
GPR64sp:$Rn, simm7s4:$offset)>;
// FIXME: Shouldn't v1f64 loads/stores be promoted to v1i64?
def : NTStore64Pat<v1f64>;
def : NTStore64Pat<v1i64>;
def : NTStore64Pat<v2i32>;
def : NTStore64Pat<v4i16>;
def : NTStore64Pat<v8i8>;
def : Pat<(nontemporalstore GPR64:$Rt,
(am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
(STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32),
(EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 32, 63), sub_32),
GPR64sp:$Rn, simm7s4:$offset)>;
} // AddedComplexity=10
} // Predicates = [IsLE]
// Tail call return handling. These are all compiler pseudo-instructions,
// so no encoding information or anything like that.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff), []>,
Sched<[WriteBrReg]>;
def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>,
Sched<[WriteBrReg]>;
// Indirect tail-call with any register allowed, used by MachineOutliner when
// this is proven safe.
// FIXME: If we have to add any more hacks like this, we should instead relax
// some verifier checks for outlined functions.
def TCRETURNriALL : Pseudo<(outs), (ins GPR64:$dst, i32imm:$FPDiff), []>,
Sched<[WriteBrReg]>;
// Indirect tail-call limited to only use registers (x16 and x17) which are
// allowed to tail-call a "BTI c" instruction.
def TCRETURNriBTI : Pseudo<(outs), (ins rtcGPR64:$dst, i32imm:$FPDiff), []>,
Sched<[WriteBrReg]>;
}
def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
(TCRETURNri tcGPR64:$dst, imm:$FPDiff)>,
Requires<[NotUseBTI]>;
def : Pat<(AArch64tcret rtcGPR64:$dst, (i32 timm:$FPDiff)),
(TCRETURNriBTI rtcGPR64:$dst, imm:$FPDiff)>,
Requires<[UseBTI]>;
def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
(TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
(TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
def MOVMCSym : Pseudo<(outs GPR64:$dst), (ins i64imm:$sym), []>, Sched<[]>;
def : Pat<(i64 (AArch64LocalRecover mcsym:$sym)), (MOVMCSym mcsym:$sym)>;
include "AArch64InstrAtomics.td"
include "AArch64SVEInstrInfo.td"
Index: vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64SVEInstrInfo.td
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64SVEInstrInfo.td (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64SVEInstrInfo.td (revision 351303)
@@ -1,1426 +1,1430 @@
//=- AArch64SVEInstrInfo.td - AArch64 SVE Instructions -*- tablegen -*-----=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// AArch64 Scalable Vector Extension (SVE) Instruction definitions.
//
//===----------------------------------------------------------------------===//
let Predicates = [HasSVE] in {
def RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr">;
def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">;
def RDFFR_P : sve_int_rdffr_unpred<"rdffr">;
def SETFFR : sve_int_setffr<"setffr">;
def WRFFR : sve_int_wrffr<"wrffr">;
defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add">;
defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub">;
defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd">;
defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd">;
defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub">;
defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub">;
defm AND_ZZZ : sve_int_bin_cons_log<0b00, "and">;
defm ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr">;
defm EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor">;
defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic">;
defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add">;
defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub">;
defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr">;
defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr">;
defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor">;
defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and">;
defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic">;
defm ADD_ZI : sve_int_arith_imm0<0b000, "add">;
defm SUB_ZI : sve_int_arith_imm0<0b001, "sub">;
defm SUBR_ZI : sve_int_arith_imm0<0b011, "subr">;
defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd">;
defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd">;
defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub">;
defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub">;
defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad">;
defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb">;
defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla">;
defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls">;
// SVE predicated integer reductions.
defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv">;
defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv">;
defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv">;
defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv">;
defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv">;
defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv">;
defm ORV_VPZ : sve_int_reduce_2<0b000, "orv">;
defm EORV_VPZ : sve_int_reduce_2<0b001, "eorv">;
defm ANDV_VPZ : sve_int_reduce_2<0b010, "andv">;
defm ORR_ZI : sve_int_log_imm<0b00, "orr", "orn">;
defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon">;
defm AND_ZI : sve_int_log_imm<0b10, "and", "bic">;
defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", simm8>;
defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", simm8>;
defm UMAX_ZI : sve_int_arith_imm1<0b01, "umax", imm0_255>;
defm UMIN_ZI : sve_int_arith_imm1<0b11, "umin", imm0_255>;
defm MUL_ZI : sve_int_arith_imm2<"mul">;
defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul">;
defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh">;
defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh">;
defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv">;
defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv">;
defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr">;
defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr">;
defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot">;
defm UDOT_ZZZ : sve_intx_dot<0b1, "udot">;
defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot">;
defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot">;
defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb">;
defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb">;
defm SXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b010, "sxth">;
defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth">;
defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw">;
defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw">;
defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs">;
defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg">;
defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls">;
defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz">;
defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt">;
defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot">;
defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not">;
defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs">;
defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg">;
defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax">;
defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax">;
defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin">;
defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin">;
defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd">;
defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd">;
defm FRECPE_ZZ : sve_fp_2op_u_zd<0b110, "frecpe">;
defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte">;
defm FADD_ZPmI : sve_fp_2op_i_p_zds<0b000, "fadd", sve_fpimm_half_one>;
defm FSUB_ZPmI : sve_fp_2op_i_p_zds<0b001, "fsub", sve_fpimm_half_one>;
defm FMUL_ZPmI : sve_fp_2op_i_p_zds<0b010, "fmul", sve_fpimm_half_two>;
defm FSUBR_ZPmI : sve_fp_2op_i_p_zds<0b011, "fsubr", sve_fpimm_half_one>;
defm FMAXNM_ZPmI : sve_fp_2op_i_p_zds<0b100, "fmaxnm", sve_fpimm_zero_one>;
defm FMINNM_ZPmI : sve_fp_2op_i_p_zds<0b101, "fminnm", sve_fpimm_zero_one>;
defm FMAX_ZPmI : sve_fp_2op_i_p_zds<0b110, "fmax", sve_fpimm_zero_one>;
defm FMIN_ZPmI : sve_fp_2op_i_p_zds<0b111, "fmin", sve_fpimm_zero_one>;
defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd">;
defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub">;
defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul">;
defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr">;
defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm">;
defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm">;
defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax">;
defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin">;
defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd">;
defm FSCALE_ZPmZ : sve_fp_2op_p_zds<0b1001, "fscale">;
defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx">;
defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr">;
defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv">;
defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd">;
defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub">;
defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul">;
defm FTSMUL_ZZZ : sve_fp_3op_u_zd<0b011, "ftsmul">;
defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps">;
defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts">;
defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel">;
defm FCADD_ZPmZ : sve_fp_fcadd<"fcadd">;
defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla">;
defm FMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, "fmla">;
defm FMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b01, "fmls">;
defm FNMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b10, "fnmla">;
defm FNMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b11, "fnmls">;
defm FMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b00, "fmad">;
defm FMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b01, "fmsb">;
defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad">;
defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb">;
defm FTMAD_ZZI : sve_fp_ftmad<"ftmad">;
defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b0, "fmla">;
defm FMLS_ZZZI : sve_fp_fma_by_indexed_elem<0b1, "fmls">;
defm FCMLA_ZZZI : sve_fp_fcmla_by_indexed_elem<"fcmla">;
defm FMUL_ZZZI : sve_fp_fmul_by_indexed_elem<"fmul">;
// SVE floating point reductions.
defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda">;
defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv">;
defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv">;
defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv">;
defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv">;
defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv">;
// Splat immediate (unpredicated)
defm DUP_ZI : sve_int_dup_imm<"dup">;
defm FDUP_ZI : sve_int_dup_fpimm<"fdup">;
defm DUPM_ZI : sve_int_dup_mask_imm<"dupm">;
// Splat immediate (predicated)
defm CPY_ZPmI : sve_int_dup_imm_pred_merge<"cpy">;
defm CPY_ZPzI : sve_int_dup_imm_pred_zero<"cpy">;
defm FCPY_ZPmI : sve_int_dup_fpimm_pred<"fcpy">;
// Splat scalar register (unpredicated, GPR or vector + element index)
defm DUP_ZR : sve_int_perm_dup_r<"dup">;
defm DUP_ZZI : sve_int_perm_dup_i<"dup">;
// Splat scalar register (predicated)
defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy">;
defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy">;
// Select elements from either vector (predicated)
defm SEL_ZPZZ : sve_int_sel_vvv<"sel">;
defm SPLICE_ZPZ : sve_int_perm_splice<"splice">;
defm COMPACT_ZPZ : sve_int_perm_compact<"compact">;
defm INSR_ZR : sve_int_perm_insrs<"insr">;
defm INSR_ZV : sve_int_perm_insrv<"insr">;
def EXT_ZZI : sve_int_perm_extract_i<"ext">;
defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit">;
defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb">;
defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh">;
defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw">;
defm REV_PP : sve_int_perm_reverse_p<"rev">;
defm REV_ZZ : sve_int_perm_reverse_z<"rev">;
defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo">;
defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi">;
defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo">;
defm UUNPKHI_ZZ : sve_int_perm_unpk<0b11, "uunpkhi">;
def PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo">;
def PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi">;
defm MOVPRFX_ZPzZ : sve_int_movprfx_pred_zero<0b000, "movprfx">;
defm MOVPRFX_ZPmZ : sve_int_movprfx_pred_merge<0b001, "movprfx">;
def MOVPRFX_ZZ : sve_int_bin_cons_misc_0_c<0b00000001, "movprfx", ZPRAny>;
def FEXPA_ZZ_H : sve_int_bin_cons_misc_0_c<0b01000000, "fexpa", ZPR16>;
def FEXPA_ZZ_S : sve_int_bin_cons_misc_0_c<0b10000000, "fexpa", ZPR32>;
def FEXPA_ZZ_D : sve_int_bin_cons_misc_0_c<0b11000000, "fexpa", ZPR64>;
def BRKPA_PPzPP : sve_int_brkp<0b00, "brkpa">;
def BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas">;
def BRKPB_PPzPP : sve_int_brkp<0b01, "brkpb">;
def BRKPBS_PPzPP : sve_int_brkp<0b11, "brkpbs">;
def BRKN_PPzP : sve_int_brkn<0b0, "brkn">;
def BRKNS_PPzP : sve_int_brkn<0b1, "brkns">;
defm BRKA_PPzP : sve_int_break_z<0b000, "brka">;
defm BRKA_PPmP : sve_int_break_m<0b001, "brka">;
defm BRKAS_PPzP : sve_int_break_z<0b010, "brkas">;
defm BRKB_PPzP : sve_int_break_z<0b100, "brkb">;
defm BRKB_PPmP : sve_int_break_m<0b101, "brkb">;
defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs">;
def PTEST_PP : sve_int_ptest<0b010000, "ptest">;
def PFALSE : sve_int_pfalse<0b000000, "pfalse">;
defm PFIRST : sve_int_pfirst<0b00000, "pfirst">;
defm PNEXT : sve_int_pnext<0b00110, "pnext">;
def AND_PPzPP : sve_int_pred_log<0b0000, "and">;
def BIC_PPzPP : sve_int_pred_log<0b0001, "bic">;
def EOR_PPzPP : sve_int_pred_log<0b0010, "eor">;
def SEL_PPPP : sve_int_pred_log<0b0011, "sel">;
def ANDS_PPzPP : sve_int_pred_log<0b0100, "ands">;
def BICS_PPzPP : sve_int_pred_log<0b0101, "bics">;
def EORS_PPzPP : sve_int_pred_log<0b0110, "eors">;
def ORR_PPzPP : sve_int_pred_log<0b1000, "orr">;
def ORN_PPzPP : sve_int_pred_log<0b1001, "orn">;
def NOR_PPzPP : sve_int_pred_log<0b1010, "nor">;
def NAND_PPzPP : sve_int_pred_log<0b1011, "nand">;
def ORRS_PPzPP : sve_int_pred_log<0b1100, "orrs">;
def ORNS_PPzPP : sve_int_pred_log<0b1101, "orns">;
def NORS_PPzPP : sve_int_pred_log<0b1110, "nors">;
def NANDS_PPzPP : sve_int_pred_log<0b1111, "nands">;
defm CLASTA_RPZ : sve_int_perm_clast_rz<0, "clasta">;
defm CLASTB_RPZ : sve_int_perm_clast_rz<1, "clastb">;
defm CLASTA_VPZ : sve_int_perm_clast_vz<0, "clasta">;
defm CLASTB_VPZ : sve_int_perm_clast_vz<1, "clastb">;
defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta">;
defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb">;
defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta">;
defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb">;
defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta">;
defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb">;
// continuous load with reg+immediate
defm LD1B_IMM : sve_mem_cld_si<0b0000, "ld1b", Z_b, ZPR8>;
defm LD1B_H_IMM : sve_mem_cld_si<0b0001, "ld1b", Z_h, ZPR16>;
defm LD1B_S_IMM : sve_mem_cld_si<0b0010, "ld1b", Z_s, ZPR32>;
defm LD1B_D_IMM : sve_mem_cld_si<0b0011, "ld1b", Z_d, ZPR64>;
defm LD1SW_D_IMM : sve_mem_cld_si<0b0100, "ld1sw", Z_d, ZPR64>;
defm LD1H_IMM : sve_mem_cld_si<0b0101, "ld1h", Z_h, ZPR16>;
defm LD1H_S_IMM : sve_mem_cld_si<0b0110, "ld1h", Z_s, ZPR32>;
defm LD1H_D_IMM : sve_mem_cld_si<0b0111, "ld1h", Z_d, ZPR64>;
defm LD1SH_D_IMM : sve_mem_cld_si<0b1000, "ld1sh", Z_d, ZPR64>;
defm LD1SH_S_IMM : sve_mem_cld_si<0b1001, "ld1sh", Z_s, ZPR32>;
defm LD1W_IMM : sve_mem_cld_si<0b1010, "ld1w", Z_s, ZPR32>;
defm LD1W_D_IMM : sve_mem_cld_si<0b1011, "ld1w", Z_d, ZPR64>;
defm LD1SB_D_IMM : sve_mem_cld_si<0b1100, "ld1sb", Z_d, ZPR64>;
defm LD1SB_S_IMM : sve_mem_cld_si<0b1101, "ld1sb", Z_s, ZPR32>;
defm LD1SB_H_IMM : sve_mem_cld_si<0b1110, "ld1sb", Z_h, ZPR16>;
defm LD1D_IMM : sve_mem_cld_si<0b1111, "ld1d", Z_d, ZPR64>;
// LD1R loads (splat scalar to vector)
defm LD1RB_IMM : sve_mem_ld_dup<0b00, 0b00, "ld1rb", Z_b, ZPR8, uimm6s1>;
defm LD1RB_H_IMM : sve_mem_ld_dup<0b00, 0b01, "ld1rb", Z_h, ZPR16, uimm6s1>;
defm LD1RB_S_IMM : sve_mem_ld_dup<0b00, 0b10, "ld1rb", Z_s, ZPR32, uimm6s1>;
defm LD1RB_D_IMM : sve_mem_ld_dup<0b00, 0b11, "ld1rb", Z_d, ZPR64, uimm6s1>;
defm LD1RSW_IMM : sve_mem_ld_dup<0b01, 0b00, "ld1rsw", Z_d, ZPR64, uimm6s4>;
defm LD1RH_IMM : sve_mem_ld_dup<0b01, 0b01, "ld1rh", Z_h, ZPR16, uimm6s2>;
defm LD1RH_S_IMM : sve_mem_ld_dup<0b01, 0b10, "ld1rh", Z_s, ZPR32, uimm6s2>;
defm LD1RH_D_IMM : sve_mem_ld_dup<0b01, 0b11, "ld1rh", Z_d, ZPR64, uimm6s2>;
defm LD1RSH_D_IMM : sve_mem_ld_dup<0b10, 0b00, "ld1rsh", Z_d, ZPR64, uimm6s2>;
defm LD1RSH_S_IMM : sve_mem_ld_dup<0b10, 0b01, "ld1rsh", Z_s, ZPR32, uimm6s2>;
defm LD1RW_IMM : sve_mem_ld_dup<0b10, 0b10, "ld1rw", Z_s, ZPR32, uimm6s4>;
defm LD1RW_D_IMM : sve_mem_ld_dup<0b10, 0b11, "ld1rw", Z_d, ZPR64, uimm6s4>;
defm LD1RSB_D_IMM : sve_mem_ld_dup<0b11, 0b00, "ld1rsb", Z_d, ZPR64, uimm6s1>;
defm LD1RSB_S_IMM : sve_mem_ld_dup<0b11, 0b01, "ld1rsb", Z_s, ZPR32, uimm6s1>;
defm LD1RSB_H_IMM : sve_mem_ld_dup<0b11, 0b10, "ld1rsb", Z_h, ZPR16, uimm6s1>;
defm LD1RD_IMM : sve_mem_ld_dup<0b11, 0b11, "ld1rd", Z_d, ZPR64, uimm6s8>;
// LD1RQ loads (load quadword-vector and splat to scalable vector)
defm LD1RQ_B_IMM : sve_mem_ldqr_si<0b00, "ld1rqb", Z_b, ZPR8>;
defm LD1RQ_H_IMM : sve_mem_ldqr_si<0b01, "ld1rqh", Z_h, ZPR16>;
defm LD1RQ_W_IMM : sve_mem_ldqr_si<0b10, "ld1rqw", Z_s, ZPR32>;
defm LD1RQ_D_IMM : sve_mem_ldqr_si<0b11, "ld1rqd", Z_d, ZPR64>;
defm LD1RQ_B : sve_mem_ldqr_ss<0b00, "ld1rqb", Z_b, ZPR8, GPR64NoXZRshifted8>;
defm LD1RQ_H : sve_mem_ldqr_ss<0b01, "ld1rqh", Z_h, ZPR16, GPR64NoXZRshifted16>;
defm LD1RQ_W : sve_mem_ldqr_ss<0b10, "ld1rqw", Z_s, ZPR32, GPR64NoXZRshifted32>;
defm LD1RQ_D : sve_mem_ldqr_ss<0b11, "ld1rqd", Z_d, ZPR64, GPR64NoXZRshifted64>;
// continuous load with reg+reg addressing.
defm LD1B : sve_mem_cld_ss<0b0000, "ld1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
defm LD1B_H : sve_mem_cld_ss<0b0001, "ld1b", Z_h, ZPR16, GPR64NoXZRshifted8>;
defm LD1B_S : sve_mem_cld_ss<0b0010, "ld1b", Z_s, ZPR32, GPR64NoXZRshifted8>;
defm LD1B_D : sve_mem_cld_ss<0b0011, "ld1b", Z_d, ZPR64, GPR64NoXZRshifted8>;
defm LD1SW_D : sve_mem_cld_ss<0b0100, "ld1sw", Z_d, ZPR64, GPR64NoXZRshifted32>;
defm LD1H : sve_mem_cld_ss<0b0101, "ld1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
defm LD1H_S : sve_mem_cld_ss<0b0110, "ld1h", Z_s, ZPR32, GPR64NoXZRshifted16>;
defm LD1H_D : sve_mem_cld_ss<0b0111, "ld1h", Z_d, ZPR64, GPR64NoXZRshifted16>;
defm LD1SH_D : sve_mem_cld_ss<0b1000, "ld1sh", Z_d, ZPR64, GPR64NoXZRshifted16>;
defm LD1SH_S : sve_mem_cld_ss<0b1001, "ld1sh", Z_s, ZPR32, GPR64NoXZRshifted16>;
defm LD1W : sve_mem_cld_ss<0b1010, "ld1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
defm LD1W_D : sve_mem_cld_ss<0b1011, "ld1w", Z_d, ZPR64, GPR64NoXZRshifted32>;
defm LD1SB_D : sve_mem_cld_ss<0b1100, "ld1sb", Z_d, ZPR64, GPR64NoXZRshifted8>;
defm LD1SB_S : sve_mem_cld_ss<0b1101, "ld1sb", Z_s, ZPR32, GPR64NoXZRshifted8>;
defm LD1SB_H : sve_mem_cld_ss<0b1110, "ld1sb", Z_h, ZPR16, GPR64NoXZRshifted8>;
defm LD1D : sve_mem_cld_ss<0b1111, "ld1d", Z_d, ZPR64, GPR64NoXZRshifted64>;
// non-faulting continuous load with reg+immediate
defm LDNF1B_IMM : sve_mem_cldnf_si<0b0000, "ldnf1b", Z_b, ZPR8>;
defm LDNF1B_H_IMM : sve_mem_cldnf_si<0b0001, "ldnf1b", Z_h, ZPR16>;
defm LDNF1B_S_IMM : sve_mem_cldnf_si<0b0010, "ldnf1b", Z_s, ZPR32>;
defm LDNF1B_D_IMM : sve_mem_cldnf_si<0b0011, "ldnf1b", Z_d, ZPR64>;
defm LDNF1SW_D_IMM : sve_mem_cldnf_si<0b0100, "ldnf1sw", Z_d, ZPR64>;
defm LDNF1H_IMM : sve_mem_cldnf_si<0b0101, "ldnf1h", Z_h, ZPR16>;
defm LDNF1H_S_IMM : sve_mem_cldnf_si<0b0110, "ldnf1h", Z_s, ZPR32>;
defm LDNF1H_D_IMM : sve_mem_cldnf_si<0b0111, "ldnf1h", Z_d, ZPR64>;
defm LDNF1SH_D_IMM : sve_mem_cldnf_si<0b1000, "ldnf1sh", Z_d, ZPR64>;
defm LDNF1SH_S_IMM : sve_mem_cldnf_si<0b1001, "ldnf1sh", Z_s, ZPR32>;
defm LDNF1W_IMM : sve_mem_cldnf_si<0b1010, "ldnf1w", Z_s, ZPR32>;
defm LDNF1W_D_IMM : sve_mem_cldnf_si<0b1011, "ldnf1w", Z_d, ZPR64>;
defm LDNF1SB_D_IMM : sve_mem_cldnf_si<0b1100, "ldnf1sb", Z_d, ZPR64>;
defm LDNF1SB_S_IMM : sve_mem_cldnf_si<0b1101, "ldnf1sb", Z_s, ZPR32>;
defm LDNF1SB_H_IMM : sve_mem_cldnf_si<0b1110, "ldnf1sb", Z_h, ZPR16>;
defm LDNF1D_IMM : sve_mem_cldnf_si<0b1111, "ldnf1d", Z_d, ZPR64>;
// First-faulting loads with reg+reg addressing.
defm LDFF1B : sve_mem_cldff_ss<0b0000, "ldff1b", Z_b, ZPR8, GPR64shifted8>;
defm LDFF1B_H : sve_mem_cldff_ss<0b0001, "ldff1b", Z_h, ZPR16, GPR64shifted8>;
defm LDFF1B_S : sve_mem_cldff_ss<0b0010, "ldff1b", Z_s, ZPR32, GPR64shifted8>;
defm LDFF1B_D : sve_mem_cldff_ss<0b0011, "ldff1b", Z_d, ZPR64, GPR64shifted8>;
defm LDFF1SW_D : sve_mem_cldff_ss<0b0100, "ldff1sw", Z_d, ZPR64, GPR64shifted32>;
defm LDFF1H : sve_mem_cldff_ss<0b0101, "ldff1h", Z_h, ZPR16, GPR64shifted16>;
defm LDFF1H_S : sve_mem_cldff_ss<0b0110, "ldff1h", Z_s, ZPR32, GPR64shifted16>;
defm LDFF1H_D : sve_mem_cldff_ss<0b0111, "ldff1h", Z_d, ZPR64, GPR64shifted16>;
defm LDFF1SH_D : sve_mem_cldff_ss<0b1000, "ldff1sh", Z_d, ZPR64, GPR64shifted16>;
defm LDFF1SH_S : sve_mem_cldff_ss<0b1001, "ldff1sh", Z_s, ZPR32, GPR64shifted16>;
defm LDFF1W : sve_mem_cldff_ss<0b1010, "ldff1w", Z_s, ZPR32, GPR64shifted32>;
defm LDFF1W_D : sve_mem_cldff_ss<0b1011, "ldff1w", Z_d, ZPR64, GPR64shifted32>;
defm LDFF1SB_D : sve_mem_cldff_ss<0b1100, "ldff1sb", Z_d, ZPR64, GPR64shifted8>;
defm LDFF1SB_S : sve_mem_cldff_ss<0b1101, "ldff1sb", Z_s, ZPR32, GPR64shifted8>;
defm LDFF1SB_H : sve_mem_cldff_ss<0b1110, "ldff1sb", Z_h, ZPR16, GPR64shifted8>;
defm LDFF1D : sve_mem_cldff_ss<0b1111, "ldff1d", Z_d, ZPR64, GPR64shifted64>;
// LD(2|3|4) structured loads with reg+immediate
defm LD2B_IMM : sve_mem_eld_si<0b00, 0b01, ZZ_b, "ld2b", simm4s2>;
defm LD3B_IMM : sve_mem_eld_si<0b00, 0b10, ZZZ_b, "ld3b", simm4s3>;
defm LD4B_IMM : sve_mem_eld_si<0b00, 0b11, ZZZZ_b, "ld4b", simm4s4>;
defm LD2H_IMM : sve_mem_eld_si<0b01, 0b01, ZZ_h, "ld2h", simm4s2>;
defm LD3H_IMM : sve_mem_eld_si<0b01, 0b10, ZZZ_h, "ld3h", simm4s3>;
defm LD4H_IMM : sve_mem_eld_si<0b01, 0b11, ZZZZ_h, "ld4h", simm4s4>;
defm LD2W_IMM : sve_mem_eld_si<0b10, 0b01, ZZ_s, "ld2w", simm4s2>;
defm LD3W_IMM : sve_mem_eld_si<0b10, 0b10, ZZZ_s, "ld3w", simm4s3>;
defm LD4W_IMM : sve_mem_eld_si<0b10, 0b11, ZZZZ_s, "ld4w", simm4s4>;
defm LD2D_IMM : sve_mem_eld_si<0b11, 0b01, ZZ_d, "ld2d", simm4s2>;
defm LD3D_IMM : sve_mem_eld_si<0b11, 0b10, ZZZ_d, "ld3d", simm4s3>;
defm LD4D_IMM : sve_mem_eld_si<0b11, 0b11, ZZZZ_d, "ld4d", simm4s4>;
// LD(2|3|4) structured loads (register + register)
def LD2B : sve_mem_eld_ss<0b00, 0b01, ZZ_b, "ld2b", GPR64NoXZRshifted8>;
def LD3B : sve_mem_eld_ss<0b00, 0b10, ZZZ_b, "ld3b", GPR64NoXZRshifted8>;
def LD4B : sve_mem_eld_ss<0b00, 0b11, ZZZZ_b, "ld4b", GPR64NoXZRshifted8>;
def LD2H : sve_mem_eld_ss<0b01, 0b01, ZZ_h, "ld2h", GPR64NoXZRshifted16>;
def LD3H : sve_mem_eld_ss<0b01, 0b10, ZZZ_h, "ld3h", GPR64NoXZRshifted16>;
def LD4H : sve_mem_eld_ss<0b01, 0b11, ZZZZ_h, "ld4h", GPR64NoXZRshifted16>;
def LD2W : sve_mem_eld_ss<0b10, 0b01, ZZ_s, "ld2w", GPR64NoXZRshifted32>;
def LD3W : sve_mem_eld_ss<0b10, 0b10, ZZZ_s, "ld3w", GPR64NoXZRshifted32>;
def LD4W : sve_mem_eld_ss<0b10, 0b11, ZZZZ_s, "ld4w", GPR64NoXZRshifted32>;
def LD2D : sve_mem_eld_ss<0b11, 0b01, ZZ_d, "ld2d", GPR64NoXZRshifted64>;
def LD3D : sve_mem_eld_ss<0b11, 0b10, ZZZ_d, "ld3d", GPR64NoXZRshifted64>;
def LD4D : sve_mem_eld_ss<0b11, 0b11, ZZZZ_d, "ld4d", GPR64NoXZRshifted64>;
// Gathers using unscaled 32-bit offsets, e.g.
// ld1h z0.s, p0/z, [x0, z0.s, uxtw]
defm GLD1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
defm GLD1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
// Gathers using scaled 32-bit offsets, e.g.
// ld1h z0.s, p0/z, [x0, z0.s, uxtw #1]
defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
// Gathers using scaled 32-bit pointers with offset, e.g.
// ld1h z0.s, p0/z, [z0.s, #16]
defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31>;
defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31>;
defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31>;
defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31>;
defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2>;
defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2>;
defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2>;
defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2>;
defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4>;
defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4>;
// Gathers using scaled 64-bit pointers with offset, e.g.
// ld1h z0.d, p0/z, [z0.d, #16]
defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31>;
defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31>;
defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31>;
defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31>;
defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2>;
defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2>;
defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2>;
defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2>;
defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4>;
defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4>;
defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4>;
defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4>;
defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8>;
defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8>;
// Gathers using unscaled 64-bit offsets, e.g.
// ld1h z0.d, p0/z, [x0, z0.d]
defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb">;
defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb">;
defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b">;
defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b">;
defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh">;
defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh">;
defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h">;
defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h">;
defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw">;
defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw">;
defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w">;
defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w">;
defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d">;
defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d">;
// Gathers using scaled 64-bit offsets, e.g.
// ld1h z0.d, p0/z, [x0, z0.d, lsl #1]
defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", ZPR64ExtLSL16>;
defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", ZPR64ExtLSL16>;
defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", ZPR64ExtLSL16>;
defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", ZPR64ExtLSL16>;
defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", ZPR64ExtLSL32>;
defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", ZPR64ExtLSL32>;
defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", ZPR64ExtLSL32>;
defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", ZPR64ExtLSL32>;
defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", ZPR64ExtLSL64>;
defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", ZPR64ExtLSL64>;
// Gathers using unscaled 32-bit offsets unpacked in 64-bits elements, e.g.
// ld1h z0.d, p0/z, [x0, z0.d, uxtw]
defm GLD1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
defm GLD1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
defm GLD1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
defm GLD1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
defm GLD1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
defm GLD1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
defm GLD1D : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
// Gathers using scaled 32-bit offsets unpacked in 64-bits elements, e.g.
// ld1h z0.d, p0/z, [x0, z0.d, uxtw #1]
defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh",ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw",ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
// Non-temporal contiguous loads (register + immediate)
defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>;
defm LDNT1H_ZRI : sve_mem_cldnt_si<0b01, "ldnt1h", Z_h, ZPR16>;
defm LDNT1W_ZRI : sve_mem_cldnt_si<0b10, "ldnt1w", Z_s, ZPR32>;
defm LDNT1D_ZRI : sve_mem_cldnt_si<0b11, "ldnt1d", Z_d, ZPR64>;
// Non-temporal contiguous loads (register + register)
defm LDNT1B_ZRR : sve_mem_cldnt_ss<0b00, "ldnt1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
defm LDNT1H_ZRR : sve_mem_cldnt_ss<0b01, "ldnt1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
defm LDNT1W_ZRR : sve_mem_cldnt_ss<0b10, "ldnt1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
defm LDNT1D_ZRR : sve_mem_cldnt_ss<0b11, "ldnt1d", Z_d, ZPR64, GPR64NoXZRshifted64>;
// contiguous store with immediates
defm ST1B_IMM : sve_mem_cst_si<0b00, 0b00, "st1b", Z_b, ZPR8>;
defm ST1B_H_IMM : sve_mem_cst_si<0b00, 0b01, "st1b", Z_h, ZPR16>;
defm ST1B_S_IMM : sve_mem_cst_si<0b00, 0b10, "st1b", Z_s, ZPR32>;
defm ST1B_D_IMM : sve_mem_cst_si<0b00, 0b11, "st1b", Z_d, ZPR64>;
defm ST1H_IMM : sve_mem_cst_si<0b01, 0b01, "st1h", Z_h, ZPR16>;
defm ST1H_S_IMM : sve_mem_cst_si<0b01, 0b10, "st1h", Z_s, ZPR32>;
defm ST1H_D_IMM : sve_mem_cst_si<0b01, 0b11, "st1h", Z_d, ZPR64>;
defm ST1W_IMM : sve_mem_cst_si<0b10, 0b10, "st1w", Z_s, ZPR32>;
defm ST1W_D_IMM : sve_mem_cst_si<0b10, 0b11, "st1w", Z_d, ZPR64>;
defm ST1D_IMM : sve_mem_cst_si<0b11, 0b11, "st1d", Z_d, ZPR64>;
// contiguous store with reg+reg addressing.
defm ST1B : sve_mem_cst_ss<0b0000, "st1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
defm ST1B_H : sve_mem_cst_ss<0b0001, "st1b", Z_h, ZPR16, GPR64NoXZRshifted8>;
defm ST1B_S : sve_mem_cst_ss<0b0010, "st1b", Z_s, ZPR32, GPR64NoXZRshifted8>;
defm ST1B_D : sve_mem_cst_ss<0b0011, "st1b", Z_d, ZPR64, GPR64NoXZRshifted8>;
defm ST1H : sve_mem_cst_ss<0b0101, "st1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
defm ST1H_S : sve_mem_cst_ss<0b0110, "st1h", Z_s, ZPR32, GPR64NoXZRshifted16>;
defm ST1H_D : sve_mem_cst_ss<0b0111, "st1h", Z_d, ZPR64, GPR64NoXZRshifted16>;
defm ST1W : sve_mem_cst_ss<0b1010, "st1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
defm ST1W_D : sve_mem_cst_ss<0b1011, "st1w", Z_d, ZPR64, GPR64NoXZRshifted32>;
defm ST1D : sve_mem_cst_ss<0b1111, "st1d", Z_d, ZPR64, GPR64NoXZRshifted64>;
// Scatters using unscaled 32-bit offsets, e.g.
// st1h z0.s, p0, [x0, z0.s, uxtw]
// and unpacked:
// st1h z0.d, p0, [x0, z0.d, uxtw]
defm SST1B_D : sve_mem_sst_sv_32_unscaled<0b000, "st1b", Z_d, ZPR64, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
defm SST1B_S : sve_mem_sst_sv_32_unscaled<0b001, "st1b", Z_s, ZPR32, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
defm SST1H_D : sve_mem_sst_sv_32_unscaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
defm SST1H_S : sve_mem_sst_sv_32_unscaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
defm SST1W_D : sve_mem_sst_sv_32_unscaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
defm SST1W : sve_mem_sst_sv_32_unscaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
defm SST1D : sve_mem_sst_sv_32_unscaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
// Scatters using scaled 32-bit offsets, e.g.
// st1h z0.s, p0, [x0, z0.s, uxtw #1]
// and unpacked:
// st1h z0.d, p0, [x0, z0.d, uxtw #1]
defm SST1H_D : sve_mem_sst_sv_32_scaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
defm SST1H_S : sve_mem_sst_sv_32_scaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
defm SST1W_D : sve_mem_sst_sv_32_scaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
defm SST1W : sve_mem_sst_sv_32_scaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
defm SST1D : sve_mem_sst_sv_32_scaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
// Scatters using 32/64-bit pointers with offset, e.g.
// st1h z0.s, p0, [z0.s, #16]
// st1h z0.d, p0, [z0.d, #16]
defm SST1B_D : sve_mem_sst_vi_ptrs<0b000, "st1b", Z_d, ZPR64, imm0_31>;
defm SST1B_S : sve_mem_sst_vi_ptrs<0b001, "st1b", Z_s, ZPR32, imm0_31>;
defm SST1H_D : sve_mem_sst_vi_ptrs<0b010, "st1h", Z_d, ZPR64, uimm5s2>;
defm SST1H_S : sve_mem_sst_vi_ptrs<0b011, "st1h", Z_s, ZPR32, uimm5s2>;
defm SST1W_D : sve_mem_sst_vi_ptrs<0b100, "st1w", Z_d, ZPR64, uimm5s4>;
defm SST1W : sve_mem_sst_vi_ptrs<0b101, "st1w", Z_s, ZPR32, uimm5s4>;
defm SST1D : sve_mem_sst_vi_ptrs<0b110, "st1d", Z_d, ZPR64, uimm5s8>;
// Scatters using unscaled 64-bit offsets, e.g.
// st1h z0.d, p0, [x0, z0.d]
defm SST1B_D : sve_mem_sst_sv_64_unscaled<0b00, "st1b">;
defm SST1H_D : sve_mem_sst_sv_64_unscaled<0b01, "st1h">;
defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w">;
defm SST1D : sve_mem_sst_sv_64_unscaled<0b11, "st1d">;
// Scatters using scaled 64-bit offsets, e.g.
// st1h z0.d, p0, [x0, z0.d, lsl #1]
defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", ZPR64ExtLSL16>;
defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", ZPR64ExtLSL32>;
defm SST1D_SCALED : sve_mem_sst_sv_64_scaled<0b11, "st1d", ZPR64ExtLSL64>;
// ST(2|3|4) structured stores (register + immediate)
defm ST2B_IMM : sve_mem_est_si<0b00, 0b01, ZZ_b, "st2b", simm4s2>;
defm ST3B_IMM : sve_mem_est_si<0b00, 0b10, ZZZ_b, "st3b", simm4s3>;
defm ST4B_IMM : sve_mem_est_si<0b00, 0b11, ZZZZ_b, "st4b", simm4s4>;
defm ST2H_IMM : sve_mem_est_si<0b01, 0b01, ZZ_h, "st2h", simm4s2>;
defm ST3H_IMM : sve_mem_est_si<0b01, 0b10, ZZZ_h, "st3h", simm4s3>;
defm ST4H_IMM : sve_mem_est_si<0b01, 0b11, ZZZZ_h, "st4h", simm4s4>;
defm ST2W_IMM : sve_mem_est_si<0b10, 0b01, ZZ_s, "st2w", simm4s2>;
defm ST3W_IMM : sve_mem_est_si<0b10, 0b10, ZZZ_s, "st3w", simm4s3>;
defm ST4W_IMM : sve_mem_est_si<0b10, 0b11, ZZZZ_s, "st4w", simm4s4>;
defm ST2D_IMM : sve_mem_est_si<0b11, 0b01, ZZ_d, "st2d", simm4s2>;
defm ST3D_IMM : sve_mem_est_si<0b11, 0b10, ZZZ_d, "st3d", simm4s3>;
defm ST4D_IMM : sve_mem_est_si<0b11, 0b11, ZZZZ_d, "st4d", simm4s4>;
// ST(2|3|4) structured stores (register + register)
def ST2B : sve_mem_est_ss<0b00, 0b01, ZZ_b, "st2b", GPR64NoXZRshifted8>;
def ST3B : sve_mem_est_ss<0b00, 0b10, ZZZ_b, "st3b", GPR64NoXZRshifted8>;
def ST4B : sve_mem_est_ss<0b00, 0b11, ZZZZ_b, "st4b", GPR64NoXZRshifted8>;
def ST2H : sve_mem_est_ss<0b01, 0b01, ZZ_h, "st2h", GPR64NoXZRshifted16>;
def ST3H : sve_mem_est_ss<0b01, 0b10, ZZZ_h, "st3h", GPR64NoXZRshifted16>;
def ST4H : sve_mem_est_ss<0b01, 0b11, ZZZZ_h, "st4h", GPR64NoXZRshifted16>;
def ST2W : sve_mem_est_ss<0b10, 0b01, ZZ_s, "st2w", GPR64NoXZRshifted32>;
def ST3W : sve_mem_est_ss<0b10, 0b10, ZZZ_s, "st3w", GPR64NoXZRshifted32>;
def ST4W : sve_mem_est_ss<0b10, 0b11, ZZZZ_s, "st4w", GPR64NoXZRshifted32>;
def ST2D : sve_mem_est_ss<0b11, 0b01, ZZ_d, "st2d", GPR64NoXZRshifted64>;
def ST3D : sve_mem_est_ss<0b11, 0b10, ZZZ_d, "st3d", GPR64NoXZRshifted64>;
def ST4D : sve_mem_est_ss<0b11, 0b11, ZZZZ_d, "st4d", GPR64NoXZRshifted64>;
// Non-temporal contiguous stores (register + immediate)
defm STNT1B_ZRI : sve_mem_cstnt_si<0b00, "stnt1b", Z_b, ZPR8>;
defm STNT1H_ZRI : sve_mem_cstnt_si<0b01, "stnt1h", Z_h, ZPR16>;
defm STNT1W_ZRI : sve_mem_cstnt_si<0b10, "stnt1w", Z_s, ZPR32>;
defm STNT1D_ZRI : sve_mem_cstnt_si<0b11, "stnt1d", Z_d, ZPR64>;
// Non-temporal contiguous stores (register + register)
defm STNT1B_ZRR : sve_mem_cstnt_ss<0b00, "stnt1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
defm STNT1H_ZRR : sve_mem_cstnt_ss<0b01, "stnt1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
defm STNT1W_ZRR : sve_mem_cstnt_ss<0b10, "stnt1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
defm STNT1D_ZRR : sve_mem_cstnt_ss<0b11, "stnt1d", Z_d, ZPR64, GPR64NoXZRshifted64>;
// Fill/Spill
defm LDR_ZXI : sve_mem_z_fill<"ldr">;
defm LDR_PXI : sve_mem_p_fill<"ldr">;
defm STR_ZXI : sve_mem_z_spill<"str">;
defm STR_PXI : sve_mem_p_spill<"str">;
// Contiguous prefetch (register + immediate)
defm PRFB_PRI : sve_mem_prfm_si<0b00, "prfb">;
defm PRFH_PRI : sve_mem_prfm_si<0b01, "prfh">;
defm PRFW_PRI : sve_mem_prfm_si<0b10, "prfw">;
defm PRFD_PRI : sve_mem_prfm_si<0b11, "prfd">;
// Contiguous prefetch (register + register)
def PRFB_PRR : sve_mem_prfm_ss<0b001, "prfb", GPR64NoXZRshifted8>;
def PRFH_PRR : sve_mem_prfm_ss<0b011, "prfh", GPR64NoXZRshifted16>;
def PRFS_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>;
def PRFD_PRR : sve_mem_prfm_ss<0b111, "prfd", GPR64NoXZRshifted64>;
// Gather prefetch using scaled 32-bit offsets, e.g.
// prfh pldl1keep, p0, [x0, z0.s, uxtw #1]
defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64>;
// Gather prefetch using unpacked, scaled 32-bit offsets, e.g.
// prfh pldl1keep, p0, [x0, z0.d, uxtw #1]
defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
// Gather prefetch using scaled 64-bit offsets, e.g.
// prfh pldl1keep, p0, [x0, z0.d, lsl #1]
defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8>;
defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16>;
defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32>;
defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64>;
// Gather prefetch using 32/64-bit pointers with offset, e.g.
// prfh pldl1keep, p0, [z0.s, #16]
// prfh pldl1keep, p0, [z0.d, #16]
defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31>;
defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2>;
defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4>;
defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8>;
defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31>;
defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2>;
defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4>;
defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8>;
defm ADR_SXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_sxtw<0b00, "adr">;
defm ADR_UXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_uxtw<0b01, "adr">;
defm ADR_LSL_ZZZ_S : sve_int_bin_cons_misc_0_a_32_lsl<0b10, "adr">;
defm ADR_LSL_ZZZ_D : sve_int_bin_cons_misc_0_a_64_lsl<0b11, "adr">;
defm TBL_ZZZ : sve_int_perm_tbl<"tbl">;
defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1">;
defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2">;
defm UZP1_ZZZ : sve_int_perm_bin_perm_zz<0b010, "uzp1">;
defm UZP2_ZZZ : sve_int_perm_bin_perm_zz<0b011, "uzp2">;
defm TRN1_ZZZ : sve_int_perm_bin_perm_zz<0b100, "trn1">;
defm TRN2_ZZZ : sve_int_perm_bin_perm_zz<0b101, "trn2">;
defm ZIP1_PPP : sve_int_perm_bin_perm_pp<0b000, "zip1">;
defm ZIP2_PPP : sve_int_perm_bin_perm_pp<0b001, "zip2">;
defm UZP1_PPP : sve_int_perm_bin_perm_pp<0b010, "uzp1">;
defm UZP2_PPP : sve_int_perm_bin_perm_pp<0b011, "uzp2">;
defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1">;
defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2">;
defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs">;
defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi">;
defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge">;
defm CMPGT_PPzZZ : sve_int_cmp_0<0b101, "cmpgt">;
defm CMPEQ_PPzZZ : sve_int_cmp_0<0b110, "cmpeq">;
defm CMPNE_PPzZZ : sve_int_cmp_0<0b111, "cmpne">;
defm CMPEQ_WIDE_PPzZZ : sve_int_cmp_0_wide<0b010, "cmpeq">;
defm CMPNE_WIDE_PPzZZ : sve_int_cmp_0_wide<0b011, "cmpne">;
defm CMPGE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b000, "cmpge">;
defm CMPGT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b001, "cmpgt">;
defm CMPLT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b010, "cmplt">;
defm CMPLE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b011, "cmple">;
defm CMPHS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b100, "cmphs">;
defm CMPHI_WIDE_PPzZZ : sve_int_cmp_1_wide<0b101, "cmphi">;
defm CMPLO_WIDE_PPzZZ : sve_int_cmp_1_wide<0b110, "cmplo">;
defm CMPLS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b111, "cmpls">;
defm CMPGE_PPzZI : sve_int_scmp_vi<0b000, "cmpge">;
defm CMPGT_PPzZI : sve_int_scmp_vi<0b001, "cmpgt">;
defm CMPLT_PPzZI : sve_int_scmp_vi<0b010, "cmplt">;
defm CMPLE_PPzZI : sve_int_scmp_vi<0b011, "cmple">;
defm CMPEQ_PPzZI : sve_int_scmp_vi<0b100, "cmpeq">;
defm CMPNE_PPzZI : sve_int_scmp_vi<0b101, "cmpne">;
defm CMPHS_PPzZI : sve_int_ucmp_vi<0b00, "cmphs">;
defm CMPHI_PPzZI : sve_int_ucmp_vi<0b01, "cmphi">;
defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo">;
defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls">;
defm FCMGE_PPzZZ : sve_fp_3op_p_pd<0b000, "fcmge">;
defm FCMGT_PPzZZ : sve_fp_3op_p_pd<0b001, "fcmgt">;
defm FCMEQ_PPzZZ : sve_fp_3op_p_pd<0b010, "fcmeq">;
defm FCMNE_PPzZZ : sve_fp_3op_p_pd<0b011, "fcmne">;
defm FCMUO_PPzZZ : sve_fp_3op_p_pd<0b100, "fcmuo">;
defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge">;
defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt">;
defm FCMGE_PPzZ0 : sve_fp_2op_p_pd<0b000, "fcmge">;
defm FCMGT_PPzZ0 : sve_fp_2op_p_pd<0b001, "fcmgt">;
defm FCMLT_PPzZ0 : sve_fp_2op_p_pd<0b010, "fcmlt">;
defm FCMLE_PPzZ0 : sve_fp_2op_p_pd<0b011, "fcmle">;
defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq">;
defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne">;
defm WHILELT_PWW : sve_int_while4_rr<0b010, "whilelt">;
defm WHILELE_PWW : sve_int_while4_rr<0b011, "whilele">;
defm WHILELO_PWW : sve_int_while4_rr<0b110, "whilelo">;
defm WHILELS_PWW : sve_int_while4_rr<0b111, "whilels">;
defm WHILELT_PXX : sve_int_while8_rr<0b010, "whilelt">;
defm WHILELE_PXX : sve_int_while8_rr<0b011, "whilele">;
defm WHILELO_PXX : sve_int_while8_rr<0b110, "whilelo">;
defm WHILELS_PXX : sve_int_while8_rr<0b111, "whilels">;
def CTERMEQ_WW : sve_int_cterm<0b0, 0b0, "ctermeq", GPR32>;
def CTERMNE_WW : sve_int_cterm<0b0, 0b1, "ctermne", GPR32>;
def CTERMEQ_XX : sve_int_cterm<0b1, 0b0, "ctermeq", GPR64>;
def CTERMNE_XX : sve_int_cterm<0b1, 0b1, "ctermne", GPR64>;
def RDVLI_XI : sve_int_read_vl_a<0b0, 0b11111, "rdvl">;
def ADDVL_XXI : sve_int_arith_vl<0b0, "addvl">;
def ADDPL_XXI : sve_int_arith_vl<0b1, "addpl">;
defm CNTB_XPiI : sve_int_count<0b000, "cntb">;
defm CNTH_XPiI : sve_int_count<0b010, "cnth">;
defm CNTW_XPiI : sve_int_count<0b100, "cntw">;
defm CNTD_XPiI : sve_int_count<0b110, "cntd">;
defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp">;
defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb">;
defm DECB_XPiI : sve_int_pred_pattern_a<0b001, "decb">;
defm INCH_XPiI : sve_int_pred_pattern_a<0b010, "inch">;
defm DECH_XPiI : sve_int_pred_pattern_a<0b011, "dech">;
defm INCW_XPiI : sve_int_pred_pattern_a<0b100, "incw">;
defm DECW_XPiI : sve_int_pred_pattern_a<0b101, "decw">;
defm INCD_XPiI : sve_int_pred_pattern_a<0b110, "incd">;
defm DECD_XPiI : sve_int_pred_pattern_a<0b111, "decd">;
defm SQINCB_XPiWdI : sve_int_pred_pattern_b_s32<0b00000, "sqincb">;
defm UQINCB_WPiI : sve_int_pred_pattern_b_u32<0b00001, "uqincb">;
defm SQDECB_XPiWdI : sve_int_pred_pattern_b_s32<0b00010, "sqdecb">;
defm UQDECB_WPiI : sve_int_pred_pattern_b_u32<0b00011, "uqdecb">;
defm SQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00100, "sqincb">;
defm UQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00101, "uqincb">;
defm SQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00110, "sqdecb">;
defm UQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00111, "uqdecb">;
defm SQINCH_XPiWdI : sve_int_pred_pattern_b_s32<0b01000, "sqinch">;
defm UQINCH_WPiI : sve_int_pred_pattern_b_u32<0b01001, "uqinch">;
defm SQDECH_XPiWdI : sve_int_pred_pattern_b_s32<0b01010, "sqdech">;
defm UQDECH_WPiI : sve_int_pred_pattern_b_u32<0b01011, "uqdech">;
defm SQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01100, "sqinch">;
defm UQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01101, "uqinch">;
defm SQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01110, "sqdech">;
defm UQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01111, "uqdech">;
defm SQINCW_XPiWdI : sve_int_pred_pattern_b_s32<0b10000, "sqincw">;
defm UQINCW_WPiI : sve_int_pred_pattern_b_u32<0b10001, "uqincw">;
defm SQDECW_XPiWdI : sve_int_pred_pattern_b_s32<0b10010, "sqdecw">;
defm UQDECW_WPiI : sve_int_pred_pattern_b_u32<0b10011, "uqdecw">;
defm SQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10100, "sqincw">;
defm UQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10101, "uqincw">;
defm SQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10110, "sqdecw">;
defm UQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10111, "uqdecw">;
defm SQINCD_XPiWdI : sve_int_pred_pattern_b_s32<0b11000, "sqincd">;
defm UQINCD_WPiI : sve_int_pred_pattern_b_u32<0b11001, "uqincd">;
defm SQDECD_XPiWdI : sve_int_pred_pattern_b_s32<0b11010, "sqdecd">;
defm UQDECD_WPiI : sve_int_pred_pattern_b_u32<0b11011, "uqdecd">;
defm SQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11100, "sqincd">;
defm UQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11101, "uqincd">;
defm SQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11110, "sqdecd">;
defm UQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11111, "uqdecd">;
defm SQINCH_ZPiI : sve_int_countvlv<0b01000, "sqinch", ZPR16>;
defm UQINCH_ZPiI : sve_int_countvlv<0b01001, "uqinch", ZPR16>;
defm SQDECH_ZPiI : sve_int_countvlv<0b01010, "sqdech", ZPR16>;
defm UQDECH_ZPiI : sve_int_countvlv<0b01011, "uqdech", ZPR16>;
defm INCH_ZPiI : sve_int_countvlv<0b01100, "inch", ZPR16>;
defm DECH_ZPiI : sve_int_countvlv<0b01101, "dech", ZPR16>;
defm SQINCW_ZPiI : sve_int_countvlv<0b10000, "sqincw", ZPR32>;
defm UQINCW_ZPiI : sve_int_countvlv<0b10001, "uqincw", ZPR32>;
defm SQDECW_ZPiI : sve_int_countvlv<0b10010, "sqdecw", ZPR32>;
defm UQDECW_ZPiI : sve_int_countvlv<0b10011, "uqdecw", ZPR32>;
defm INCW_ZPiI : sve_int_countvlv<0b10100, "incw", ZPR32>;
defm DECW_ZPiI : sve_int_countvlv<0b10101, "decw", ZPR32>;
defm SQINCD_ZPiI : sve_int_countvlv<0b11000, "sqincd", ZPR64>;
defm UQINCD_ZPiI : sve_int_countvlv<0b11001, "uqincd", ZPR64>;
defm SQDECD_ZPiI : sve_int_countvlv<0b11010, "sqdecd", ZPR64>;
defm UQDECD_ZPiI : sve_int_countvlv<0b11011, "uqdecd", ZPR64>;
defm INCD_ZPiI : sve_int_countvlv<0b11100, "incd", ZPR64>;
defm DECD_ZPiI : sve_int_countvlv<0b11101, "decd", ZPR64>;
defm SQINCP_XPWd : sve_int_count_r_s32<0b00000, "sqincp">;
defm SQINCP_XP : sve_int_count_r_x64<0b00010, "sqincp">;
defm UQINCP_WP : sve_int_count_r_u32<0b00100, "uqincp">;
defm UQINCP_XP : sve_int_count_r_x64<0b00110, "uqincp">;
defm SQDECP_XPWd : sve_int_count_r_s32<0b01000, "sqdecp">;
defm SQDECP_XP : sve_int_count_r_x64<0b01010, "sqdecp">;
defm UQDECP_WP : sve_int_count_r_u32<0b01100, "uqdecp">;
defm UQDECP_XP : sve_int_count_r_x64<0b01110, "uqdecp">;
defm INCP_XP : sve_int_count_r_x64<0b10000, "incp">;
defm DECP_XP : sve_int_count_r_x64<0b10100, "decp">;
defm SQINCP_ZP : sve_int_count_v<0b00000, "sqincp">;
defm UQINCP_ZP : sve_int_count_v<0b00100, "uqincp">;
defm SQDECP_ZP : sve_int_count_v<0b01000, "sqdecp">;
defm UQDECP_ZP : sve_int_count_v<0b01100, "uqdecp">;
defm INCP_ZP : sve_int_count_v<0b10000, "incp">;
defm DECP_ZP : sve_int_count_v<0b10100, "decp">;
defm INDEX_RR : sve_int_index_rr<"index">;
defm INDEX_IR : sve_int_index_ir<"index">;
defm INDEX_RI : sve_int_index_ri<"index">;
defm INDEX_II : sve_int_index_ii<"index">;
// Unpredicated shifts
defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr">;
defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr">;
defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl">;
defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">;
defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">;
defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">;
// Predicated shifts
defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0000, "asr">;
defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0001, "lsr">;
defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0011, "lsl">;
defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd">;
defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr">;
defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr">;
defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl">;
defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr">;
defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr">;
defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr">;
defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr">;
defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr">;
defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl">;
def FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16, ElementSizeS>;
def FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32, ElementSizeS>;
def SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16, ElementSizeH>;
def SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32, ElementSizeS>;
def UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32, ElementSizeS>;
def UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16, ElementSizeH>;
def FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, ElementSizeH>;
def FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, ElementSizeS>;
def FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, ElementSizeH>;
def FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, ElementSizeS>;
def FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16, ElementSizeD>;
def FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64, ElementSizeD>;
def FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32, ElementSizeD>;
def FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64, ElementSizeD>;
def SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64, ElementSizeD>;
def UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64, ElementSizeD>;
def UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16, ElementSizeS>;
def SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32, ElementSizeD>;
def SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16, ElementSizeS>;
def SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16, ElementSizeD>;
def UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32, ElementSizeD>;
def UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16, ElementSizeD>;
def SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64, ElementSizeD>;
def UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64, ElementSizeD>;
def FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, ElementSizeD>;
def FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, ElementSizeD>;
def FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, ElementSizeD>;
def FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, ElementSizeS>;
def FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, ElementSizeD>;
def FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, ElementSizeS>;
def FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, ElementSizeD>;
def FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, ElementSizeD>;
def FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, ElementSizeD>;
def FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, ElementSizeD>;
defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn">;
defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp">;
defm FRINTM_ZPmZ : sve_fp_2op_p_zd_HSD<0b00010, "frintm">;
defm FRINTZ_ZPmZ : sve_fp_2op_p_zd_HSD<0b00011, "frintz">;
defm FRINTA_ZPmZ : sve_fp_2op_p_zd_HSD<0b00100, "frinta">;
defm FRINTX_ZPmZ : sve_fp_2op_p_zd_HSD<0b00110, "frintx">;
defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti">;
defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx">;
defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt">;
// InstAliases
def : InstAlias<"mov $Zd, $Zn",
(ORR_ZZZ ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zn), 1>;
def : InstAlias<"mov $Pd, $Pg/m, $Pn",
(SEL_PPPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPR8:$Pd), 1>;
def : InstAlias<"mov $Pd, $Pn",
(ORR_PPzPP PPR8:$Pd, PPR8:$Pn, PPR8:$Pn, PPR8:$Pn), 1>;
def : InstAlias<"mov $Pd, $Pg/z, $Pn",
(AND_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPR8:$Pn), 1>;
def : InstAlias<"movs $Pd, $Pn",
(ORRS_PPzPP PPR8:$Pd, PPR8:$Pn, PPR8:$Pn, PPR8:$Pn), 1>;
def : InstAlias<"movs $Pd, $Pg/z, $Pn",
(ANDS_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPR8:$Pn), 1>;
def : InstAlias<"not $Pd, $Pg/z, $Pn",
(EOR_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPRAny:$Pg), 1>;
def : InstAlias<"nots $Pd, $Pg/z, $Pn",
(EORS_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPRAny:$Pg), 1>;
def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
(CMPGE_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
(CMPGE_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
(CMPGE_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
(CMPGE_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
(CMPHI_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
(CMPHI_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
(CMPHI_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
(CMPHI_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
(CMPHS_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
(CMPHS_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
(CMPHS_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
(CMPHS_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
(CMPGT_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
(CMPGT_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
(CMPGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
(CMPGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
def : InstAlias<"facle $Zd, $Pg/z, $Zm, $Zn",
(FACGE_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
def : InstAlias<"facle $Zd, $Pg/z, $Zm, $Zn",
(FACGE_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
def : InstAlias<"facle $Zd, $Pg/z, $Zm, $Zn",
(FACGE_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
def : InstAlias<"faclt $Zd, $Pg/z, $Zm, $Zn",
(FACGT_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
def : InstAlias<"faclt $Zd, $Pg/z, $Zm, $Zn",
(FACGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
def : InstAlias<"faclt $Zd, $Pg/z, $Zm, $Zn",
(FACGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
def : InstAlias<"fcmle $Zd, $Pg/z, $Zm, $Zn",
(FCMGE_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
def : InstAlias<"fcmle $Zd, $Pg/z, $Zm, $Zn",
(FCMGE_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
def : InstAlias<"fcmle $Zd, $Pg/z, $Zm, $Zn",
(FCMGE_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
(FCMGT_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
(FCMGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
(FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
}
let Predicates = [HasSVE2] in {
// SVE2 integer multiply-add (indexed)
defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla">;
defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls">;
// SVE2 saturating multiply-add high (indexed)
defm SQRDMLAH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b0, "sqrdmlah">;
defm SQRDMLSH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b1, "sqrdmlsh">;
// SVE2 saturating multiply-add high (vectors, unpredicated)
defm SQRDMLAH_ZZZ : sve2_int_mla<0b0, "sqrdmlah">;
defm SQRDMLSH_ZZZ : sve2_int_mla<0b1, "sqrdmlsh">;
// SVE2 integer multiply (indexed)
defm MUL_ZZZI : sve2_int_mul_by_indexed_elem<0b1110, "mul">;
// SVE2 saturating multiply high (indexed)
defm SQDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1100, "sqdmulh">;
defm SQRDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1101, "sqrdmulh">;
// SVE2 signed saturating doubling multiply high (unpredicated)
defm SQDMULH_ZZZ : sve2_int_mul<0b100, "sqdmulh">;
defm SQRDMULH_ZZZ : sve2_int_mul<0b101, "sqrdmulh">;
// SVE2 integer multiply vectors (unpredicated)
defm MUL_ZZZ : sve2_int_mul<0b000, "mul">;
defm SMULH_ZZZ : sve2_int_mul<0b010, "smulh">;
defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh">;
def PMUL_ZZZ_B : sve2_int_mul<0b00, 0b001, "pmul", ZPR8>;
// SVE2 complex integer dot product (indexed)
defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot">;
// SVE2 complex integer dot product
defm CDOT_ZZZ : sve2_cintx_dot<"cdot">;
// SVE2 complex integer multiply-add (indexed)
defm CMLA_ZZZI : sve2_cmla_by_indexed_elem<0b0, "cmla">;
// SVE2 complex saturating multiply-add (indexed)
defm SQRDCMLAH_ZZZI : sve2_cmla_by_indexed_elem<0b1, "sqrdcmlah">;
// SVE2 complex integer multiply-add
defm CMLA_ZZZ : sve2_int_cmla<0b0, "cmla">;
defm SQRDCMLAH_ZZZ : sve2_int_cmla<0b1, "sqrdcmlah">;
// SVE2 integer multiply long (indexed)
defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb">;
defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt">;
defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb">;
defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt">;
// SVE2 saturating multiply (indexed)
defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb">;
defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt">;
// SVE2 integer multiply-add long (indexed)
defm SMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1000, "smlalb">;
defm SMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1001, "smlalt">;
defm UMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1010, "umlalb">;
defm UMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1011, "umlalt">;
defm SMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1100, "smlslb">;
defm SMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1101, "smlslt">;
defm UMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1110, "umlslb">;
defm UMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1111, "umlslt">;
// SVE2 integer multiply-add long (vectors, unpredicated)
defm SMLALB_ZZZ : sve2_int_mla_long<0b10000, "smlalb">;
defm SMLALT_ZZZ : sve2_int_mla_long<0b10001, "smlalt">;
defm UMLALB_ZZZ : sve2_int_mla_long<0b10010, "umlalb">;
defm UMLALT_ZZZ : sve2_int_mla_long<0b10011, "umlalt">;
defm SMLSLB_ZZZ : sve2_int_mla_long<0b10100, "smlslb">;
defm SMLSLT_ZZZ : sve2_int_mla_long<0b10101, "smlslt">;
defm UMLSLB_ZZZ : sve2_int_mla_long<0b10110, "umlslb">;
defm UMLSLT_ZZZ : sve2_int_mla_long<0b10111, "umlslt">;
// SVE2 saturating multiply-add long (indexed)
defm SQDMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0100, "sqdmlalb">;
defm SQDMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0101, "sqdmlalt">;
defm SQDMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0110, "sqdmlslb">;
defm SQDMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0111, "sqdmlslt">;
// SVE2 saturating multiply-add long (vectors, unpredicated)
defm SQDMLALB_ZZZ : sve2_int_mla_long<0b11000, "sqdmlalb">;
defm SQDMLALT_ZZZ : sve2_int_mla_long<0b11001, "sqdmlalt">;
defm SQDMLSLB_ZZZ : sve2_int_mla_long<0b11010, "sqdmlslb">;
defm SQDMLSLT_ZZZ : sve2_int_mla_long<0b11011, "sqdmlslt">;
// SVE2 saturating multiply-add interleaved long
defm SQDMLALBT_ZZZ : sve2_int_mla_long<0b00010, "sqdmlalbt">;
defm SQDMLSLBT_ZZZ : sve2_int_mla_long<0b00011, "sqdmlslbt">;
// SVE2 integer halving add/subtract (predicated)
defm SHADD_ZPmZ : sve2_int_arith_pred<0b100000, "shadd">;
defm UHADD_ZPmZ : sve2_int_arith_pred<0b100010, "uhadd">;
defm SHSUB_ZPmZ : sve2_int_arith_pred<0b100100, "shsub">;
defm UHSUB_ZPmZ : sve2_int_arith_pred<0b100110, "uhsub">;
defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd">;
defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd">;
defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr">;
defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr">;
// SVE2 integer pairwise add and accumulate long
defm SADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<0, "sadalp">;
defm UADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<1, "uadalp">;
// SVE2 integer pairwise arithmetic
defm ADDP_ZPmZ : sve2_int_arith_pred<0b100011, "addp">;
defm SMAXP_ZPmZ : sve2_int_arith_pred<0b101001, "smaxp">;
defm UMAXP_ZPmZ : sve2_int_arith_pred<0b101011, "umaxp">;
defm SMINP_ZPmZ : sve2_int_arith_pred<0b101101, "sminp">;
defm UMINP_ZPmZ : sve2_int_arith_pred<0b101111, "uminp">;
// SVE2 integer unary operations (predicated)
defm URECPE_ZPmZ : sve2_int_un_pred_arit_s<0b000, "urecpe">;
defm URSQRTE_ZPmZ : sve2_int_un_pred_arit_s<0b001, "ursqrte">;
defm SQABS_ZPmZ : sve2_int_un_pred_arit<0b100, "sqabs">;
defm SQNEG_ZPmZ : sve2_int_un_pred_arit<0b101, "sqneg">;
// SVE2 saturating add/subtract
defm SQADD_ZPmZ : sve2_int_arith_pred<0b110000, "sqadd">;
defm UQADD_ZPmZ : sve2_int_arith_pred<0b110010, "uqadd">;
defm SQSUB_ZPmZ : sve2_int_arith_pred<0b110100, "sqsub">;
defm UQSUB_ZPmZ : sve2_int_arith_pred<0b110110, "uqsub">;
defm SUQADD_ZPmZ : sve2_int_arith_pred<0b111000, "suqadd">;
defm USQADD_ZPmZ : sve2_int_arith_pred<0b111010, "usqadd">;
defm SQSUBR_ZPmZ : sve2_int_arith_pred<0b111100, "sqsubr">;
defm UQSUBR_ZPmZ : sve2_int_arith_pred<0b111110, "uqsubr">;
// SVE2 saturating/rounding bitwise shift left (predicated)
defm SRSHL_ZPmZ : sve2_int_arith_pred<0b000100, "srshl">;
defm URSHL_ZPmZ : sve2_int_arith_pred<0b000110, "urshl">;
defm SRSHLR_ZPmZ : sve2_int_arith_pred<0b001100, "srshlr">;
defm URSHLR_ZPmZ : sve2_int_arith_pred<0b001110, "urshlr">;
defm SQSHL_ZPmZ : sve2_int_arith_pred<0b010000, "sqshl">;
defm UQSHL_ZPmZ : sve2_int_arith_pred<0b010010, "uqshl">;
defm SQRSHL_ZPmZ : sve2_int_arith_pred<0b010100, "sqrshl">;
defm UQRSHL_ZPmZ : sve2_int_arith_pred<0b010110, "uqrshl">;
defm SQSHLR_ZPmZ : sve2_int_arith_pred<0b011000, "sqshlr">;
defm UQSHLR_ZPmZ : sve2_int_arith_pred<0b011010, "uqshlr">;
defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr">;
defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr">;
+ // SVE2 predicated shifts
+ defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">;
+ defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">;
+ defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">;
+ defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">;
+ defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">;
+
// SVE2 integer add/subtract long
defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb">;
defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt">;
defm UADDLB_ZZZ : sve2_wide_int_arith_long<0b00010, "uaddlb">;
defm UADDLT_ZZZ : sve2_wide_int_arith_long<0b00011, "uaddlt">;
defm SSUBLB_ZZZ : sve2_wide_int_arith_long<0b00100, "ssublb">;
defm SSUBLT_ZZZ : sve2_wide_int_arith_long<0b00101, "ssublt">;
defm USUBLB_ZZZ : sve2_wide_int_arith_long<0b00110, "usublb">;
defm USUBLT_ZZZ : sve2_wide_int_arith_long<0b00111, "usublt">;
defm SABDLB_ZZZ : sve2_wide_int_arith_long<0b01100, "sabdlb">;
defm SABDLT_ZZZ : sve2_wide_int_arith_long<0b01101, "sabdlt">;
defm UABDLB_ZZZ : sve2_wide_int_arith_long<0b01110, "uabdlb">;
defm UABDLT_ZZZ : sve2_wide_int_arith_long<0b01111, "uabdlt">;
// SVE2 integer add/subtract wide
defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb">;
defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt">;
defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb">;
defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt">;
defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb">;
defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt">;
defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb">;
defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt">;
// SVE2 integer multiply long
defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb">;
defm SQDMULLT_ZZZ : sve2_wide_int_arith_long<0b11001, "sqdmullt">;
defm SMULLB_ZZZ : sve2_wide_int_arith_long<0b11100, "smullb">;
defm SMULLT_ZZZ : sve2_wide_int_arith_long<0b11101, "smullt">;
defm UMULLB_ZZZ : sve2_wide_int_arith_long<0b11110, "umullb">;
defm UMULLT_ZZZ : sve2_wide_int_arith_long<0b11111, "umullt">;
defm PMULLB_ZZZ : sve2_pmul_long<0b0, "pmullb">;
defm PMULLT_ZZZ : sve2_pmul_long<0b1, "pmullt">;
// SVE2 bitwise shift and insert
- defm SRI_ZZI : sve2_int_bin_cons_shift_imm_right<0b0, "sri">;
- defm SLI_ZZI : sve2_int_bin_cons_shift_imm_left< 0b1, "sli">;
+ defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri">;
+ defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli">;
// SVE2 bitwise shift right and accumulate
- defm SSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b00, "ssra">;
- defm USRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b01, "usra">;
- defm SRSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b10, "srsra">;
- defm URSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b11, "ursra">;
+ defm SSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b00, "ssra">;
+ defm USRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b01, "usra">;
+ defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra">;
+ defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra">;
// SVE2 complex integer add
defm CADD_ZZI : sve2_int_cadd<0b0, "cadd">;
defm SQCADD_ZZI : sve2_int_cadd<0b1, "sqcadd">;
// SVE2 integer absolute difference and accumulate
defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba">;
defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba">;
// SVE2 integer absolute difference and accumulate long
defm SABALB_ZZZ : sve2_int_absdiff_accum_long<0b00, "sabalb">;
defm SABALT_ZZZ : sve2_int_absdiff_accum_long<0b01, "sabalt">;
defm UABALB_ZZZ : sve2_int_absdiff_accum_long<0b10, "uabalb">;
defm UABALT_ZZZ : sve2_int_absdiff_accum_long<0b11, "uabalt">;
// SVE2 integer add/subtract long with carry
defm ADCLB_ZZZ : sve2_int_addsub_long_carry<0b00, "adclb">;
defm ADCLT_ZZZ : sve2_int_addsub_long_carry<0b01, "adclt">;
defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb">;
defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt">;
- // SVE2 bitwise shift right narrow
- defm SQSHRUNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0000, "sqshrunb">;
- defm SQSHRUNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0001, "sqshrunt">;
- defm SQRSHRUNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0010, "sqrshrunb">;
- defm SQRSHRUNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0011, "sqrshrunt">;
- defm SHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0100, "shrnb">;
- defm SHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0101, "shrnt">;
- defm RSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0110, "rshrnb">;
- defm RSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0111, "rshrnt">;
- defm SQSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1000, "sqshrnb">;
- defm SQSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1001, "sqshrnt">;
- defm SQRSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1010, "sqrshrnb">;
- defm SQRSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1011, "sqrshrnt">;
- defm UQSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1100, "uqshrnb">;
- defm UQSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1101, "uqshrnt">;
- defm UQRSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1110, "uqrshrnb">;
- defm UQRSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1111, "uqrshrnt">;
+ // SVE2 bitwise shift right narrow (bottom)
+ defm SQSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb">;
+ defm SQRSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b001, "sqrshrunb">;
+ defm SHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b010, "shrnb">;
+ defm RSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b011, "rshrnb">;
+ defm SQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b100, "sqshrnb">;
+ defm SQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b101, "sqrshrnb">;
+ defm UQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b110, "uqshrnb">;
+ defm UQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b111, "uqrshrnb">;
- // SVE2 integer add/subtract narrow high part
- defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high<0b000, "addhnb">;
- defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high<0b001, "addhnt">;
- defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high<0b010, "raddhnb">;
- defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high<0b011, "raddhnt">;
- defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high<0b100, "subhnb">;
- defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high<0b101, "subhnt">;
- defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high<0b110, "rsubhnb">;
- defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high<0b111, "rsubhnt">;
+ // SVE2 bitwise shift right narrow (top)
+ defm SQSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b000, "sqshrunt">;
+ defm SQRSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b001, "sqrshrunt">;
+ defm SHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b010, "shrnt">;
+ defm RSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b011, "rshrnt">;
+ defm SQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b100, "sqshrnt">;
+ defm SQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b101, "sqrshrnt">;
+ defm UQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b110, "uqshrnt">;
+ defm UQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b111, "uqrshrnt">;
- // SVE2 saturating extract narrow
- defm SQXTNB_ZZ : sve2_int_sat_extract_narrow<0b000, "sqxtnb">;
- defm SQXTNT_ZZ : sve2_int_sat_extract_narrow<0b001, "sqxtnt">;
- defm UQXTNB_ZZ : sve2_int_sat_extract_narrow<0b010, "uqxtnb">;
- defm UQXTNT_ZZ : sve2_int_sat_extract_narrow<0b011, "uqxtnt">;
- defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow<0b100, "sqxtunb">;
- defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow<0b101, "sqxtunt">;
+ // SVE2 integer add/subtract narrow high part (bottom)
+ defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b00, "addhnb">;
+ defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b01, "raddhnb">;
+ defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b10, "subhnb">;
+ defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b11, "rsubhnb">;
+ // SVE2 integer add/subtract narrow high part (top)
+ defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b00, "addhnt">;
+ defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b01, "raddhnt">;
+ defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b10, "subhnt">;
+ defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b11, "rsubhnt">;
+
+ // SVE2 saturating extract narrow (bottom)
+ defm SQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b00, "sqxtnb">;
+ defm UQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b01, "uqxtnb">;
+ defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b10, "sqxtunb">;
+
+ // SVE2 saturating extract narrow (top)
+ defm SQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b00, "sqxtnt">;
+ defm UQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b01, "uqxtnt">;
+ defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt">;
+
// SVE2 character match
defm MATCH_PPzZZ : sve2_char_match<0b0, "match">;
defm NMATCH_PPzZZ : sve2_char_match<0b1, "nmatch">;
// SVE2 bitwise exclusive-or interleaved
defm EORBT_ZZZ : sve2_bitwise_xor_interleaved<0b0, "eorbt">;
defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb">;
// SVE2 bitwise shift left long
defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb">;
defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt">;
defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb">;
defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt">;
// SVE2 integer add/subtract interleaved long
defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt">;
defm SSUBLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b10, "ssublbt">;
defm SSUBLTB_ZZZ : sve2_misc_int_addsub_long_interleaved<0b11, "ssubltb">;
// SVE2 histogram generation (segment)
def HISTSEG_ZZZ : sve2_hist_gen_segment<"histseg">;
// SVE2 histogram generation (vector)
defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt">;
+ // SVE2 floating-point base 2 logarithm as integer
+ defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">;
+
// SVE2 floating-point convert precision
defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtxnt">;
defm FCVTNT_ZPmZ : sve2_fp_convert_down_narrow<"fcvtnt">;
defm FCVTLT_ZPmZ : sve2_fp_convert_up_long<"fcvtlt">;
+ def FCVTX_ZPmZ_DtoS : sve_fp_2op_p_zd<0b0001010, "fcvtx", ZPR64, ZPR32, ElementSizeD>;
// SVE2 floating-point pairwise operations
defm FADDP_ZPmZZ : sve2_fp_pairwise_pred<0b000, "faddp">;
defm FMAXNMP_ZPmZZ : sve2_fp_pairwise_pred<0b100, "fmaxnmp">;
defm FMINNMP_ZPmZZ : sve2_fp_pairwise_pred<0b101, "fminnmp">;
defm FMAXP_ZPmZZ : sve2_fp_pairwise_pred<0b110, "fmaxp">;
defm FMINP_ZPmZZ : sve2_fp_pairwise_pred<0b111, "fminp">;
// SVE2 floating-point multiply-add long (indexed)
def FMLALB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b00, "fmlalb">;
def FMLALT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b01, "fmlalt">;
def FMLSLB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b10, "fmlslb">;
def FMLSLT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b11, "fmlslt">;
// SVE2 floating-point multiply-add long
def FMLALB_ZZZ_SHH : sve2_fp_mla_long<0b00, "fmlalb">;
def FMLALT_ZZZ_SHH : sve2_fp_mla_long<0b01, "fmlalt">;
def FMLSLB_ZZZ_SHH : sve2_fp_mla_long<0b10, "fmlslb">;
def FMLSLT_ZZZ_SHH : sve2_fp_mla_long<0b11, "fmlslt">;
// SVE2 bitwise ternary operations
defm EOR3_ZZZZ_D : sve2_int_bitwise_ternary_op<0b000, "eor3">;
defm BCAX_ZZZZ_D : sve2_int_bitwise_ternary_op<0b010, "bcax">;
def BSL_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b001, "bsl">;
def BSL1N_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b011, "bsl1n">;
def BSL2N_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b101, "bsl2n">;
def NBSL_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b111, "nbsl">;
- // sve_int_rotate_imm
+ // SVE2 bitwise xor and rotate right by immediate
defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar">;
// SVE2 extract vector (immediate offset, constructive)
def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;
- // SVE floating-point convert precision
- def FCVTX_ZPmZ_DtoS : sve_fp_2op_p_zd<0b0001010, "fcvtx", ZPR64, ZPR32, ElementSizeD>;
+ // SVE2 non-temporal gather loads
+ defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>;
+ defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>;
+ defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>;
+ defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>;
+ defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>;
- // SVE floating-point convert to integer
- defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">;
+ defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>;
+ defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>;
+ defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>;
+ defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>;
+ defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>;
+ defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>;
+ defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>;
- // Non-temporal contiguous loads (vector + register)
- defm LDNT1SB_ZZR_S : sve2_mem_cldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>;
- defm LDNT1B_ZZR_S : sve2_mem_cldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>;
- defm LDNT1SH_ZZR_S : sve2_mem_cldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>;
- defm LDNT1H_ZZR_S : sve2_mem_cldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>;
- defm LDNT1W_ZZR_S : sve2_mem_cldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>;
-
- defm LDNT1SB_ZZR_D : sve2_mem_cldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>;
- defm LDNT1B_ZZR_D : sve2_mem_cldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>;
- defm LDNT1SH_ZZR_D : sve2_mem_cldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>;
- defm LDNT1H_ZZR_D : sve2_mem_cldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>;
- defm LDNT1SW_ZZR_D : sve2_mem_cldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>;
- defm LDNT1W_ZZR_D : sve2_mem_cldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>;
- defm LDNT1D_ZZR_D : sve2_mem_cldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>;
-
// SVE2 vector splice (constructive)
defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;
- // Predicated shifts
- defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">;
- defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">;
- defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">;
- defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">;
- defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">;
+ // SVE2 non-temporal scatter stores
+ defm STNT1B_ZZR_S : sve2_mem_sstnt_vs<0b001, "stnt1b", Z_s, ZPR32>;
+ defm STNT1H_ZZR_S : sve2_mem_sstnt_vs<0b011, "stnt1h", Z_s, ZPR32>;
+ defm STNT1W_ZZR_S : sve2_mem_sstnt_vs<0b101, "stnt1w", Z_s, ZPR32>;
- // Non-temporal contiguous stores (vector + register)
- defm STNT1B_ZZR_S : sve2_mem_cstnt_vs<0b001, "stnt1b", Z_s, ZPR32>;
- defm STNT1H_ZZR_S : sve2_mem_cstnt_vs<0b011, "stnt1h", Z_s, ZPR32>;
- defm STNT1W_ZZR_S : sve2_mem_cstnt_vs<0b101, "stnt1w", Z_s, ZPR32>;
+ defm STNT1B_ZZR_D : sve2_mem_sstnt_vs<0b000, "stnt1b", Z_d, ZPR64>;
+ defm STNT1H_ZZR_D : sve2_mem_sstnt_vs<0b010, "stnt1h", Z_d, ZPR64>;
+ defm STNT1W_ZZR_D : sve2_mem_sstnt_vs<0b100, "stnt1w", Z_d, ZPR64>;
+ defm STNT1D_ZZR_D : sve2_mem_sstnt_vs<0b110, "stnt1d", Z_d, ZPR64>;
- defm STNT1B_ZZR_D : sve2_mem_cstnt_vs<0b000, "stnt1b", Z_d, ZPR64>;
- defm STNT1H_ZZR_D : sve2_mem_cstnt_vs<0b010, "stnt1h", Z_d, ZPR64>;
- defm STNT1W_ZZR_D : sve2_mem_cstnt_vs<0b100, "stnt1w", Z_d, ZPR64>;
- defm STNT1D_ZZR_D : sve2_mem_cstnt_vs<0b110, "stnt1d", Z_d, ZPR64>;
-
- // SVE table lookup (three sources)
+ // SVE2 table lookup (three sources)
defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl">;
defm TBX_ZZZ : sve2_int_perm_tbx<"tbx">;
- // SVE integer compare scalar count and limit
+ // SVE2 integer compare scalar count and limit
defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege">;
defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt">;
defm WHILEHS_PWW : sve_int_while4_rr<0b100, "whilehs">;
defm WHILEHI_PWW : sve_int_while4_rr<0b101, "whilehi">;
defm WHILEGE_PXX : sve_int_while8_rr<0b000, "whilege">;
defm WHILEGT_PXX : sve_int_while8_rr<0b001, "whilegt">;
defm WHILEHS_PXX : sve_int_while8_rr<0b100, "whilehs">;
defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi">;
- // SVE pointer conflict compare
+ // SVE2 pointer conflict compare
defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr">;
defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw">;
}
let Predicates = [HasSVE2AES] in {
// SVE2 crypto destructive binary operations
def AESE_ZZZ_B : sve2_crypto_des_bin_op<0b00, "aese", ZPR8>;
def AESD_ZZZ_B : sve2_crypto_des_bin_op<0b01, "aesd", ZPR8>;
// SVE2 crypto unary operations
def AESMC_ZZ_B : sve2_crypto_unary_op<0b0, "aesmc">;
def AESIMC_ZZ_B : sve2_crypto_unary_op<0b1, "aesimc">;
// PMULLB and PMULLT instructions which operate with 64-bit source and
// 128-bit destination elements are enabled with crypto extensions, similar
// to NEON PMULL2 instruction.
def PMULLB_ZZZ_Q : sve2_wide_int_arith<0b00, 0b11010, "pmullb",
ZPR128, ZPR64, ZPR64>;
def PMULLT_ZZZ_Q : sve2_wide_int_arith<0b00, 0b11011, "pmullt",
ZPR128, ZPR64, ZPR64>;
}
let Predicates = [HasSVE2SM4] in {
// SVE2 crypto constructive binary operations
def SM4EKEY_ZZZ_S : sve2_crypto_cons_bin_op<0b0, "sm4ekey", ZPR32>;
// SVE2 crypto destructive binary operations
def SM4E_ZZZ_S : sve2_crypto_des_bin_op<0b10, "sm4e", ZPR32>;
}
let Predicates = [HasSVE2SHA3] in {
// SVE2 crypto constructive binary operations
def RAX1_ZZZ_D : sve2_crypto_cons_bin_op<0b1, "rax1", ZPR64>;
}
let Predicates = [HasSVE2BitPerm] in {
// SVE2 bitwise permute
defm BEXT_ZZZ : sve2_misc_bitwise<0b1100, "bext">;
defm BDEP_ZZZ : sve2_misc_bitwise<0b1101, "bdep">;
defm BGRP_ZZZ : sve2_misc_bitwise<0b1110, "bgrp">;
}
Index: vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (revision 351303)
@@ -1,1002 +1,1015 @@
//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "AArch64ExpandImm.h"
#include "AArch64TargetTransformInfo.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/CodeGen/CostTable.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Support/Debug.h"
#include <algorithm>
using namespace llvm;
#define DEBUG_TYPE "aarch64tti"
static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
cl::init(true), cl::Hidden);
bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
const FeatureBitset &CallerBits =
TM.getSubtargetImpl(*Caller)->getFeatureBits();
const FeatureBitset &CalleeBits =
TM.getSubtargetImpl(*Callee)->getFeatureBits();
// Inline a callee if its target-features are a subset of the callers
// target-features.
return (CallerBits & CalleeBits) == CalleeBits;
}
/// Calculate the cost of materializing a 64-bit value. This helper
/// method might only calculate a fraction of a larger immediate. Therefore it
/// is valid to return a cost of ZERO.
int AArch64TTIImpl::getIntImmCost(int64_t Val) {
// Check if the immediate can be encoded within an instruction.
if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
return 0;
if (Val < 0)
Val = ~Val;
// Calculate how many moves we will need to materialize this constant.
SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
AArch64_IMM::expandMOVImm(Val, 64, Insn);
return Insn.size();
}
/// Calculate the cost of materializing the given constant.
int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
if (BitSize == 0)
return ~0U;
// Sign-extend all constants to a multiple of 64-bit.
APInt ImmVal = Imm;
if (BitSize & 0x3f)
ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
// Split the constant into 64-bit chunks and calculate the cost for each
// chunk.
int Cost = 0;
for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
int64_t Val = Tmp.getSExtValue();
Cost += getIntImmCost(Val);
}
// We need at least one instruction to materialze the constant.
return std::max(1, Cost);
}
int AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
const APInt &Imm, Type *Ty) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
// There is no cost model for constants with a bit size of 0. Return TCC_Free
// here, so that constant hoisting will ignore this constant.
if (BitSize == 0)
return TTI::TCC_Free;
unsigned ImmIdx = ~0U;
switch (Opcode) {
default:
return TTI::TCC_Free;
case Instruction::GetElementPtr:
// Always hoist the base address of a GetElementPtr.
if (Idx == 0)
return 2 * TTI::TCC_Basic;
return TTI::TCC_Free;
case Instruction::Store:
ImmIdx = 0;
break;
case Instruction::Add:
case Instruction::Sub:
case Instruction::Mul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
case Instruction::ICmp:
ImmIdx = 1;
break;
// Always return TCC_Free for the shift value of a shift instruction.
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
if (Idx == 1)
return TTI::TCC_Free;
break;
case Instruction::Trunc:
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::IntToPtr:
case Instruction::PtrToInt:
case Instruction::BitCast:
case Instruction::PHI:
case Instruction::Call:
case Instruction::Select:
case Instruction::Ret:
case Instruction::Load:
break;
}
if (Idx == ImmIdx) {
int NumConstants = (BitSize + 63) / 64;
int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
return (Cost <= NumConstants * TTI::TCC_Basic)
? static_cast<int>(TTI::TCC_Free)
: Cost;
}
return AArch64TTIImpl::getIntImmCost(Imm, Ty);
}
int AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
const APInt &Imm, Type *Ty) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
// There is no cost model for constants with a bit size of 0. Return TCC_Free
// here, so that constant hoisting will ignore this constant.
if (BitSize == 0)
return TTI::TCC_Free;
switch (IID) {
default:
return TTI::TCC_Free;
case Intrinsic::sadd_with_overflow:
case Intrinsic::uadd_with_overflow:
case Intrinsic::ssub_with_overflow:
case Intrinsic::usub_with_overflow:
case Intrinsic::smul_with_overflow:
case Intrinsic::umul_with_overflow:
if (Idx == 1) {
int NumConstants = (BitSize + 63) / 64;
int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
return (Cost <= NumConstants * TTI::TCC_Basic)
? static_cast<int>(TTI::TCC_Free)
: Cost;
}
break;
case Intrinsic::experimental_stackmap:
if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
return TTI::TCC_Free;
break;
case Intrinsic::experimental_patchpoint_void:
case Intrinsic::experimental_patchpoint_i64:
if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
return TTI::TCC_Free;
break;
}
return AArch64TTIImpl::getIntImmCost(Imm, Ty);
}
TargetTransformInfo::PopcntSupportKind
AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
if (TyWidth == 32 || TyWidth == 64)
return TTI::PSK_FastHardware;
// TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
return TTI::PSK_Software;
}
bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
ArrayRef<const Value *> Args) {
// A helper that returns a vector type from the given type. The number of
// elements in type Ty determine the vector width.
auto toVectorTy = [&](Type *ArgTy) {
return VectorType::get(ArgTy->getScalarType(),
DstTy->getVectorNumElements());
};
// Exit early if DstTy is not a vector type whose elements are at least
// 16-bits wide.
if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
return false;
// Determine if the operation has a widening variant. We consider both the
// "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
// instructions.
//
// TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
// verify that their extending operands are eliminated during code
// generation.
switch (Opcode) {
case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
break;
default:
return false;
}
// To be a widening instruction (either the "wide" or "long" versions), the
// second operand must be a sign- or zero extend having a single user. We
// only consider extends having a single user because they may otherwise not
// be eliminated.
if (Args.size() != 2 ||
(!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
!Args[1]->hasOneUse())
return false;
auto *Extend = cast<CastInst>(Args[1]);
// Legalize the destination type and ensure it can be used in a widening
// operation.
auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
return false;
// Legalize the source type and ensure it can be used in a widening
// operation.
Type *SrcTy = toVectorTy(Extend->getSrcTy());
auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
return false;
// Get the total number of vector elements in the legalized types.
unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();
// Return true if the legalized types have the same number of vector elements
// and the destination element type size is twice that of the source type.
return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
}
int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
// If the cast is observable, and it is used by a widening instruction (e.g.,
// uaddl, saddw, etc.), it may be free.
if (I && I->hasOneUse()) {
auto *SingleUser = cast<Instruction>(*I->user_begin());
SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
// If the cast is the second operand, it is free. We will generate either
// a "wide" or "long" version of the widening instruction.
if (I == SingleUser->getOperand(1))
return 0;
// If the cast is not the second operand, it will be free if it looks the
// same as the second operand. In this case, we will generate a "long"
// version of the widening instruction.
if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
return 0;
}
}
EVT SrcTy = TLI->getValueType(DL, Src);
EVT DstTy = TLI->getValueType(DL, Dst);
if (!SrcTy.isSimple() || !DstTy.isSimple())
return BaseT::getCastInstrCost(Opcode, Dst, Src);
static const TypeConversionCostTblEntry
ConversionTbl[] = {
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
// The number of shll instructions for the extension.
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
// LowerVectorINT_TO_FP:
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
// Complex: to v2f32
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
// Complex: to v4f32
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
// Complex: to v8f32
{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
// Complex: to v16f32
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
// Complex: to v2f64
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
// LowerVectorFP_TO_INT
{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
{ ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
// Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
{ ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
{ ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
{ ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
{ ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
// Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
{ ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
{ ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
{ ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
{ ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
// Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
{ ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
{ ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
{ ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
{ ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
};
if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
DstTy.getSimpleVT(),
SrcTy.getSimpleVT()))
return Entry->Cost;
return BaseT::getCastInstrCost(Opcode, Dst, Src);
}
int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
VectorType *VecTy,
unsigned Index) {
// Make sure we were given a valid extend opcode.
assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
"Invalid opcode");
// We are extending an element we extract from a vector, so the source type
// of the extend is the element type of the vector.
auto *Src = VecTy->getElementType();
// Sign- and zero-extends are for integer types only.
assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
// Get the cost for the extract. We compute the cost (if any) for the extend
// below.
auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
// Legalize the types.
auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
auto DstVT = TLI->getValueType(DL, Dst);
auto SrcVT = TLI->getValueType(DL, Src);
// If the resulting type is still a vector and the destination type is legal,
// we may get the extension for free. If not, get the default cost for the
// extend.
if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
return Cost + getCastInstrCost(Opcode, Dst, Src);
// The destination type should be larger than the element type. If not, get
// the default cost for the extend.
if (DstVT.getSizeInBits() < SrcVT.getSizeInBits())
return Cost + getCastInstrCost(Opcode, Dst, Src);
switch (Opcode) {
default:
llvm_unreachable("Opcode should be either SExt or ZExt");
// For sign-extends, we only need a smov, which performs the extension
// automatically.
case Instruction::SExt:
return Cost;
// For zero-extends, the extend is performed automatically by a umov unless
// the destination type is i64 and the element type is i8 or i16.
case Instruction::ZExt:
if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
return Cost;
}
// If we are unable to perform the extend for free, get the default cost.
return Cost + getCastInstrCost(Opcode, Dst, Src);
}
int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
unsigned Index) {
assert(Val->isVectorTy() && "This must be a vector type");
if (Index != -1U) {
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
// This type is legalized to a scalar type.
if (!LT.second.isVector())
return 0;
// The type may be split. Normalize the index to the new type.
unsigned Width = LT.second.getVectorNumElements();
Index = Index % Width;
// The element at index zero is already inside the vector.
if (Index == 0)
return 0;
}
// All other insert/extracts cost this much.
return ST->getVectorInsertExtractBaseCost();
}
int AArch64TTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
// If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
// add in the widening overhead specified by the sub-target. Since the
// extends feeding widening instructions are performed automatically, they
// aren't present in the generated code and have a zero cost. By adding a
// widening overhead here, we attach the total cost of the combined operation
// to the widening instruction.
int Cost = 0;
if (isWideningInstruction(Ty, Opcode, Args))
Cost += ST->getWideningBaseCost();
int ISD = TLI->InstructionOpcodeToISD(Opcode);
switch (ISD) {
default:
return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
Opd1PropInfo, Opd2PropInfo);
case ISD::SDIV:
if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
// On AArch64, scalar signed division by constants power-of-two are
// normally expanded to the sequence ADD + CMP + SELECT + SRA.
// The OperandValue properties many not be same as that of previous
// operation; conservatively assume OP_None.
Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
return Cost;
}
LLVM_FALLTHROUGH;
case ISD::UDIV:
if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
auto VT = TLI->getValueType(DL, Ty);
if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
// Vector signed division by constant are expanded to the
// sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
// to MULHS + SUB + SRL + ADD + SRL.
int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, Opd1Info,
Opd2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info,
Opd2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info,
Opd2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
}
}
Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
Opd1PropInfo, Opd2PropInfo);
if (Ty->isVectorTy()) {
// On AArch64, vector divisions are not supported natively and are
// expanded into scalar divisions of each pair of elements.
Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, Opd1Info,
Opd2Info, Opd1PropInfo, Opd2PropInfo);
Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, Opd1Info,
Opd2Info, Opd1PropInfo, Opd2PropInfo);
// TODO: if one of the arguments is scalar, then it's not necessary to
// double the cost of handling the vector elements.
Cost += Cost;
}
return Cost;
case ISD::ADD:
case ISD::MUL:
case ISD::XOR:
case ISD::OR:
case ISD::AND:
// These nodes are marked as 'custom' for combining purposes only.
// We know that they are legal. See LowerAdd in ISelLowering.
return (Cost + 1) * LT.first;
}
}
int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
const SCEV *Ptr) {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
// extra micro-ops can significantly decrease throughput.
unsigned NumVectorInstToHideOverhead = 10;
int MaxMergeDistance = 64;
if (Ty->isVectorTy() && SE &&
!BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
return NumVectorInstToHideOverhead;
// In many cases the address computation is not merged into the instruction
// addressing mode.
return 1;
}
int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
Type *CondTy, const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
// We don't lower some vector selects well that are wider than the register
// width.
if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
// We would need this many instructions to hide the scalarization happening.
const int AmortizationCost = 20;
static const TypeConversionCostTblEntry
VectorSelectTbl[] = {
{ ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
{ ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
{ ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
{ ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
{ ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
{ ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
};
EVT SelCondTy = TLI->getValueType(DL, CondTy);
EVT SelValTy = TLI->getValueType(DL, ValTy);
if (SelCondTy.isSimple() && SelValTy.isSimple()) {
if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
SelCondTy.getSimpleVT(),
SelValTy.getSimpleVT()))
return Entry->Cost;
}
}
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
}
+AArch64TTIImpl::TTI::MemCmpExpansionOptions
+AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
+ TTI::MemCmpExpansionOptions Options;
+ Options.AllowOverlappingLoads = !ST->requiresStrictAlign();
+ Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
+ Options.NumLoadsPerBlock = Options.MaxNumLoads;
+ // TODO: Though vector loads usually perform well on AArch64, in some targets
+ // they may wake up the FP unit, which raises the power consumption. Perhaps
+ // they could be used with no holds barred (-O3).
+ Options.LoadSizes = {8, 4, 2, 1};
+ return Options;
+}
+
int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
unsigned Alignment, unsigned AddressSpace,
const Instruction *I) {
auto LT = TLI->getTypeLegalizationCost(DL, Ty);
if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
LT.second.is128BitVector() && Alignment < 16) {
// Unaligned stores are extremely inefficient. We don't split all
// unaligned 128-bit stores because the negative impact that has shown in
// practice on inlined block copy code.
// We make such stores expensive so that we will only vectorize if there
// are 6 other instructions getting vectorized.
const int AmortizationCost = 6;
return LT.first * 2 * AmortizationCost;
}
if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8)) {
unsigned ProfitableNumElements;
if (Opcode == Instruction::Store)
// We use a custom trunc store lowering so v.4b should be profitable.
ProfitableNumElements = 4;
else
// We scalarize the loads because there is not v.4b register and we
// have to promote the elements to v.2.
ProfitableNumElements = 8;
if (Ty->getVectorNumElements() < ProfitableNumElements) {
unsigned NumVecElts = Ty->getVectorNumElements();
unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
// We generate 2 instructions per vector element.
return NumVectorizableInstsToAmortize * NumVecElts * 2;
}
}
return LT.first;
}
int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace,
bool UseMaskForCond,
bool UseMaskForGaps) {
assert(Factor >= 2 && "Invalid interleave factor");
assert(isa<VectorType>(VecTy) && "Expect a vector type");
if (!UseMaskForCond && !UseMaskForGaps &&
Factor <= TLI->getMaxSupportedInterleaveFactor()) {
unsigned NumElts = VecTy->getVectorNumElements();
auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
// ldN/stN only support legal vector types of size 64 or 128 in bits.
// Accesses having vector types that are a multiple of 128 bits can be
// matched to more than one ldN/stN instruction.
if (NumElts % Factor == 0 &&
TLI->isLegalInterleavedAccessType(SubVecTy, DL))
return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace,
UseMaskForCond, UseMaskForGaps);
}
int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
int Cost = 0;
for (auto *I : Tys) {
if (!I->isVectorTy())
continue;
if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128)
Cost += getMemoryOpCost(Instruction::Store, I, 128, 0) +
getMemoryOpCost(Instruction::Load, I, 128, 0);
}
return Cost;
}
unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
return ST->getMaxInterleaveFactor();
}
// For Falkor, we want to avoid having too many strided loads in a loop since
// that can exhaust the HW prefetcher resources. We adjust the unroller
// MaxCount preference below to attempt to ensure unrolling doesn't create too
// many strided loads.
static void
getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TargetTransformInfo::UnrollingPreferences &UP) {
enum { MaxStridedLoads = 7 };
auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
int StridedLoads = 0;
// FIXME? We could make this more precise by looking at the CFG and
// e.g. not counting loads in each side of an if-then-else diamond.
for (const auto BB : L->blocks()) {
for (auto &I : *BB) {
LoadInst *LMemI = dyn_cast<LoadInst>(&I);
if (!LMemI)
continue;
Value *PtrValue = LMemI->getPointerOperand();
if (L->isLoopInvariant(PtrValue))
continue;
const SCEV *LSCEV = SE.getSCEV(PtrValue);
const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
continue;
// FIXME? We could take pairing of unrolled load copies into account
// by looking at the AddRec, but we would probably have to limit this
// to loops with no stores or other memory optimization barriers.
++StridedLoads;
// We've seen enough strided loads that seeing more won't make a
// difference.
if (StridedLoads > MaxStridedLoads / 2)
return StridedLoads;
}
}
return StridedLoads;
};
int StridedLoads = countStridedLoads(L, SE);
LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
<< " strided loads\n");
// Pick the largest power of 2 unroll count that won't result in too many
// strided loads.
if (StridedLoads) {
UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
<< UP.MaxCount << '\n');
}
}
void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
// Enable partial unrolling and runtime unrolling.
BaseT::getUnrollingPreferences(L, SE, UP);
// For inner loop, it is more likely to be a hot one, and the runtime check
// can be promoted out from LICM pass, so the overhead is less, let's try
// a larger threshold to unroll more loops.
if (L->getLoopDepth() > 1)
UP.PartialThreshold *= 2;
// Disable partial & runtime unrolling on -Os.
UP.PartialOptSizeThreshold = 0;
if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
EnableFalkorHWPFUnrollFix)
getFalkorUnrollingPreferences(L, SE, UP);
}
Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
Type *ExpectedType) {
switch (Inst->getIntrinsicID()) {
default:
return nullptr;
case Intrinsic::aarch64_neon_st2:
case Intrinsic::aarch64_neon_st3:
case Intrinsic::aarch64_neon_st4: {
// Create a struct type
StructType *ST = dyn_cast<StructType>(ExpectedType);
if (!ST)
return nullptr;
unsigned NumElts = Inst->getNumArgOperands() - 1;
if (ST->getNumElements() != NumElts)
return nullptr;
for (unsigned i = 0, e = NumElts; i != e; ++i) {
if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
return nullptr;
}
Value *Res = UndefValue::get(ExpectedType);
IRBuilder<> Builder(Inst);
for (unsigned i = 0, e = NumElts; i != e; ++i) {
Value *L = Inst->getArgOperand(i);
Res = Builder.CreateInsertValue(Res, L, i);
}
return Res;
}
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_ld4:
if (Inst->getType() == ExpectedType)
return Inst;
return nullptr;
}
}
bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
MemIntrinsicInfo &Info) {
switch (Inst->getIntrinsicID()) {
default:
break;
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_ld4:
Info.ReadMem = true;
Info.WriteMem = false;
Info.PtrVal = Inst->getArgOperand(0);
break;
case Intrinsic::aarch64_neon_st2:
case Intrinsic::aarch64_neon_st3:
case Intrinsic::aarch64_neon_st4:
Info.ReadMem = false;
Info.WriteMem = true;
Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
break;
}
switch (Inst->getIntrinsicID()) {
default:
return false;
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_st2:
Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
break;
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_st3:
Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
break;
case Intrinsic::aarch64_neon_ld4:
case Intrinsic::aarch64_neon_st4:
Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
break;
}
return true;
}
/// See if \p I should be considered for address type promotion. We check if \p
/// I is a sext with right type and used in memory accesses. If it used in a
/// "complex" getelementptr, we allow it to be promoted without finding other
/// sext instructions that sign extended the same initial value. A getelementptr
/// is considered as "complex" if it has more than 2 operands.
bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
bool Considerable = false;
AllowPromotionWithoutCommonHeader = false;
if (!isa<SExtInst>(&I))
return false;
Type *ConsideredSExtType =
Type::getInt64Ty(I.getParent()->getParent()->getContext());
if (I.getType() != ConsideredSExtType)
return false;
// See if the sext is the one with the right type and used in at least one
// GetElementPtrInst.
for (const User *U : I.users()) {
if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
Considerable = true;
// A getelementptr is considered as "complex" if it has more than 2
// operands. We will promote a SExt used in such complex GEP as we
// expect some computation to be merged if they are done on 64 bits.
if (GEPInst->getNumOperands() > 2) {
AllowPromotionWithoutCommonHeader = true;
break;
}
}
}
return Considerable;
}
unsigned AArch64TTIImpl::getCacheLineSize() {
return ST->getCacheLineSize();
}
unsigned AArch64TTIImpl::getPrefetchDistance() {
return ST->getPrefetchDistance();
}
unsigned AArch64TTIImpl::getMinPrefetchStride() {
return ST->getMinPrefetchStride();
}
unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
return ST->getMaxPrefetchIterationsAhead();
}
bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const {
assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
unsigned ScalarBits = Ty->getScalarSizeInBits();
switch (Opcode) {
case Instruction::FAdd:
case Instruction::FMul:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
case Instruction::Mul:
return false;
case Instruction::Add:
return ScalarBits * Ty->getVectorNumElements() >= 128;
case Instruction::ICmp:
return (ScalarBits < 64) &&
(ScalarBits * Ty->getVectorNumElements() >= 128);
case Instruction::FCmp:
return Flags.NoNaN;
default:
llvm_unreachable("Unhandled reduction opcode");
}
return false;
}
int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
bool IsPairwiseForm) {
if (IsPairwiseForm)
return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
MVT MTy = LT.second;
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
// Horizontal adds can use the 'addv' instruction. We model the cost of these
// instructions as normal vector adds. This is the only arithmetic vector
// reduction operation for which we have an instruction.
static const CostTblEntry CostTblNoPairwise[]{
{ISD::ADD, MVT::v8i8, 1},
{ISD::ADD, MVT::v16i8, 1},
{ISD::ADD, MVT::v4i16, 1},
{ISD::ADD, MVT::v8i16, 1},
{ISD::ADD, MVT::v4i32, 1},
};
if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
return LT.first * Entry->Cost;
return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
}
int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc) {
static const CostTblEntry ShuffleTbl[] = {
// Broadcast shuffle kinds can be performed with 'dup'.
{ TTI::SK_Broadcast, MVT::v8i8, 1 },
{ TTI::SK_Broadcast, MVT::v16i8, 1 },
{ TTI::SK_Broadcast, MVT::v4i16, 1 },
{ TTI::SK_Broadcast, MVT::v8i16, 1 },
{ TTI::SK_Broadcast, MVT::v2i32, 1 },
{ TTI::SK_Broadcast, MVT::v4i32, 1 },
{ TTI::SK_Broadcast, MVT::v2i64, 1 },
{ TTI::SK_Broadcast, MVT::v2f32, 1 },
{ TTI::SK_Broadcast, MVT::v4f32, 1 },
{ TTI::SK_Broadcast, MVT::v2f64, 1 },
// Transpose shuffle kinds can be performed with 'trn1/trn2' and
// 'zip1/zip2' instructions.
{ TTI::SK_Transpose, MVT::v8i8, 1 },
{ TTI::SK_Transpose, MVT::v16i8, 1 },
{ TTI::SK_Transpose, MVT::v4i16, 1 },
{ TTI::SK_Transpose, MVT::v8i16, 1 },
{ TTI::SK_Transpose, MVT::v2i32, 1 },
{ TTI::SK_Transpose, MVT::v4i32, 1 },
{ TTI::SK_Transpose, MVT::v2i64, 1 },
{ TTI::SK_Transpose, MVT::v2f32, 1 },
{ TTI::SK_Transpose, MVT::v4f32, 1 },
{ TTI::SK_Transpose, MVT::v2f64, 1 },
// Select shuffle kinds.
// TODO: handle vXi8/vXi16.
{ TTI::SK_Select, MVT::v2i32, 1 }, // mov.
{ TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
{ TTI::SK_Select, MVT::v2i64, 1 }, // mov.
{ TTI::SK_Select, MVT::v2f32, 1 }, // mov.
{ TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
{ TTI::SK_Select, MVT::v2f64, 1 }, // mov.
// PermuteSingleSrc shuffle kinds.
// TODO: handle vXi8/vXi16.
{ TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
{ TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
{ TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
{ TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
{ TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
{ TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
};
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
}
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
}
Index: vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64TargetTransformInfo.h (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64TargetTransformInfo.h (revision 351303)
@@ -1,184 +1,187 @@
//===- AArch64TargetTransformInfo.h - AArch64 specific TTI ------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
/// This file a TargetTransformInfo::Concept conforming object specific to the
/// AArch64 target machine. It uses the target's detailed information to
/// provide more precise answers to certain TTI queries, while letting the
/// target independent and default TTI implementations handle the rest.
///
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H
#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H
#include "AArch64.h"
#include "AArch64Subtarget.h"
#include "AArch64TargetMachine.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Intrinsics.h"
#include <cstdint>
namespace llvm {
class APInt;
class Instruction;
class IntrinsicInst;
class Loop;
class SCEV;
class ScalarEvolution;
class Type;
class Value;
class VectorType;
class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
using BaseT = BasicTTIImplBase<AArch64TTIImpl>;
using TTI = TargetTransformInfo;
friend BaseT;
const AArch64Subtarget *ST;
const AArch64TargetLowering *TLI;
const AArch64Subtarget *getST() const { return ST; }
const AArch64TargetLowering *getTLI() const { return TLI; }
enum MemIntrinsicType {
VECTOR_LDST_TWO_ELEMENTS,
VECTOR_LDST_THREE_ELEMENTS,
VECTOR_LDST_FOUR_ELEMENTS
};
bool isWideningInstruction(Type *Ty, unsigned Opcode,
ArrayRef<const Value *> Args);
public:
explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
/// \name Scalar TTI Implementations
/// @{
using BaseT::getIntImmCost;
int getIntImmCost(int64_t Val);
int getIntImmCost(const APInt &Imm, Type *Ty);
int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
Type *Ty);
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
/// @}
/// \name Vector TTI Implementations
/// @{
bool enableInterleavedAccessVectorization() { return true; }
unsigned getNumberOfRegisters(bool Vector) {
if (Vector) {
if (ST->hasNEON())
return 32;
return 0;
}
return 31;
}
unsigned getRegisterBitWidth(bool Vector) const {
if (Vector) {
if (ST->hasNEON())
return 128;
return 0;
}
return 64;
}
unsigned getMinVectorRegisterBitWidth() {
return ST->getMinVectorRegisterBitWidth();
}
unsigned getMaxInterleaveFactor(unsigned VF);
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
const Instruction *I = nullptr);
int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
unsigned Index);
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
int getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
ArrayRef<const Value *> Args = ArrayRef<const Value *>());
int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
const Instruction *I = nullptr);
+ TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
+ bool IsZeroCmp) const;
+
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace, const Instruction *I = nullptr);
int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
Type *ExpectedType);
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, unsigned Alignment,
unsigned AddressSpace,
bool UseMaskForCond = false,
bool UseMaskForGaps = false);
bool
shouldConsiderAddressTypePromotion(const Instruction &I,
bool &AllowPromotionWithoutCommonHeader);
unsigned getCacheLineSize();
unsigned getPrefetchDistance();
unsigned getMinPrefetchStride();
unsigned getMaxPrefetchIterationsAhead();
bool shouldExpandReduction(const IntrinsicInst *II) const {
return false;
}
unsigned getGISelRematGlobalCost() const {
return 2;
}
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;
int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
bool IsPairwiseForm);
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
/// @}
};
} // end namespace llvm
#endif // LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H
Index: vendor/llvm/dist-release_90/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp (revision 351303)
@@ -1,5762 +1,5762 @@
//==- AArch64AsmParser.cpp - Parse AArch64 assembly to MCInst instructions -==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "MCTargetDesc/AArch64MCExpr.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "MCTargetDesc/AArch64TargetStreamer.h"
#include "TargetInfo/AArch64TargetInfo.h"
#include "AArch64InstrInfo.h"
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Twine.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCLinkerOptimizationHint.h"
#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCParser/MCAsmLexer.h"
#include "llvm/MC/MCParser/MCAsmParser.h"
#include "llvm/MC/MCParser/MCAsmParserExtension.h"
#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
#include "llvm/MC/MCParser/MCTargetAsmParser.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCTargetOptions.h"
#include "llvm/MC/SubtargetFeature.h"
#include "llvm/MC/MCValue.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/SMLoc.h"
#include "llvm/Support/TargetParser.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
#include <cassert>
#include <cctype>
#include <cstdint>
#include <cstdio>
#include <string>
#include <tuple>
#include <utility>
#include <vector>
using namespace llvm;
namespace {
enum class RegKind {
Scalar,
NeonVector,
SVEDataVector,
SVEPredicateVector
};
enum RegConstraintEqualityTy {
EqualsReg,
EqualsSuperReg,
EqualsSubReg
};
class AArch64AsmParser : public MCTargetAsmParser {
private:
StringRef Mnemonic; ///< Instruction mnemonic.
// Map of register aliases registers via the .req directive.
StringMap<std::pair<RegKind, unsigned>> RegisterReqs;
class PrefixInfo {
public:
static PrefixInfo CreateFromInst(const MCInst &Inst, uint64_t TSFlags) {
PrefixInfo Prefix;
switch (Inst.getOpcode()) {
case AArch64::MOVPRFX_ZZ:
Prefix.Active = true;
Prefix.Dst = Inst.getOperand(0).getReg();
break;
case AArch64::MOVPRFX_ZPmZ_B:
case AArch64::MOVPRFX_ZPmZ_H:
case AArch64::MOVPRFX_ZPmZ_S:
case AArch64::MOVPRFX_ZPmZ_D:
Prefix.Active = true;
Prefix.Predicated = true;
Prefix.ElementSize = TSFlags & AArch64::ElementSizeMask;
assert(Prefix.ElementSize != AArch64::ElementSizeNone &&
"No destructive element size set for movprfx");
Prefix.Dst = Inst.getOperand(0).getReg();
Prefix.Pg = Inst.getOperand(2).getReg();
break;
case AArch64::MOVPRFX_ZPzZ_B:
case AArch64::MOVPRFX_ZPzZ_H:
case AArch64::MOVPRFX_ZPzZ_S:
case AArch64::MOVPRFX_ZPzZ_D:
Prefix.Active = true;
Prefix.Predicated = true;
Prefix.ElementSize = TSFlags & AArch64::ElementSizeMask;
assert(Prefix.ElementSize != AArch64::ElementSizeNone &&
"No destructive element size set for movprfx");
Prefix.Dst = Inst.getOperand(0).getReg();
Prefix.Pg = Inst.getOperand(1).getReg();
break;
default:
break;
}
return Prefix;
}
PrefixInfo() : Active(false), Predicated(false) {}
bool isActive() const { return Active; }
bool isPredicated() const { return Predicated; }
unsigned getElementSize() const {
assert(Predicated);
return ElementSize;
}
unsigned getDstReg() const { return Dst; }
unsigned getPgReg() const {
assert(Predicated);
return Pg;
}
private:
bool Active;
bool Predicated;
unsigned ElementSize;
unsigned Dst;
unsigned Pg;
} NextPrefix;
AArch64TargetStreamer &getTargetStreamer() {
MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
return static_cast<AArch64TargetStreamer &>(TS);
}
SMLoc getLoc() const { return getParser().getTok().getLoc(); }
bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
void createSysAlias(uint16_t Encoding, OperandVector &Operands, SMLoc S);
AArch64CC::CondCode parseCondCodeString(StringRef Cond);
bool parseCondCode(OperandVector &Operands, bool invertCondCode);
unsigned matchRegisterNameAlias(StringRef Name, RegKind Kind);
bool parseRegister(OperandVector &Operands);
bool parseSymbolicImmVal(const MCExpr *&ImmVal);
bool parseNeonVectorList(OperandVector &Operands);
bool parseOptionalMulOperand(OperandVector &Operands);
bool parseOperand(OperandVector &Operands, bool isCondCode,
bool invertCondCode);
bool showMatchError(SMLoc Loc, unsigned ErrCode, uint64_t ErrorInfo,
OperandVector &Operands);
bool parseDirectiveArch(SMLoc L);
bool parseDirectiveArchExtension(SMLoc L);
bool parseDirectiveCPU(SMLoc L);
bool parseDirectiveInst(SMLoc L);
bool parseDirectiveTLSDescCall(SMLoc L);
bool parseDirectiveLOH(StringRef LOH, SMLoc L);
bool parseDirectiveLtorg(SMLoc L);
bool parseDirectiveReq(StringRef Name, SMLoc L);
bool parseDirectiveUnreq(SMLoc L);
bool parseDirectiveCFINegateRAState();
bool parseDirectiveCFIBKeyFrame();
bool validateInstruction(MCInst &Inst, SMLoc &IDLoc,
SmallVectorImpl<SMLoc> &Loc);
bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands, MCStreamer &Out,
uint64_t &ErrorInfo,
bool MatchingInlineAsm) override;
/// @name Auto-generated Match Functions
/// {
#define GET_ASSEMBLER_HEADER
#include "AArch64GenAsmMatcher.inc"
/// }
OperandMatchResultTy tryParseScalarRegister(unsigned &Reg);
OperandMatchResultTy tryParseVectorRegister(unsigned &Reg, StringRef &Kind,
RegKind MatchKind);
OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands);
OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands);
OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands);
OperandMatchResultTy tryParseSysReg(OperandVector &Operands);
OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
template <bool IsSVEPrefetch = false>
OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
OperandMatchResultTy tryParsePSBHint(OperandVector &Operands);
OperandMatchResultTy tryParseBTIHint(OperandVector &Operands);
OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
template<bool AddFPZeroAsLiteral>
OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
OperandMatchResultTy tryParseImmWithOptionalShift(OperandVector &Operands);
OperandMatchResultTy tryParseGPR64sp0Operand(OperandVector &Operands);
bool tryParseNeonVectorRegister(OperandVector &Operands);
OperandMatchResultTy tryParseVectorIndex(OperandVector &Operands);
OperandMatchResultTy tryParseGPRSeqPair(OperandVector &Operands);
template <bool ParseShiftExtend,
RegConstraintEqualityTy EqTy = RegConstraintEqualityTy::EqualsReg>
OperandMatchResultTy tryParseGPROperand(OperandVector &Operands);
template <bool ParseShiftExtend, bool ParseSuffix>
OperandMatchResultTy tryParseSVEDataVector(OperandVector &Operands);
OperandMatchResultTy tryParseSVEPredicateVector(OperandVector &Operands);
template <RegKind VectorKind>
OperandMatchResultTy tryParseVectorList(OperandVector &Operands,
bool ExpectMatch = false);
OperandMatchResultTy tryParseSVEPattern(OperandVector &Operands);
public:
enum AArch64MatchResultTy {
Match_InvalidSuffix = FIRST_TARGET_MATCH_RESULT_TY,
#define GET_OPERAND_DIAGNOSTIC_TYPES
#include "AArch64GenAsmMatcher.inc"
};
bool IsILP32;
AArch64AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
const MCInstrInfo &MII, const MCTargetOptions &Options)
: MCTargetAsmParser(Options, STI, MII) {
IsILP32 = Options.getABIName() == "ilp32";
MCAsmParserExtension::Initialize(Parser);
MCStreamer &S = getParser().getStreamer();
if (S.getTargetStreamer() == nullptr)
new AArch64TargetStreamer(S);
// Alias .hword/.word/.[dx]word to the target-independent
// .2byte/.4byte/.8byte directives as they have the same form and
// semantics:
/// ::= (.hword | .word | .dword | .xword ) [ expression (, expression)* ]
Parser.addAliasForDirective(".hword", ".2byte");
Parser.addAliasForDirective(".word", ".4byte");
Parser.addAliasForDirective(".dword", ".8byte");
Parser.addAliasForDirective(".xword", ".8byte");
// Initialize the set of available features.
setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
}
bool regsEqual(const MCParsedAsmOperand &Op1,
const MCParsedAsmOperand &Op2) const override;
bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
SMLoc NameLoc, OperandVector &Operands) override;
bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
bool ParseDirective(AsmToken DirectiveID) override;
unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
unsigned Kind) override;
static bool classifySymbolRef(const MCExpr *Expr,
AArch64MCExpr::VariantKind &ELFRefKind,
MCSymbolRefExpr::VariantKind &DarwinRefKind,
int64_t &Addend);
};
/// AArch64Operand - Instances of this class represent a parsed AArch64 machine
/// instruction.
class AArch64Operand : public MCParsedAsmOperand {
private:
enum KindTy {
k_Immediate,
k_ShiftedImm,
k_CondCode,
k_Register,
k_VectorList,
k_VectorIndex,
k_Token,
k_SysReg,
k_SysCR,
k_Prefetch,
k_ShiftExtend,
k_FPImm,
k_Barrier,
k_PSBHint,
k_BTIHint,
} Kind;
SMLoc StartLoc, EndLoc;
struct TokOp {
const char *Data;
unsigned Length;
bool IsSuffix; // Is the operand actually a suffix on the mnemonic.
};
// Separate shift/extend operand.
struct ShiftExtendOp {
AArch64_AM::ShiftExtendType Type;
unsigned Amount;
bool HasExplicitAmount;
};
struct RegOp {
unsigned RegNum;
RegKind Kind;
int ElementWidth;
// The register may be allowed as a different register class,
// e.g. for GPR64as32 or GPR32as64.
RegConstraintEqualityTy EqualityTy;
// In some cases the shift/extend needs to be explicitly parsed together
// with the register, rather than as a separate operand. This is needed
// for addressing modes where the instruction as a whole dictates the
// scaling/extend, rather than specific bits in the instruction.
// By parsing them as a single operand, we avoid the need to pass an
// extra operand in all CodeGen patterns (because all operands need to
// have an associated value), and we avoid the need to update TableGen to
// accept operands that have no associated bits in the instruction.
//
// An added benefit of parsing them together is that the assembler
// can give a sensible diagnostic if the scaling is not correct.
//
// The default is 'lsl #0' (HasExplicitAmount = false) if no
// ShiftExtend is specified.
ShiftExtendOp ShiftExtend;
};
struct VectorListOp {
unsigned RegNum;
unsigned Count;
unsigned NumElements;
unsigned ElementWidth;
RegKind RegisterKind;
};
struct VectorIndexOp {
unsigned Val;
};
struct ImmOp {
const MCExpr *Val;
};
struct ShiftedImmOp {
const MCExpr *Val;
unsigned ShiftAmount;
};
struct CondCodeOp {
AArch64CC::CondCode Code;
};
struct FPImmOp {
uint64_t Val; // APFloat value bitcasted to uint64_t.
bool IsExact; // describes whether parsed value was exact.
};
struct BarrierOp {
const char *Data;
unsigned Length;
unsigned Val; // Not the enum since not all values have names.
};
struct SysRegOp {
const char *Data;
unsigned Length;
uint32_t MRSReg;
uint32_t MSRReg;
uint32_t PStateField;
};
struct SysCRImmOp {
unsigned Val;
};
struct PrefetchOp {
const char *Data;
unsigned Length;
unsigned Val;
};
struct PSBHintOp {
const char *Data;
unsigned Length;
unsigned Val;
};
struct BTIHintOp {
const char *Data;
unsigned Length;
unsigned Val;
};
struct ExtendOp {
unsigned Val;
};
union {
struct TokOp Tok;
struct RegOp Reg;
struct VectorListOp VectorList;
struct VectorIndexOp VectorIndex;
struct ImmOp Imm;
struct ShiftedImmOp ShiftedImm;
struct CondCodeOp CondCode;
struct FPImmOp FPImm;
struct BarrierOp Barrier;
struct SysRegOp SysReg;
struct SysCRImmOp SysCRImm;
struct PrefetchOp Prefetch;
struct PSBHintOp PSBHint;
struct BTIHintOp BTIHint;
struct ShiftExtendOp ShiftExtend;
};
// Keep the MCContext around as the MCExprs may need manipulated during
// the add<>Operands() calls.
MCContext &Ctx;
public:
AArch64Operand(KindTy K, MCContext &Ctx) : Kind(K), Ctx(Ctx) {}
AArch64Operand(const AArch64Operand &o) : MCParsedAsmOperand(), Ctx(o.Ctx) {
Kind = o.Kind;
StartLoc = o.StartLoc;
EndLoc = o.EndLoc;
switch (Kind) {
case k_Token:
Tok = o.Tok;
break;
case k_Immediate:
Imm = o.Imm;
break;
case k_ShiftedImm:
ShiftedImm = o.ShiftedImm;
break;
case k_CondCode:
CondCode = o.CondCode;
break;
case k_FPImm:
FPImm = o.FPImm;
break;
case k_Barrier:
Barrier = o.Barrier;
break;
case k_Register:
Reg = o.Reg;
break;
case k_VectorList:
VectorList = o.VectorList;
break;
case k_VectorIndex:
VectorIndex = o.VectorIndex;
break;
case k_SysReg:
SysReg = o.SysReg;
break;
case k_SysCR:
SysCRImm = o.SysCRImm;
break;
case k_Prefetch:
Prefetch = o.Prefetch;
break;
case k_PSBHint:
PSBHint = o.PSBHint;
break;
case k_BTIHint:
BTIHint = o.BTIHint;
break;
case k_ShiftExtend:
ShiftExtend = o.ShiftExtend;
break;
}
}
/// getStartLoc - Get the location of the first token of this operand.
SMLoc getStartLoc() const override { return StartLoc; }
/// getEndLoc - Get the location of the last token of this operand.
SMLoc getEndLoc() const override { return EndLoc; }
StringRef getToken() const {
assert(Kind == k_Token && "Invalid access!");
return StringRef(Tok.Data, Tok.Length);
}
bool isTokenSuffix() const {
assert(Kind == k_Token && "Invalid access!");
return Tok.IsSuffix;
}
const MCExpr *getImm() const {
assert(Kind == k_Immediate && "Invalid access!");
return Imm.Val;
}
const MCExpr *getShiftedImmVal() const {
assert(Kind == k_ShiftedImm && "Invalid access!");
return ShiftedImm.Val;
}
unsigned getShiftedImmShift() const {
assert(Kind == k_ShiftedImm && "Invalid access!");
return ShiftedImm.ShiftAmount;
}
AArch64CC::CondCode getCondCode() const {
assert(Kind == k_CondCode && "Invalid access!");
return CondCode.Code;
}
APFloat getFPImm() const {
assert (Kind == k_FPImm && "Invalid access!");
return APFloat(APFloat::IEEEdouble(), APInt(64, FPImm.Val, true));
}
bool getFPImmIsExact() const {
assert (Kind == k_FPImm && "Invalid access!");
return FPImm.IsExact;
}
unsigned getBarrier() const {
assert(Kind == k_Barrier && "Invalid access!");
return Barrier.Val;
}
StringRef getBarrierName() const {
assert(Kind == k_Barrier && "Invalid access!");
return StringRef(Barrier.Data, Barrier.Length);
}
unsigned getReg() const override {
assert(Kind == k_Register && "Invalid access!");
return Reg.RegNum;
}
RegConstraintEqualityTy getRegEqualityTy() const {
assert(Kind == k_Register && "Invalid access!");
return Reg.EqualityTy;
}
unsigned getVectorListStart() const {
assert(Kind == k_VectorList && "Invalid access!");
return VectorList.RegNum;
}
unsigned getVectorListCount() const {
assert(Kind == k_VectorList && "Invalid access!");
return VectorList.Count;
}
unsigned getVectorIndex() const {
assert(Kind == k_VectorIndex && "Invalid access!");
return VectorIndex.Val;
}
StringRef getSysReg() const {
assert(Kind == k_SysReg && "Invalid access!");
return StringRef(SysReg.Data, SysReg.Length);
}
unsigned getSysCR() const {
assert(Kind == k_SysCR && "Invalid access!");
return SysCRImm.Val;
}
unsigned getPrefetch() const {
assert(Kind == k_Prefetch && "Invalid access!");
return Prefetch.Val;
}
unsigned getPSBHint() const {
assert(Kind == k_PSBHint && "Invalid access!");
return PSBHint.Val;
}
StringRef getPSBHintName() const {
assert(Kind == k_PSBHint && "Invalid access!");
return StringRef(PSBHint.Data, PSBHint.Length);
}
unsigned getBTIHint() const {
assert(Kind == k_BTIHint && "Invalid access!");
return BTIHint.Val;
}
StringRef getBTIHintName() const {
assert(Kind == k_BTIHint && "Invalid access!");
return StringRef(BTIHint.Data, BTIHint.Length);
}
StringRef getPrefetchName() const {
assert(Kind == k_Prefetch && "Invalid access!");
return StringRef(Prefetch.Data, Prefetch.Length);
}
AArch64_AM::ShiftExtendType getShiftExtendType() const {
if (Kind == k_ShiftExtend)
return ShiftExtend.Type;
if (Kind == k_Register)
return Reg.ShiftExtend.Type;
llvm_unreachable("Invalid access!");
}
unsigned getShiftExtendAmount() const {
if (Kind == k_ShiftExtend)
return ShiftExtend.Amount;
if (Kind == k_Register)
return Reg.ShiftExtend.Amount;
llvm_unreachable("Invalid access!");
}
bool hasShiftExtendAmount() const {
if (Kind == k_ShiftExtend)
return ShiftExtend.HasExplicitAmount;
if (Kind == k_Register)
return Reg.ShiftExtend.HasExplicitAmount;
llvm_unreachable("Invalid access!");
}
bool isImm() const override { return Kind == k_Immediate; }
bool isMem() const override { return false; }
bool isUImm6() const {
if (!isImm())
return false;
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
if (!MCE)
return false;
int64_t Val = MCE->getValue();
return (Val >= 0 && Val < 64);
}
template <int Width> bool isSImm() const { return isSImmScaled<Width, 1>(); }
template <int Bits, int Scale> DiagnosticPredicate isSImmScaled() const {
return isImmScaled<Bits, Scale>(true);
}
template <int Bits, int Scale> DiagnosticPredicate isUImmScaled() const {
return isImmScaled<Bits, Scale>(false);
}
template <int Bits, int Scale>
DiagnosticPredicate isImmScaled(bool Signed) const {
if (!isImm())
return DiagnosticPredicateTy::NoMatch;
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
if (!MCE)
return DiagnosticPredicateTy::NoMatch;
int64_t MinVal, MaxVal;
if (Signed) {
int64_t Shift = Bits - 1;
MinVal = (int64_t(1) << Shift) * -Scale;
MaxVal = ((int64_t(1) << Shift) - 1) * Scale;
} else {
MinVal = 0;
MaxVal = ((int64_t(1) << Bits) - 1) * Scale;
}
int64_t Val = MCE->getValue();
if (Val >= MinVal && Val <= MaxVal && (Val % Scale) == 0)
return DiagnosticPredicateTy::Match;
return DiagnosticPredicateTy::NearMatch;
}
DiagnosticPredicate isSVEPattern() const {
if (!isImm())
return DiagnosticPredicateTy::NoMatch;
auto *MCE = dyn_cast<MCConstantExpr>(getImm());
if (!MCE)
return DiagnosticPredicateTy::NoMatch;
int64_t Val = MCE->getValue();
if (Val >= 0 && Val < 32)
return DiagnosticPredicateTy::Match;
return DiagnosticPredicateTy::NearMatch;
}
bool isSymbolicUImm12Offset(const MCExpr *Expr) const {
AArch64MCExpr::VariantKind ELFRefKind;
MCSymbolRefExpr::VariantKind DarwinRefKind;
int64_t Addend;
if (!AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind,
Addend)) {
// If we don't understand the expression, assume the best and
// let the fixup and relocation code deal with it.
return true;
}
if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
ELFRefKind == AArch64MCExpr::VK_LO12 ||
ELFRefKind == AArch64MCExpr::VK_GOT_LO12 ||
ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 ||
ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC ||
ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
ELFRefKind == AArch64MCExpr::VK_GOTTPREL_LO12_NC ||
ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12 ||
ELFRefKind == AArch64MCExpr::VK_SECREL_LO12 ||
ELFRefKind == AArch64MCExpr::VK_SECREL_HI12) {
// Note that we don't range-check the addend. It's adjusted modulo page
// size when converted, so there is no "out of range" condition when using
// @pageoff.
return true;
} else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF ||
DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) {
// @gotpageoff/@tlvppageoff can only be used directly, not with an addend.
return Addend == 0;
}
return false;
}
template <int Scale> bool isUImm12Offset() const {
if (!isImm())
return false;
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
if (!MCE)
return isSymbolicUImm12Offset(getImm());
int64_t Val = MCE->getValue();
return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000;
}
template <int N, int M>
bool isImmInRange() const {
if (!isImm())
return false;
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
if (!MCE)
return false;
int64_t Val = MCE->getValue();
return (Val >= N && Val <= M);
}
// NOTE: Also used for isLogicalImmNot as anything that can be represented as
// a logical immediate can always be represented when inverted.
template <typename T>
bool isLogicalImm() const {
if (!isImm())
return false;
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
if (!MCE)
return false;
int64_t Val = MCE->getValue();
int64_t SVal = typename std::make_signed<T>::type(Val);
int64_t UVal = typename std::make_unsigned<T>::type(Val);
if (Val != SVal && Val != UVal)
return false;
return AArch64_AM::isLogicalImmediate(UVal, sizeof(T) * 8);
}
bool isShiftedImm() const { return Kind == k_ShiftedImm; }
/// Returns the immediate value as a pair of (imm, shift) if the immediate is
/// a shifted immediate by value 'Shift' or '0', or if it is an unshifted
/// immediate that can be shifted by 'Shift'.
template <unsigned Width>
Optional<std::pair<int64_t, unsigned> > getShiftedVal() const {
if (isShiftedImm() && Width == getShiftedImmShift())
if (auto *CE = dyn_cast<MCConstantExpr>(getShiftedImmVal()))
return std::make_pair(CE->getValue(), Width);
if (isImm())
if (auto *CE = dyn_cast<MCConstantExpr>(getImm())) {
int64_t Val = CE->getValue();
if ((Val != 0) && (uint64_t(Val >> Width) << Width) == uint64_t(Val))
return std::make_pair(Val >> Width, Width);
else
return std::make_pair(Val, 0u);
}
return {};
}
bool isAddSubImm() const {
if (!isShiftedImm() && !isImm())
return false;
const MCExpr *Expr;
// An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'.
if (isShiftedImm()) {
unsigned Shift = ShiftedImm.ShiftAmount;
Expr = ShiftedImm.Val;
if (Shift != 0 && Shift != 12)
return false;
} else {
Expr = getImm();
}
AArch64MCExpr::VariantKind ELFRefKind;
MCSymbolRefExpr::VariantKind DarwinRefKind;
int64_t Addend;
if (AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind,
DarwinRefKind, Addend)) {
return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF
|| DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF
|| (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0)
|| ELFRefKind == AArch64MCExpr::VK_LO12
|| ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12
|| ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12
|| ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC
|| ELFRefKind == AArch64MCExpr::VK_TPREL_HI12
|| ELFRefKind == AArch64MCExpr::VK_TPREL_LO12
|| ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC
|| ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12
|| ELFRefKind == AArch64MCExpr::VK_SECREL_HI12
|| ELFRefKind == AArch64MCExpr::VK_SECREL_LO12;
}
// If it's a constant, it should be a real immediate in range.
if (auto ShiftedVal = getShiftedVal<12>())
return ShiftedVal->first >= 0 && ShiftedVal->first <= 0xfff;
// If it's an expression, we hope for the best and let the fixup/relocation
// code deal with it.
return true;
}
bool isAddSubImmNeg() const {
if (!isShiftedImm() && !isImm())
return false;
// Otherwise it should be a real negative immediate in range.
if (auto ShiftedVal = getShiftedVal<12>())
return ShiftedVal->first < 0 && -ShiftedVal->first <= 0xfff;
return false;
}
// Signed value in the range -128 to +127. For element widths of
// 16 bits or higher it may also be a signed multiple of 256 in the
// range -32768 to +32512.
// For element-width of 8 bits a range of -128 to 255 is accepted,
// since a copy of a byte can be either signed/unsigned.
template <typename T>
DiagnosticPredicate isSVECpyImm() const {
if (!isShiftedImm() && (!isImm() || !isa<MCConstantExpr>(getImm())))
return DiagnosticPredicateTy::NoMatch;
bool IsByte =
std::is_same<int8_t, typename std::make_signed<T>::type>::value;
if (auto ShiftedImm = getShiftedVal<8>())
if (!(IsByte && ShiftedImm->second) &&
AArch64_AM::isSVECpyImm<T>(uint64_t(ShiftedImm->first)
<< ShiftedImm->second))
return DiagnosticPredicateTy::Match;
return DiagnosticPredicateTy::NearMatch;
}
// Unsigned value in the range 0 to 255. For element widths of
// 16 bits or higher it may also be a signed multiple of 256 in the
// range 0 to 65280.
template <typename T> DiagnosticPredicate isSVEAddSubImm() const {
if (!isShiftedImm() && (!isImm() || !isa<MCConstantExpr>(getImm())))
return DiagnosticPredicateTy::NoMatch;
bool IsByte =
std::is_same<int8_t, typename std::make_signed<T>::type>::value;
if (auto ShiftedImm = getShiftedVal<8>())
if (!(IsByte && ShiftedImm->second) &&
AArch64_AM::isSVEAddSubImm<T>(ShiftedImm->first
<< ShiftedImm->second))
return DiagnosticPredicateTy::Match;
return DiagnosticPredicateTy::NearMatch;
}
template <typename T> DiagnosticPredicate isSVEPreferredLogicalImm() const {
if (isLogicalImm<T>() && !isSVECpyImm<T>())
return DiagnosticPredicateTy::Match;
return DiagnosticPredicateTy::NoMatch;
}
bool isCondCode() const { return Kind == k_CondCode; }
bool isSIMDImmType10() const {
if (!isImm())
return false;
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
if (!MCE)
return false;
return AArch64_AM::isAdvSIMDModImmType10(MCE->getValue());
}
template<int N>
bool isBranchTarget() const {
if (!isImm())
return false;
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
if (!MCE)
return true;
int64_t Val = MCE->getValue();
if (Val & 0x3)
return false;
assert(N > 0 && "Branch target immediate cannot be 0 bits!");
return (Val >= -((1<<(N-1)) << 2) && Val <= (((1<<(N-1))-1) << 2));
}
bool
isMovWSymbol(ArrayRef<AArch64MCExpr::VariantKind> AllowedModifiers) const {
if (!isImm())
return false;
AArch64MCExpr::VariantKind ELFRefKind;
MCSymbolRefExpr::VariantKind DarwinRefKind;
int64_t Addend;
if (!AArch64AsmParser::classifySymbolRef(getImm(), ELFRefKind,
DarwinRefKind, Addend)) {
return false;
}
if (DarwinRefKind != MCSymbolRefExpr::VK_None)
return false;
for (unsigned i = 0; i != AllowedModifiers.size(); ++i) {
if (ELFRefKind == AllowedModifiers[i])
return true;
}
return false;
}
bool isMovZSymbolG3() const {
return isMovWSymbol(AArch64MCExpr::VK_ABS_G3);
}
bool isMovZSymbolG2() const {
return isMovWSymbol({AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S,
AArch64MCExpr::VK_TPREL_G2,
AArch64MCExpr::VK_DTPREL_G2});
}
bool isMovZSymbolG1() const {
return isMovWSymbol({
AArch64MCExpr::VK_ABS_G1, AArch64MCExpr::VK_ABS_G1_S,
AArch64MCExpr::VK_GOTTPREL_G1, AArch64MCExpr::VK_TPREL_G1,
AArch64MCExpr::VK_DTPREL_G1,
});
}
bool isMovZSymbolG0() const {
return isMovWSymbol({AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S,
AArch64MCExpr::VK_TPREL_G0,
AArch64MCExpr::VK_DTPREL_G0});
}
bool isMovKSymbolG3() const {
return isMovWSymbol(AArch64MCExpr::VK_ABS_G3);
}
bool isMovKSymbolG2() const {
return isMovWSymbol(AArch64MCExpr::VK_ABS_G2_NC);
}
bool isMovKSymbolG1() const {
return isMovWSymbol({AArch64MCExpr::VK_ABS_G1_NC,
AArch64MCExpr::VK_TPREL_G1_NC,
AArch64MCExpr::VK_DTPREL_G1_NC});
}
bool isMovKSymbolG0() const {
return isMovWSymbol(
{AArch64MCExpr::VK_ABS_G0_NC, AArch64MCExpr::VK_GOTTPREL_G0_NC,
AArch64MCExpr::VK_TPREL_G0_NC, AArch64MCExpr::VK_DTPREL_G0_NC});
}
template<int RegWidth, int Shift>
bool isMOVZMovAlias() const {
if (!isImm()) return false;
const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
if (!CE) return false;
uint64_t Value = CE->getValue();
return AArch64_AM::isMOVZMovAlias(Value, Shift, RegWidth);
}
template<int RegWidth, int Shift>
bool isMOVNMovAlias() const {
if (!isImm()) return false;
const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
if (!CE) return false;
uint64_t Value = CE->getValue();
return AArch64_AM::isMOVNMovAlias(Value, Shift, RegWidth);
}
bool isFPImm() const {
return Kind == k_FPImm &&
AArch64_AM::getFP64Imm(getFPImm().bitcastToAPInt()) != -1;
}
bool isBarrier() const { return Kind == k_Barrier; }
bool isSysReg() const { return Kind == k_SysReg; }
bool isMRSSystemRegister() const {
if (!isSysReg()) return false;
return SysReg.MRSReg != -1U;
}
bool isMSRSystemRegister() const {
if (!isSysReg()) return false;
return SysReg.MSRReg != -1U;
}
bool isSystemPStateFieldWithImm0_1() const {
if (!isSysReg()) return false;
return (SysReg.PStateField == AArch64PState::PAN ||
SysReg.PStateField == AArch64PState::DIT ||
SysReg.PStateField == AArch64PState::UAO ||
SysReg.PStateField == AArch64PState::SSBS);
}
bool isSystemPStateFieldWithImm0_15() const {
if (!isSysReg() || isSystemPStateFieldWithImm0_1()) return false;
return SysReg.PStateField != -1U;
}
bool isReg() const override {
return Kind == k_Register;
}
bool isScalarReg() const {
return Kind == k_Register && Reg.Kind == RegKind::Scalar;
}
bool isNeonVectorReg() const {
return Kind == k_Register && Reg.Kind == RegKind::NeonVector;
}
bool isNeonVectorRegLo() const {
return Kind == k_Register && Reg.Kind == RegKind::NeonVector &&
AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
Reg.RegNum);
}
template <unsigned Class> bool isSVEVectorReg() const {
RegKind RK;
switch (Class) {
case AArch64::ZPRRegClassID:
case AArch64::ZPR_3bRegClassID:
case AArch64::ZPR_4bRegClassID:
RK = RegKind::SVEDataVector;
break;
case AArch64::PPRRegClassID:
case AArch64::PPR_3bRegClassID:
RK = RegKind::SVEPredicateVector;
break;
default:
llvm_unreachable("Unsupport register class");
}
return (Kind == k_Register && Reg.Kind == RK) &&
AArch64MCRegisterClasses[Class].contains(getReg());
}
template <unsigned Class> bool isFPRasZPR() const {
return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
AArch64MCRegisterClasses[Class].contains(getReg());
}
template <int ElementWidth, unsigned Class>
DiagnosticPredicate isSVEPredicateVectorRegOfWidth() const {
if (Kind != k_Register || Reg.Kind != RegKind::SVEPredicateVector)
return DiagnosticPredicateTy::NoMatch;
if (isSVEVectorReg<Class>() && (Reg.ElementWidth == ElementWidth))
return DiagnosticPredicateTy::Match;
return DiagnosticPredicateTy::NearMatch;
}
template <int ElementWidth, unsigned Class>
DiagnosticPredicate isSVEDataVectorRegOfWidth() const {
if (Kind != k_Register || Reg.Kind != RegKind::SVEDataVector)
return DiagnosticPredicateTy::NoMatch;
if (isSVEVectorReg<Class>() && Reg.ElementWidth == ElementWidth)
return DiagnosticPredicateTy::Match;
return DiagnosticPredicateTy::NearMatch;
}
template <int ElementWidth, unsigned Class,
AArch64_AM::ShiftExtendType ShiftExtendTy, int ShiftWidth,
bool ShiftWidthAlwaysSame>
DiagnosticPredicate isSVEDataVectorRegWithShiftExtend() const {
auto VectorMatch = isSVEDataVectorRegOfWidth<ElementWidth, Class>();
if (!VectorMatch.isMatch())
return DiagnosticPredicateTy::NoMatch;
// Give a more specific diagnostic when the user has explicitly typed in
// a shift-amount that does not match what is expected, but for which
// there is also an unscaled addressing mode (e.g. sxtw/uxtw).
bool MatchShift = getShiftExtendAmount() == Log2_32(ShiftWidth / 8);
if (!MatchShift && (ShiftExtendTy == AArch64_AM::UXTW ||
ShiftExtendTy == AArch64_AM::SXTW) &&
!ShiftWidthAlwaysSame && hasShiftExtendAmount() && ShiftWidth == 8)
return DiagnosticPredicateTy::NoMatch;
if (MatchShift && ShiftExtendTy == getShiftExtendType())
return DiagnosticPredicateTy::Match;
return DiagnosticPredicateTy::NearMatch;
}
bool isGPR32as64() const {
return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum);
}
bool isGPR64as32() const {
return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
AArch64MCRegisterClasses[AArch64::GPR32RegClassID].contains(Reg.RegNum);
}
bool isWSeqPair() const {
return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID].contains(
Reg.RegNum);
}
bool isXSeqPair() const {
return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID].contains(
Reg.RegNum);
}
template<int64_t Angle, int64_t Remainder>
DiagnosticPredicate isComplexRotation() const {
if (!isImm()) return DiagnosticPredicateTy::NoMatch;
const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
if (!CE) return DiagnosticPredicateTy::NoMatch;
uint64_t Value = CE->getValue();
if (Value % Angle == Remainder && Value <= 270)
return DiagnosticPredicateTy::Match;
return DiagnosticPredicateTy::NearMatch;
}
template <unsigned RegClassID> bool isGPR64() const {
return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
AArch64MCRegisterClasses[RegClassID].contains(getReg());
}
template <unsigned RegClassID, int ExtWidth>
DiagnosticPredicate isGPR64WithShiftExtend() const {
if (Kind != k_Register || Reg.Kind != RegKind::Scalar)
return DiagnosticPredicateTy::NoMatch;
if (isGPR64<RegClassID>() && getShiftExtendType() == AArch64_AM::LSL &&
getShiftExtendAmount() == Log2_32(ExtWidth / 8))
return DiagnosticPredicateTy::Match;
return DiagnosticPredicateTy::NearMatch;
}
/// Is this a vector list with the type implicit (presumably attached to the
/// instruction itself)?
template <RegKind VectorKind, unsigned NumRegs>
bool isImplicitlyTypedVectorList() const {
return Kind == k_VectorList && VectorList.Count == NumRegs &&
VectorList.NumElements == 0 &&
VectorList.RegisterKind == VectorKind;
}
template <RegKind VectorKind, unsigned NumRegs, unsigned NumElements,
unsigned ElementWidth>
bool isTypedVectorList() const {
if (Kind != k_VectorList)
return false;
if (VectorList.Count != NumRegs)
return false;
if (VectorList.RegisterKind != VectorKind)
return false;
if (VectorList.ElementWidth != ElementWidth)
return false;
return VectorList.NumElements == NumElements;
}
template <int Min, int Max>
DiagnosticPredicate isVectorIndex() const {
if (Kind != k_VectorIndex)
return DiagnosticPredicateTy::NoMatch;
if (VectorIndex.Val >= Min && VectorIndex.Val <= Max)
return DiagnosticPredicateTy::Match;
return DiagnosticPredicateTy::NearMatch;
}
bool isToken() const override { return Kind == k_Token; }
bool isTokenEqual(StringRef Str) const {
return Kind == k_Token && getToken() == Str;
}
bool isSysCR() const { return Kind == k_SysCR; }
bool isPrefetch() const { return Kind == k_Prefetch; }
bool isPSBHint() const { return Kind == k_PSBHint; }
bool isBTIHint() const { return Kind == k_BTIHint; }
bool isShiftExtend() const { return Kind == k_ShiftExtend; }
bool isShifter() const {
if (!isShiftExtend())
return false;
AArch64_AM::ShiftExtendType ST = getShiftExtendType();
return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
ST == AArch64_AM::ASR || ST == AArch64_AM::ROR ||
ST == AArch64_AM::MSL);
}
template <unsigned ImmEnum> DiagnosticPredicate isExactFPImm() const {
if (Kind != k_FPImm)
return DiagnosticPredicateTy::NoMatch;
if (getFPImmIsExact()) {
// Lookup the immediate from table of supported immediates.
auto *Desc = AArch64ExactFPImm::lookupExactFPImmByEnum(ImmEnum);
assert(Desc && "Unknown enum value");
// Calculate its FP value.
APFloat RealVal(APFloat::IEEEdouble());
if (RealVal.convertFromString(Desc->Repr, APFloat::rmTowardZero) !=
APFloat::opOK)
llvm_unreachable("FP immediate is not exact");
if (getFPImm().bitwiseIsEqual(RealVal))
return DiagnosticPredicateTy::Match;
}
return DiagnosticPredicateTy::NearMatch;
}
template <unsigned ImmA, unsigned ImmB>
DiagnosticPredicate isExactFPImm() const {
DiagnosticPredicate Res = DiagnosticPredicateTy::NoMatch;
if ((Res = isExactFPImm<ImmA>()))
return DiagnosticPredicateTy::Match;
if ((Res = isExactFPImm<ImmB>()))
return DiagnosticPredicateTy::Match;
return Res;
}
bool isExtend() const {
if (!isShiftExtend())
return false;
AArch64_AM::ShiftExtendType ET = getShiftExtendType();
return (ET == AArch64_AM::UXTB || ET == AArch64_AM::SXTB ||
ET == AArch64_AM::UXTH || ET == AArch64_AM::SXTH ||
ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW ||
ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX ||
ET == AArch64_AM::LSL) &&
getShiftExtendAmount() <= 4;
}
bool isExtend64() const {
if (!isExtend())
return false;
// Make sure the extend expects a 32-bit source register.
AArch64_AM::ShiftExtendType ET = getShiftExtendType();
return ET == AArch64_AM::UXTB || ET == AArch64_AM::SXTB ||
ET == AArch64_AM::UXTH || ET == AArch64_AM::SXTH ||
ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW;
}
bool isExtendLSL64() const {
if (!isExtend())
return false;
AArch64_AM::ShiftExtendType ET = getShiftExtendType();
return (ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX ||
ET == AArch64_AM::LSL) &&
getShiftExtendAmount() <= 4;
}
template<int Width> bool isMemXExtend() const {
if (!isExtend())
return false;
AArch64_AM::ShiftExtendType ET = getShiftExtendType();
return (ET == AArch64_AM::LSL || ET == AArch64_AM::SXTX) &&
(getShiftExtendAmount() == Log2_32(Width / 8) ||
getShiftExtendAmount() == 0);
}
template<int Width> bool isMemWExtend() const {
if (!isExtend())
return false;
AArch64_AM::ShiftExtendType ET = getShiftExtendType();
return (ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW) &&
(getShiftExtendAmount() == Log2_32(Width / 8) ||
getShiftExtendAmount() == 0);
}
template <unsigned width>
bool isArithmeticShifter() const {
if (!isShifter())
return false;
// An arithmetic shifter is LSL, LSR, or ASR.
AArch64_AM::ShiftExtendType ST = getShiftExtendType();
return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
ST == AArch64_AM::ASR) && getShiftExtendAmount() < width;
}
template <unsigned width>
bool isLogicalShifter() const {
if (!isShifter())
return false;
// A logical shifter is LSL, LSR, ASR or ROR.
AArch64_AM::ShiftExtendType ST = getShiftExtendType();
return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
ST == AArch64_AM::ASR || ST == AArch64_AM::ROR) &&
getShiftExtendAmount() < width;
}
bool isMovImm32Shifter() const {
if (!isShifter())
return false;
// A MOVi shifter is LSL of 0, 16, 32, or 48.
AArch64_AM::ShiftExtendType ST = getShiftExtendType();
if (ST != AArch64_AM::LSL)
return false;
uint64_t Val = getShiftExtendAmount();
return (Val == 0 || Val == 16);
}
bool isMovImm64Shifter() const {
if (!isShifter())
return false;
// A MOVi shifter is LSL of 0 or 16.
AArch64_AM::ShiftExtendType ST = getShiftExtendType();
if (ST != AArch64_AM::LSL)
return false;
uint64_t Val = getShiftExtendAmount();
return (Val == 0 || Val == 16 || Val == 32 || Val == 48);
}
bool isLogicalVecShifter() const {
if (!isShifter())
return false;
// A logical vector shifter is a left shift by 0, 8, 16, or 24.
unsigned Shift = getShiftExtendAmount();
return getShiftExtendType() == AArch64_AM::LSL &&
(Shift == 0 || Shift == 8 || Shift == 16 || Shift == 24);
}
bool isLogicalVecHalfWordShifter() const {
if (!isLogicalVecShifter())
return false;
// A logical vector shifter is a left shift by 0 or 8.
unsigned Shift = getShiftExtendAmount();
return getShiftExtendType() == AArch64_AM::LSL &&
(Shift == 0 || Shift == 8);
}
bool isMoveVecShifter() const {
if (!isShiftExtend())
return false;
// A logical vector shifter is a left shift by 8 or 16.
unsigned Shift = getShiftExtendAmount();
return getShiftExtendType() == AArch64_AM::MSL &&
(Shift == 8 || Shift == 16);
}
// Fallback unscaled operands are for aliases of LDR/STR that fall back
// to LDUR/STUR when the offset is not legal for the former but is for
// the latter. As such, in addition to checking for being a legal unscaled
// address, also check that it is not a legal scaled address. This avoids
// ambiguity in the matcher.
template<int Width>
bool isSImm9OffsetFB() const {
return isSImm<9>() && !isUImm12Offset<Width / 8>();
}
bool isAdrpLabel() const {
// Validation was handled during parsing, so we just sanity check that
// something didn't go haywire.
if (!isImm())
return false;
if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
int64_t Val = CE->getValue();
int64_t Min = - (4096 * (1LL << (21 - 1)));
int64_t Max = 4096 * ((1LL << (21 - 1)) - 1);
return (Val % 4096) == 0 && Val >= Min && Val <= Max;
}
return true;
}
bool isAdrLabel() const {
// Validation was handled during parsing, so we just sanity check that
// something didn't go haywire.
if (!isImm())
return false;
if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
int64_t Val = CE->getValue();
int64_t Min = - (1LL << (21 - 1));
int64_t Max = ((1LL << (21 - 1)) - 1);
return Val >= Min && Val <= Max;
}
return true;
}
void addExpr(MCInst &Inst, const MCExpr *Expr) const {
// Add as immediates when possible. Null MCExpr = 0.
if (!Expr)
Inst.addOperand(MCOperand::createImm(0));
else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
Inst.addOperand(MCOperand::createImm(CE->getValue()));
else
Inst.addOperand(MCOperand::createExpr(Expr));
}
void addRegOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createReg(getReg()));
}
void addGPR32as64Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
assert(
AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(getReg()));
const MCRegisterInfo *RI = Ctx.getRegisterInfo();
uint32_t Reg = RI->getRegClass(AArch64::GPR32RegClassID).getRegister(
RI->getEncodingValue(getReg()));
Inst.addOperand(MCOperand::createReg(Reg));
}
void addGPR64as32Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
assert(
AArch64MCRegisterClasses[AArch64::GPR32RegClassID].contains(getReg()));
const MCRegisterInfo *RI = Ctx.getRegisterInfo();
uint32_t Reg = RI->getRegClass(AArch64::GPR64RegClassID).getRegister(
RI->getEncodingValue(getReg()));
Inst.addOperand(MCOperand::createReg(Reg));
}
template <int Width>
void addFPRasZPRRegOperands(MCInst &Inst, unsigned N) const {
unsigned Base;
switch (Width) {
case 8: Base = AArch64::B0; break;
case 16: Base = AArch64::H0; break;
case 32: Base = AArch64::S0; break;
case 64: Base = AArch64::D0; break;
case 128: Base = AArch64::Q0; break;
default:
llvm_unreachable("Unsupported width");
}
Inst.addOperand(MCOperand::createReg(AArch64::Z0 + getReg() - Base));
}
void addVectorReg64Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
assert(
AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg()));
Inst.addOperand(MCOperand::createReg(AArch64::D0 + getReg() - AArch64::Q0));
}
void addVectorReg128Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
assert(
AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg()));
Inst.addOperand(MCOperand::createReg(getReg()));
}
void addVectorRegLoOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createReg(getReg()));
}
enum VecListIndexType {
VecListIdx_DReg = 0,
VecListIdx_QReg = 1,
VecListIdx_ZReg = 2,
};
template <VecListIndexType RegTy, unsigned NumRegs>
void addVectorListOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
static const unsigned FirstRegs[][5] = {
/* DReg */ { AArch64::Q0,
AArch64::D0, AArch64::D0_D1,
AArch64::D0_D1_D2, AArch64::D0_D1_D2_D3 },
/* QReg */ { AArch64::Q0,
AArch64::Q0, AArch64::Q0_Q1,
AArch64::Q0_Q1_Q2, AArch64::Q0_Q1_Q2_Q3 },
/* ZReg */ { AArch64::Z0,
AArch64::Z0, AArch64::Z0_Z1,
AArch64::Z0_Z1_Z2, AArch64::Z0_Z1_Z2_Z3 }
};
assert((RegTy != VecListIdx_ZReg || NumRegs <= 4) &&
" NumRegs must be <= 4 for ZRegs");
unsigned FirstReg = FirstRegs[(unsigned)RegTy][NumRegs];
Inst.addOperand(MCOperand::createReg(FirstReg + getVectorListStart() -
FirstRegs[(unsigned)RegTy][0]));
}
void addVectorIndexOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(getVectorIndex()));
}
template <unsigned ImmIs0, unsigned ImmIs1>
void addExactFPImmOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
assert(bool(isExactFPImm<ImmIs0, ImmIs1>()) && "Invalid operand");
Inst.addOperand(MCOperand::createImm(bool(isExactFPImm<ImmIs1>())));
}
void addImmOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// If this is a pageoff symrefexpr with an addend, adjust the addend
// to be only the page-offset portion. Otherwise, just add the expr
// as-is.
addExpr(Inst, getImm());
}
template <int Shift>
void addImmWithOptionalShiftOperands(MCInst &Inst, unsigned N) const {
assert(N == 2 && "Invalid number of operands!");
if (auto ShiftedVal = getShiftedVal<Shift>()) {
Inst.addOperand(MCOperand::createImm(ShiftedVal->first));
Inst.addOperand(MCOperand::createImm(ShiftedVal->second));
} else if (isShiftedImm()) {
addExpr(Inst, getShiftedImmVal());
Inst.addOperand(MCOperand::createImm(getShiftedImmShift()));
} else {
addExpr(Inst, getImm());
Inst.addOperand(MCOperand::createImm(0));
}
}
template <int Shift>
void addImmNegWithOptionalShiftOperands(MCInst &Inst, unsigned N) const {
assert(N == 2 && "Invalid number of operands!");
if (auto ShiftedVal = getShiftedVal<Shift>()) {
Inst.addOperand(MCOperand::createImm(-ShiftedVal->first));
Inst.addOperand(MCOperand::createImm(ShiftedVal->second));
} else
llvm_unreachable("Not a shifted negative immediate");
}
void addCondCodeOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(getCondCode()));
}
void addAdrpLabelOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
if (!MCE)
addExpr(Inst, getImm());
else
Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 12));
}
void addAdrLabelOperands(MCInst &Inst, unsigned N) const {
addImmOperands(Inst, N);
}
template<int Scale>
void addUImm12OffsetOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
if (!MCE) {
Inst.addOperand(MCOperand::createExpr(getImm()));
return;
}
Inst.addOperand(MCOperand::createImm(MCE->getValue() / Scale));
}
void addUImm6Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(MCE->getValue()));
}
template <int Scale>
void addImmScaledOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(MCE->getValue() / Scale));
}
template <typename T>
void addLogicalImmOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
typename std::make_unsigned<T>::type Val = MCE->getValue();
uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, sizeof(T) * 8);
Inst.addOperand(MCOperand::createImm(encoding));
}
template <typename T>
void addLogicalImmNotOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
typename std::make_unsigned<T>::type Val = ~MCE->getValue();
uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, sizeof(T) * 8);
Inst.addOperand(MCOperand::createImm(encoding));
}
void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
uint64_t encoding = AArch64_AM::encodeAdvSIMDModImmType10(MCE->getValue());
Inst.addOperand(MCOperand::createImm(encoding));
}
void addBranchTarget26Operands(MCInst &Inst, unsigned N) const {
// Branch operands don't encode the low bits, so shift them off
// here. If it's a label, however, just put it on directly as there's
// not enough information now to do anything.
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
if (!MCE) {
addExpr(Inst, getImm());
return;
}
assert(MCE && "Invalid constant immediate operand!");
Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2));
}
void addPCRelLabel19Operands(MCInst &Inst, unsigned N) const {
// Branch operands don't encode the low bits, so shift them off
// here. If it's a label, however, just put it on directly as there's
// not enough information now to do anything.
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
if (!MCE) {
addExpr(Inst, getImm());
return;
}
assert(MCE && "Invalid constant immediate operand!");
Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2));
}
void addBranchTarget14Operands(MCInst &Inst, unsigned N) const {
// Branch operands don't encode the low bits, so shift them off
// here. If it's a label, however, just put it on directly as there's
// not enough information now to do anything.
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
if (!MCE) {
addExpr(Inst, getImm());
return;
}
assert(MCE && "Invalid constant immediate operand!");
Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2));
}
void addFPImmOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(
AArch64_AM::getFP64Imm(getFPImm().bitcastToAPInt())));
}
void addBarrierOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(getBarrier()));
}
void addMRSSystemRegisterOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(SysReg.MRSReg));
}
void addMSRSystemRegisterOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(SysReg.MSRReg));
}
void addSystemPStateFieldWithImm0_1Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(SysReg.PStateField));
}
void addSystemPStateFieldWithImm0_15Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(SysReg.PStateField));
}
void addSysCROperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(getSysCR()));
}
void addPrefetchOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(getPrefetch()));
}
void addPSBHintOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(getPSBHint()));
}
void addBTIHintOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(getBTIHint()));
}
void addShifterOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
unsigned Imm =
AArch64_AM::getShifterImm(getShiftExtendType(), getShiftExtendAmount());
Inst.addOperand(MCOperand::createImm(Imm));
}
void addExtendOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
AArch64_AM::ShiftExtendType ET = getShiftExtendType();
if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTW;
unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount());
Inst.addOperand(MCOperand::createImm(Imm));
}
void addExtend64Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
AArch64_AM::ShiftExtendType ET = getShiftExtendType();
if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTX;
unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount());
Inst.addOperand(MCOperand::createImm(Imm));
}
void addMemExtendOperands(MCInst &Inst, unsigned N) const {
assert(N == 2 && "Invalid number of operands!");
AArch64_AM::ShiftExtendType ET = getShiftExtendType();
bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX;
Inst.addOperand(MCOperand::createImm(IsSigned));
Inst.addOperand(MCOperand::createImm(getShiftExtendAmount() != 0));
}
// For 8-bit load/store instructions with a register offset, both the
// "DoShift" and "NoShift" variants have a shift of 0. Because of this,
// they're disambiguated by whether the shift was explicit or implicit rather
// than its size.
void addMemExtend8Operands(MCInst &Inst, unsigned N) const {
assert(N == 2 && "Invalid number of operands!");
AArch64_AM::ShiftExtendType ET = getShiftExtendType();
bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX;
Inst.addOperand(MCOperand::createImm(IsSigned));
Inst.addOperand(MCOperand::createImm(hasShiftExtendAmount()));
}
template<int Shift>
void addMOVZMovAliasOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
uint64_t Value = CE->getValue();
Inst.addOperand(MCOperand::createImm((Value >> Shift) & 0xffff));
}
template<int Shift>
void addMOVNMovAliasOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
uint64_t Value = CE->getValue();
Inst.addOperand(MCOperand::createImm((~Value >> Shift) & 0xffff));
}
void addComplexRotationEvenOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm(MCE->getValue() / 90));
}
void addComplexRotationOddOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
Inst.addOperand(MCOperand::createImm((MCE->getValue() - 90) / 180));
}
void print(raw_ostream &OS) const override;
static std::unique_ptr<AArch64Operand>
CreateToken(StringRef Str, bool IsSuffix, SMLoc S, MCContext &Ctx) {
auto Op = make_unique<AArch64Operand>(k_Token, Ctx);
Op->Tok.Data = Str.data();
Op->Tok.Length = Str.size();
Op->Tok.IsSuffix = IsSuffix;
Op->StartLoc = S;
Op->EndLoc = S;
return Op;
}
static std::unique_ptr<AArch64Operand>
CreateReg(unsigned RegNum, RegKind Kind, SMLoc S, SMLoc E, MCContext &Ctx,
RegConstraintEqualityTy EqTy = RegConstraintEqualityTy::EqualsReg,
AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL,
unsigned ShiftAmount = 0,
unsigned HasExplicitAmount = false) {
auto Op = make_unique<AArch64Operand>(k_Register, Ctx);
Op->Reg.RegNum = RegNum;
Op->Reg.Kind = Kind;
Op->Reg.ElementWidth = 0;
Op->Reg.EqualityTy = EqTy;
Op->Reg.ShiftExtend.Type = ExtTy;
Op->Reg.ShiftExtend.Amount = ShiftAmount;
Op->Reg.ShiftExtend.HasExplicitAmount = HasExplicitAmount;
Op->StartLoc = S;
Op->EndLoc = E;
return Op;
}
static std::unique_ptr<AArch64Operand>
CreateVectorReg(unsigned RegNum, RegKind Kind, unsigned ElementWidth,
SMLoc S, SMLoc E, MCContext &Ctx,
AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL,
unsigned ShiftAmount = 0,
unsigned HasExplicitAmount = false) {
assert((Kind == RegKind::NeonVector || Kind == RegKind::SVEDataVector ||
Kind == RegKind::SVEPredicateVector) &&
"Invalid vector kind");
auto Op = CreateReg(RegNum, Kind, S, E, Ctx, EqualsReg, ExtTy, ShiftAmount,
HasExplicitAmount);
Op->Reg.ElementWidth = ElementWidth;
return Op;
}
static std::unique_ptr<AArch64Operand>
CreateVectorList(unsigned RegNum, unsigned Count, unsigned NumElements,
unsigned ElementWidth, RegKind RegisterKind, SMLoc S, SMLoc E,
MCContext &Ctx) {
auto Op = make_unique<AArch64Operand>(k_VectorList, Ctx);
Op->VectorList.RegNum = RegNum;
Op->VectorList.Count = Count;
Op->VectorList.NumElements = NumElements;
Op->VectorList.ElementWidth = ElementWidth;
Op->VectorList.RegisterKind = RegisterKind;
Op->StartLoc = S;
Op->EndLoc = E;
return Op;
}
static std::unique_ptr<AArch64Operand>
CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) {
auto Op = make_unique<AArch64Operand>(k_VectorIndex, Ctx);
Op->VectorIndex.Val = Idx;
Op->StartLoc = S;
Op->EndLoc = E;
return Op;
}
static std::unique_ptr<AArch64Operand> CreateImm(const MCExpr *Val, SMLoc S,
SMLoc E, MCContext &Ctx) {
auto Op = make_unique<AArch64Operand>(k_Immediate, Ctx);
Op->Imm.Val = Val;
Op->StartLoc = S;
Op->EndLoc = E;
return Op;
}
static std::unique_ptr<AArch64Operand> CreateShiftedImm(const MCExpr *Val,
unsigned ShiftAmount,
SMLoc S, SMLoc E,
MCContext &Ctx) {
auto Op = make_unique<AArch64Operand>(k_ShiftedImm, Ctx);
Op->ShiftedImm .Val = Val;
Op->ShiftedImm.ShiftAmount = ShiftAmount;
Op->StartLoc = S;
Op->EndLoc = E;
return Op;
}
static std::unique_ptr<AArch64Operand>
CreateCondCode(AArch64CC::CondCode Code, SMLoc S, SMLoc E, MCContext &Ctx) {
auto Op = make_unique<AArch64Operand>(k_CondCode, Ctx);
Op->CondCode.Code = Code;
Op->StartLoc = S;
Op->EndLoc = E;
return Op;
}
static std::unique_ptr<AArch64Operand>
CreateFPImm(APFloat Val, bool IsExact, SMLoc S, MCContext &Ctx) {
auto Op = make_unique<AArch64Operand>(k_FPImm, Ctx);
Op->FPImm.Val = Val.bitcastToAPInt().getSExtValue();
Op->FPImm.IsExact = IsExact;
Op->StartLoc = S;
Op->EndLoc = S;
return Op;
}
static std::unique_ptr<AArch64Operand> CreateBarrier(unsigned Val,
StringRef Str,
SMLoc S,
MCContext &Ctx) {
auto Op = make_unique<AArch64Operand>(k_Barrier, Ctx);
Op->Barrier.Val = Val;
Op->Barrier.Data = Str.data();
Op->Barrier.Length = Str.size();
Op->StartLoc = S;
Op->EndLoc = S;
return Op;
}
static std::unique_ptr<AArch64Operand> CreateSysReg(StringRef Str, SMLoc S,
uint32_t MRSReg,
uint32_t MSRReg,
uint32_t PStateField,
MCContext &Ctx) {
auto Op = make_unique<AArch64Operand>(k_SysReg, Ctx);
Op->SysReg.Data = Str.data();
Op->SysReg.Length = Str.size();
Op->SysReg.MRSReg = MRSReg;
Op->SysReg.MSRReg = MSRReg;
Op->SysReg.PStateField = PStateField;
Op->StartLoc = S;
Op->EndLoc = S;
return Op;
}
static std::unique_ptr<AArch64Operand> CreateSysCR(unsigned Val, SMLoc S,
SMLoc E, MCContext &Ctx) {
auto Op = make_unique<AArch64Operand>(k_SysCR, Ctx);
Op->SysCRImm.Val = Val;
Op->StartLoc = S;
Op->EndLoc = E;
return Op;
}
static std::unique_ptr<AArch64Operand> CreatePrefetch(unsigned Val,
StringRef Str,
SMLoc S,
MCContext &Ctx) {
auto Op = make_unique<AArch64Operand>(k_Prefetch, Ctx);
Op->Prefetch.Val = Val;
Op->Barrier.Data = Str.data();
Op->Barrier.Length = Str.size();
Op->StartLoc = S;
Op->EndLoc = S;
return Op;
}
static std::unique_ptr<AArch64Operand> CreatePSBHint(unsigned Val,
StringRef Str,
SMLoc S,
MCContext &Ctx) {
auto Op = make_unique<AArch64Operand>(k_PSBHint, Ctx);
Op->PSBHint.Val = Val;
Op->PSBHint.Data = Str.data();
Op->PSBHint.Length = Str.size();
Op->StartLoc = S;
Op->EndLoc = S;
return Op;
}
static std::unique_ptr<AArch64Operand> CreateBTIHint(unsigned Val,
StringRef Str,
SMLoc S,
MCContext &Ctx) {
auto Op = make_unique<AArch64Operand>(k_BTIHint, Ctx);
Op->BTIHint.Val = Val << 1 | 32;
Op->BTIHint.Data = Str.data();
Op->BTIHint.Length = Str.size();
Op->StartLoc = S;
Op->EndLoc = S;
return Op;
}
static std::unique_ptr<AArch64Operand>
CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val,
bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) {
auto Op = make_unique<AArch64Operand>(k_ShiftExtend, Ctx);
Op->ShiftExtend.Type = ShOp;
Op->ShiftExtend.Amount = Val;
Op->ShiftExtend.HasExplicitAmount = HasExplicitAmount;
Op->StartLoc = S;
Op->EndLoc = E;
return Op;
}
};
} // end anonymous namespace.
void AArch64Operand::print(raw_ostream &OS) const {
switch (Kind) {
case k_FPImm:
OS << "<fpimm " << getFPImm().bitcastToAPInt().getZExtValue();
if (!getFPImmIsExact())
OS << " (inexact)";
OS << ">";
break;
case k_Barrier: {
StringRef Name = getBarrierName();
if (!Name.empty())
OS << "<barrier " << Name << ">";
else
OS << "<barrier invalid #" << getBarrier() << ">";
break;
}
case k_Immediate:
OS << *getImm();
break;
case k_ShiftedImm: {
unsigned Shift = getShiftedImmShift();
OS << "<shiftedimm ";
OS << *getShiftedImmVal();
OS << ", lsl #" << AArch64_AM::getShiftValue(Shift) << ">";
break;
}
case k_CondCode:
OS << "<condcode " << getCondCode() << ">";
break;
case k_VectorList: {
OS << "<vectorlist ";
unsigned Reg = getVectorListStart();
for (unsigned i = 0, e = getVectorListCount(); i != e; ++i)
OS << Reg + i << " ";
OS << ">";
break;
}
case k_VectorIndex:
OS << "<vectorindex " << getVectorIndex() << ">";
break;
case k_SysReg:
OS << "<sysreg: " << getSysReg() << '>';
break;
case k_Token:
OS << "'" << getToken() << "'";
break;
case k_SysCR:
OS << "c" << getSysCR();
break;
case k_Prefetch: {
StringRef Name = getPrefetchName();
if (!Name.empty())
OS << "<prfop " << Name << ">";
else
OS << "<prfop invalid #" << getPrefetch() << ">";
break;
}
case k_PSBHint:
OS << getPSBHintName();
break;
case k_Register:
OS << "<register " << getReg() << ">";
if (!getShiftExtendAmount() && !hasShiftExtendAmount())
break;
LLVM_FALLTHROUGH;
case k_BTIHint:
OS << getBTIHintName();
break;
case k_ShiftExtend:
OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #"
<< getShiftExtendAmount();
if (!hasShiftExtendAmount())
OS << "<imp>";
OS << '>';
break;
}
}
/// @name Auto-generated Match Functions
/// {
static unsigned MatchRegisterName(StringRef Name);
/// }
static unsigned MatchNeonVectorRegName(StringRef Name) {
return StringSwitch<unsigned>(Name.lower())
.Case("v0", AArch64::Q0)
.Case("v1", AArch64::Q1)
.Case("v2", AArch64::Q2)
.Case("v3", AArch64::Q3)
.Case("v4", AArch64::Q4)
.Case("v5", AArch64::Q5)
.Case("v6", AArch64::Q6)
.Case("v7", AArch64::Q7)
.Case("v8", AArch64::Q8)
.Case("v9", AArch64::Q9)
.Case("v10", AArch64::Q10)
.Case("v11", AArch64::Q11)
.Case("v12", AArch64::Q12)
.Case("v13", AArch64::Q13)
.Case("v14", AArch64::Q14)
.Case("v15", AArch64::Q15)
.Case("v16", AArch64::Q16)
.Case("v17", AArch64::Q17)
.Case("v18", AArch64::Q18)
.Case("v19", AArch64::Q19)
.Case("v20", AArch64::Q20)
.Case("v21", AArch64::Q21)
.Case("v22", AArch64::Q22)
.Case("v23", AArch64::Q23)
.Case("v24", AArch64::Q24)
.Case("v25", AArch64::Q25)
.Case("v26", AArch64::Q26)
.Case("v27", AArch64::Q27)
.Case("v28", AArch64::Q28)
.Case("v29", AArch64::Q29)
.Case("v30", AArch64::Q30)
.Case("v31", AArch64::Q31)
.Default(0);
}
/// Returns an optional pair of (#elements, element-width) if Suffix
/// is a valid vector kind. Where the number of elements in a vector
/// or the vector width is implicit or explicitly unknown (but still a
/// valid suffix kind), 0 is used.
static Optional<std::pair<int, int>> parseVectorKind(StringRef Suffix,
RegKind VectorKind) {
std::pair<int, int> Res = {-1, -1};
switch (VectorKind) {
case RegKind::NeonVector:
Res =
StringSwitch<std::pair<int, int>>(Suffix.lower())
.Case("", {0, 0})
.Case(".1d", {1, 64})
.Case(".1q", {1, 128})
// '.2h' needed for fp16 scalar pairwise reductions
.Case(".2h", {2, 16})
.Case(".2s", {2, 32})
.Case(".2d", {2, 64})
// '.4b' is another special case for the ARMv8.2a dot product
// operand
.Case(".4b", {4, 8})
.Case(".4h", {4, 16})
.Case(".4s", {4, 32})
.Case(".8b", {8, 8})
.Case(".8h", {8, 16})
.Case(".16b", {16, 8})
// Accept the width neutral ones, too, for verbose syntax. If those
// aren't used in the right places, the token operand won't match so
// all will work out.
.Case(".b", {0, 8})
.Case(".h", {0, 16})
.Case(".s", {0, 32})
.Case(".d", {0, 64})
.Default({-1, -1});
break;
case RegKind::SVEPredicateVector:
case RegKind::SVEDataVector:
Res = StringSwitch<std::pair<int, int>>(Suffix.lower())
.Case("", {0, 0})
.Case(".b", {0, 8})
.Case(".h", {0, 16})
.Case(".s", {0, 32})
.Case(".d", {0, 64})
.Case(".q", {0, 128})
.Default({-1, -1});
break;
default:
llvm_unreachable("Unsupported RegKind");
}
if (Res == std::make_pair(-1, -1))
return Optional<std::pair<int, int>>();
return Optional<std::pair<int, int>>(Res);
}
static bool isValidVectorKind(StringRef Suffix, RegKind VectorKind) {
return parseVectorKind(Suffix, VectorKind).hasValue();
}
static unsigned matchSVEDataVectorRegName(StringRef Name) {
return StringSwitch<unsigned>(Name.lower())
.Case("z0", AArch64::Z0)
.Case("z1", AArch64::Z1)
.Case("z2", AArch64::Z2)
.Case("z3", AArch64::Z3)
.Case("z4", AArch64::Z4)
.Case("z5", AArch64::Z5)
.Case("z6", AArch64::Z6)
.Case("z7", AArch64::Z7)
.Case("z8", AArch64::Z8)
.Case("z9", AArch64::Z9)
.Case("z10", AArch64::Z10)
.Case("z11", AArch64::Z11)
.Case("z12", AArch64::Z12)
.Case("z13", AArch64::Z13)
.Case("z14", AArch64::Z14)
.Case("z15", AArch64::Z15)
.Case("z16", AArch64::Z16)
.Case("z17", AArch64::Z17)
.Case("z18", AArch64::Z18)
.Case("z19", AArch64::Z19)
.Case("z20", AArch64::Z20)
.Case("z21", AArch64::Z21)
.Case("z22", AArch64::Z22)
.Case("z23", AArch64::Z23)
.Case("z24", AArch64::Z24)
.Case("z25", AArch64::Z25)
.Case("z26", AArch64::Z26)
.Case("z27", AArch64::Z27)
.Case("z28", AArch64::Z28)
.Case("z29", AArch64::Z29)
.Case("z30", AArch64::Z30)
.Case("z31", AArch64::Z31)
.Default(0);
}
static unsigned matchSVEPredicateVectorRegName(StringRef Name) {
return StringSwitch<unsigned>(Name.lower())
.Case("p0", AArch64::P0)
.Case("p1", AArch64::P1)
.Case("p2", AArch64::P2)
.Case("p3", AArch64::P3)
.Case("p4", AArch64::P4)
.Case("p5", AArch64::P5)
.Case("p6", AArch64::P6)
.Case("p7", AArch64::P7)
.Case("p8", AArch64::P8)
.Case("p9", AArch64::P9)
.Case("p10", AArch64::P10)
.Case("p11", AArch64::P11)
.Case("p12", AArch64::P12)
.Case("p13", AArch64::P13)
.Case("p14", AArch64::P14)
.Case("p15", AArch64::P15)
.Default(0);
}
bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
SMLoc &EndLoc) {
StartLoc = getLoc();
auto Res = tryParseScalarRegister(RegNo);
EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1);
return Res != MatchOperand_Success;
}
// Matches a register name or register alias previously defined by '.req'
unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name,
RegKind Kind) {
unsigned RegNum = 0;
if ((RegNum = matchSVEDataVectorRegName(Name)))
return Kind == RegKind::SVEDataVector ? RegNum : 0;
if ((RegNum = matchSVEPredicateVectorRegName(Name)))
return Kind == RegKind::SVEPredicateVector ? RegNum : 0;
if ((RegNum = MatchNeonVectorRegName(Name)))
return Kind == RegKind::NeonVector ? RegNum : 0;
// The parsed register must be of RegKind Scalar
if ((RegNum = MatchRegisterName(Name)))
return Kind == RegKind::Scalar ? RegNum : 0;
if (!RegNum) {
// Handle a few common aliases of registers.
if (auto RegNum = StringSwitch<unsigned>(Name.lower())
.Case("fp", AArch64::FP)
.Case("lr", AArch64::LR)
.Case("x31", AArch64::XZR)
.Case("w31", AArch64::WZR)
.Default(0))
return Kind == RegKind::Scalar ? RegNum : 0;
// Check for aliases registered via .req. Canonicalize to lower case.
// That's more consistent since register names are case insensitive, and
// it's how the original entry was passed in from MC/MCParser/AsmParser.
auto Entry = RegisterReqs.find(Name.lower());
if (Entry == RegisterReqs.end())
return 0;
// set RegNum if the match is the right kind of register
if (Kind == Entry->getValue().first)
RegNum = Entry->getValue().second;
}
return RegNum;
}
/// tryParseScalarRegister - Try to parse a register name. The token must be an
/// Identifier when called, and if it is a register name the token is eaten and
/// the register is added to the operand list.
OperandMatchResultTy
AArch64AsmParser::tryParseScalarRegister(unsigned &RegNum) {
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
if (Tok.isNot(AsmToken::Identifier))
return MatchOperand_NoMatch;
std::string lowerCase = Tok.getString().lower();
unsigned Reg = matchRegisterNameAlias(lowerCase, RegKind::Scalar);
if (Reg == 0)
return MatchOperand_NoMatch;
RegNum = Reg;
Parser.Lex(); // Eat identifier token.
return MatchOperand_Success;
}
/// tryParseSysCROperand - Try to parse a system instruction CR operand name.
OperandMatchResultTy
AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
SMLoc S = getLoc();
if (Parser.getTok().isNot(AsmToken::Identifier)) {
Error(S, "Expected cN operand where 0 <= N <= 15");
return MatchOperand_ParseFail;
}
StringRef Tok = Parser.getTok().getIdentifier();
if (Tok[0] != 'c' && Tok[0] != 'C') {
Error(S, "Expected cN operand where 0 <= N <= 15");
return MatchOperand_ParseFail;
}
uint32_t CRNum;
bool BadNum = Tok.drop_front().getAsInteger(10, CRNum);
if (BadNum || CRNum > 15) {
Error(S, "Expected cN operand where 0 <= N <= 15");
return MatchOperand_ParseFail;
}
Parser.Lex(); // Eat identifier token.
Operands.push_back(
AArch64Operand::CreateSysCR(CRNum, S, getLoc(), getContext()));
return MatchOperand_Success;
}
/// tryParsePrefetch - Try to parse a prefetch operand.
template <bool IsSVEPrefetch>
OperandMatchResultTy
AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
SMLoc S = getLoc();
const AsmToken &Tok = Parser.getTok();
auto LookupByName = [](StringRef N) {
if (IsSVEPrefetch) {
if (auto Res = AArch64SVEPRFM::lookupSVEPRFMByName(N))
return Optional<unsigned>(Res->Encoding);
} else if (auto Res = AArch64PRFM::lookupPRFMByName(N))
return Optional<unsigned>(Res->Encoding);
return Optional<unsigned>();
};
auto LookupByEncoding = [](unsigned E) {
if (IsSVEPrefetch) {
if (auto Res = AArch64SVEPRFM::lookupSVEPRFMByEncoding(E))
return Optional<StringRef>(Res->Name);
} else if (auto Res = AArch64PRFM::lookupPRFMByEncoding(E))
return Optional<StringRef>(Res->Name);
return Optional<StringRef>();
};
unsigned MaxVal = IsSVEPrefetch ? 15 : 31;
// Either an identifier for named values or a 5-bit immediate.
// Eat optional hash.
if (parseOptionalToken(AsmToken::Hash) ||
Tok.is(AsmToken::Integer)) {
const MCExpr *ImmVal;
if (getParser().parseExpression(ImmVal))
return MatchOperand_ParseFail;
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
if (!MCE) {
TokError("immediate value expected for prefetch operand");
return MatchOperand_ParseFail;
}
unsigned prfop = MCE->getValue();
if (prfop > MaxVal) {
TokError("prefetch operand out of range, [0," + utostr(MaxVal) +
"] expected");
return MatchOperand_ParseFail;
}
auto PRFM = LookupByEncoding(MCE->getValue());
Operands.push_back(AArch64Operand::CreatePrefetch(
prfop, PRFM.getValueOr(""), S, getContext()));
return MatchOperand_Success;
}
if (Tok.isNot(AsmToken::Identifier)) {
TokError("prefetch hint expected");
return MatchOperand_ParseFail;
}
auto PRFM = LookupByName(Tok.getString());
if (!PRFM) {
TokError("prefetch hint expected");
return MatchOperand_ParseFail;
}
Parser.Lex(); // Eat identifier token.
Operands.push_back(AArch64Operand::CreatePrefetch(
*PRFM, Tok.getString(), S, getContext()));
return MatchOperand_Success;
}
/// tryParsePSBHint - Try to parse a PSB operand, mapped to Hint command
OperandMatchResultTy
AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
SMLoc S = getLoc();
const AsmToken &Tok = Parser.getTok();
if (Tok.isNot(AsmToken::Identifier)) {
TokError("invalid operand for instruction");
return MatchOperand_ParseFail;
}
auto PSB = AArch64PSBHint::lookupPSBByName(Tok.getString());
if (!PSB) {
TokError("invalid operand for instruction");
return MatchOperand_ParseFail;
}
Parser.Lex(); // Eat identifier token.
Operands.push_back(AArch64Operand::CreatePSBHint(
PSB->Encoding, Tok.getString(), S, getContext()));
return MatchOperand_Success;
}
/// tryParseBTIHint - Try to parse a BTI operand, mapped to Hint command
OperandMatchResultTy
AArch64AsmParser::tryParseBTIHint(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
SMLoc S = getLoc();
const AsmToken &Tok = Parser.getTok();
if (Tok.isNot(AsmToken::Identifier)) {
TokError("invalid operand for instruction");
return MatchOperand_ParseFail;
}
auto BTI = AArch64BTIHint::lookupBTIByName(Tok.getString());
if (!BTI) {
TokError("invalid operand for instruction");
return MatchOperand_ParseFail;
}
Parser.Lex(); // Eat identifier token.
Operands.push_back(AArch64Operand::CreateBTIHint(
BTI->Encoding, Tok.getString(), S, getContext()));
return MatchOperand_Success;
}
/// tryParseAdrpLabel - Parse and validate a source label for the ADRP
/// instruction.
OperandMatchResultTy
AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
SMLoc S = getLoc();
const MCExpr *Expr = nullptr;
if (Parser.getTok().is(AsmToken::Hash)) {
Parser.Lex(); // Eat hash token.
}
if (parseSymbolicImmVal(Expr))
return MatchOperand_ParseFail;
AArch64MCExpr::VariantKind ELFRefKind;
MCSymbolRefExpr::VariantKind DarwinRefKind;
int64_t Addend;
if (classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
if (DarwinRefKind == MCSymbolRefExpr::VK_None &&
ELFRefKind == AArch64MCExpr::VK_INVALID) {
// No modifier was specified at all; this is the syntax for an ELF basic
// ADRP relocation (unfortunately).
Expr =
AArch64MCExpr::create(Expr, AArch64MCExpr::VK_ABS_PAGE, getContext());
} else if ((DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGE ||
DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGE) &&
Addend != 0) {
Error(S, "gotpage label reference not allowed an addend");
return MatchOperand_ParseFail;
} else if (DarwinRefKind != MCSymbolRefExpr::VK_PAGE &&
DarwinRefKind != MCSymbolRefExpr::VK_GOTPAGE &&
DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE &&
ELFRefKind != AArch64MCExpr::VK_ABS_PAGE_NC &&
ELFRefKind != AArch64MCExpr::VK_GOT_PAGE &&
ELFRefKind != AArch64MCExpr::VK_GOTTPREL_PAGE &&
ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE) {
// The operand must be an @page or @gotpage qualified symbolref.
Error(S, "page or gotpage label reference expected");
return MatchOperand_ParseFail;
}
}
// We have either a label reference possibly with addend or an immediate. The
// addend is a raw value here. The linker will adjust it to only reference the
// page.
SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
return MatchOperand_Success;
}
/// tryParseAdrLabel - Parse and validate a source label for the ADR
/// instruction.
OperandMatchResultTy
AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
SMLoc S = getLoc();
const MCExpr *Expr = nullptr;
// Leave anything with a bracket to the default for SVE
if (getParser().getTok().is(AsmToken::LBrac))
return MatchOperand_NoMatch;
if (getParser().getTok().is(AsmToken::Hash))
getParser().Lex(); // Eat hash token.
if (parseSymbolicImmVal(Expr))
return MatchOperand_ParseFail;
AArch64MCExpr::VariantKind ELFRefKind;
MCSymbolRefExpr::VariantKind DarwinRefKind;
int64_t Addend;
if (classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
if (DarwinRefKind == MCSymbolRefExpr::VK_None &&
ELFRefKind == AArch64MCExpr::VK_INVALID) {
// No modifier was specified at all; this is the syntax for an ELF basic
// ADR relocation (unfortunately).
Expr = AArch64MCExpr::create(Expr, AArch64MCExpr::VK_ABS, getContext());
} else {
Error(S, "unexpected adr label");
return MatchOperand_ParseFail;
}
}
SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
return MatchOperand_Success;
}
/// tryParseFPImm - A floating point immediate expression operand.
template<bool AddFPZeroAsLiteral>
OperandMatchResultTy
AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
SMLoc S = getLoc();
bool Hash = parseOptionalToken(AsmToken::Hash);
// Handle negation, as that still comes through as a separate token.
bool isNegative = parseOptionalToken(AsmToken::Minus);
const AsmToken &Tok = Parser.getTok();
if (!Tok.is(AsmToken::Real) && !Tok.is(AsmToken::Integer)) {
if (!Hash)
return MatchOperand_NoMatch;
TokError("invalid floating point immediate");
return MatchOperand_ParseFail;
}
// Parse hexadecimal representation.
if (Tok.is(AsmToken::Integer) && Tok.getString().startswith("0x")) {
if (Tok.getIntVal() > 255 || isNegative) {
TokError("encoded floating point value out of range");
return MatchOperand_ParseFail;
}
APFloat F((double)AArch64_AM::getFPImmFloat(Tok.getIntVal()));
Operands.push_back(
AArch64Operand::CreateFPImm(F, true, S, getContext()));
} else {
// Parse FP representation.
APFloat RealVal(APFloat::IEEEdouble());
auto Status =
RealVal.convertFromString(Tok.getString(), APFloat::rmTowardZero);
if (isNegative)
RealVal.changeSign();
if (AddFPZeroAsLiteral && RealVal.isPosZero()) {
Operands.push_back(
AArch64Operand::CreateToken("#0", false, S, getContext()));
Operands.push_back(
AArch64Operand::CreateToken(".0", false, S, getContext()));
} else
Operands.push_back(AArch64Operand::CreateFPImm(
RealVal, Status == APFloat::opOK, S, getContext()));
}
Parser.Lex(); // Eat the token.
return MatchOperand_Success;
}
/// tryParseImmWithOptionalShift - Parse immediate operand, optionally with
/// a shift suffix, for example '#1, lsl #12'.
OperandMatchResultTy
AArch64AsmParser::tryParseImmWithOptionalShift(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
SMLoc S = getLoc();
if (Parser.getTok().is(AsmToken::Hash))
Parser.Lex(); // Eat '#'
else if (Parser.getTok().isNot(AsmToken::Integer))
// Operand should start from # or should be integer, emit error otherwise.
return MatchOperand_NoMatch;
const MCExpr *Imm = nullptr;
if (parseSymbolicImmVal(Imm))
return MatchOperand_ParseFail;
else if (Parser.getTok().isNot(AsmToken::Comma)) {
SMLoc E = Parser.getTok().getLoc();
Operands.push_back(
AArch64Operand::CreateImm(Imm, S, E, getContext()));
return MatchOperand_Success;
}
// Eat ','
Parser.Lex();
// The optional operand must be "lsl #N" where N is non-negative.
if (!Parser.getTok().is(AsmToken::Identifier) ||
!Parser.getTok().getIdentifier().equals_lower("lsl")) {
Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
return MatchOperand_ParseFail;
}
// Eat 'lsl'
Parser.Lex();
parseOptionalToken(AsmToken::Hash);
if (Parser.getTok().isNot(AsmToken::Integer)) {
Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
return MatchOperand_ParseFail;
}
int64_t ShiftAmount = Parser.getTok().getIntVal();
if (ShiftAmount < 0) {
Error(Parser.getTok().getLoc(), "positive shift amount required");
return MatchOperand_ParseFail;
}
Parser.Lex(); // Eat the number
// Just in case the optional lsl #0 is used for immediates other than zero.
if (ShiftAmount == 0 && Imm != nullptr) {
SMLoc E = Parser.getTok().getLoc();
Operands.push_back(AArch64Operand::CreateImm(Imm, S, E, getContext()));
return MatchOperand_Success;
}
SMLoc E = Parser.getTok().getLoc();
Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount,
S, E, getContext()));
return MatchOperand_Success;
}
/// parseCondCodeString - Parse a Condition Code string.
AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) {
AArch64CC::CondCode CC = StringSwitch<AArch64CC::CondCode>(Cond.lower())
.Case("eq", AArch64CC::EQ)
.Case("ne", AArch64CC::NE)
.Case("cs", AArch64CC::HS)
.Case("hs", AArch64CC::HS)
.Case("cc", AArch64CC::LO)
.Case("lo", AArch64CC::LO)
.Case("mi", AArch64CC::MI)
.Case("pl", AArch64CC::PL)
.Case("vs", AArch64CC::VS)
.Case("vc", AArch64CC::VC)
.Case("hi", AArch64CC::HI)
.Case("ls", AArch64CC::LS)
.Case("ge", AArch64CC::GE)
.Case("lt", AArch64CC::LT)
.Case("gt", AArch64CC::GT)
.Case("le", AArch64CC::LE)
.Case("al", AArch64CC::AL)
.Case("nv", AArch64CC::NV)
.Default(AArch64CC::Invalid);
if (CC == AArch64CC::Invalid &&
getSTI().getFeatureBits()[AArch64::FeatureSVE])
CC = StringSwitch<AArch64CC::CondCode>(Cond.lower())
.Case("none", AArch64CC::EQ)
.Case("any", AArch64CC::NE)
.Case("nlast", AArch64CC::HS)
.Case("last", AArch64CC::LO)
.Case("first", AArch64CC::MI)
.Case("nfrst", AArch64CC::PL)
.Case("pmore", AArch64CC::HI)
.Case("plast", AArch64CC::LS)
.Case("tcont", AArch64CC::GE)
.Case("tstop", AArch64CC::LT)
.Default(AArch64CC::Invalid);
return CC;
}
/// parseCondCode - Parse a Condition Code operand.
bool AArch64AsmParser::parseCondCode(OperandVector &Operands,
bool invertCondCode) {
MCAsmParser &Parser = getParser();
SMLoc S = getLoc();
const AsmToken &Tok = Parser.getTok();
assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
StringRef Cond = Tok.getString();
AArch64CC::CondCode CC = parseCondCodeString(Cond);
if (CC == AArch64CC::Invalid)
return TokError("invalid condition code");
Parser.Lex(); // Eat identifier token.
if (invertCondCode) {
if (CC == AArch64CC::AL || CC == AArch64CC::NV)
return TokError("condition codes AL and NV are invalid for this instruction");
CC = AArch64CC::getInvertedCondCode(AArch64CC::CondCode(CC));
}
Operands.push_back(
AArch64Operand::CreateCondCode(CC, S, getLoc(), getContext()));
return false;
}
/// tryParseOptionalShift - Some operands take an optional shift argument. Parse
/// them if present.
OperandMatchResultTy
AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
std::string LowerID = Tok.getString().lower();
AArch64_AM::ShiftExtendType ShOp =
StringSwitch<AArch64_AM::ShiftExtendType>(LowerID)
.Case("lsl", AArch64_AM::LSL)
.Case("lsr", AArch64_AM::LSR)
.Case("asr", AArch64_AM::ASR)
.Case("ror", AArch64_AM::ROR)
.Case("msl", AArch64_AM::MSL)
.Case("uxtb", AArch64_AM::UXTB)
.Case("uxth", AArch64_AM::UXTH)
.Case("uxtw", AArch64_AM::UXTW)
.Case("uxtx", AArch64_AM::UXTX)
.Case("sxtb", AArch64_AM::SXTB)
.Case("sxth", AArch64_AM::SXTH)
.Case("sxtw", AArch64_AM::SXTW)
.Case("sxtx", AArch64_AM::SXTX)
.Default(AArch64_AM::InvalidShiftExtend);
if (ShOp == AArch64_AM::InvalidShiftExtend)
return MatchOperand_NoMatch;
SMLoc S = Tok.getLoc();
Parser.Lex();
bool Hash = parseOptionalToken(AsmToken::Hash);
if (!Hash && getLexer().isNot(AsmToken::Integer)) {
if (ShOp == AArch64_AM::LSL || ShOp == AArch64_AM::LSR ||
ShOp == AArch64_AM::ASR || ShOp == AArch64_AM::ROR ||
ShOp == AArch64_AM::MSL) {
// We expect a number here.
TokError("expected #imm after shift specifier");
return MatchOperand_ParseFail;
}
// "extend" type operations don't need an immediate, #0 is implicit.
SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
Operands.push_back(
AArch64Operand::CreateShiftExtend(ShOp, 0, false, S, E, getContext()));
return MatchOperand_Success;
}
// Make sure we do actually have a number, identifier or a parenthesized
// expression.
SMLoc E = Parser.getTok().getLoc();
if (!Parser.getTok().is(AsmToken::Integer) &&
!Parser.getTok().is(AsmToken::LParen) &&
!Parser.getTok().is(AsmToken::Identifier)) {
Error(E, "expected integer shift amount");
return MatchOperand_ParseFail;
}
const MCExpr *ImmVal;
if (getParser().parseExpression(ImmVal))
return MatchOperand_ParseFail;
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
if (!MCE) {
Error(E, "expected constant '#imm' after shift specifier");
return MatchOperand_ParseFail;
}
E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
Operands.push_back(AArch64Operand::CreateShiftExtend(
ShOp, MCE->getValue(), true, S, E, getContext()));
return MatchOperand_Success;
}
static const struct Extension {
const char *Name;
const FeatureBitset Features;
} ExtensionMap[] = {
{"crc", {AArch64::FeatureCRC}},
{"sm4", {AArch64::FeatureSM4}},
{"sha3", {AArch64::FeatureSHA3}},
{"sha2", {AArch64::FeatureSHA2}},
{"aes", {AArch64::FeatureAES}},
{"crypto", {AArch64::FeatureCrypto}},
{"fp", {AArch64::FeatureFPARMv8}},
{"simd", {AArch64::FeatureNEON}},
{"ras", {AArch64::FeatureRAS}},
{"lse", {AArch64::FeatureLSE}},
{"predres", {AArch64::FeaturePredRes}},
{"ccdp", {AArch64::FeatureCacheDeepPersist}},
{"mte", {AArch64::FeatureMTE}},
{"tlb-rmi", {AArch64::FeatureTLB_RMI}},
{"pan-rwv", {AArch64::FeaturePAN_RWV}},
{"ccpp", {AArch64::FeatureCCPP}},
{"sve", {AArch64::FeatureSVE}},
{"sve2", {AArch64::FeatureSVE2}},
{"sve2-aes", {AArch64::FeatureSVE2AES}},
{"sve2-sm4", {AArch64::FeatureSVE2SM4}},
{"sve2-sha3", {AArch64::FeatureSVE2SHA3}},
- {"bitperm", {AArch64::FeatureSVE2BitPerm}},
+ {"sve2-bitperm", {AArch64::FeatureSVE2BitPerm}},
// FIXME: Unsupported extensions
{"pan", {}},
{"lor", {}},
{"rdma", {}},
{"profile", {}},
};
static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
if (FBS[AArch64::HasV8_1aOps])
Str += "ARMv8.1a";
else if (FBS[AArch64::HasV8_2aOps])
Str += "ARMv8.2a";
else if (FBS[AArch64::HasV8_3aOps])
Str += "ARMv8.3a";
else if (FBS[AArch64::HasV8_4aOps])
Str += "ARMv8.4a";
else if (FBS[AArch64::HasV8_5aOps])
Str += "ARMv8.5a";
else {
auto ext = std::find_if(std::begin(ExtensionMap),
std::end(ExtensionMap),
[&](const Extension& e)
// Use & in case multiple features are enabled
{ return (FBS & e.Features) != FeatureBitset(); }
);
Str += ext != std::end(ExtensionMap) ? ext->Name : "(unknown)";
}
}
void AArch64AsmParser::createSysAlias(uint16_t Encoding, OperandVector &Operands,
SMLoc S) {
const uint16_t Op2 = Encoding & 7;
const uint16_t Cm = (Encoding & 0x78) >> 3;
const uint16_t Cn = (Encoding & 0x780) >> 7;
const uint16_t Op1 = (Encoding & 0x3800) >> 11;
const MCExpr *Expr = MCConstantExpr::create(Op1, getContext());
Operands.push_back(
AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));
Operands.push_back(
AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext()));
Operands.push_back(
AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext()));
Expr = MCConstantExpr::create(Op2, getContext());
Operands.push_back(
AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));
}
/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for
/// the SYS instruction. Parse them specially so that we create a SYS MCInst.
bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
OperandVector &Operands) {
if (Name.find('.') != StringRef::npos)
return TokError("invalid operand");
Mnemonic = Name;
Operands.push_back(
AArch64Operand::CreateToken("sys", false, NameLoc, getContext()));
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
StringRef Op = Tok.getString();
SMLoc S = Tok.getLoc();
if (Mnemonic == "ic") {
const AArch64IC::IC *IC = AArch64IC::lookupICByName(Op);
if (!IC)
return TokError("invalid operand for IC instruction");
else if (!IC->haveFeatures(getSTI().getFeatureBits())) {
std::string Str("IC " + std::string(IC->Name) + " requires ");
setRequiredFeatureString(IC->getRequiredFeatures(), Str);
return TokError(Str.c_str());
}
createSysAlias(IC->Encoding, Operands, S);
} else if (Mnemonic == "dc") {
const AArch64DC::DC *DC = AArch64DC::lookupDCByName(Op);
if (!DC)
return TokError("invalid operand for DC instruction");
else if (!DC->haveFeatures(getSTI().getFeatureBits())) {
std::string Str("DC " + std::string(DC->Name) + " requires ");
setRequiredFeatureString(DC->getRequiredFeatures(), Str);
return TokError(Str.c_str());
}
createSysAlias(DC->Encoding, Operands, S);
} else if (Mnemonic == "at") {
const AArch64AT::AT *AT = AArch64AT::lookupATByName(Op);
if (!AT)
return TokError("invalid operand for AT instruction");
else if (!AT->haveFeatures(getSTI().getFeatureBits())) {
std::string Str("AT " + std::string(AT->Name) + " requires ");
setRequiredFeatureString(AT->getRequiredFeatures(), Str);
return TokError(Str.c_str());
}
createSysAlias(AT->Encoding, Operands, S);
} else if (Mnemonic == "tlbi") {
const AArch64TLBI::TLBI *TLBI = AArch64TLBI::lookupTLBIByName(Op);
if (!TLBI)
return TokError("invalid operand for TLBI instruction");
else if (!TLBI->haveFeatures(getSTI().getFeatureBits())) {
std::string Str("TLBI " + std::string(TLBI->Name) + " requires ");
setRequiredFeatureString(TLBI->getRequiredFeatures(), Str);
return TokError(Str.c_str());
}
createSysAlias(TLBI->Encoding, Operands, S);
} else if (Mnemonic == "cfp" || Mnemonic == "dvp" || Mnemonic == "cpp") {
const AArch64PRCTX::PRCTX *PRCTX = AArch64PRCTX::lookupPRCTXByName(Op);
if (!PRCTX)
return TokError("invalid operand for prediction restriction instruction");
else if (!PRCTX->haveFeatures(getSTI().getFeatureBits())) {
std::string Str(
Mnemonic.upper() + std::string(PRCTX->Name) + " requires ");
setRequiredFeatureString(PRCTX->getRequiredFeatures(), Str);
return TokError(Str.c_str());
}
uint16_t PRCTX_Op2 =
Mnemonic == "cfp" ? 4 :
Mnemonic == "dvp" ? 5 :
Mnemonic == "cpp" ? 7 :
0;
assert(PRCTX_Op2 && "Invalid mnemonic for prediction restriction instruction");
createSysAlias(PRCTX->Encoding << 3 | PRCTX_Op2 , Operands, S);
}
Parser.Lex(); // Eat operand.
bool ExpectRegister = (Op.lower().find("all") == StringRef::npos);
bool HasRegister = false;
// Check for the optional register operand.
if (parseOptionalToken(AsmToken::Comma)) {
if (Tok.isNot(AsmToken::Identifier) || parseRegister(Operands))
return TokError("expected register operand");
HasRegister = true;
}
if (ExpectRegister && !HasRegister)
return TokError("specified " + Mnemonic + " op requires a register");
else if (!ExpectRegister && HasRegister)
return TokError("specified " + Mnemonic + " op does not use a register");
if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list"))
return true;
return false;
}
OperandMatchResultTy
AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
if (Mnemonic == "tsb" && Tok.isNot(AsmToken::Identifier)) {
TokError("'csync' operand expected");
return MatchOperand_ParseFail;
// Can be either a #imm style literal or an option name
} else if (parseOptionalToken(AsmToken::Hash) || Tok.is(AsmToken::Integer)) {
// Immediate operand.
const MCExpr *ImmVal;
SMLoc ExprLoc = getLoc();
if (getParser().parseExpression(ImmVal))
return MatchOperand_ParseFail;
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
if (!MCE) {
Error(ExprLoc, "immediate value expected for barrier operand");
return MatchOperand_ParseFail;
}
if (MCE->getValue() < 0 || MCE->getValue() > 15) {
Error(ExprLoc, "barrier operand out of range");
return MatchOperand_ParseFail;
}
auto DB = AArch64DB::lookupDBByEncoding(MCE->getValue());
Operands.push_back(AArch64Operand::CreateBarrier(
MCE->getValue(), DB ? DB->Name : "", ExprLoc, getContext()));
return MatchOperand_Success;
}
if (Tok.isNot(AsmToken::Identifier)) {
TokError("invalid operand for instruction");
return MatchOperand_ParseFail;
}
auto TSB = AArch64TSB::lookupTSBByName(Tok.getString());
// The only valid named option for ISB is 'sy'
auto DB = AArch64DB::lookupDBByName(Tok.getString());
if (Mnemonic == "isb" && (!DB || DB->Encoding != AArch64DB::sy)) {
TokError("'sy' or #imm operand expected");
return MatchOperand_ParseFail;
// The only valid named option for TSB is 'csync'
} else if (Mnemonic == "tsb" && (!TSB || TSB->Encoding != AArch64TSB::csync)) {
TokError("'csync' operand expected");
return MatchOperand_ParseFail;
} else if (!DB && !TSB) {
TokError("invalid barrier option name");
return MatchOperand_ParseFail;
}
Operands.push_back(AArch64Operand::CreateBarrier(
DB ? DB->Encoding : TSB->Encoding, Tok.getString(), getLoc(), getContext()));
Parser.Lex(); // Consume the option
return MatchOperand_Success;
}
OperandMatchResultTy
AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
if (Tok.isNot(AsmToken::Identifier))
return MatchOperand_NoMatch;
int MRSReg, MSRReg;
auto SysReg = AArch64SysReg::lookupSysRegByName(Tok.getString());
if (SysReg && SysReg->haveFeatures(getSTI().getFeatureBits())) {
MRSReg = SysReg->Readable ? SysReg->Encoding : -1;
MSRReg = SysReg->Writeable ? SysReg->Encoding : -1;
} else
MRSReg = MSRReg = AArch64SysReg::parseGenericRegister(Tok.getString());
auto PState = AArch64PState::lookupPStateByName(Tok.getString());
unsigned PStateImm = -1;
if (PState && PState->haveFeatures(getSTI().getFeatureBits()))
PStateImm = PState->Encoding;
Operands.push_back(
AArch64Operand::CreateSysReg(Tok.getString(), getLoc(), MRSReg, MSRReg,
PStateImm, getContext()));
Parser.Lex(); // Eat identifier
return MatchOperand_Success;
}
/// tryParseNeonVectorRegister - Parse a vector register operand.
bool AArch64AsmParser::tryParseNeonVectorRegister(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
if (Parser.getTok().isNot(AsmToken::Identifier))
return true;
SMLoc S = getLoc();
// Check for a vector register specifier first.
StringRef Kind;
unsigned Reg;
OperandMatchResultTy Res =
tryParseVectorRegister(Reg, Kind, RegKind::NeonVector);
if (Res != MatchOperand_Success)
return true;
const auto &KindRes = parseVectorKind(Kind, RegKind::NeonVector);
if (!KindRes)
return true;
unsigned ElementWidth = KindRes->second;
Operands.push_back(
AArch64Operand::CreateVectorReg(Reg, RegKind::NeonVector, ElementWidth,
S, getLoc(), getContext()));
// If there was an explicit qualifier, that goes on as a literal text
// operand.
if (!Kind.empty())
Operands.push_back(
AArch64Operand::CreateToken(Kind, false, S, getContext()));
return tryParseVectorIndex(Operands) == MatchOperand_ParseFail;
}
OperandMatchResultTy
AArch64AsmParser::tryParseVectorIndex(OperandVector &Operands) {
SMLoc SIdx = getLoc();
if (parseOptionalToken(AsmToken::LBrac)) {
const MCExpr *ImmVal;
if (getParser().parseExpression(ImmVal))
return MatchOperand_NoMatch;
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
if (!MCE) {
TokError("immediate value expected for vector index");
return MatchOperand_ParseFail;;
}
SMLoc E = getLoc();
if (parseToken(AsmToken::RBrac, "']' expected"))
return MatchOperand_ParseFail;;
Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
E, getContext()));
return MatchOperand_Success;
}
return MatchOperand_NoMatch;
}
// tryParseVectorRegister - Try to parse a vector register name with
// optional kind specifier. If it is a register specifier, eat the token
// and return it.
OperandMatchResultTy
AArch64AsmParser::tryParseVectorRegister(unsigned &Reg, StringRef &Kind,
RegKind MatchKind) {
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
if (Tok.isNot(AsmToken::Identifier))
return MatchOperand_NoMatch;
StringRef Name = Tok.getString();
// If there is a kind specifier, it's separated from the register name by
// a '.'.
size_t Start = 0, Next = Name.find('.');
StringRef Head = Name.slice(Start, Next);
unsigned RegNum = matchRegisterNameAlias(Head, MatchKind);
if (RegNum) {
if (Next != StringRef::npos) {
Kind = Name.slice(Next, StringRef::npos);
if (!isValidVectorKind(Kind, MatchKind)) {
TokError("invalid vector kind qualifier");
return MatchOperand_ParseFail;
}
}
Parser.Lex(); // Eat the register token.
Reg = RegNum;
return MatchOperand_Success;
}
return MatchOperand_NoMatch;
}
/// tryParseSVEPredicateVector - Parse a SVE predicate register operand.
OperandMatchResultTy
AArch64AsmParser::tryParseSVEPredicateVector(OperandVector &Operands) {
// Check for a SVE predicate register specifier first.
const SMLoc S = getLoc();
StringRef Kind;
unsigned RegNum;
auto Res = tryParseVectorRegister(RegNum, Kind, RegKind::SVEPredicateVector);
if (Res != MatchOperand_Success)
return Res;
const auto &KindRes = parseVectorKind(Kind, RegKind::SVEPredicateVector);
if (!KindRes)
return MatchOperand_NoMatch;
unsigned ElementWidth = KindRes->second;
Operands.push_back(AArch64Operand::CreateVectorReg(
RegNum, RegKind::SVEPredicateVector, ElementWidth, S,
getLoc(), getContext()));
// Not all predicates are followed by a '/m' or '/z'.
MCAsmParser &Parser = getParser();
if (Parser.getTok().isNot(AsmToken::Slash))
return MatchOperand_Success;
// But when they do they shouldn't have an element type suffix.
if (!Kind.empty()) {
Error(S, "not expecting size suffix");
return MatchOperand_ParseFail;
}
// Add a literal slash as operand
Operands.push_back(
AArch64Operand::CreateToken("/" , false, getLoc(), getContext()));
Parser.Lex(); // Eat the slash.
// Zeroing or merging?
auto Pred = Parser.getTok().getString().lower();
if (Pred != "z" && Pred != "m") {
Error(getLoc(), "expecting 'm' or 'z' predication");
return MatchOperand_ParseFail;
}
// Add zero/merge token.
const char *ZM = Pred == "z" ? "z" : "m";
Operands.push_back(
AArch64Operand::CreateToken(ZM, false, getLoc(), getContext()));
Parser.Lex(); // Eat zero/merge token.
return MatchOperand_Success;
}
/// parseRegister - Parse a register operand.
bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
// Try for a Neon vector register.
if (!tryParseNeonVectorRegister(Operands))
return false;
// Otherwise try for a scalar register.
if (tryParseGPROperand<false>(Operands) == MatchOperand_Success)
return false;
return true;
}
bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
MCAsmParser &Parser = getParser();
bool HasELFModifier = false;
AArch64MCExpr::VariantKind RefKind;
if (parseOptionalToken(AsmToken::Colon)) {
HasELFModifier = true;
if (Parser.getTok().isNot(AsmToken::Identifier))
return TokError("expect relocation specifier in operand after ':'");
std::string LowerCase = Parser.getTok().getIdentifier().lower();
RefKind = StringSwitch<AArch64MCExpr::VariantKind>(LowerCase)
.Case("lo12", AArch64MCExpr::VK_LO12)
.Case("abs_g3", AArch64MCExpr::VK_ABS_G3)
.Case("abs_g2", AArch64MCExpr::VK_ABS_G2)
.Case("abs_g2_s", AArch64MCExpr::VK_ABS_G2_S)
.Case("abs_g2_nc", AArch64MCExpr::VK_ABS_G2_NC)
.Case("abs_g1", AArch64MCExpr::VK_ABS_G1)
.Case("abs_g1_s", AArch64MCExpr::VK_ABS_G1_S)
.Case("abs_g1_nc", AArch64MCExpr::VK_ABS_G1_NC)
.Case("abs_g0", AArch64MCExpr::VK_ABS_G0)
.Case("abs_g0_s", AArch64MCExpr::VK_ABS_G0_S)
.Case("abs_g0_nc", AArch64MCExpr::VK_ABS_G0_NC)
.Case("dtprel_g2", AArch64MCExpr::VK_DTPREL_G2)
.Case("dtprel_g1", AArch64MCExpr::VK_DTPREL_G1)
.Case("dtprel_g1_nc", AArch64MCExpr::VK_DTPREL_G1_NC)
.Case("dtprel_g0", AArch64MCExpr::VK_DTPREL_G0)
.Case("dtprel_g0_nc", AArch64MCExpr::VK_DTPREL_G0_NC)
.Case("dtprel_hi12", AArch64MCExpr::VK_DTPREL_HI12)
.Case("dtprel_lo12", AArch64MCExpr::VK_DTPREL_LO12)
.Case("dtprel_lo12_nc", AArch64MCExpr::VK_DTPREL_LO12_NC)
.Case("pg_hi21_nc", AArch64MCExpr::VK_ABS_PAGE_NC)
.Case("tprel_g2", AArch64MCExpr::VK_TPREL_G2)
.Case("tprel_g1", AArch64MCExpr::VK_TPREL_G1)
.Case("tprel_g1_nc", AArch64MCExpr::VK_TPREL_G1_NC)
.Case("tprel_g0", AArch64MCExpr::VK_TPREL_G0)
.Case("tprel_g0_nc", AArch64MCExpr::VK_TPREL_G0_NC)
.Case("tprel_hi12", AArch64MCExpr::VK_TPREL_HI12)
.Case("tprel_lo12", AArch64MCExpr::VK_TPREL_LO12)
.Case("tprel_lo12_nc", AArch64MCExpr::VK_TPREL_LO12_NC)
.Case("tlsdesc_lo12", AArch64MCExpr::VK_TLSDESC_LO12)
.Case("got", AArch64MCExpr::VK_GOT_PAGE)
.Case("got_lo12", AArch64MCExpr::VK_GOT_LO12)
.Case("gottprel", AArch64MCExpr::VK_GOTTPREL_PAGE)
.Case("gottprel_lo12", AArch64MCExpr::VK_GOTTPREL_LO12_NC)
.Case("gottprel_g1", AArch64MCExpr::VK_GOTTPREL_G1)
.Case("gottprel_g0_nc", AArch64MCExpr::VK_GOTTPREL_G0_NC)
.Case("tlsdesc", AArch64MCExpr::VK_TLSDESC_PAGE)
.Case("secrel_lo12", AArch64MCExpr::VK_SECREL_LO12)
.Case("secrel_hi12", AArch64MCExpr::VK_SECREL_HI12)
.Default(AArch64MCExpr::VK_INVALID);
if (RefKind == AArch64MCExpr::VK_INVALID)
return TokError("expect relocation specifier in operand after ':'");
Parser.Lex(); // Eat identifier
if (parseToken(AsmToken::Colon, "expect ':' after relocation specifier"))
return true;
}
if (getParser().parseExpression(ImmVal))
return true;
if (HasELFModifier)
ImmVal = AArch64MCExpr::create(ImmVal, RefKind, getContext());
return false;
}
template <RegKind VectorKind>
OperandMatchResultTy
AArch64AsmParser::tryParseVectorList(OperandVector &Operands,
bool ExpectMatch) {
MCAsmParser &Parser = getParser();
if (!Parser.getTok().is(AsmToken::LCurly))
return MatchOperand_NoMatch;
// Wrapper around parse function
auto ParseVector = [this, &Parser](unsigned &Reg, StringRef &Kind, SMLoc Loc,
bool NoMatchIsError) {
auto RegTok = Parser.getTok();
auto ParseRes = tryParseVectorRegister(Reg, Kind, VectorKind);
if (ParseRes == MatchOperand_Success) {
if (parseVectorKind(Kind, VectorKind))
return ParseRes;
llvm_unreachable("Expected a valid vector kind");
}
if (RegTok.isNot(AsmToken::Identifier) ||
ParseRes == MatchOperand_ParseFail ||
(ParseRes == MatchOperand_NoMatch && NoMatchIsError)) {
Error(Loc, "vector register expected");
return MatchOperand_ParseFail;
}
return MatchOperand_NoMatch;
};
SMLoc S = getLoc();
auto LCurly = Parser.getTok();
Parser.Lex(); // Eat left bracket token.
StringRef Kind;
unsigned FirstReg;
auto ParseRes = ParseVector(FirstReg, Kind, getLoc(), ExpectMatch);
// Put back the original left bracket if there was no match, so that
// different types of list-operands can be matched (e.g. SVE, Neon).
if (ParseRes == MatchOperand_NoMatch)
Parser.getLexer().UnLex(LCurly);
if (ParseRes != MatchOperand_Success)
return ParseRes;
int64_t PrevReg = FirstReg;
unsigned Count = 1;
if (parseOptionalToken(AsmToken::Minus)) {
SMLoc Loc = getLoc();
StringRef NextKind;
unsigned Reg;
ParseRes = ParseVector(Reg, NextKind, getLoc(), true);
if (ParseRes != MatchOperand_Success)
return ParseRes;
// Any Kind suffices must match on all regs in the list.
if (Kind != NextKind) {
Error(Loc, "mismatched register size suffix");
return MatchOperand_ParseFail;
}
unsigned Space = (PrevReg < Reg) ? (Reg - PrevReg) : (Reg + 32 - PrevReg);
if (Space == 0 || Space > 3) {
Error(Loc, "invalid number of vectors");
return MatchOperand_ParseFail;
}
Count += Space;
}
else {
while (parseOptionalToken(AsmToken::Comma)) {
SMLoc Loc = getLoc();
StringRef NextKind;
unsigned Reg;
ParseRes = ParseVector(Reg, NextKind, getLoc(), true);
if (ParseRes != MatchOperand_Success)
return ParseRes;
// Any Kind suffices must match on all regs in the list.
if (Kind != NextKind) {
Error(Loc, "mismatched register size suffix");
return MatchOperand_ParseFail;
}
// Registers must be incremental (with wraparound at 31)
if (getContext().getRegisterInfo()->getEncodingValue(Reg) !=
(getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32) {
Error(Loc, "registers must be sequential");
return MatchOperand_ParseFail;
}
PrevReg = Reg;
++Count;
}
}
if (parseToken(AsmToken::RCurly, "'}' expected"))
return MatchOperand_ParseFail;
if (Count > 4) {
Error(S, "invalid number of vectors");
return MatchOperand_ParseFail;
}
unsigned NumElements = 0;
unsigned ElementWidth = 0;
if (!Kind.empty()) {
if (const auto &VK = parseVectorKind(Kind, VectorKind))
std::tie(NumElements, ElementWidth) = *VK;
}
Operands.push_back(AArch64Operand::CreateVectorList(
FirstReg, Count, NumElements, ElementWidth, VectorKind, S, getLoc(),
getContext()));
return MatchOperand_Success;
}
/// parseNeonVectorList - Parse a vector list operand for AdvSIMD instructions.
bool AArch64AsmParser::parseNeonVectorList(OperandVector &Operands) {
auto ParseRes = tryParseVectorList<RegKind::NeonVector>(Operands, true);
if (ParseRes != MatchOperand_Success)
return true;
return tryParseVectorIndex(Operands) == MatchOperand_ParseFail;
}
OperandMatchResultTy
AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
SMLoc StartLoc = getLoc();
unsigned RegNum;
OperandMatchResultTy Res = tryParseScalarRegister(RegNum);
if (Res != MatchOperand_Success)
return Res;
if (!parseOptionalToken(AsmToken::Comma)) {
Operands.push_back(AArch64Operand::CreateReg(
RegNum, RegKind::Scalar, StartLoc, getLoc(), getContext()));
return MatchOperand_Success;
}
parseOptionalToken(AsmToken::Hash);
if (getParser().getTok().isNot(AsmToken::Integer)) {
Error(getLoc(), "index must be absent or #0");
return MatchOperand_ParseFail;
}
const MCExpr *ImmVal;
if (getParser().parseExpression(ImmVal) || !isa<MCConstantExpr>(ImmVal) ||
cast<MCConstantExpr>(ImmVal)->getValue() != 0) {
Error(getLoc(), "index must be absent or #0");
return MatchOperand_ParseFail;
}
Operands.push_back(AArch64Operand::CreateReg(
RegNum, RegKind::Scalar, StartLoc, getLoc(), getContext()));
return MatchOperand_Success;
}
template <bool ParseShiftExtend, RegConstraintEqualityTy EqTy>
OperandMatchResultTy
AArch64AsmParser::tryParseGPROperand(OperandVector &Operands) {
SMLoc StartLoc = getLoc();
unsigned RegNum;
OperandMatchResultTy Res = tryParseScalarRegister(RegNum);
if (Res != MatchOperand_Success)
return Res;
// No shift/extend is the default.
if (!ParseShiftExtend || getParser().getTok().isNot(AsmToken::Comma)) {
Operands.push_back(AArch64Operand::CreateReg(
RegNum, RegKind::Scalar, StartLoc, getLoc(), getContext(), EqTy));
return MatchOperand_Success;
}
// Eat the comma
getParser().Lex();
// Match the shift
SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> ExtOpnd;
Res = tryParseOptionalShiftExtend(ExtOpnd);
if (Res != MatchOperand_Success)
return Res;
auto Ext = static_cast<AArch64Operand*>(ExtOpnd.back().get());
Operands.push_back(AArch64Operand::CreateReg(
RegNum, RegKind::Scalar, StartLoc, Ext->getEndLoc(), getContext(), EqTy,
Ext->getShiftExtendType(), Ext->getShiftExtendAmount(),
Ext->hasShiftExtendAmount()));
return MatchOperand_Success;
}
bool AArch64AsmParser::parseOptionalMulOperand(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
// Some SVE instructions have a decoration after the immediate, i.e.
// "mul vl". We parse them here and add tokens, which must be present in the
// asm string in the tablegen instruction.
bool NextIsVL = Parser.getLexer().peekTok().getString().equals_lower("vl");
bool NextIsHash = Parser.getLexer().peekTok().is(AsmToken::Hash);
if (!Parser.getTok().getString().equals_lower("mul") ||
!(NextIsVL || NextIsHash))
return true;
Operands.push_back(
AArch64Operand::CreateToken("mul", false, getLoc(), getContext()));
Parser.Lex(); // Eat the "mul"
if (NextIsVL) {
Operands.push_back(
AArch64Operand::CreateToken("vl", false, getLoc(), getContext()));
Parser.Lex(); // Eat the "vl"
return false;
}
if (NextIsHash) {
Parser.Lex(); // Eat the #
SMLoc S = getLoc();
// Parse immediate operand.
const MCExpr *ImmVal;
if (!Parser.parseExpression(ImmVal))
if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal)) {
Operands.push_back(AArch64Operand::CreateImm(
MCConstantExpr::create(MCE->getValue(), getContext()), S, getLoc(),
getContext()));
return MatchOperand_Success;
}
}
return Error(getLoc(), "expected 'vl' or '#<imm>'");
}
/// parseOperand - Parse a arm instruction operand. For now this parses the
/// operand regardless of the mnemonic.
bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
bool invertCondCode) {
MCAsmParser &Parser = getParser();
OperandMatchResultTy ResTy =
MatchOperandParserImpl(Operands, Mnemonic, /*ParseForAllFeatures=*/ true);
// Check if the current operand has a custom associated parser, if so, try to
// custom parse the operand, or fallback to the general approach.
if (ResTy == MatchOperand_Success)
return false;
// If there wasn't a custom match, try the generic matcher below. Otherwise,
// there was a match, but an error occurred, in which case, just return that
// the operand parsing failed.
if (ResTy == MatchOperand_ParseFail)
return true;
// Nothing custom, so do general case parsing.
SMLoc S, E;
switch (getLexer().getKind()) {
default: {
SMLoc S = getLoc();
const MCExpr *Expr;
if (parseSymbolicImmVal(Expr))
return Error(S, "invalid operand");
SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
return false;
}
case AsmToken::LBrac: {
SMLoc Loc = Parser.getTok().getLoc();
Operands.push_back(AArch64Operand::CreateToken("[", false, Loc,
getContext()));
Parser.Lex(); // Eat '['
// There's no comma after a '[', so we can parse the next operand
// immediately.
return parseOperand(Operands, false, false);
}
case AsmToken::LCurly:
return parseNeonVectorList(Operands);
case AsmToken::Identifier: {
// If we're expecting a Condition Code operand, then just parse that.
if (isCondCode)
return parseCondCode(Operands, invertCondCode);
// If it's a register name, parse it.
if (!parseRegister(Operands))
return false;
// See if this is a "mul vl" decoration or "mul #<int>" operand used
// by SVE instructions.
if (!parseOptionalMulOperand(Operands))
return false;
// This could be an optional "shift" or "extend" operand.
OperandMatchResultTy GotShift = tryParseOptionalShiftExtend(Operands);
// We can only continue if no tokens were eaten.
if (GotShift != MatchOperand_NoMatch)
return GotShift;
// This was not a register so parse other operands that start with an
// identifier (like labels) as expressions and create them as immediates.
const MCExpr *IdVal;
S = getLoc();
if (getParser().parseExpression(IdVal))
return true;
E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
Operands.push_back(AArch64Operand::CreateImm(IdVal, S, E, getContext()));
return false;
}
case AsmToken::Integer:
case AsmToken::Real:
case AsmToken::Hash: {
// #42 -> immediate.
S = getLoc();
parseOptionalToken(AsmToken::Hash);
// Parse a negative sign
bool isNegative = false;
if (Parser.getTok().is(AsmToken::Minus)) {
isNegative = true;
// We need to consume this token only when we have a Real, otherwise
// we let parseSymbolicImmVal take care of it
if (Parser.getLexer().peekTok().is(AsmToken::Real))
Parser.Lex();
}
// The only Real that should come through here is a literal #0.0 for
// the fcmp[e] r, #0.0 instructions. They expect raw token operands,
// so convert the value.
const AsmToken &Tok = Parser.getTok();
if (Tok.is(AsmToken::Real)) {
APFloat RealVal(APFloat::IEEEdouble(), Tok.getString());
uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
if (Mnemonic != "fcmp" && Mnemonic != "fcmpe" && Mnemonic != "fcmeq" &&
Mnemonic != "fcmge" && Mnemonic != "fcmgt" && Mnemonic != "fcmle" &&
Mnemonic != "fcmlt" && Mnemonic != "fcmne")
return TokError("unexpected floating point literal");
else if (IntVal != 0 || isNegative)
return TokError("expected floating-point constant #0.0");
Parser.Lex(); // Eat the token.
Operands.push_back(
AArch64Operand::CreateToken("#0", false, S, getContext()));
Operands.push_back(
AArch64Operand::CreateToken(".0", false, S, getContext()));
return false;
}
const MCExpr *ImmVal;
if (parseSymbolicImmVal(ImmVal))
return true;
E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E, getContext()));
return false;
}
case AsmToken::Equal: {
SMLoc Loc = getLoc();
if (Mnemonic != "ldr") // only parse for ldr pseudo (e.g. ldr r0, =val)
return TokError("unexpected token in operand");
Parser.Lex(); // Eat '='
const MCExpr *SubExprVal;
if (getParser().parseExpression(SubExprVal))
return true;
if (Operands.size() < 2 ||
!static_cast<AArch64Operand &>(*Operands[1]).isScalarReg())
return Error(Loc, "Only valid when first operand is register");
bool IsXReg =
AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
Operands[1]->getReg());
MCContext& Ctx = getContext();
E = SMLoc::getFromPointer(Loc.getPointer() - 1);
// If the op is an imm and can be fit into a mov, then replace ldr with mov.
if (isa<MCConstantExpr>(SubExprVal)) {
uint64_t Imm = (cast<MCConstantExpr>(SubExprVal))->getValue();
uint32_t ShiftAmt = 0, MaxShiftAmt = IsXReg ? 48 : 16;
while(Imm > 0xFFFF && countTrailingZeros(Imm) >= 16) {
ShiftAmt += 16;
Imm >>= 16;
}
if (ShiftAmt <= MaxShiftAmt && Imm <= 0xFFFF) {
Operands[0] = AArch64Operand::CreateToken("movz", false, Loc, Ctx);
Operands.push_back(AArch64Operand::CreateImm(
MCConstantExpr::create(Imm, Ctx), S, E, Ctx));
if (ShiftAmt)
Operands.push_back(AArch64Operand::CreateShiftExtend(AArch64_AM::LSL,
ShiftAmt, true, S, E, Ctx));
return false;
}
APInt Simm = APInt(64, Imm << ShiftAmt);
// check if the immediate is an unsigned or signed 32-bit int for W regs
if (!IsXReg && !(Simm.isIntN(32) || Simm.isSignedIntN(32)))
return Error(Loc, "Immediate too large for register");
}
// If it is a label or an imm that cannot fit in a movz, put it into CP.
const MCExpr *CPLoc =
getTargetStreamer().addConstantPoolEntry(SubExprVal, IsXReg ? 8 : 4, Loc);
Operands.push_back(AArch64Operand::CreateImm(CPLoc, S, E, Ctx));
return false;
}
}
}
bool AArch64AsmParser::regsEqual(const MCParsedAsmOperand &Op1,
const MCParsedAsmOperand &Op2) const {
auto &AOp1 = static_cast<const AArch64Operand&>(Op1);
auto &AOp2 = static_cast<const AArch64Operand&>(Op2);
if (AOp1.getRegEqualityTy() == RegConstraintEqualityTy::EqualsReg &&
AOp2.getRegEqualityTy() == RegConstraintEqualityTy::EqualsReg)
return MCTargetAsmParser::regsEqual(Op1, Op2);
assert(AOp1.isScalarReg() && AOp2.isScalarReg() &&
"Testing equality of non-scalar registers not supported");
// Check if a registers match their sub/super register classes.
if (AOp1.getRegEqualityTy() == EqualsSuperReg)
return getXRegFromWReg(Op1.getReg()) == Op2.getReg();
if (AOp1.getRegEqualityTy() == EqualsSubReg)
return getWRegFromXReg(Op1.getReg()) == Op2.getReg();
if (AOp2.getRegEqualityTy() == EqualsSuperReg)
return getXRegFromWReg(Op2.getReg()) == Op1.getReg();
if (AOp2.getRegEqualityTy() == EqualsSubReg)
return getWRegFromXReg(Op2.getReg()) == Op1.getReg();
return false;
}
/// ParseInstruction - Parse an AArch64 instruction mnemonic followed by its
/// operands.
bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
StringRef Name, SMLoc NameLoc,
OperandVector &Operands) {
MCAsmParser &Parser = getParser();
Name = StringSwitch<StringRef>(Name.lower())
.Case("beq", "b.eq")
.Case("bne", "b.ne")
.Case("bhs", "b.hs")
.Case("bcs", "b.cs")
.Case("blo", "b.lo")
.Case("bcc", "b.cc")
.Case("bmi", "b.mi")
.Case("bpl", "b.pl")
.Case("bvs", "b.vs")
.Case("bvc", "b.vc")
.Case("bhi", "b.hi")
.Case("bls", "b.ls")
.Case("bge", "b.ge")
.Case("blt", "b.lt")
.Case("bgt", "b.gt")
.Case("ble", "b.le")
.Case("bal", "b.al")
.Case("bnv", "b.nv")
.Default(Name);
// First check for the AArch64-specific .req directive.
if (Parser.getTok().is(AsmToken::Identifier) &&
Parser.getTok().getIdentifier() == ".req") {
parseDirectiveReq(Name, NameLoc);
// We always return 'error' for this, as we're done with this
// statement and don't need to match the 'instruction."
return true;
}
// Create the leading tokens for the mnemonic, split by '.' characters.
size_t Start = 0, Next = Name.find('.');
StringRef Head = Name.slice(Start, Next);
// IC, DC, AT, TLBI and Prediction invalidation instructions are aliases for
// the SYS instruction.
if (Head == "ic" || Head == "dc" || Head == "at" || Head == "tlbi" ||
Head == "cfp" || Head == "dvp" || Head == "cpp")
return parseSysAlias(Head, NameLoc, Operands);
Operands.push_back(
AArch64Operand::CreateToken(Head, false, NameLoc, getContext()));
Mnemonic = Head;
// Handle condition codes for a branch mnemonic
if (Head == "b" && Next != StringRef::npos) {
Start = Next;
Next = Name.find('.', Start + 1);
Head = Name.slice(Start + 1, Next);
SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
(Head.data() - Name.data()));
AArch64CC::CondCode CC = parseCondCodeString(Head);
if (CC == AArch64CC::Invalid)
return Error(SuffixLoc, "invalid condition code");
Operands.push_back(
AArch64Operand::CreateToken(".", true, SuffixLoc, getContext()));
Operands.push_back(
AArch64Operand::CreateCondCode(CC, NameLoc, NameLoc, getContext()));
}
// Add the remaining tokens in the mnemonic.
while (Next != StringRef::npos) {
Start = Next;
Next = Name.find('.', Start + 1);
Head = Name.slice(Start, Next);
SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
(Head.data() - Name.data()) + 1);
Operands.push_back(
AArch64Operand::CreateToken(Head, true, SuffixLoc, getContext()));
}
// Conditional compare instructions have a Condition Code operand, which needs
// to be parsed and an immediate operand created.
bool condCodeFourthOperand =
(Head == "ccmp" || Head == "ccmn" || Head == "fccmp" ||
Head == "fccmpe" || Head == "fcsel" || Head == "csel" ||
Head == "csinc" || Head == "csinv" || Head == "csneg");
// These instructions are aliases to some of the conditional select
// instructions. However, the condition code is inverted in the aliased
// instruction.
//
// FIXME: Is this the correct way to handle these? Or should the parser
// generate the aliased instructions directly?
bool condCodeSecondOperand = (Head == "cset" || Head == "csetm");
bool condCodeThirdOperand =
(Head == "cinc" || Head == "cinv" || Head == "cneg");
// Read the remaining operands.
if (getLexer().isNot(AsmToken::EndOfStatement)) {
unsigned N = 1;
do {
// Parse and remember the operand.
if (parseOperand(Operands, (N == 4 && condCodeFourthOperand) ||
(N == 3 && condCodeThirdOperand) ||
(N == 2 && condCodeSecondOperand),
condCodeSecondOperand || condCodeThirdOperand)) {
return true;
}
// After successfully parsing some operands there are two special cases to
// consider (i.e. notional operands not separated by commas). Both are due
// to memory specifiers:
// + An RBrac will end an address for load/store/prefetch
// + An '!' will indicate a pre-indexed operation.
//
// It's someone else's responsibility to make sure these tokens are sane
// in the given context!
SMLoc RLoc = Parser.getTok().getLoc();
if (parseOptionalToken(AsmToken::RBrac))
Operands.push_back(
AArch64Operand::CreateToken("]", false, RLoc, getContext()));
SMLoc ELoc = Parser.getTok().getLoc();
if (parseOptionalToken(AsmToken::Exclaim))
Operands.push_back(
AArch64Operand::CreateToken("!", false, ELoc, getContext()));
++N;
} while (parseOptionalToken(AsmToken::Comma));
}
if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list"))
return true;
return false;
}
static inline bool isMatchingOrAlias(unsigned ZReg, unsigned Reg) {
assert((ZReg >= AArch64::Z0) && (ZReg <= AArch64::Z31));
return (ZReg == ((Reg - AArch64::B0) + AArch64::Z0)) ||
(ZReg == ((Reg - AArch64::H0) + AArch64::Z0)) ||
(ZReg == ((Reg - AArch64::S0) + AArch64::Z0)) ||
(ZReg == ((Reg - AArch64::D0) + AArch64::Z0)) ||
(ZReg == ((Reg - AArch64::Q0) + AArch64::Z0)) ||
(ZReg == ((Reg - AArch64::Z0) + AArch64::Z0));
}
// FIXME: This entire function is a giant hack to provide us with decent
// operand range validation/diagnostics until TableGen/MC can be extended
// to support autogeneration of this kind of validation.
bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc,
SmallVectorImpl<SMLoc> &Loc) {
const MCRegisterInfo *RI = getContext().getRegisterInfo();
const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
// A prefix only applies to the instruction following it. Here we extract
// prefix information for the next instruction before validating the current
// one so that in the case of failure we don't erronously continue using the
// current prefix.
PrefixInfo Prefix = NextPrefix;
NextPrefix = PrefixInfo::CreateFromInst(Inst, MCID.TSFlags);
// Before validating the instruction in isolation we run through the rules
// applicable when it follows a prefix instruction.
// NOTE: brk & hlt can be prefixed but require no additional validation.
if (Prefix.isActive() &&
(Inst.getOpcode() != AArch64::BRK) &&
(Inst.getOpcode() != AArch64::HLT)) {
// Prefixed intructions must have a destructive operand.
if ((MCID.TSFlags & AArch64::DestructiveInstTypeMask) ==
AArch64::NotDestructive)
return Error(IDLoc, "instruction is unpredictable when following a"
" movprfx, suggest replacing movprfx with mov");
// Destination operands must match.
if (Inst.getOperand(0).getReg() != Prefix.getDstReg())
return Error(Loc[0], "instruction is unpredictable when following a"
" movprfx writing to a different destination");
// Destination operand must not be used in any other location.
for (unsigned i = 1; i < Inst.getNumOperands(); ++i) {
if (Inst.getOperand(i).isReg() &&
(MCID.getOperandConstraint(i, MCOI::TIED_TO) == -1) &&
isMatchingOrAlias(Prefix.getDstReg(), Inst.getOperand(i).getReg()))
return Error(Loc[0], "instruction is unpredictable when following a"
" movprfx and destination also used as non-destructive"
" source");
}
auto PPRRegClass = AArch64MCRegisterClasses[AArch64::PPRRegClassID];
if (Prefix.isPredicated()) {
int PgIdx = -1;
// Find the instructions general predicate.
for (unsigned i = 1; i < Inst.getNumOperands(); ++i)
if (Inst.getOperand(i).isReg() &&
PPRRegClass.contains(Inst.getOperand(i).getReg())) {
PgIdx = i;
break;
}
// Instruction must be predicated if the movprfx is predicated.
if (PgIdx == -1 ||
(MCID.TSFlags & AArch64::ElementSizeMask) == AArch64::ElementSizeNone)
return Error(IDLoc, "instruction is unpredictable when following a"
" predicated movprfx, suggest using unpredicated movprfx");
// Instruction must use same general predicate as the movprfx.
if (Inst.getOperand(PgIdx).getReg() != Prefix.getPgReg())
return Error(IDLoc, "instruction is unpredictable when following a"
" predicated movprfx using a different general predicate");
// Instruction element type must match the movprfx.
if ((MCID.TSFlags & AArch64::ElementSizeMask) != Prefix.getElementSize())
return Error(IDLoc, "instruction is unpredictable when following a"
" predicated movprfx with a different element size");
}
}
// Check for indexed addressing modes w/ the base register being the
// same as a destination/source register or pair load where
// the Rt == Rt2. All of those are undefined behaviour.
switch (Inst.getOpcode()) {
case AArch64::LDPSWpre:
case AArch64::LDPWpost:
case AArch64::LDPWpre:
case AArch64::LDPXpost:
case AArch64::LDPXpre: {
unsigned Rt = Inst.getOperand(1).getReg();
unsigned Rt2 = Inst.getOperand(2).getReg();
unsigned Rn = Inst.getOperand(3).getReg();
if (RI->isSubRegisterEq(Rn, Rt))
return Error(Loc[0], "unpredictable LDP instruction, writeback base "
"is also a destination");
if (RI->isSubRegisterEq(Rn, Rt2))
return Error(Loc[1], "unpredictable LDP instruction, writeback base "
"is also a destination");
LLVM_FALLTHROUGH;
}
case AArch64::LDPDi:
case AArch64::LDPQi:
case AArch64::LDPSi:
case AArch64::LDPSWi:
case AArch64::LDPWi:
case AArch64::LDPXi: {
unsigned Rt = Inst.getOperand(0).getReg();
unsigned Rt2 = Inst.getOperand(1).getReg();
if (Rt == Rt2)
return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
break;
}
case AArch64::LDPDpost:
case AArch64::LDPDpre:
case AArch64::LDPQpost:
case AArch64::LDPQpre:
case AArch64::LDPSpost:
case AArch64::LDPSpre:
case AArch64::LDPSWpost: {
unsigned Rt = Inst.getOperand(1).getReg();
unsigned Rt2 = Inst.getOperand(2).getReg();
if (Rt == Rt2)
return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
break;
}
case AArch64::STPDpost:
case AArch64::STPDpre:
case AArch64::STPQpost:
case AArch64::STPQpre:
case AArch64::STPSpost:
case AArch64::STPSpre:
case AArch64::STPWpost:
case AArch64::STPWpre:
case AArch64::STPXpost:
case AArch64::STPXpre: {
unsigned Rt = Inst.getOperand(1).getReg();
unsigned Rt2 = Inst.getOperand(2).getReg();
unsigned Rn = Inst.getOperand(3).getReg();
if (RI->isSubRegisterEq(Rn, Rt))
return Error(Loc[0], "unpredictable STP instruction, writeback base "
"is also a source");
if (RI->isSubRegisterEq(Rn, Rt2))
return Error(Loc[1], "unpredictable STP instruction, writeback base "
"is also a source");
break;
}
case AArch64::LDRBBpre:
case AArch64::LDRBpre:
case AArch64::LDRHHpre:
case AArch64::LDRHpre:
case AArch64::LDRSBWpre:
case AArch64::LDRSBXpre:
case AArch64::LDRSHWpre:
case AArch64::LDRSHXpre:
case AArch64::LDRSWpre:
case AArch64::LDRWpre:
case AArch64::LDRXpre:
case AArch64::LDRBBpost:
case AArch64::LDRBpost:
case AArch64::LDRHHpost:
case AArch64::LDRHpost:
case AArch64::LDRSBWpost:
case AArch64::LDRSBXpost:
case AArch64::LDRSHWpost:
case AArch64::LDRSHXpost:
case AArch64::LDRSWpost:
case AArch64::LDRWpost:
case AArch64::LDRXpost: {
unsigned Rt = Inst.getOperand(1).getReg();
unsigned Rn = Inst.getOperand(2).getReg();
if (RI->isSubRegisterEq(Rn, Rt))
return Error(Loc[0], "unpredictable LDR instruction, writeback base "
"is also a source");
break;
}
case AArch64::STRBBpost:
case AArch64::STRBpost:
case AArch64::STRHHpost:
case AArch64::STRHpost:
case AArch64::STRWpost:
case AArch64::STRXpost:
case AArch64::STRBBpre:
case AArch64::STRBpre:
case AArch64::STRHHpre:
case AArch64::STRHpre:
case AArch64::STRWpre:
case AArch64::STRXpre: {
unsigned Rt = Inst.getOperand(1).getReg();
unsigned Rn = Inst.getOperand(2).getReg();
if (RI->isSubRegisterEq(Rn, Rt))
return Error(Loc[0], "unpredictable STR instruction, writeback base "
"is also a source");
break;
}
case AArch64::STXRB:
case AArch64::STXRH:
case AArch64::STXRW:
case AArch64::STXRX:
case AArch64::STLXRB:
case AArch64::STLXRH:
case AArch64::STLXRW:
case AArch64::STLXRX: {
unsigned Rs = Inst.getOperand(0).getReg();
unsigned Rt = Inst.getOperand(1).getReg();
unsigned Rn = Inst.getOperand(2).getReg();
if (RI->isSubRegisterEq(Rt, Rs) ||
(RI->isSubRegisterEq(Rn, Rs) && Rn != AArch64::SP))
return Error(Loc[0],
"unpredictable STXR instruction, status is also a source");
break;
}
case AArch64::STXPW:
case AArch64::STXPX:
case AArch64::STLXPW:
case AArch64::STLXPX: {
unsigned Rs = Inst.getOperand(0).getReg();
unsigned Rt1 = Inst.getOperand(1).getReg();
unsigned Rt2 = Inst.getOperand(2).getReg();
unsigned Rn = Inst.getOperand(3).getReg();
if (RI->isSubRegisterEq(Rt1, Rs) || RI->isSubRegisterEq(Rt2, Rs) ||
(RI->isSubRegisterEq(Rn, Rs) && Rn != AArch64::SP))
return Error(Loc[0],
"unpredictable STXP instruction, status is also a source");
break;
}
}
// Now check immediate ranges. Separate from the above as there is overlap
// in the instructions being checked and this keeps the nested conditionals
// to a minimum.
switch (Inst.getOpcode()) {
case AArch64::ADDSWri:
case AArch64::ADDSXri:
case AArch64::ADDWri:
case AArch64::ADDXri:
case AArch64::SUBSWri:
case AArch64::SUBSXri:
case AArch64::SUBWri:
case AArch64::SUBXri: {
// Annoyingly we can't do this in the isAddSubImm predicate, so there is
// some slight duplication here.
if (Inst.getOperand(2).isExpr()) {
const MCExpr *Expr = Inst.getOperand(2).getExpr();
AArch64MCExpr::VariantKind ELFRefKind;
MCSymbolRefExpr::VariantKind DarwinRefKind;
int64_t Addend;
if (classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
// Only allow these with ADDXri.
if ((DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) &&
Inst.getOpcode() == AArch64::ADDXri)
return false;
// Only allow these with ADDXri/ADDWri
if ((ELFRefKind == AArch64MCExpr::VK_LO12 ||
ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 ||
ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 ||
ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC ||
ELFRefKind == AArch64MCExpr::VK_TPREL_HI12 ||
ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12 ||
ELFRefKind == AArch64MCExpr::VK_SECREL_LO12 ||
ELFRefKind == AArch64MCExpr::VK_SECREL_HI12) &&
(Inst.getOpcode() == AArch64::ADDXri ||
Inst.getOpcode() == AArch64::ADDWri))
return false;
// Don't allow symbol refs in the immediate field otherwise
// Note: Loc.back() may be Loc[1] or Loc[2] depending on the number of
// operands of the original instruction (i.e. 'add w0, w1, borked' vs
// 'cmp w0, 'borked')
return Error(Loc.back(), "invalid immediate expression");
}
// We don't validate more complex expressions here
}
return false;
}
default:
return false;
}
}
static std::string AArch64MnemonicSpellCheck(StringRef S,
const FeatureBitset &FBS,
unsigned VariantID = 0);
bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
uint64_t ErrorInfo,
OperandVector &Operands) {
switch (ErrCode) {
case Match_InvalidTiedOperand: {
RegConstraintEqualityTy EqTy =
static_cast<const AArch64Operand &>(*Operands[ErrorInfo])
.getRegEqualityTy();
switch (EqTy) {
case RegConstraintEqualityTy::EqualsSubReg:
return Error(Loc, "operand must be 64-bit form of destination register");
case RegConstraintEqualityTy::EqualsSuperReg:
return Error(Loc, "operand must be 32-bit form of destination register");
case RegConstraintEqualityTy::EqualsReg:
return Error(Loc, "operand must match destination register");
}
llvm_unreachable("Unknown RegConstraintEqualityTy");
}
case Match_MissingFeature:
return Error(Loc,
"instruction requires a CPU feature not currently enabled");
case Match_InvalidOperand:
return Error(Loc, "invalid operand for instruction");
case Match_InvalidSuffix:
return Error(Loc, "invalid type suffix for instruction");
case Match_InvalidCondCode:
return Error(Loc, "expected AArch64 condition code");
case Match_AddSubRegExtendSmall:
return Error(Loc,
"expected '[su]xt[bhw]' with optional integer in range [0, 4]");
case Match_AddSubRegExtendLarge:
return Error(Loc,
"expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]");
case Match_AddSubSecondSource:
return Error(Loc,
"expected compatible register, symbol or integer in range [0, 4095]");
case Match_LogicalSecondSource:
return Error(Loc, "expected compatible register or logical immediate");
case Match_InvalidMovImm32Shift:
return Error(Loc, "expected 'lsl' with optional integer 0 or 16");
case Match_InvalidMovImm64Shift:
return Error(Loc, "expected 'lsl' with optional integer 0, 16, 32 or 48");
case Match_AddSubRegShift32:
return Error(Loc,
"expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]");
case Match_AddSubRegShift64:
return Error(Loc,
"expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]");
case Match_InvalidFPImm:
return Error(Loc,
"expected compatible register or floating-point constant");
case Match_InvalidMemoryIndexedSImm6:
return Error(Loc, "index must be an integer in range [-32, 31].");
case Match_InvalidMemoryIndexedSImm5:
return Error(Loc, "index must be an integer in range [-16, 15].");
case Match_InvalidMemoryIndexed1SImm4:
return Error(Loc, "index must be an integer in range [-8, 7].");
case Match_InvalidMemoryIndexed2SImm4:
return Error(Loc, "index must be a multiple of 2 in range [-16, 14].");
case Match_InvalidMemoryIndexed3SImm4:
return Error(Loc, "index must be a multiple of 3 in range [-24, 21].");
case Match_InvalidMemoryIndexed4SImm4:
return Error(Loc, "index must be a multiple of 4 in range [-32, 28].");
case Match_InvalidMemoryIndexed16SImm4:
return Error(Loc, "index must be a multiple of 16 in range [-128, 112].");
case Match_InvalidMemoryIndexed1SImm6:
return Error(Loc, "index must be an integer in range [-32, 31].");
case Match_InvalidMemoryIndexedSImm8:
return Error(Loc, "index must be an integer in range [-128, 127].");
case Match_InvalidMemoryIndexedSImm9:
return Error(Loc, "index must be an integer in range [-256, 255].");
case Match_InvalidMemoryIndexed16SImm9:
return Error(Loc, "index must be a multiple of 16 in range [-4096, 4080].");
case Match_InvalidMemoryIndexed8SImm10:
return Error(Loc, "index must be a multiple of 8 in range [-4096, 4088].");
case Match_InvalidMemoryIndexed4SImm7:
return Error(Loc, "index must be a multiple of 4 in range [-256, 252].");
case Match_InvalidMemoryIndexed8SImm7:
return Error(Loc, "index must be a multiple of 8 in range [-512, 504].");
case Match_InvalidMemoryIndexed16SImm7:
return Error(Loc, "index must be a multiple of 16 in range [-1024, 1008].");
case Match_InvalidMemoryIndexed8UImm5:
return Error(Loc, "index must be a multiple of 8 in range [0, 248].");
case Match_InvalidMemoryIndexed4UImm5:
return Error(Loc, "index must be a multiple of 4 in range [0, 124].");
case Match_InvalidMemoryIndexed2UImm5:
return Error(Loc, "index must be a multiple of 2 in range [0, 62].");
case Match_InvalidMemoryIndexed8UImm6:
return Error(Loc, "index must be a multiple of 8 in range [0, 504].");
case Match_InvalidMemoryIndexed16UImm6:
return Error(Loc, "index must be a multiple of 16 in range [0, 1008].");
case Match_InvalidMemoryIndexed4UImm6:
return Error(Loc, "index must be a multiple of 4 in range [0, 252].");
case Match_InvalidMemoryIndexed2UImm6:
return Error(Loc, "index must be a multiple of 2 in range [0, 126].");
case Match_InvalidMemoryIndexed1UImm6:
return Error(Loc, "index must be in range [0, 63].");
case Match_InvalidMemoryWExtend8:
return Error(Loc,
"expected 'uxtw' or 'sxtw' with optional shift of #0");
case Match_InvalidMemoryWExtend16:
return Error(Loc,
"expected 'uxtw' or 'sxtw' with optional shift of #0 or #1");
case Match_InvalidMemoryWExtend32:
return Error(Loc,
"expected 'uxtw' or 'sxtw' with optional shift of #0 or #2");
case Match_InvalidMemoryWExtend64:
return Error(Loc,
"expected 'uxtw' or 'sxtw' with optional shift of #0 or #3");
case Match_InvalidMemoryWExtend128:
return Error(Loc,
"expected 'uxtw' or 'sxtw' with optional shift of #0 or #4");
case Match_InvalidMemoryXExtend8:
return Error(Loc,
"expected 'lsl' or 'sxtx' with optional shift of #0");
case Match_InvalidMemoryXExtend16:
return Error(Loc,
"expected 'lsl' or 'sxtx' with optional shift of #0 or #1");
case Match_InvalidMemoryXExtend32:
return Error(Loc,
"expected 'lsl' or 'sxtx' with optional shift of #0 or #2");
case Match_InvalidMemoryXExtend64:
return Error(Loc,
"expected 'lsl' or 'sxtx' with optional shift of #0 or #3");
case Match_InvalidMemoryXExtend128:
return Error(Loc,
"expected 'lsl' or 'sxtx' with optional shift of #0 or #4");
case Match_InvalidMemoryIndexed1:
return Error(Loc, "index must be an integer in range [0, 4095].");
case Match_InvalidMemoryIndexed2:
return Error(Loc, "index must be a multiple of 2 in range [0, 8190].");
case Match_InvalidMemoryIndexed4:
return Error(Loc, "index must be a multiple of 4 in range [0, 16380].");
case Match_InvalidMemoryIndexed8:
return Error(Loc, "index must be a multiple of 8 in range [0, 32760].");
case Match_InvalidMemoryIndexed16:
return Error(Loc, "index must be a multiple of 16 in range [0, 65520].");
case Match_InvalidImm0_1:
return Error(Loc, "immediate must be an integer in range [0, 1].");
case Match_InvalidImm0_7:
return Error(Loc, "immediate must be an integer in range [0, 7].");
case Match_InvalidImm0_15:
return Error(Loc, "immediate must be an integer in range [0, 15].");
case Match_InvalidImm0_31:
return Error(Loc, "immediate must be an integer in range [0, 31].");
case Match_InvalidImm0_63:
return Error(Loc, "immediate must be an integer in range [0, 63].");
case Match_InvalidImm0_127:
return Error(Loc, "immediate must be an integer in range [0, 127].");
case Match_InvalidImm0_255:
return Error(Loc, "immediate must be an integer in range [0, 255].");
case Match_InvalidImm0_65535:
return Error(Loc, "immediate must be an integer in range [0, 65535].");
case Match_InvalidImm1_8:
return Error(Loc, "immediate must be an integer in range [1, 8].");
case Match_InvalidImm1_16:
return Error(Loc, "immediate must be an integer in range [1, 16].");
case Match_InvalidImm1_32:
return Error(Loc, "immediate must be an integer in range [1, 32].");
case Match_InvalidImm1_64:
return Error(Loc, "immediate must be an integer in range [1, 64].");
case Match_InvalidSVEAddSubImm8:
return Error(Loc, "immediate must be an integer in range [0, 255]"
" with a shift amount of 0");
case Match_InvalidSVEAddSubImm16:
case Match_InvalidSVEAddSubImm32:
case Match_InvalidSVEAddSubImm64:
return Error(Loc, "immediate must be an integer in range [0, 255] or a "
"multiple of 256 in range [256, 65280]");
case Match_InvalidSVECpyImm8:
return Error(Loc, "immediate must be an integer in range [-128, 255]"
" with a shift amount of 0");
case Match_InvalidSVECpyImm16:
return Error(Loc, "immediate must be an integer in range [-128, 127] or a "
"multiple of 256 in range [-32768, 65280]");
case Match_InvalidSVECpyImm32:
case Match_InvalidSVECpyImm64:
return Error(Loc, "immediate must be an integer in range [-128, 127] or a "
"multiple of 256 in range [-32768, 32512]");
case Match_InvalidIndexRange1_1:
return Error(Loc, "expected lane specifier '[1]'");
case Match_InvalidIndexRange0_15:
return Error(Loc, "vector lane must be an integer in range [0, 15].");
case Match_InvalidIndexRange0_7:
return Error(Loc, "vector lane must be an integer in range [0, 7].");
case Match_InvalidIndexRange0_3:
return Error(Loc, "vector lane must be an integer in range [0, 3].");
case Match_InvalidIndexRange0_1:
return Error(Loc, "vector lane must be an integer in range [0, 1].");
case Match_InvalidSVEIndexRange0_63:
return Error(Loc, "vector lane must be an integer in range [0, 63].");
case Match_InvalidSVEIndexRange0_31:
return Error(Loc, "vector lane must be an integer in range [0, 31].");
case Match_InvalidSVEIndexRange0_15:
return Error(Loc, "vector lane must be an integer in range [0, 15].");
case Match_InvalidSVEIndexRange0_7:
return Error(Loc, "vector lane must be an integer in range [0, 7].");
case Match_InvalidSVEIndexRange0_3:
return Error(Loc, "vector lane must be an integer in range [0, 3].");
case Match_InvalidLabel:
return Error(Loc, "expected label or encodable integer pc offset");
case Match_MRS:
return Error(Loc, "expected readable system register");
case Match_MSR:
return Error(Loc, "expected writable system register or pstate");
case Match_InvalidComplexRotationEven:
return Error(Loc, "complex rotation must be 0, 90, 180 or 270.");
case Match_InvalidComplexRotationOdd:
return Error(Loc, "complex rotation must be 90 or 270.");
case Match_MnemonicFail: {
std::string Suggestion = AArch64MnemonicSpellCheck(
((AArch64Operand &)*Operands[0]).getToken(),
ComputeAvailableFeatures(STI->getFeatureBits()));
return Error(Loc, "unrecognized instruction mnemonic" + Suggestion);
}
case Match_InvalidGPR64shifted8:
return Error(Loc, "register must be x0..x30 or xzr, without shift");
case Match_InvalidGPR64shifted16:
return Error(Loc, "register must be x0..x30 or xzr, with required shift 'lsl #1'");
case Match_InvalidGPR64shifted32:
return Error(Loc, "register must be x0..x30 or xzr, with required shift 'lsl #2'");
case Match_InvalidGPR64shifted64:
return Error(Loc, "register must be x0..x30 or xzr, with required shift 'lsl #3'");
case Match_InvalidGPR64NoXZRshifted8:
return Error(Loc, "register must be x0..x30 without shift");
case Match_InvalidGPR64NoXZRshifted16:
return Error(Loc, "register must be x0..x30 with required shift 'lsl #1'");
case Match_InvalidGPR64NoXZRshifted32:
return Error(Loc, "register must be x0..x30 with required shift 'lsl #2'");
case Match_InvalidGPR64NoXZRshifted64:
return Error(Loc, "register must be x0..x30 with required shift 'lsl #3'");
case Match_InvalidZPR32UXTW8:
case Match_InvalidZPR32SXTW8:
return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, (uxtw|sxtw)'");
case Match_InvalidZPR32UXTW16:
case Match_InvalidZPR32SXTW16:
return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, (uxtw|sxtw) #1'");
case Match_InvalidZPR32UXTW32:
case Match_InvalidZPR32SXTW32:
return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, (uxtw|sxtw) #2'");
case Match_InvalidZPR32UXTW64:
case Match_InvalidZPR32SXTW64:
return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, (uxtw|sxtw) #3'");
case Match_InvalidZPR64UXTW8:
case Match_InvalidZPR64SXTW8:
return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, (uxtw|sxtw)'");
case Match_InvalidZPR64UXTW16:
case Match_InvalidZPR64SXTW16:
return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, (lsl|uxtw|sxtw) #1'");
case Match_InvalidZPR64UXTW32:
case Match_InvalidZPR64SXTW32:
return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, (lsl|uxtw|sxtw) #2'");
case Match_InvalidZPR64UXTW64:
case Match_InvalidZPR64SXTW64:
return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, (lsl|uxtw|sxtw) #3'");
case Match_InvalidZPR32LSL8:
return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s'");
case Match_InvalidZPR32LSL16:
return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, lsl #1'");
case Match_InvalidZPR32LSL32:
return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, lsl #2'");
case Match_InvalidZPR32LSL64:
return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, lsl #3'");
case Match_InvalidZPR64LSL8:
return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d'");
case Match_InvalidZPR64LSL16:
return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, lsl #1'");
case Match_InvalidZPR64LSL32:
return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, lsl #2'");
case Match_InvalidZPR64LSL64:
return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, lsl #3'");
case Match_InvalidZPR0:
return Error(Loc, "expected register without element width suffix");
case Match_InvalidZPR8:
case Match_InvalidZPR16:
case Match_InvalidZPR32:
case Match_InvalidZPR64:
case Match_InvalidZPR128:
return Error(Loc, "invalid element width");
case Match_InvalidZPR_3b8:
return Error(Loc, "Invalid restricted vector register, expected z0.b..z7.b");
case Match_InvalidZPR_3b16:
return Error(Loc, "Invalid restricted vector register, expected z0.h..z7.h");
case Match_InvalidZPR_3b32:
return Error(Loc, "Invalid restricted vector register, expected z0.s..z7.s");
case Match_InvalidZPR_4b16:
return Error(Loc, "Invalid restricted vector register, expected z0.h..z15.h");
case Match_InvalidZPR_4b32:
return Error(Loc, "Invalid restricted vector register, expected z0.s..z15.s");
case Match_InvalidZPR_4b64:
return Error(Loc, "Invalid restricted vector register, expected z0.d..z15.d");
case Match_InvalidSVEPattern:
return Error(Loc, "invalid predicate pattern");
case Match_InvalidSVEPredicateAnyReg:
case Match_InvalidSVEPredicateBReg:
case Match_InvalidSVEPredicateHReg:
case Match_InvalidSVEPredicateSReg:
case Match_InvalidSVEPredicateDReg:
return Error(Loc, "invalid predicate register.");
case Match_InvalidSVEPredicate3bAnyReg:
return Error(Loc, "invalid restricted predicate register, expected p0..p7 (without element suffix)");
case Match_InvalidSVEPredicate3bBReg:
return Error(Loc, "invalid restricted predicate register, expected p0.b..p7.b");
case Match_InvalidSVEPredicate3bHReg:
return Error(Loc, "invalid restricted predicate register, expected p0.h..p7.h");
case Match_InvalidSVEPredicate3bSReg:
return Error(Loc, "invalid restricted predicate register, expected p0.s..p7.s");
case Match_InvalidSVEPredicate3bDReg:
return Error(Loc, "invalid restricted predicate register, expected p0.d..p7.d");
case Match_InvalidSVEExactFPImmOperandHalfOne:
return Error(Loc, "Invalid floating point constant, expected 0.5 or 1.0.");
case Match_InvalidSVEExactFPImmOperandHalfTwo:
return Error(Loc, "Invalid floating point constant, expected 0.5 or 2.0.");
case Match_InvalidSVEExactFPImmOperandZeroOne:
return Error(Loc, "Invalid floating point constant, expected 0.0 or 1.0.");
default:
llvm_unreachable("unexpected error code!");
}
}
static const char *getSubtargetFeatureName(uint64_t Val);
bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands,
MCStreamer &Out,
uint64_t &ErrorInfo,
bool MatchingInlineAsm) {
assert(!Operands.empty() && "Unexpect empty operand list!");
AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[0]);
assert(Op.isToken() && "Leading operand should always be a mnemonic!");
StringRef Tok = Op.getToken();
unsigned NumOperands = Operands.size();
if (NumOperands == 4 && Tok == "lsl") {
AArch64Operand &Op2 = static_cast<AArch64Operand &>(*Operands[2]);
AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
if (Op2.isScalarReg() && Op3.isImm()) {
const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
if (Op3CE) {
uint64_t Op3Val = Op3CE->getValue();
uint64_t NewOp3Val = 0;
uint64_t NewOp4Val = 0;
if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains(
Op2.getReg())) {
NewOp3Val = (32 - Op3Val) & 0x1f;
NewOp4Val = 31 - Op3Val;
} else {
NewOp3Val = (64 - Op3Val) & 0x3f;
NewOp4Val = 63 - Op3Val;
}
const MCExpr *NewOp3 = MCConstantExpr::create(NewOp3Val, getContext());
const MCExpr *NewOp4 = MCConstantExpr::create(NewOp4Val, getContext());
Operands[0] = AArch64Operand::CreateToken(
"ubfm", false, Op.getStartLoc(), getContext());
Operands.push_back(AArch64Operand::CreateImm(
NewOp4, Op3.getStartLoc(), Op3.getEndLoc(), getContext()));
Operands[3] = AArch64Operand::CreateImm(NewOp3, Op3.getStartLoc(),
Op3.getEndLoc(), getContext());
}
}
} else if (NumOperands == 4 && Tok == "bfc") {
// FIXME: Horrible hack to handle BFC->BFM alias.
AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]);
AArch64Operand LSBOp = static_cast<AArch64Operand &>(*Operands[2]);
AArch64Operand WidthOp = static_cast<AArch64Operand &>(*Operands[3]);
if (Op1.isScalarReg() && LSBOp.isImm() && WidthOp.isImm()) {
const MCConstantExpr *LSBCE = dyn_cast<MCConstantExpr>(LSBOp.getImm());
const MCConstantExpr *WidthCE = dyn_cast<MCConstantExpr>(WidthOp.getImm());
if (LSBCE && WidthCE) {
uint64_t LSB = LSBCE->getValue();
uint64_t Width = WidthCE->getValue();
uint64_t RegWidth = 0;
if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
Op1.getReg()))
RegWidth = 64;
else
RegWidth = 32;
if (LSB >= RegWidth)
return Error(LSBOp.getStartLoc(),
"expected integer in range [0, 31]");
if (Width < 1 || Width > RegWidth)
return Error(WidthOp.getStartLoc(),
"expected integer in range [1, 32]");
uint64_t ImmR = 0;
if (RegWidth == 32)
ImmR = (32 - LSB) & 0x1f;
else
ImmR = (64 - LSB) & 0x3f;
uint64_t ImmS = Width - 1;
if (ImmR != 0 && ImmS >= ImmR)
return Error(WidthOp.getStartLoc(),
"requested insert overflows register");
const MCExpr *ImmRExpr = MCConstantExpr::create(ImmR, getContext());
const MCExpr *ImmSExpr = MCConstantExpr::create(ImmS, getContext());
Operands[0] = AArch64Operand::CreateToken(
"bfm", false, Op.getStartLoc(), getContext());
Operands[2] = AArch64Operand::CreateReg(
RegWidth == 32 ? AArch64::WZR : AArch64::XZR, RegKind::Scalar,
SMLoc(), SMLoc(), getContext());
Operands[3] = AArch64Operand::CreateImm(
ImmRExpr, LSBOp.getStartLoc(), LSBOp.getEndLoc(), getContext());
Operands.emplace_back(
AArch64Operand::CreateImm(ImmSExpr, WidthOp.getStartLoc(),
WidthOp.getEndLoc(), getContext()));
}
}
} else if (NumOperands == 5) {
// FIXME: Horrible hack to handle the BFI -> BFM, SBFIZ->SBFM, and
// UBFIZ -> UBFM aliases.
if (Tok == "bfi" || Tok == "sbfiz" || Tok == "ubfiz") {
AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]);
AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
AArch64Operand &Op4 = static_cast<AArch64Operand &>(*Operands[4]);
if (Op1.isScalarReg() && Op3.isImm() && Op4.isImm()) {
const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4.getImm());
if (Op3CE && Op4CE) {
uint64_t Op3Val = Op3CE->getValue();
uint64_t Op4Val = Op4CE->getValue();
uint64_t RegWidth = 0;
if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
Op1.getReg()))
RegWidth = 64;
else
RegWidth = 32;
if (Op3Val >= RegWidth)
return Error(Op3.getStartLoc(),
"expected integer in range [0, 31]");
if (Op4Val < 1 || Op4Val > RegWidth)
return Error(Op4.getStartLoc(),
"expected integer in range [1, 32]");
uint64_t NewOp3Val = 0;
if (RegWidth == 32)
NewOp3Val = (32 - Op3Val) & 0x1f;
else
NewOp3Val = (64 - Op3Val) & 0x3f;
uint64_t NewOp4Val = Op4Val - 1;
if (NewOp3Val != 0 && NewOp4Val >= NewOp3Val)
return Error(Op4.getStartLoc(),
"requested insert overflows register");
const MCExpr *NewOp3 =
MCConstantExpr::create(NewOp3Val, getContext());
const MCExpr *NewOp4 =
MCConstantExpr::create(NewOp4Val, getContext());
Operands[3] = AArch64Operand::CreateImm(
NewOp3, Op3.getStartLoc(), Op3.getEndLoc(), getContext());
Operands[4] = AArch64Operand::CreateImm(
NewOp4, Op4.getStartLoc(), Op4.getEndLoc(), getContext());
if (Tok == "bfi")
Operands[0] = AArch64Operand::CreateToken(
"bfm", false, Op.getStartLoc(), getContext());
else if (Tok == "sbfiz")
Operands[0] = AArch64Operand::CreateToken(
"sbfm", false, Op.getStartLoc(), getContext());
else if (Tok == "ubfiz")
Operands[0] = AArch64Operand::CreateToken(
"ubfm", false, Op.getStartLoc(), getContext());
else
llvm_unreachable("No valid mnemonic for alias?");
}
}
// FIXME: Horrible hack to handle the BFXIL->BFM, SBFX->SBFM, and
// UBFX -> UBFM aliases.
} else if (NumOperands == 5 &&
(Tok == "bfxil" || Tok == "sbfx" || Tok == "ubfx")) {
AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]);
AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
AArch64Operand &Op4 = static_cast<AArch64Operand &>(*Operands[4]);
if (Op1.isScalarReg() && Op3.isImm() && Op4.isImm()) {
const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4.getImm());
if (Op3CE && Op4CE) {
uint64_t Op3Val = Op3CE->getValue();
uint64_t Op4Val = Op4CE->getValue();
uint64_t RegWidth = 0;
if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
Op1.getReg()))
RegWidth = 64;
else
RegWidth = 32;
if (Op3Val >= RegWidth)
return Error(Op3.getStartLoc(),
"expected integer in range [0, 31]");
if (Op4Val < 1 || Op4Val > RegWidth)
return Error(Op4.getStartLoc(),
"expected integer in range [1, 32]");
uint64_t NewOp4Val = Op3Val + Op4Val - 1;
if (NewOp4Val >= RegWidth || NewOp4Val < Op3Val)
return Error(Op4.getStartLoc(),
"requested extract overflows register");
const MCExpr *NewOp4 =
MCConstantExpr::create(NewOp4Val, getContext());
Operands[4] = AArch64Operand::CreateImm(
NewOp4, Op4.getStartLoc(), Op4.getEndLoc(), getContext());
if (Tok == "bfxil")
Operands[0] = AArch64Operand::CreateToken(
"bfm", false, Op.getStartLoc(), getContext());
else if (Tok == "sbfx")
Operands[0] = AArch64Operand::CreateToken(
"sbfm", false, Op.getStartLoc(), getContext());
else if (Tok == "ubfx")
Operands[0] = AArch64Operand::CreateToken(
"ubfm", false, Op.getStartLoc(), getContext());
else
llvm_unreachable("No valid mnemonic for alias?");
}
}
}
}
// The Cyclone CPU and early successors didn't execute the zero-cycle zeroing
// instruction for FP registers correctly in some rare circumstances. Convert
// it to a safe instruction and warn (because silently changing someone's
// assembly is rude).
if (getSTI().getFeatureBits()[AArch64::FeatureZCZeroingFPWorkaround] &&
NumOperands == 4 && Tok == "movi") {
AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]);
AArch64Operand &Op2 = static_cast<AArch64Operand &>(*Operands[2]);
AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
if ((Op1.isToken() && Op2.isNeonVectorReg() && Op3.isImm()) ||
(Op1.isNeonVectorReg() && Op2.isToken() && Op3.isImm())) {
StringRef Suffix = Op1.isToken() ? Op1.getToken() : Op2.getToken();
if (Suffix.lower() == ".2d" &&
cast<MCConstantExpr>(Op3.getImm())->getValue() == 0) {
Warning(IDLoc, "instruction movi.2d with immediate #0 may not function"
" correctly on this CPU, converting to equivalent movi.16b");
// Switch the suffix to .16b.
unsigned Idx = Op1.isToken() ? 1 : 2;
Operands[Idx] = AArch64Operand::CreateToken(".16b", false, IDLoc,
getContext());
}
}
}
// FIXME: Horrible hack for sxtw and uxtw with Wn src and Xd dst operands.
// InstAlias can't quite handle this since the reg classes aren't
// subclasses.
if (NumOperands == 3 && (Tok == "sxtw" || Tok == "uxtw")) {
// The source register can be Wn here, but the matcher expects a
// GPR64. Twiddle it here if necessary.
AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]);
if (Op.isScalarReg()) {
unsigned Reg = getXRegFromWReg(Op.getReg());
Operands[2] = AArch64Operand::CreateReg(Reg, RegKind::Scalar,
Op.getStartLoc(), Op.getEndLoc(),
getContext());
}
}
// FIXME: Likewise for sxt[bh] with a Xd dst operand
else if (NumOperands == 3 && (Tok == "sxtb" || Tok == "sxth")) {
AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
if (Op.isScalarReg() &&
AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
Op.getReg())) {
// The source register can be Wn here, but the matcher expects a
// GPR64. Twiddle it here if necessary.
AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]);
if (Op.isScalarReg()) {
unsigned Reg = getXRegFromWReg(Op.getReg());
Operands[2] = AArch64Operand::CreateReg(Reg, RegKind::Scalar,
Op.getStartLoc(),
Op.getEndLoc(), getContext());
}
}
}
// FIXME: Likewise for uxt[bh] with a Xd dst operand
else if (NumOperands == 3 && (Tok == "uxtb" || Tok == "uxth")) {
AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
if (Op.isScalarReg() &&
AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
Op.getReg())) {
// The source register can be Wn here, but the matcher expects a
// GPR32. Twiddle it here if necessary.
AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
if (Op.isScalarReg()) {
unsigned Reg = getWRegFromXReg(Op.getReg());
Operands[1] = AArch64Operand::CreateReg(Reg, RegKind::Scalar,
Op.getStartLoc(),
Op.getEndLoc(), getContext());
}
}
}
MCInst Inst;
FeatureBitset MissingFeatures;
// First try to match against the secondary set of tables containing the
// short-form NEON instructions (e.g. "fadd.2s v0, v1, v2").
unsigned MatchResult =
MatchInstructionImpl(Operands, Inst, ErrorInfo, MissingFeatures,
MatchingInlineAsm, 1);
// If that fails, try against the alternate table containing long-form NEON:
// "fadd v0.2s, v1.2s, v2.2s"
if (MatchResult != Match_Success) {
// But first, save the short-form match result: we can use it in case the
// long-form match also fails.
auto ShortFormNEONErrorInfo = ErrorInfo;
auto ShortFormNEONMatchResult = MatchResult;
auto ShortFormNEONMissingFeatures = MissingFeatures;
MatchResult =
MatchInstructionImpl(Operands, Inst, ErrorInfo, MissingFeatures,
MatchingInlineAsm, 0);
// Now, both matches failed, and the long-form match failed on the mnemonic
// suffix token operand. The short-form match failure is probably more
// relevant: use it instead.
if (MatchResult == Match_InvalidOperand && ErrorInfo == 1 &&
Operands.size() > 1 && ((AArch64Operand &)*Operands[1]).isToken() &&
((AArch64Operand &)*Operands[1]).isTokenSuffix()) {
MatchResult = ShortFormNEONMatchResult;
ErrorInfo = ShortFormNEONErrorInfo;
MissingFeatures = ShortFormNEONMissingFeatures;
}
}
switch (MatchResult) {
case Match_Success: {
// Perform range checking and other semantic validations
SmallVector<SMLoc, 8> OperandLocs;
NumOperands = Operands.size();
for (unsigned i = 1; i < NumOperands; ++i)
OperandLocs.push_back(Operands[i]->getStartLoc());
if (validateInstruction(Inst, IDLoc, OperandLocs))
return true;
Inst.setLoc(IDLoc);
Out.EmitInstruction(Inst, getSTI());
return false;
}
case Match_MissingFeature: {
assert(MissingFeatures.any() && "Unknown missing feature!");
// Special case the error message for the very common case where only
// a single subtarget feature is missing (neon, e.g.).
std::string Msg = "instruction requires:";
for (unsigned i = 0, e = MissingFeatures.size(); i != e; ++i) {
if (MissingFeatures[i]) {
Msg += " ";
Msg += getSubtargetFeatureName(i);
}
}
return Error(IDLoc, Msg);
}
case Match_MnemonicFail:
return showMatchError(IDLoc, MatchResult, ErrorInfo, Operands);
case Match_InvalidOperand: {
SMLoc ErrorLoc = IDLoc;
if (ErrorInfo != ~0ULL) {
if (ErrorInfo >= Operands.size())
return Error(IDLoc, "too few operands for instruction",
SMRange(IDLoc, getTok().getLoc()));
ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc();
if (ErrorLoc == SMLoc())
ErrorLoc = IDLoc;
}
// If the match failed on a suffix token operand, tweak the diagnostic
// accordingly.
if (((AArch64Operand &)*Operands[ErrorInfo]).isToken() &&
((AArch64Operand &)*Operands[ErrorInfo]).isTokenSuffix())
MatchResult = Match_InvalidSuffix;
return showMatchError(ErrorLoc, MatchResult, ErrorInfo, Operands);
}
case Match_InvalidTiedOperand:
case Match_InvalidMemoryIndexed1:
case Match_InvalidMemoryIndexed2:
case Match_InvalidMemoryIndexed4:
case Match_InvalidMemoryIndexed8:
case Match_InvalidMemoryIndexed16:
case Match_InvalidCondCode:
case Match_AddSubRegExtendSmall:
case Match_AddSubRegExtendLarge:
case Match_AddSubSecondSource:
case Match_LogicalSecondSource:
case Match_AddSubRegShift32:
case Match_AddSubRegShift64:
case Match_InvalidMovImm32Shift:
case Match_InvalidMovImm64Shift:
case Match_InvalidFPImm:
case Match_InvalidMemoryWExtend8:
case Match_InvalidMemoryWExtend16:
case Match_InvalidMemoryWExtend32:
case Match_InvalidMemoryWExtend64:
case Match_InvalidMemoryWExtend128:
case Match_InvalidMemoryXExtend8:
case Match_InvalidMemoryXExtend16:
case Match_InvalidMemoryXExtend32:
case Match_InvalidMemoryXExtend64:
case Match_InvalidMemoryXExtend128:
case Match_InvalidMemoryIndexed1SImm4:
case Match_InvalidMemoryIndexed2SImm4:
case Match_InvalidMemoryIndexed3SImm4:
case Match_InvalidMemoryIndexed4SImm4:
case Match_InvalidMemoryIndexed1SImm6:
case Match_InvalidMemoryIndexed16SImm4:
case Match_InvalidMemoryIndexed4SImm7:
case Match_InvalidMemoryIndexed8SImm7:
case Match_InvalidMemoryIndexed16SImm7:
case Match_InvalidMemoryIndexed8UImm5:
case Match_InvalidMemoryIndexed4UImm5:
case Match_InvalidMemoryIndexed2UImm5:
case Match_InvalidMemoryIndexed1UImm6:
case Match_InvalidMemoryIndexed2UImm6:
case Match_InvalidMemoryIndexed4UImm6:
case Match_InvalidMemoryIndexed8UImm6:
case Match_InvalidMemoryIndexed16UImm6:
case Match_InvalidMemoryIndexedSImm6:
case Match_InvalidMemoryIndexedSImm5:
case Match_InvalidMemoryIndexedSImm8:
case Match_InvalidMemoryIndexedSImm9:
case Match_InvalidMemoryIndexed16SImm9:
case Match_InvalidMemoryIndexed8SImm10:
case Match_InvalidImm0_1:
case Match_InvalidImm0_7:
case Match_InvalidImm0_15:
case Match_InvalidImm0_31:
case Match_InvalidImm0_63:
case Match_InvalidImm0_127:
case Match_InvalidImm0_255:
case Match_InvalidImm0_65535:
case Match_InvalidImm1_8:
case Match_InvalidImm1_16:
case Match_InvalidImm1_32:
case Match_InvalidImm1_64:
case Match_InvalidSVEAddSubImm8:
case Match_InvalidSVEAddSubImm16:
case Match_InvalidSVEAddSubImm32:
case Match_InvalidSVEAddSubImm64:
case Match_InvalidSVECpyImm8:
case Match_InvalidSVECpyImm16:
case Match_InvalidSVECpyImm32:
case Match_InvalidSVECpyImm64:
case Match_InvalidIndexRange1_1:
case Match_InvalidIndexRange0_15:
case Match_InvalidIndexRange0_7:
case Match_InvalidIndexRange0_3:
case Match_InvalidIndexRange0_1:
case Match_InvalidSVEIndexRange0_63:
case Match_InvalidSVEIndexRange0_31:
case Match_InvalidSVEIndexRange0_15:
case Match_InvalidSVEIndexRange0_7:
case Match_InvalidSVEIndexRange0_3:
case Match_InvalidLabel:
case Match_InvalidComplexRotationEven:
case Match_InvalidComplexRotationOdd:
case Match_InvalidGPR64shifted8:
case Match_InvalidGPR64shifted16:
case Match_InvalidGPR64shifted32:
case Match_InvalidGPR64shifted64:
case Match_InvalidGPR64NoXZRshifted8:
case Match_InvalidGPR64NoXZRshifted16:
case Match_InvalidGPR64NoXZRshifted32:
case Match_InvalidGPR64NoXZRshifted64:
case Match_InvalidZPR32UXTW8:
case Match_InvalidZPR32UXTW16:
case Match_InvalidZPR32UXTW32:
case Match_InvalidZPR32UXTW64:
case Match_InvalidZPR32SXTW8:
case Match_InvalidZPR32SXTW16:
case Match_InvalidZPR32SXTW32:
case Match_InvalidZPR32SXTW64:
case Match_InvalidZPR64UXTW8:
case Match_InvalidZPR64SXTW8:
case Match_InvalidZPR64UXTW16:
case Match_InvalidZPR64SXTW16:
case Match_InvalidZPR64UXTW32:
case Match_InvalidZPR64SXTW32:
case Match_InvalidZPR64UXTW64:
case Match_InvalidZPR64SXTW64:
case Match_InvalidZPR32LSL8:
case Match_InvalidZPR32LSL16:
case Match_InvalidZPR32LSL32:
case Match_InvalidZPR32LSL64:
case Match_InvalidZPR64LSL8:
case Match_InvalidZPR64LSL16:
case Match_InvalidZPR64LSL32:
case Match_InvalidZPR64LSL64:
case Match_InvalidZPR0:
case Match_InvalidZPR8:
case Match_InvalidZPR16:
case Match_InvalidZPR32:
case Match_InvalidZPR64:
case Match_InvalidZPR128:
case Match_InvalidZPR_3b8:
case Match_InvalidZPR_3b16:
case Match_InvalidZPR_3b32:
case Match_InvalidZPR_4b16:
case Match_InvalidZPR_4b32:
case Match_InvalidZPR_4b64:
case Match_InvalidSVEPredicateAnyReg:
case Match_InvalidSVEPattern:
case Match_InvalidSVEPredicateBReg:
case Match_InvalidSVEPredicateHReg:
case Match_InvalidSVEPredicateSReg:
case Match_InvalidSVEPredicateDReg:
case Match_InvalidSVEPredicate3bAnyReg:
case Match_InvalidSVEPredicate3bBReg:
case Match_InvalidSVEPredicate3bHReg:
case Match_InvalidSVEPredicate3bSReg:
case Match_InvalidSVEPredicate3bDReg:
case Match_InvalidSVEExactFPImmOperandHalfOne:
case Match_InvalidSVEExactFPImmOperandHalfTwo:
case Match_InvalidSVEExactFPImmOperandZeroOne:
case Match_MSR:
case Match_MRS: {
if (ErrorInfo >= Operands.size())
return Error(IDLoc, "too few operands for instruction", SMRange(IDLoc, (*Operands.back()).getEndLoc()));
// Any time we get here, there's nothing fancy to do. Just get the
// operand SMLoc and display the diagnostic.
SMLoc ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc();
if (ErrorLoc == SMLoc())
ErrorLoc = IDLoc;
return showMatchError(ErrorLoc, MatchResult, ErrorInfo, Operands);
}
}
llvm_unreachable("Implement any new match types added!");
}
/// ParseDirective parses the arm specific directives
bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
const MCObjectFileInfo::Environment Format =
getContext().getObjectFileInfo()->getObjectFileType();
bool IsMachO = Format == MCObjectFileInfo::IsMachO;
StringRef IDVal = DirectiveID.getIdentifier();
SMLoc Loc = DirectiveID.getLoc();
if (IDVal == ".arch")
parseDirectiveArch(Loc);
else if (IDVal == ".cpu")
parseDirectiveCPU(Loc);
else if (IDVal == ".tlsdesccall")
parseDirectiveTLSDescCall(Loc);
else if (IDVal == ".ltorg" || IDVal == ".pool")
parseDirectiveLtorg(Loc);
else if (IDVal == ".unreq")
parseDirectiveUnreq(Loc);
else if (IDVal == ".inst")
parseDirectiveInst(Loc);
else if (IDVal == ".cfi_negate_ra_state")
parseDirectiveCFINegateRAState();
else if (IDVal == ".cfi_b_key_frame")
parseDirectiveCFIBKeyFrame();
else if (IDVal == ".arch_extension")
parseDirectiveArchExtension(Loc);
else if (IsMachO) {
if (IDVal == MCLOHDirectiveName())
parseDirectiveLOH(IDVal, Loc);
else
return true;
} else
return true;
return false;
}
static void ExpandCryptoAEK(AArch64::ArchKind ArchKind,
SmallVector<StringRef, 4> &RequestedExtensions) {
const bool NoCrypto =
(std::find(RequestedExtensions.begin(), RequestedExtensions.end(),
"nocrypto") != std::end(RequestedExtensions));
const bool Crypto =
(std::find(RequestedExtensions.begin(), RequestedExtensions.end(),
"crypto") != std::end(RequestedExtensions));
if (!NoCrypto && Crypto) {
switch (ArchKind) {
default:
// Map 'generic' (and others) to sha2 and aes, because
// that was the traditional meaning of crypto.
case AArch64::ArchKind::ARMV8_1A:
case AArch64::ArchKind::ARMV8_2A:
case AArch64::ArchKind::ARMV8_3A:
RequestedExtensions.push_back("sha2");
RequestedExtensions.push_back("aes");
break;
case AArch64::ArchKind::ARMV8_4A:
case AArch64::ArchKind::ARMV8_5A:
RequestedExtensions.push_back("sm4");
RequestedExtensions.push_back("sha3");
RequestedExtensions.push_back("sha2");
RequestedExtensions.push_back("aes");
break;
}
} else if (NoCrypto) {
switch (ArchKind) {
default:
// Map 'generic' (and others) to sha2 and aes, because
// that was the traditional meaning of crypto.
case AArch64::ArchKind::ARMV8_1A:
case AArch64::ArchKind::ARMV8_2A:
case AArch64::ArchKind::ARMV8_3A:
RequestedExtensions.push_back("nosha2");
RequestedExtensions.push_back("noaes");
break;
case AArch64::ArchKind::ARMV8_4A:
case AArch64::ArchKind::ARMV8_5A:
RequestedExtensions.push_back("nosm4");
RequestedExtensions.push_back("nosha3");
RequestedExtensions.push_back("nosha2");
RequestedExtensions.push_back("noaes");
break;
}
}
}
/// parseDirectiveArch
/// ::= .arch token
bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
SMLoc ArchLoc = getLoc();
StringRef Arch, ExtensionString;
std::tie(Arch, ExtensionString) =
getParser().parseStringToEndOfStatement().trim().split('+');
AArch64::ArchKind ID = AArch64::parseArch(Arch);
if (ID == AArch64::ArchKind::INVALID)
return Error(ArchLoc, "unknown arch name");
if (parseToken(AsmToken::EndOfStatement))
return true;
// Get the architecture and extension features.
std::vector<StringRef> AArch64Features;
AArch64::getArchFeatures(ID, AArch64Features);
AArch64::getExtensionFeatures(AArch64::getDefaultExtensions("generic", ID),
AArch64Features);
MCSubtargetInfo &STI = copySTI();
std::vector<std::string> ArchFeatures(AArch64Features.begin(), AArch64Features.end());
STI.setDefaultFeatures("generic", join(ArchFeatures.begin(), ArchFeatures.end(), ","));
SmallVector<StringRef, 4> RequestedExtensions;
if (!ExtensionString.empty())
ExtensionString.split(RequestedExtensions, '+');
ExpandCryptoAEK(ID, RequestedExtensions);
FeatureBitset Features = STI.getFeatureBits();
for (auto Name : RequestedExtensions) {
bool EnableFeature = true;
if (Name.startswith_lower("no")) {
EnableFeature = false;
Name = Name.substr(2);
}
for (const auto &Extension : ExtensionMap) {
if (Extension.Name != Name)
continue;
if (Extension.Features.none())
report_fatal_error("unsupported architectural extension: " + Name);
FeatureBitset ToggleFeatures = EnableFeature
? (~Features & Extension.Features)
: ( Features & Extension.Features);
FeatureBitset Features =
ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
setAvailableFeatures(Features);
break;
}
}
return false;
}
/// parseDirectiveArchExtension
/// ::= .arch_extension [no]feature
bool AArch64AsmParser::parseDirectiveArchExtension(SMLoc L) {
SMLoc ExtLoc = getLoc();
StringRef Name = getParser().parseStringToEndOfStatement().trim();
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '.arch_extension' directive"))
return true;
bool EnableFeature = true;
if (Name.startswith_lower("no")) {
EnableFeature = false;
Name = Name.substr(2);
}
MCSubtargetInfo &STI = copySTI();
FeatureBitset Features = STI.getFeatureBits();
for (const auto &Extension : ExtensionMap) {
if (Extension.Name != Name)
continue;
if (Extension.Features.none())
return Error(ExtLoc, "unsupported architectural extension: " + Name);
FeatureBitset ToggleFeatures = EnableFeature
? (~Features & Extension.Features)
: (Features & Extension.Features);
FeatureBitset Features =
ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
setAvailableFeatures(Features);
return false;
}
return Error(ExtLoc, "unknown architectural extension: " + Name);
}
static SMLoc incrementLoc(SMLoc L, int Offset) {
return SMLoc::getFromPointer(L.getPointer() + Offset);
}
/// parseDirectiveCPU
/// ::= .cpu id
bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
SMLoc CurLoc = getLoc();
StringRef CPU, ExtensionString;
std::tie(CPU, ExtensionString) =
getParser().parseStringToEndOfStatement().trim().split('+');
if (parseToken(AsmToken::EndOfStatement))
return true;
SmallVector<StringRef, 4> RequestedExtensions;
if (!ExtensionString.empty())
ExtensionString.split(RequestedExtensions, '+');
// FIXME This is using tablegen data, but should be moved to ARMTargetParser
// once that is tablegen'ed
if (!getSTI().isCPUStringValid(CPU)) {
Error(CurLoc, "unknown CPU name");
return false;
}
MCSubtargetInfo &STI = copySTI();
STI.setDefaultFeatures(CPU, "");
CurLoc = incrementLoc(CurLoc, CPU.size());
ExpandCryptoAEK(llvm::AArch64::getCPUArchKind(CPU), RequestedExtensions);
FeatureBitset Features = STI.getFeatureBits();
for (auto Name : RequestedExtensions) {
// Advance source location past '+'.
CurLoc = incrementLoc(CurLoc, 1);
bool EnableFeature = true;
if (Name.startswith_lower("no")) {
EnableFeature = false;
Name = Name.substr(2);
}
bool FoundExtension = false;
for (const auto &Extension : ExtensionMap) {
if (Extension.Name != Name)
continue;
if (Extension.Features.none())
report_fatal_error("unsupported architectural extension: " + Name);
FeatureBitset ToggleFeatures = EnableFeature
? (~Features & Extension.Features)
: ( Features & Extension.Features);
FeatureBitset Features =
ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
setAvailableFeatures(Features);
FoundExtension = true;
break;
}
if (!FoundExtension)
Error(CurLoc, "unsupported architectural extension");
CurLoc = incrementLoc(CurLoc, Name.size());
}
return false;
}
/// parseDirectiveInst
/// ::= .inst opcode [, ...]
bool AArch64AsmParser::parseDirectiveInst(SMLoc Loc) {
if (getLexer().is(AsmToken::EndOfStatement))
return Error(Loc, "expected expression following '.inst' directive");
auto parseOp = [&]() -> bool {
SMLoc L = getLoc();
const MCExpr *Expr;
if (check(getParser().parseExpression(Expr), L, "expected expression"))
return true;
const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr);
if (check(!Value, L, "expected constant expression"))
return true;
getTargetStreamer().emitInst(Value->getValue());
return false;
};
if (parseMany(parseOp))
return addErrorSuffix(" in '.inst' directive");
return false;
}
// parseDirectiveTLSDescCall:
// ::= .tlsdesccall symbol
bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
StringRef Name;
if (check(getParser().parseIdentifier(Name), L,
"expected symbol after directive") ||
parseToken(AsmToken::EndOfStatement))
return true;
MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getContext());
Expr = AArch64MCExpr::create(Expr, AArch64MCExpr::VK_TLSDESC, getContext());
MCInst Inst;
Inst.setOpcode(AArch64::TLSDESCCALL);
Inst.addOperand(MCOperand::createExpr(Expr));
getParser().getStreamer().EmitInstruction(Inst, getSTI());
return false;
}
/// ::= .loh <lohName | lohId> label1, ..., labelN
/// The number of arguments depends on the loh identifier.
bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
MCLOHType Kind;
if (getParser().getTok().isNot(AsmToken::Identifier)) {
if (getParser().getTok().isNot(AsmToken::Integer))
return TokError("expected an identifier or a number in directive");
// We successfully get a numeric value for the identifier.
// Check if it is valid.
int64_t Id = getParser().getTok().getIntVal();
if (Id <= -1U && !isValidMCLOHType(Id))
return TokError("invalid numeric identifier in directive");
Kind = (MCLOHType)Id;
} else {
StringRef Name = getTok().getIdentifier();
// We successfully parse an identifier.
// Check if it is a recognized one.
int Id = MCLOHNameToId(Name);
if (Id == -1)
return TokError("invalid identifier in directive");
Kind = (MCLOHType)Id;
}
// Consume the identifier.
Lex();
// Get the number of arguments of this LOH.
int NbArgs = MCLOHIdToNbArgs(Kind);
assert(NbArgs != -1 && "Invalid number of arguments");
SmallVector<MCSymbol *, 3> Args;
for (int Idx = 0; Idx < NbArgs; ++Idx) {
StringRef Name;
if (getParser().parseIdentifier(Name))
return TokError("expected identifier in directive");
Args.push_back(getContext().getOrCreateSymbol(Name));
if (Idx + 1 == NbArgs)
break;
if (parseToken(AsmToken::Comma,
"unexpected token in '" + Twine(IDVal) + "' directive"))
return true;
}
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '" + Twine(IDVal) + "' directive"))
return true;
getStreamer().EmitLOHDirective((MCLOHType)Kind, Args);
return false;
}
/// parseDirectiveLtorg
/// ::= .ltorg | .pool
bool AArch64AsmParser::parseDirectiveLtorg(SMLoc L) {
if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
return true;
getTargetStreamer().emitCurrentConstantPool();
return false;
}
/// parseDirectiveReq
/// ::= name .req registername
bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
MCAsmParser &Parser = getParser();
Parser.Lex(); // Eat the '.req' token.
SMLoc SRegLoc = getLoc();
RegKind RegisterKind = RegKind::Scalar;
unsigned RegNum;
OperandMatchResultTy ParseRes = tryParseScalarRegister(RegNum);
if (ParseRes != MatchOperand_Success) {
StringRef Kind;
RegisterKind = RegKind::NeonVector;
ParseRes = tryParseVectorRegister(RegNum, Kind, RegKind::NeonVector);
if (ParseRes == MatchOperand_ParseFail)
return true;
if (ParseRes == MatchOperand_Success && !Kind.empty())
return Error(SRegLoc, "vector register without type specifier expected");
}
if (ParseRes != MatchOperand_Success) {
StringRef Kind;
RegisterKind = RegKind::SVEDataVector;
ParseRes =
tryParseVectorRegister(RegNum, Kind, RegKind::SVEDataVector);
if (ParseRes == MatchOperand_ParseFail)
return true;
if (ParseRes == MatchOperand_Success && !Kind.empty())
return Error(SRegLoc,
"sve vector register without type specifier expected");
}
if (ParseRes != MatchOperand_Success) {
StringRef Kind;
RegisterKind = RegKind::SVEPredicateVector;
ParseRes = tryParseVectorRegister(RegNum, Kind, RegKind::SVEPredicateVector);
if (ParseRes == MatchOperand_ParseFail)
return true;
if (ParseRes == MatchOperand_Success && !Kind.empty())
return Error(SRegLoc,
"sve predicate register without type specifier expected");
}
if (ParseRes != MatchOperand_Success)
return Error(SRegLoc, "register name or alias expected");
// Shouldn't be anything else.
if (parseToken(AsmToken::EndOfStatement,
"unexpected input in .req directive"))
return true;
auto pair = std::make_pair(RegisterKind, (unsigned) RegNum);
if (RegisterReqs.insert(std::make_pair(Name, pair)).first->second != pair)
Warning(L, "ignoring redefinition of register alias '" + Name + "'");
return false;
}
/// parseDirectiveUneq
/// ::= .unreq registername
bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) {
MCAsmParser &Parser = getParser();
if (getTok().isNot(AsmToken::Identifier))
return TokError("unexpected input in .unreq directive.");
RegisterReqs.erase(Parser.getTok().getIdentifier().lower());
Parser.Lex(); // Eat the identifier.
if (parseToken(AsmToken::EndOfStatement))
return addErrorSuffix("in '.unreq' directive");
return false;
}
bool AArch64AsmParser::parseDirectiveCFINegateRAState() {
if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
return true;
getStreamer().EmitCFINegateRAState();
return false;
}
/// parseDirectiveCFIBKeyFrame
/// ::= .cfi_b_key
bool AArch64AsmParser::parseDirectiveCFIBKeyFrame() {
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '.cfi_b_key_frame'"))
return true;
getStreamer().EmitCFIBKeyFrame();
return false;
}
bool
AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
AArch64MCExpr::VariantKind &ELFRefKind,
MCSymbolRefExpr::VariantKind &DarwinRefKind,
int64_t &Addend) {
ELFRefKind = AArch64MCExpr::VK_INVALID;
DarwinRefKind = MCSymbolRefExpr::VK_None;
Addend = 0;
if (const AArch64MCExpr *AE = dyn_cast<AArch64MCExpr>(Expr)) {
ELFRefKind = AE->getKind();
Expr = AE->getSubExpr();
}
const MCSymbolRefExpr *SE = dyn_cast<MCSymbolRefExpr>(Expr);
if (SE) {
// It's a simple symbol reference with no addend.
DarwinRefKind = SE->getKind();
return true;
}
// Check that it looks like a symbol + an addend
MCValue Res;
bool Relocatable = Expr->evaluateAsRelocatable(Res, nullptr, nullptr);
if (!Relocatable || Res.getSymB())
return false;
// Treat expressions with an ELFRefKind (like ":abs_g1:3", or
// ":abs_g1:x" where x is constant) as symbolic even if there is no symbol.
if (!Res.getSymA() && ELFRefKind == AArch64MCExpr::VK_INVALID)
return false;
if (Res.getSymA())
DarwinRefKind = Res.getSymA()->getKind();
Addend = Res.getConstant();
// It's some symbol reference + a constant addend, but really
// shouldn't use both Darwin and ELF syntax.
return ELFRefKind == AArch64MCExpr::VK_INVALID ||
DarwinRefKind == MCSymbolRefExpr::VK_None;
}
/// Force static initialization.
extern "C" void LLVMInitializeAArch64AsmParser() {
RegisterMCAsmParser<AArch64AsmParser> X(getTheAArch64leTarget());
RegisterMCAsmParser<AArch64AsmParser> Y(getTheAArch64beTarget());
RegisterMCAsmParser<AArch64AsmParser> Z(getTheARM64Target());
RegisterMCAsmParser<AArch64AsmParser> W(getTheARM64_32Target());
RegisterMCAsmParser<AArch64AsmParser> V(getTheAArch64_32Target());
}
#define GET_REGISTER_MATCHER
#define GET_SUBTARGET_FEATURE_NAME
#define GET_MATCHER_IMPLEMENTATION
#define GET_MNEMONIC_SPELL_CHECKER
#include "AArch64GenAsmMatcher.inc"
// Define this matcher function after the auto-generated include so we
// have the match class enum definitions.
unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
unsigned Kind) {
AArch64Operand &Op = static_cast<AArch64Operand &>(AsmOp);
// If the kind is a token for a literal immediate, check if our asm
// operand matches. This is for InstAliases which have a fixed-value
// immediate in the syntax.
int64_t ExpectedVal;
switch (Kind) {
default:
return Match_InvalidOperand;
case MCK__35_0:
ExpectedVal = 0;
break;
case MCK__35_1:
ExpectedVal = 1;
break;
case MCK__35_12:
ExpectedVal = 12;
break;
case MCK__35_16:
ExpectedVal = 16;
break;
case MCK__35_2:
ExpectedVal = 2;
break;
case MCK__35_24:
ExpectedVal = 24;
break;
case MCK__35_3:
ExpectedVal = 3;
break;
case MCK__35_32:
ExpectedVal = 32;
break;
case MCK__35_4:
ExpectedVal = 4;
break;
case MCK__35_48:
ExpectedVal = 48;
break;
case MCK__35_6:
ExpectedVal = 6;
break;
case MCK__35_64:
ExpectedVal = 64;
break;
case MCK__35_8:
ExpectedVal = 8;
break;
}
if (!Op.isImm())
return Match_InvalidOperand;
const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm());
if (!CE)
return Match_InvalidOperand;
if (CE->getValue() == ExpectedVal)
return Match_Success;
return Match_InvalidOperand;
}
OperandMatchResultTy
AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) {
SMLoc S = getLoc();
if (getParser().getTok().isNot(AsmToken::Identifier)) {
Error(S, "expected register");
return MatchOperand_ParseFail;
}
unsigned FirstReg;
OperandMatchResultTy Res = tryParseScalarRegister(FirstReg);
if (Res != MatchOperand_Success)
return MatchOperand_ParseFail;
const MCRegisterClass &WRegClass =
AArch64MCRegisterClasses[AArch64::GPR32RegClassID];
const MCRegisterClass &XRegClass =
AArch64MCRegisterClasses[AArch64::GPR64RegClassID];
bool isXReg = XRegClass.contains(FirstReg),
isWReg = WRegClass.contains(FirstReg);
if (!isXReg && !isWReg) {
Error(S, "expected first even register of a "
"consecutive same-size even/odd register pair");
return MatchOperand_ParseFail;
}
const MCRegisterInfo *RI = getContext().getRegisterInfo();
unsigned FirstEncoding = RI->getEncodingValue(FirstReg);
if (FirstEncoding & 0x1) {
Error(S, "expected first even register of a "
"consecutive same-size even/odd register pair");
return MatchOperand_ParseFail;
}
if (getParser().getTok().isNot(AsmToken::Comma)) {
Error(getLoc(), "expected comma");
return MatchOperand_ParseFail;
}
// Eat the comma
getParser().Lex();
SMLoc E = getLoc();
unsigned SecondReg;
Res = tryParseScalarRegister(SecondReg);
if (Res != MatchOperand_Success)
return MatchOperand_ParseFail;
if (RI->getEncodingValue(SecondReg) != FirstEncoding + 1 ||
(isXReg && !XRegClass.contains(SecondReg)) ||
(isWReg && !WRegClass.contains(SecondReg))) {
Error(E,"expected second odd register of a "
"consecutive same-size even/odd register pair");
return MatchOperand_ParseFail;
}
unsigned Pair = 0;
if (isXReg) {
Pair = RI->getMatchingSuperReg(FirstReg, AArch64::sube64,
&AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID]);
} else {
Pair = RI->getMatchingSuperReg(FirstReg, AArch64::sube32,
&AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID]);
}
Operands.push_back(AArch64Operand::CreateReg(Pair, RegKind::Scalar, S,
getLoc(), getContext()));
return MatchOperand_Success;
}
template <bool ParseShiftExtend, bool ParseSuffix>
OperandMatchResultTy
AArch64AsmParser::tryParseSVEDataVector(OperandVector &Operands) {
const SMLoc S = getLoc();
// Check for a SVE vector register specifier first.
unsigned RegNum;
StringRef Kind;
OperandMatchResultTy Res =
tryParseVectorRegister(RegNum, Kind, RegKind::SVEDataVector);
if (Res != MatchOperand_Success)
return Res;
if (ParseSuffix && Kind.empty())
return MatchOperand_NoMatch;
const auto &KindRes = parseVectorKind(Kind, RegKind::SVEDataVector);
if (!KindRes)
return MatchOperand_NoMatch;
unsigned ElementWidth = KindRes->second;
// No shift/extend is the default.
if (!ParseShiftExtend || getParser().getTok().isNot(AsmToken::Comma)) {
Operands.push_back(AArch64Operand::CreateVectorReg(
RegNum, RegKind::SVEDataVector, ElementWidth, S, S, getContext()));
OperandMatchResultTy Res = tryParseVectorIndex(Operands);
if (Res == MatchOperand_ParseFail)
return MatchOperand_ParseFail;
return MatchOperand_Success;
}
// Eat the comma
getParser().Lex();
// Match the shift
SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> ExtOpnd;
Res = tryParseOptionalShiftExtend(ExtOpnd);
if (Res != MatchOperand_Success)
return Res;
auto Ext = static_cast<AArch64Operand *>(ExtOpnd.back().get());
Operands.push_back(AArch64Operand::CreateVectorReg(
RegNum, RegKind::SVEDataVector, ElementWidth, S, Ext->getEndLoc(),
getContext(), Ext->getShiftExtendType(), Ext->getShiftExtendAmount(),
Ext->hasShiftExtendAmount()));
return MatchOperand_Success;
}
OperandMatchResultTy
AArch64AsmParser::tryParseSVEPattern(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
SMLoc SS = getLoc();
const AsmToken &TokE = Parser.getTok();
bool IsHash = TokE.is(AsmToken::Hash);
if (!IsHash && TokE.isNot(AsmToken::Identifier))
return MatchOperand_NoMatch;
int64_t Pattern;
if (IsHash) {
Parser.Lex(); // Eat hash
// Parse the immediate operand.
const MCExpr *ImmVal;
SS = getLoc();
if (Parser.parseExpression(ImmVal))
return MatchOperand_ParseFail;
auto *MCE = dyn_cast<MCConstantExpr>(ImmVal);
if (!MCE)
return MatchOperand_ParseFail;
Pattern = MCE->getValue();
} else {
// Parse the pattern
auto Pat = AArch64SVEPredPattern::lookupSVEPREDPATByName(TokE.getString());
if (!Pat)
return MatchOperand_NoMatch;
Parser.Lex();
Pattern = Pat->Encoding;
assert(Pattern >= 0 && Pattern < 32);
}
Operands.push_back(
AArch64Operand::CreateImm(MCConstantExpr::create(Pattern, getContext()),
SS, getLoc(), getContext()));
return MatchOperand_Success;
}
Index: vendor/llvm/dist-release_90/lib/Target/AArch64/SVEInstrFormats.td
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/AArch64/SVEInstrFormats.td (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/AArch64/SVEInstrFormats.td (revision 351303)
@@ -1,5716 +1,5833 @@
//=-- SVEInstrFormats.td - AArch64 SVE Instruction classes -*- tablegen -*--=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// AArch64 Scalable Vector Extension (SVE) Instruction Class Definitions.
//
//===----------------------------------------------------------------------===//
def SVEPatternOperand : AsmOperandClass {
let Name = "SVEPattern";
let ParserMethod = "tryParseSVEPattern";
let PredicateMethod = "isSVEPattern";
let RenderMethod = "addImmOperands";
let DiagnosticType = "InvalidSVEPattern";
}
def sve_pred_enum : Operand<i32>, ImmLeaf<i32, [{
return (((uint32_t)Imm) < 32);
}]> {
let PrintMethod = "printSVEPattern";
let ParserMatchClass = SVEPatternOperand;
}
def SVEPrefetchOperand : AsmOperandClass {
let Name = "SVEPrefetch";
let ParserMethod = "tryParsePrefetch<true>";
let PredicateMethod = "isPrefetch";
let RenderMethod = "addPrefetchOperands";
}
def sve_prfop : Operand<i32>, ImmLeaf<i32, [{
return (((uint32_t)Imm) <= 15);
}]> {
let PrintMethod = "printPrefetchOp<true>";
let ParserMatchClass = SVEPrefetchOperand;
}
class SVELogicalImmOperand<int Width> : AsmOperandClass {
let Name = "SVELogicalImm" # Width;
let DiagnosticType = "LogicalSecondSource";
let PredicateMethod = "isLogicalImm<int" # Width # "_t>";
let RenderMethod = "addLogicalImmOperands<int" # Width # "_t>";
}
def sve_logical_imm8 : Operand<i64> {
let ParserMatchClass = SVELogicalImmOperand<8>;
let PrintMethod = "printLogicalImm<int8_t>";
let MCOperandPredicate = [{
if (!MCOp.isImm())
return false;
int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
return AArch64_AM::isSVEMaskOfIdenticalElements<int8_t>(Val);
}];
}
def sve_logical_imm16 : Operand<i64> {
let ParserMatchClass = SVELogicalImmOperand<16>;
let PrintMethod = "printLogicalImm<int16_t>";
let MCOperandPredicate = [{
if (!MCOp.isImm())
return false;
int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
return AArch64_AM::isSVEMaskOfIdenticalElements<int16_t>(Val);
}];
}
def sve_logical_imm32 : Operand<i64> {
let ParserMatchClass = SVELogicalImmOperand<32>;
let PrintMethod = "printLogicalImm<int32_t>";
let MCOperandPredicate = [{
if (!MCOp.isImm())
return false;
int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
return AArch64_AM::isSVEMaskOfIdenticalElements<int32_t>(Val);
}];
}
class SVEPreferredLogicalImmOperand<int Width> : AsmOperandClass {
let Name = "SVEPreferredLogicalImm" # Width;
let PredicateMethod = "isSVEPreferredLogicalImm<int" # Width # "_t>";
let RenderMethod = "addLogicalImmOperands<int" # Width # "_t>";
}
def sve_preferred_logical_imm16 : Operand<i64> {
let ParserMatchClass = SVEPreferredLogicalImmOperand<16>;
let PrintMethod = "printSVELogicalImm<int16_t>";
let MCOperandPredicate = [{
if (!MCOp.isImm())
return false;
int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
return AArch64_AM::isSVEMaskOfIdenticalElements<int16_t>(Val) &&
AArch64_AM::isSVEMoveMaskPreferredLogicalImmediate(Val);
}];
}
def sve_preferred_logical_imm32 : Operand<i64> {
let ParserMatchClass = SVEPreferredLogicalImmOperand<32>;
let PrintMethod = "printSVELogicalImm<int32_t>";
let MCOperandPredicate = [{
if (!MCOp.isImm())
return false;
int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
return AArch64_AM::isSVEMaskOfIdenticalElements<int32_t>(Val) &&
AArch64_AM::isSVEMoveMaskPreferredLogicalImmediate(Val);
}];
}
def sve_preferred_logical_imm64 : Operand<i64> {
let ParserMatchClass = SVEPreferredLogicalImmOperand<64>;
let PrintMethod = "printSVELogicalImm<int64_t>";
let MCOperandPredicate = [{
if (!MCOp.isImm())
return false;
int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
return AArch64_AM::isSVEMaskOfIdenticalElements<int64_t>(Val) &&
AArch64_AM::isSVEMoveMaskPreferredLogicalImmediate(Val);
}];
}
class SVELogicalImmNotOperand<int Width> : AsmOperandClass {
let Name = "SVELogicalImm" # Width # "Not";
let DiagnosticType = "LogicalSecondSource";
let PredicateMethod = "isLogicalImm<int" # Width # "_t>";
let RenderMethod = "addLogicalImmNotOperands<int" # Width # "_t>";
}
def sve_logical_imm8_not : Operand<i64> {
let ParserMatchClass = SVELogicalImmNotOperand<8>;
}
def sve_logical_imm16_not : Operand<i64> {
let ParserMatchClass = SVELogicalImmNotOperand<16>;
}
def sve_logical_imm32_not : Operand<i64> {
let ParserMatchClass = SVELogicalImmNotOperand<32>;
}
class SVEShiftedImmOperand<int ElementWidth, string Infix, string Predicate>
: AsmOperandClass {
let Name = "SVE" # Infix # "Imm" # ElementWidth;
let DiagnosticType = "Invalid" # Name;
let RenderMethod = "addImmWithOptionalShiftOperands<8>";
let ParserMethod = "tryParseImmWithOptionalShift";
let PredicateMethod = Predicate;
}
def SVECpyImmOperand8 : SVEShiftedImmOperand<8, "Cpy", "isSVECpyImm<int8_t>">;
def SVECpyImmOperand16 : SVEShiftedImmOperand<16, "Cpy", "isSVECpyImm<int16_t>">;
def SVECpyImmOperand32 : SVEShiftedImmOperand<32, "Cpy", "isSVECpyImm<int32_t>">;
def SVECpyImmOperand64 : SVEShiftedImmOperand<64, "Cpy", "isSVECpyImm<int64_t>">;
def SVEAddSubImmOperand8 : SVEShiftedImmOperand<8, "AddSub", "isSVEAddSubImm<int8_t>">;
def SVEAddSubImmOperand16 : SVEShiftedImmOperand<16, "AddSub", "isSVEAddSubImm<int16_t>">;
def SVEAddSubImmOperand32 : SVEShiftedImmOperand<32, "AddSub", "isSVEAddSubImm<int32_t>">;
def SVEAddSubImmOperand64 : SVEShiftedImmOperand<64, "AddSub", "isSVEAddSubImm<int64_t>">;
class imm8_opt_lsl<int ElementWidth, string printType,
AsmOperandClass OpndClass, code Predicate>
: Operand<i32>, ImmLeaf<i32, Predicate> {
let EncoderMethod = "getImm8OptLsl";
let DecoderMethod = "DecodeImm8OptLsl<" # ElementWidth # ">";
let PrintMethod = "printImm8OptLsl<" # printType # ">";
let ParserMatchClass = OpndClass;
let MIOperandInfo = (ops i32imm, i32imm);
}
def cpy_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "int8_t", SVECpyImmOperand8, [{
return AArch64_AM::isSVECpyImm<int8_t>(Imm);
}]>;
def cpy_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "int16_t", SVECpyImmOperand16, [{
return AArch64_AM::isSVECpyImm<int16_t>(Imm);
}]>;
def cpy_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "int32_t", SVECpyImmOperand32, [{
return AArch64_AM::isSVECpyImm<int32_t>(Imm);
}]>;
def cpy_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "int64_t", SVECpyImmOperand64, [{
return AArch64_AM::isSVECpyImm<int64_t>(Imm);
}]>;
def addsub_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "uint8_t", SVEAddSubImmOperand8, [{
return AArch64_AM::isSVEAddSubImm<int8_t>(Imm);
}]>;
def addsub_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "uint16_t", SVEAddSubImmOperand16, [{
return AArch64_AM::isSVEAddSubImm<int16_t>(Imm);
}]>;
def addsub_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "uint32_t", SVEAddSubImmOperand32, [{
return AArch64_AM::isSVEAddSubImm<int32_t>(Imm);
}]>;
def addsub_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "uint64_t", SVEAddSubImmOperand64, [{
return AArch64_AM::isSVEAddSubImm<int64_t>(Imm);
}]>;
class SVEExactFPImm<string Suffix, string ValA, string ValB> : AsmOperandClass {
let Name = "SVEExactFPImmOperand" # Suffix;
let DiagnosticType = "Invalid" # Name;
let ParserMethod = "tryParseFPImm<false>";
let PredicateMethod = "isExactFPImm<" # ValA # ", " # ValB # ">";
let RenderMethod = "addExactFPImmOperands<" # ValA # ", " # ValB # ">";
}
class SVEExactFPImmOperand<string Suffix, string ValA, string ValB> : Operand<i32> {
let PrintMethod = "printExactFPImm<" # ValA # ", " # ValB # ">";
let ParserMatchClass = SVEExactFPImm<Suffix, ValA, ValB>;
}
def sve_fpimm_half_one
: SVEExactFPImmOperand<"HalfOne", "AArch64ExactFPImm::half",
"AArch64ExactFPImm::one">;
def sve_fpimm_half_two
: SVEExactFPImmOperand<"HalfTwo", "AArch64ExactFPImm::half",
"AArch64ExactFPImm::two">;
def sve_fpimm_zero_one
: SVEExactFPImmOperand<"ZeroOne", "AArch64ExactFPImm::zero",
"AArch64ExactFPImm::one">;
def sve_incdec_imm : Operand<i32>, ImmLeaf<i32, [{
return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
}]> {
let ParserMatchClass = Imm1_16Operand;
let EncoderMethod = "getSVEIncDecImm";
let DecoderMethod = "DecodeSVEIncDecImm";
}
//===----------------------------------------------------------------------===//
// SVE PTrue - These are used extensively throughout the pattern matching so
// it's important we define them first.
//===----------------------------------------------------------------------===//
class sve_int_ptrue<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty>
: I<(outs pprty:$Pd), (ins sve_pred_enum:$pattern),
asm, "\t$Pd, $pattern",
"",
[]>, Sched<[]> {
bits<4> Pd;
bits<5> pattern;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21-19} = 0b011;
let Inst{18-17} = opc{2-1};
let Inst{16} = opc{0};
let Inst{15-10} = 0b111000;
let Inst{9-5} = pattern;
let Inst{4} = 0b0;
let Inst{3-0} = Pd;
let Defs = !if(!eq (opc{0}, 1), [NZCV], []);
}
multiclass sve_int_ptrue<bits<3> opc, string asm> {
def _B : sve_int_ptrue<0b00, opc, asm, PPR8>;
def _H : sve_int_ptrue<0b01, opc, asm, PPR16>;
def _S : sve_int_ptrue<0b10, opc, asm, PPR32>;
def _D : sve_int_ptrue<0b11, opc, asm, PPR64>;
def : InstAlias<asm # "\t$Pd",
(!cast<Instruction>(NAME # _B) PPR8:$Pd, 0b11111), 1>;
def : InstAlias<asm # "\t$Pd",
(!cast<Instruction>(NAME # _H) PPR16:$Pd, 0b11111), 1>;
def : InstAlias<asm # "\t$Pd",
(!cast<Instruction>(NAME # _S) PPR32:$Pd, 0b11111), 1>;
def : InstAlias<asm # "\t$Pd",
(!cast<Instruction>(NAME # _D) PPR64:$Pd, 0b11111), 1>;
}
let Predicates = [HasSVE] in {
defm PTRUE : sve_int_ptrue<0b000, "ptrue">;
defm PTRUES : sve_int_ptrue<0b001, "ptrues">;
}
//===----------------------------------------------------------------------===//
// SVE Predicate Misc Group
//===----------------------------------------------------------------------===//
class sve_int_pfalse<bits<6> opc, string asm>
: I<(outs PPR8:$Pd), (ins),
asm, "\t$Pd",
"",
[]>, Sched<[]> {
bits<4> Pd;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = opc{5-4};
let Inst{21-19} = 0b011;
let Inst{18-16} = opc{3-1};
let Inst{15-10} = 0b111001;
let Inst{9} = opc{0};
let Inst{8-4} = 0b00000;
let Inst{3-0} = Pd;
}
class sve_int_ptest<bits<6> opc, string asm>
: I<(outs), (ins PPRAny:$Pg, PPR8:$Pn),
asm, "\t$Pg, $Pn",
"",
[]>, Sched<[]> {
bits<4> Pg;
bits<4> Pn;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = opc{5-4};
let Inst{21-19} = 0b010;
let Inst{18-16} = opc{3-1};
let Inst{15-14} = 0b11;
let Inst{13-10} = Pg;
let Inst{9} = opc{0};
let Inst{8-5} = Pn;
let Inst{4-0} = 0b00000;
let Defs = [NZCV];
}
class sve_int_pfirst_next<bits<2> sz8_64, bits<5> opc, string asm,
PPRRegOp pprty>
: I<(outs pprty:$Pdn), (ins PPRAny:$Pg, pprty:$_Pdn),
asm, "\t$Pdn, $Pg, $_Pdn",
"",
[]>, Sched<[]> {
bits<4> Pdn;
bits<4> Pg;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21-19} = 0b011;
let Inst{18-16} = opc{4-2};
let Inst{15-11} = 0b11000;
let Inst{10-9} = opc{1-0};
let Inst{8-5} = Pg;
let Inst{4} = 0;
let Inst{3-0} = Pdn;
let Constraints = "$Pdn = $_Pdn";
let Defs = [NZCV];
}
multiclass sve_int_pfirst<bits<5> opc, string asm> {
def : sve_int_pfirst_next<0b01, opc, asm, PPR8>;
}
multiclass sve_int_pnext<bits<5> opc, string asm> {
def _B : sve_int_pfirst_next<0b00, opc, asm, PPR8>;
def _H : sve_int_pfirst_next<0b01, opc, asm, PPR16>;
def _S : sve_int_pfirst_next<0b10, opc, asm, PPR32>;
def _D : sve_int_pfirst_next<0b11, opc, asm, PPR64>;
}
//===----------------------------------------------------------------------===//
// SVE Predicate Count Group
//===----------------------------------------------------------------------===//
class sve_int_count_r<bits<2> sz8_64, bits<5> opc, string asm,
RegisterOperand dty, PPRRegOp pprty, RegisterOperand sty>
: I<(outs dty:$Rdn), (ins pprty:$Pg, sty:$_Rdn),
asm, "\t$Rdn, $Pg",
"",
[]>, Sched<[]> {
bits<5> Rdn;
bits<4> Pg;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21-19} = 0b101;
let Inst{18-16} = opc{4-2};
let Inst{15-11} = 0b10001;
let Inst{10-9} = opc{1-0};
let Inst{8-5} = Pg;
let Inst{4-0} = Rdn;
// Signed 32bit forms require their GPR operand printed.
let AsmString = !if(!eq(opc{4,2-0}, 0b0000),
!strconcat(asm, "\t$Rdn, $Pg, $_Rdn"),
!strconcat(asm, "\t$Rdn, $Pg"));
let Constraints = "$Rdn = $_Rdn";
}
multiclass sve_int_count_r_s32<bits<5> opc, string asm> {
def _B : sve_int_count_r<0b00, opc, asm, GPR64z, PPR8, GPR64as32>;
def _H : sve_int_count_r<0b01, opc, asm, GPR64z, PPR16, GPR64as32>;
def _S : sve_int_count_r<0b10, opc, asm, GPR64z, PPR32, GPR64as32>;
def _D : sve_int_count_r<0b11, opc, asm, GPR64z, PPR64, GPR64as32>;
}
multiclass sve_int_count_r_u32<bits<5> opc, string asm> {
def _B : sve_int_count_r<0b00, opc, asm, GPR32z, PPR8, GPR32z>;
def _H : sve_int_count_r<0b01, opc, asm, GPR32z, PPR16, GPR32z>;
def _S : sve_int_count_r<0b10, opc, asm, GPR32z, PPR32, GPR32z>;
def _D : sve_int_count_r<0b11, opc, asm, GPR32z, PPR64, GPR32z>;
}
multiclass sve_int_count_r_x64<bits<5> opc, string asm> {
def _B : sve_int_count_r<0b00, opc, asm, GPR64z, PPR8, GPR64z>;
def _H : sve_int_count_r<0b01, opc, asm, GPR64z, PPR16, GPR64z>;
def _S : sve_int_count_r<0b10, opc, asm, GPR64z, PPR32, GPR64z>;
def _D : sve_int_count_r<0b11, opc, asm, GPR64z, PPR64, GPR64z>;
}
class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
- ZPRRegOp zprty>
-: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, PPRAny:$Pg),
- asm, "\t$Zdn, $Pg",
+ ZPRRegOp zprty, PPRRegOp pprty>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, pprty:$Pm),
+ asm, "\t$Zdn, $Pm",
"",
[]>, Sched<[]> {
- bits<4> Pg;
+ bits<4> Pm;
bits<5> Zdn;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21-19} = 0b101;
let Inst{18-16} = opc{4-2};
let Inst{15-11} = 0b10000;
let Inst{10-9} = opc{1-0};
- let Inst{8-5} = Pg;
+ let Inst{8-5} = Pm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
multiclass sve_int_count_v<bits<5> opc, string asm> {
- def _H : sve_int_count_v<0b01, opc, asm, ZPR16>;
- def _S : sve_int_count_v<0b10, opc, asm, ZPR32>;
- def _D : sve_int_count_v<0b11, opc, asm, ZPR64>;
+ def _H : sve_int_count_v<0b01, opc, asm, ZPR16, PPR16>;
+ def _S : sve_int_count_v<0b10, opc, asm, ZPR32, PPR32>;
+ def _D : sve_int_count_v<0b11, opc, asm, ZPR64, PPR64>;
+
+ def : InstAlias<asm # "\t$Zdn, $Pm",
+ (!cast<Instruction>(NAME # "_H") ZPR16:$Zdn, PPRAny:$Pm), 0>;
+ def : InstAlias<asm # "\t$Zdn, $Pm",
+ (!cast<Instruction>(NAME # "_S") ZPR32:$Zdn, PPRAny:$Pm), 0>;
+ def : InstAlias<asm # "\t$Zdn, $Pm",
+ (!cast<Instruction>(NAME # "_D") ZPR64:$Zdn, PPRAny:$Pm), 0>;
}
class sve_int_pcount_pred<bits<2> sz8_64, bits<4> opc, string asm,
PPRRegOp pprty>
: I<(outs GPR64:$Rd), (ins PPRAny:$Pg, pprty:$Pn),
asm, "\t$Rd, $Pg, $Pn",
"",
[]>, Sched<[]> {
bits<4> Pg;
bits<4> Pn;
bits<5> Rd;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21-19} = 0b100;
let Inst{18-16} = opc{3-1};
let Inst{15-14} = 0b10;
let Inst{13-10} = Pg;
let Inst{9} = opc{0};
let Inst{8-5} = Pn;
let Inst{4-0} = Rd;
}
multiclass sve_int_pcount_pred<bits<4> opc, string asm> {
def _B : sve_int_pcount_pred<0b00, opc, asm, PPR8>;
def _H : sve_int_pcount_pred<0b01, opc, asm, PPR16>;
def _S : sve_int_pcount_pred<0b10, opc, asm, PPR32>;
def _D : sve_int_pcount_pred<0b11, opc, asm, PPR64>;
}
//===----------------------------------------------------------------------===//
// SVE Element Count Group
//===----------------------------------------------------------------------===//
class sve_int_count<bits<3> opc, string asm>
: I<(outs GPR64:$Rd), (ins sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
asm, "\t$Rd, $pattern, mul $imm4",
"",
[]>, Sched<[]> {
bits<5> Rd;
bits<4> imm4;
bits<5> pattern;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = opc{2-1};
let Inst{21-20} = 0b10;
let Inst{19-16} = imm4;
let Inst{15-11} = 0b11100;
let Inst{10} = opc{0};
let Inst{9-5} = pattern;
let Inst{4-0} = Rd;
}
multiclass sve_int_count<bits<3> opc, string asm> {
def NAME : sve_int_count<opc, asm>;
def : InstAlias<asm # "\t$Rd, $pattern",
(!cast<Instruction>(NAME) GPR64:$Rd, sve_pred_enum:$pattern, 1), 1>;
def : InstAlias<asm # "\t$Rd",
(!cast<Instruction>(NAME) GPR64:$Rd, 0b11111, 1), 2>;
}
class sve_int_countvlv<bits<5> opc, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
asm, "\t$Zdn, $pattern, mul $imm4",
"",
[]>, Sched<[]> {
bits<5> Zdn;
bits<5> pattern;
bits<4> imm4;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = opc{4-3};
let Inst{21} = 0b1;
let Inst{20} = opc{2};
let Inst{19-16} = imm4;
let Inst{15-12} = 0b1100;
let Inst{11-10} = opc{1-0};
let Inst{9-5} = pattern;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
multiclass sve_int_countvlv<bits<5> opc, string asm, ZPRRegOp zprty> {
def NAME : sve_int_countvlv<opc, asm, zprty>;
def : InstAlias<asm # "\t$Zdn, $pattern",
(!cast<Instruction>(NAME) zprty:$Zdn, sve_pred_enum:$pattern, 1), 1>;
def : InstAlias<asm # "\t$Zdn",
(!cast<Instruction>(NAME) zprty:$Zdn, 0b11111, 1), 2>;
}
class sve_int_pred_pattern_a<bits<3> opc, string asm>
: I<(outs GPR64:$Rdn), (ins GPR64:$_Rdn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
asm, "\t$Rdn, $pattern, mul $imm4",
"",
[]>, Sched<[]> {
bits<5> Rdn;
bits<5> pattern;
bits<4> imm4;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = opc{2-1};
let Inst{21-20} = 0b11;
let Inst{19-16} = imm4;
let Inst{15-11} = 0b11100;
let Inst{10} = opc{0};
let Inst{9-5} = pattern;
let Inst{4-0} = Rdn;
let Constraints = "$Rdn = $_Rdn";
}
multiclass sve_int_pred_pattern_a<bits<3> opc, string asm> {
def NAME : sve_int_pred_pattern_a<opc, asm>;
def : InstAlias<asm # "\t$Rdn, $pattern",
(!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, 1), 1>;
def : InstAlias<asm # "\t$Rdn",
(!cast<Instruction>(NAME) GPR64:$Rdn, 0b11111, 1), 2>;
}
class sve_int_pred_pattern_b<bits<5> opc, string asm, RegisterOperand dt,
RegisterOperand st>
: I<(outs dt:$Rdn), (ins st:$_Rdn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
asm, "\t$Rdn, $pattern, mul $imm4",
"",
[]>, Sched<[]> {
bits<5> Rdn;
bits<5> pattern;
bits<4> imm4;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = opc{4-3};
let Inst{21} = 0b1;
let Inst{20} = opc{2};
let Inst{19-16} = imm4;
let Inst{15-12} = 0b1111;
let Inst{11-10} = opc{1-0};
let Inst{9-5} = pattern;
let Inst{4-0} = Rdn;
// Signed 32bit forms require their GPR operand printed.
let AsmString = !if(!eq(opc{2,0}, 0b00),
!strconcat(asm, "\t$Rdn, $_Rdn, $pattern, mul $imm4"),
!strconcat(asm, "\t$Rdn, $pattern, mul $imm4"));
let Constraints = "$Rdn = $_Rdn";
}
multiclass sve_int_pred_pattern_b_s32<bits<5> opc, string asm> {
def NAME : sve_int_pred_pattern_b<opc, asm, GPR64z, GPR64as32>;
def : InstAlias<asm # "\t$Rd, $Rn, $pattern",
(!cast<Instruction>(NAME) GPR64z:$Rd, GPR64as32:$Rn, sve_pred_enum:$pattern, 1), 1>;
def : InstAlias<asm # "\t$Rd, $Rn",
(!cast<Instruction>(NAME) GPR64z:$Rd, GPR64as32:$Rn, 0b11111, 1), 2>;
}
multiclass sve_int_pred_pattern_b_u32<bits<5> opc, string asm> {
def NAME : sve_int_pred_pattern_b<opc, asm, GPR32z, GPR32z>;
def : InstAlias<asm # "\t$Rdn, $pattern",
(!cast<Instruction>(NAME) GPR32z:$Rdn, sve_pred_enum:$pattern, 1), 1>;
def : InstAlias<asm # "\t$Rdn",
(!cast<Instruction>(NAME) GPR32z:$Rdn, 0b11111, 1), 2>;
}
multiclass sve_int_pred_pattern_b_x64<bits<5> opc, string asm> {
def NAME : sve_int_pred_pattern_b<opc, asm, GPR64z, GPR64z>;
def : InstAlias<asm # "\t$Rdn, $pattern",
(!cast<Instruction>(NAME) GPR64z:$Rdn, sve_pred_enum:$pattern, 1), 1>;
def : InstAlias<asm # "\t$Rdn",
(!cast<Instruction>(NAME) GPR64z:$Rdn, 0b11111, 1), 2>;
}
//===----------------------------------------------------------------------===//
// SVE Permute - Cross Lane Group
//===----------------------------------------------------------------------===//
class sve_int_perm_dup_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
RegisterClass srcRegType>
: I<(outs zprty:$Zd), (ins srcRegType:$Rn),
asm, "\t$Zd, $Rn",
"",
[]>, Sched<[]> {
bits<5> Rn;
bits<5> Zd;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-10} = 0b100000001110;
let Inst{9-5} = Rn;
let Inst{4-0} = Zd;
}
multiclass sve_int_perm_dup_r<string asm> {
def _B : sve_int_perm_dup_r<0b00, asm, ZPR8, GPR32sp>;
def _H : sve_int_perm_dup_r<0b01, asm, ZPR16, GPR32sp>;
def _S : sve_int_perm_dup_r<0b10, asm, ZPR32, GPR32sp>;
def _D : sve_int_perm_dup_r<0b11, asm, ZPR64, GPR64sp>;
def : InstAlias<"mov $Zd, $Rn",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, GPR32sp:$Rn), 1>;
def : InstAlias<"mov $Zd, $Rn",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, GPR32sp:$Rn), 1>;
def : InstAlias<"mov $Zd, $Rn",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, GPR32sp:$Rn), 1>;
def : InstAlias<"mov $Zd, $Rn",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, GPR64sp:$Rn), 1>;
}
class sve_int_perm_dup_i<bits<5> tsz, Operand immtype, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$idx),
asm, "\t$Zd, $Zn$idx",
"",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<7> idx;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = {?,?}; // imm3h
let Inst{21} = 0b1;
let Inst{20-16} = tsz;
let Inst{15-10} = 0b001000;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve_int_perm_dup_i<string asm> {
def _B : sve_int_perm_dup_i<{?,?,?,?,1}, sve_elm_idx_extdup_b, asm, ZPR8> {
let Inst{23-22} = idx{5-4};
let Inst{20-17} = idx{3-0};
}
def _H : sve_int_perm_dup_i<{?,?,?,1,0}, sve_elm_idx_extdup_h, asm, ZPR16> {
let Inst{23-22} = idx{4-3};
let Inst{20-18} = idx{2-0};
}
def _S : sve_int_perm_dup_i<{?,?,1,0,0}, sve_elm_idx_extdup_s, asm, ZPR32> {
let Inst{23-22} = idx{3-2};
let Inst{20-19} = idx{1-0};
}
def _D : sve_int_perm_dup_i<{?,1,0,0,0}, sve_elm_idx_extdup_d, asm, ZPR64> {
let Inst{23-22} = idx{2-1};
let Inst{20} = idx{0};
}
def _Q : sve_int_perm_dup_i<{1,0,0,0,0}, sve_elm_idx_extdup_q, asm, ZPR128> {
let Inst{23-22} = idx{1-0};
}
def : InstAlias<"mov $Zd, $Zn$idx",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, ZPR8:$Zn, sve_elm_idx_extdup_b:$idx), 1>;
def : InstAlias<"mov $Zd, $Zn$idx",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, ZPR16:$Zn, sve_elm_idx_extdup_h:$idx), 1>;
def : InstAlias<"mov $Zd, $Zn$idx",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, ZPR32:$Zn, sve_elm_idx_extdup_s:$idx), 1>;
def : InstAlias<"mov $Zd, $Zn$idx",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, ZPR64:$Zn, sve_elm_idx_extdup_d:$idx), 1>;
def : InstAlias<"mov $Zd, $Zn$idx",
(!cast<Instruction>(NAME # _Q) ZPR128:$Zd, ZPR128:$Zn, sve_elm_idx_extdup_q:$idx), 1>;
def : InstAlias<"mov $Zd, $Bn",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, FPR8asZPR:$Bn, 0), 2>;
def : InstAlias<"mov $Zd, $Hn",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, FPR16asZPR:$Hn, 0), 2>;
def : InstAlias<"mov $Zd, $Sn",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, FPR32asZPR:$Sn, 0), 2>;
def : InstAlias<"mov $Zd, $Dn",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, FPR64asZPR:$Dn, 0), 2>;
def : InstAlias<"mov $Zd, $Qn",
(!cast<Instruction>(NAME # _Q) ZPR128:$Zd, FPR128asZPR:$Qn, 0), 2>;
}
class sve_int_perm_tbl<bits<2> sz8_64, bits<2> opc, string asm,
ZPRRegOp zprty, RegisterOperand VecList>
: I<(outs zprty:$Zd), (ins VecList:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm",
"",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b001;
let Inst{12-11} = opc;
let Inst{10} = 0b0;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve_int_perm_tbl<string asm> {
def _B : sve_int_perm_tbl<0b00, 0b10, asm, ZPR8, Z_b>;
def _H : sve_int_perm_tbl<0b01, 0b10, asm, ZPR16, Z_h>;
def _S : sve_int_perm_tbl<0b10, 0b10, asm, ZPR32, Z_s>;
def _D : sve_int_perm_tbl<0b11, 0b10, asm, ZPR64, Z_d>;
def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, ZPR8:$Zn, ZPR8:$Zm), 0>;
def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, ZPR16:$Zn, ZPR16:$Zm), 0>;
def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, ZPR32:$Zn, ZPR32:$Zm), 0>;
def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zm), 0>;
}
multiclass sve2_int_perm_tbl<string asm> {
def _B : sve_int_perm_tbl<0b00, 0b01, asm, ZPR8, ZZ_b>;
def _H : sve_int_perm_tbl<0b01, 0b01, asm, ZPR16, ZZ_h>;
def _S : sve_int_perm_tbl<0b10, 0b01, asm, ZPR32, ZZ_s>;
def _D : sve_int_perm_tbl<0b11, 0b01, asm, ZPR64, ZZ_d>;
}
class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty>
-: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
+: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm",
"",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-10} = 0b001011;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
}
multiclass sve2_int_perm_tbx<string asm> {
def _B : sve2_int_perm_tbx<0b00, asm, ZPR8>;
def _H : sve2_int_perm_tbx<0b01, asm, ZPR16>;
def _S : sve2_int_perm_tbx<0b10, asm, ZPR32>;
def _D : sve2_int_perm_tbx<0b11, asm, ZPR64>;
}
class sve_int_perm_reverse_z<bits<2> sz8_64, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn),
asm, "\t$Zd, $Zn",
"",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-10} = 0b111000001110;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve_int_perm_reverse_z<string asm> {
def _B : sve_int_perm_reverse_z<0b00, asm, ZPR8>;
def _H : sve_int_perm_reverse_z<0b01, asm, ZPR16>;
def _S : sve_int_perm_reverse_z<0b10, asm, ZPR32>;
def _D : sve_int_perm_reverse_z<0b11, asm, ZPR64>;
}
class sve_int_perm_reverse_p<bits<2> sz8_64, string asm, PPRRegOp pprty>
: I<(outs pprty:$Pd), (ins pprty:$Pn),
asm, "\t$Pd, $Pn",
"",
[]>, Sched<[]> {
bits<4> Pd;
bits<4> Pn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-9} = 0b1101000100000;
let Inst{8-5} = Pn;
let Inst{4} = 0b0;
let Inst{3-0} = Pd;
}
multiclass sve_int_perm_reverse_p<string asm> {
def _B : sve_int_perm_reverse_p<0b00, asm, PPR8>;
def _H : sve_int_perm_reverse_p<0b01, asm, PPR16>;
def _S : sve_int_perm_reverse_p<0b10, asm, PPR32>;
def _D : sve_int_perm_reverse_p<0b11, asm, PPR64>;
}
class sve_int_perm_unpk<bits<2> sz16_64, bits<2> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn),
asm, "\t$Zd, $Zn",
"", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz16_64;
let Inst{21-18} = 0b1100;
let Inst{17-16} = opc;
let Inst{15-10} = 0b001110;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve_int_perm_unpk<bits<2> opc, string asm> {
def _H : sve_int_perm_unpk<0b01, opc, asm, ZPR16, ZPR8>;
def _S : sve_int_perm_unpk<0b10, opc, asm, ZPR32, ZPR16>;
def _D : sve_int_perm_unpk<0b11, opc, asm, ZPR64, ZPR32>;
}
class sve_int_perm_insrs<bits<2> sz8_64, string asm, ZPRRegOp zprty,
RegisterClass srcRegType>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, srcRegType:$Rm),
asm, "\t$Zdn, $Rm",
"",
[]>, Sched<[]> {
bits<5> Rm;
bits<5> Zdn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-10} = 0b100100001110;
let Inst{9-5} = Rm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
multiclass sve_int_perm_insrs<string asm> {
def _B : sve_int_perm_insrs<0b00, asm, ZPR8, GPR32>;
def _H : sve_int_perm_insrs<0b01, asm, ZPR16, GPR32>;
def _S : sve_int_perm_insrs<0b10, asm, ZPR32, GPR32>;
def _D : sve_int_perm_insrs<0b11, asm, ZPR64, GPR64>;
}
class sve_int_perm_insrv<bits<2> sz8_64, string asm, ZPRRegOp zprty,
RegisterClass srcRegType>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, srcRegType:$Vm),
asm, "\t$Zdn, $Vm",
"",
[]>, Sched<[]> {
bits<5> Vm;
bits<5> Zdn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-10} = 0b110100001110;
let Inst{9-5} = Vm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
multiclass sve_int_perm_insrv<string asm> {
def _B : sve_int_perm_insrv<0b00, asm, ZPR8, FPR8>;
def _H : sve_int_perm_insrv<0b01, asm, ZPR16, FPR16>;
def _S : sve_int_perm_insrv<0b10, asm, ZPR32, FPR32>;
def _D : sve_int_perm_insrv<0b11, asm, ZPR64, FPR64>;
}
//===----------------------------------------------------------------------===//
// SVE Permute - Extract Group
//===----------------------------------------------------------------------===//
class sve_int_perm_extract_i<string asm>
: I<(outs ZPR8:$Zdn), (ins ZPR8:$_Zdn, ZPR8:$Zm, imm0_255:$imm8),
asm, "\t$Zdn, $_Zdn, $Zm, $imm8",
"", []>, Sched<[]> {
bits<5> Zdn;
bits<5> Zm;
bits<8> imm8;
let Inst{31-21} = 0b00000101001;
let Inst{20-16} = imm8{7-3};
let Inst{15-13} = 0b000;
let Inst{12-10} = imm8{2-0};
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
class sve2_int_perm_extract_i_cons<string asm>
: I<(outs ZPR8:$Zd), (ins ZZ_b:$Zn, imm0_255:$imm8),
asm, "\t$Zd, $Zn, $imm8",
"", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<8> imm8;
let Inst{31-21} = 0b00000101011;
let Inst{20-16} = imm8{7-3};
let Inst{15-13} = 0b000;
let Inst{12-10} = imm8{2-0};
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
//===----------------------------------------------------------------------===//
// SVE Vector Select Group
//===----------------------------------------------------------------------===//
class sve_int_sel_vvv<bits<2> sz8_64, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins PPRAny:$Pg, zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Pg, $Zn, $Zm",
"",
[]>, Sched<[]> {
bits<4> Pg;
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-14} = 0b11;
let Inst{13-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve_int_sel_vvv<string asm> {
def _B : sve_int_sel_vvv<0b00, asm, ZPR8>;
def _H : sve_int_sel_vvv<0b01, asm, ZPR16>;
def _S : sve_int_sel_vvv<0b10, asm, ZPR32>;
def _D : sve_int_sel_vvv<0b11, asm, ZPR64>;
def : InstAlias<"mov $Zd, $Pg/m, $Zn",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, ZPR8:$Zn, ZPR8:$Zd), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $Zn",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, ZPR16:$Zn, ZPR16:$Zd), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $Zn",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, ZPR32:$Zn, ZPR32:$Zd), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $Zn",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, ZPR64:$Zn, ZPR64:$Zd), 1>;
}
//===----------------------------------------------------------------------===//
// SVE Predicate Logical Operations Group
//===----------------------------------------------------------------------===//
class sve_int_pred_log<bits<4> opc, string asm>
: I<(outs PPR8:$Pd), (ins PPRAny:$Pg, PPR8:$Pn, PPR8:$Pm),
asm, "\t$Pd, $Pg/z, $Pn, $Pm",
"",
[]>, Sched<[]> {
bits<4> Pd;
bits<4> Pg;
bits<4> Pm;
bits<4> Pn;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = opc{3-2};
let Inst{21-20} = 0b00;
let Inst{19-16} = Pm;
let Inst{15-14} = 0b01;
let Inst{13-10} = Pg;
let Inst{9} = opc{1};
let Inst{8-5} = Pn;
let Inst{4} = opc{0};
let Inst{3-0} = Pd;
// SEL has no predication qualifier.
let AsmString = !if(!eq(opc, 0b0011),
!strconcat(asm, "\t$Pd, $Pg, $Pn, $Pm"),
!strconcat(asm, "\t$Pd, $Pg/z, $Pn, $Pm"));
let Defs = !if(!eq (opc{2}, 1), [NZCV], []);
}
//===----------------------------------------------------------------------===//
// SVE Logical Mask Immediate Group
//===----------------------------------------------------------------------===//
class sve_int_log_imm<bits<2> opc, string asm>
: I<(outs ZPR64:$Zdn), (ins ZPR64:$_Zdn, logical_imm64:$imms13),
asm, "\t$Zdn, $_Zdn, $imms13",
"", []>, Sched<[]> {
bits<5> Zdn;
bits<13> imms13;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = opc;
let Inst{21-18} = 0b0000;
let Inst{17-5} = imms13;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DecoderMethod = "DecodeSVELogicalImmInstruction";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
multiclass sve_int_log_imm<bits<2> opc, string asm, string alias> {
def NAME : sve_int_log_imm<opc, asm>;
def : InstAlias<asm # "\t$Zdn, $Zdn, $imm",
(!cast<Instruction>(NAME) ZPR8:$Zdn, sve_logical_imm8:$imm), 4>;
def : InstAlias<asm # "\t$Zdn, $Zdn, $imm",
(!cast<Instruction>(NAME) ZPR16:$Zdn, sve_logical_imm16:$imm), 3>;
def : InstAlias<asm # "\t$Zdn, $Zdn, $imm",
(!cast<Instruction>(NAME) ZPR32:$Zdn, sve_logical_imm32:$imm), 2>;
def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
(!cast<Instruction>(NAME) ZPR8:$Zdn, sve_logical_imm8_not:$imm), 0>;
def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
(!cast<Instruction>(NAME) ZPR16:$Zdn, sve_logical_imm16_not:$imm), 0>;
def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
(!cast<Instruction>(NAME) ZPR32:$Zdn, sve_logical_imm32_not:$imm), 0>;
def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
(!cast<Instruction>(NAME) ZPR64:$Zdn, logical_imm64_not:$imm), 0>;
}
class sve_int_dup_mask_imm<string asm>
: I<(outs ZPR64:$Zd), (ins logical_imm64:$imms),
asm, "\t$Zd, $imms",
"",
[]>, Sched<[]> {
bits<5> Zd;
bits<13> imms;
let Inst{31-18} = 0b00000101110000;
let Inst{17-5} = imms;
let Inst{4-0} = Zd;
let isReMaterializable = 1;
let DecoderMethod = "DecodeSVELogicalImmInstruction";
}
multiclass sve_int_dup_mask_imm<string asm> {
def NAME : sve_int_dup_mask_imm<asm>;
def : InstAlias<"dupm $Zd, $imm",
(!cast<Instruction>(NAME) ZPR8:$Zd, sve_logical_imm8:$imm), 4>;
def : InstAlias<"dupm $Zd, $imm",
(!cast<Instruction>(NAME) ZPR16:$Zd, sve_logical_imm16:$imm), 3>;
def : InstAlias<"dupm $Zd, $imm",
(!cast<Instruction>(NAME) ZPR32:$Zd, sve_logical_imm32:$imm), 2>;
// All Zd.b forms have a CPY/DUP equivalent, hence no byte alias here.
def : InstAlias<"mov $Zd, $imm",
(!cast<Instruction>(NAME) ZPR16:$Zd, sve_preferred_logical_imm16:$imm), 7>;
def : InstAlias<"mov $Zd, $imm",
(!cast<Instruction>(NAME) ZPR32:$Zd, sve_preferred_logical_imm32:$imm), 6>;
def : InstAlias<"mov $Zd, $imm",
(!cast<Instruction>(NAME) ZPR64:$Zd, sve_preferred_logical_imm64:$imm), 5>;
}
//===----------------------------------------------------------------------===//
// SVE Integer Arithmetic - Unpredicated Group.
//===----------------------------------------------------------------------===//
class sve_int_bin_cons_arit_0<bits<2> sz8_64, bits<3> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm",
"", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b000;
let Inst{12-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve_int_bin_cons_arit_0<bits<3> opc, string asm> {
def _B : sve_int_bin_cons_arit_0<0b00, opc, asm, ZPR8>;
def _H : sve_int_bin_cons_arit_0<0b01, opc, asm, ZPR16>;
def _S : sve_int_bin_cons_arit_0<0b10, opc, asm, ZPR32>;
def _D : sve_int_bin_cons_arit_0<0b11, opc, asm, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE Floating Point Arithmetic - Predicated Group
//===----------------------------------------------------------------------===//
class sve_fp_2op_i_p_zds<bits<2> sz, bits<3> opc, string asm,
ZPRRegOp zprty,
Operand imm_ty>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, imm_ty:$i1),
asm, "\t$Zdn, $Pg/m, $_Zdn, $i1",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zdn;
bit i1;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21-19} = 0b011;
let Inst{18-16} = opc;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-6} = 0b0000;
let Inst{5} = i1;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = zprty.ElementSize;
}
multiclass sve_fp_2op_i_p_zds<bits<3> opc, string asm, Operand imm_ty> {
def _H : sve_fp_2op_i_p_zds<0b01, opc, asm, ZPR16, imm_ty>;
def _S : sve_fp_2op_i_p_zds<0b10, opc, asm, ZPR32, imm_ty>;
def _D : sve_fp_2op_i_p_zds<0b11, opc, asm, ZPR64, imm_ty>;
}
class sve_fp_2op_p_zds<bits<2> sz, bits<4> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zdn;
bits<5> Zm;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21-20} = 0b00;
let Inst{19-16} = opc;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = zprty.ElementSize;
}
multiclass sve_fp_2op_p_zds<bits<4> opc, string asm> {
def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>;
def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>;
def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>;
}
class sve_fp_ftmad<bits<2> sz, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, imm0_7:$imm3),
asm, "\t$Zdn, $_Zdn, $Zm, $imm3",
"",
[]>, Sched<[]> {
bits<5> Zdn;
bits<5> Zm;
bits<3> imm3;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21-19} = 0b010;
let Inst{18-16} = imm3;
let Inst{15-10} = 0b100000;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
multiclass sve_fp_ftmad<string asm> {
def _H : sve_fp_ftmad<0b01, asm, ZPR16>;
def _S : sve_fp_ftmad<0b10, asm, ZPR32>;
def _D : sve_fp_ftmad<0b11, asm, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE Floating Point Arithmetic - Unpredicated Group
//===----------------------------------------------------------------------===//
class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm",
"", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21} = 0b0;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b000;
let Inst{12-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve_fp_3op_u_zd<bits<3> opc, string asm> {
def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>;
def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>;
def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE Floating Point Fused Multiply-Add Group
//===----------------------------------------------------------------------===//
class sve_fp_3op_p_zds_a<bits<2> sz, bits<2> opc, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zda), (ins PPR3bAny:$Pg, zprty:$_Zda, zprty:$Zn, zprty:$Zm),
asm, "\t$Zda, $Pg/m, $Zn, $Zm",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zda;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15} = 0b0;
let Inst{14-13} = opc;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = Destructive;
let ElementSize = zprty.ElementSize;
}
multiclass sve_fp_3op_p_zds_a<bits<2> opc, string asm> {
def _H : sve_fp_3op_p_zds_a<0b01, opc, asm, ZPR16>;
def _S : sve_fp_3op_p_zds_a<0b10, opc, asm, ZPR32>;
def _D : sve_fp_3op_p_zds_a<0b11, opc, asm, ZPR64>;
}
class sve_fp_3op_p_zds_b<bits<2> sz, bits<2> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm, zprty:$Za),
asm, "\t$Zdn, $Pg/m, $Zm, $Za",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Za;
bits<5> Zdn;
bits<5> Zm;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{20-16} = Za;
let Inst{15} = 0b1;
let Inst{14-13} = opc;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = zprty.ElementSize;
}
multiclass sve_fp_3op_p_zds_b<bits<2> opc, string asm> {
def _H : sve_fp_3op_p_zds_b<0b01, opc, asm, ZPR16>;
def _S : sve_fp_3op_p_zds_b<0b10, opc, asm, ZPR32>;
def _D : sve_fp_3op_p_zds_b<0b11, opc, asm, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE Floating Point Multiply-Add - Indexed Group
//===----------------------------------------------------------------------===//
class sve_fp_fma_by_indexed_elem<bits<2> sz, bit opc, string asm,
ZPRRegOp zprty1,
ZPRRegOp zprty2, Operand itype>
: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty1:$Zn, zprty2:$Zm, itype:$iop),
asm, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
let Inst{31-24} = 0b01100100;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{15-11} = 0;
let Inst{10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
multiclass sve_fp_fma_by_indexed_elem<bit opc, string asm> {
def _H : sve_fp_fma_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR3b16, VectorIndexH> {
bits<3> Zm;
bits<3> iop;
let Inst{22} = iop{2};
let Inst{20-19} = iop{1-0};
let Inst{18-16} = Zm;
}
def _S : sve_fp_fma_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR3b32, VectorIndexS> {
bits<3> Zm;
bits<2> iop;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
}
def _D : sve_fp_fma_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR4b64, VectorIndexD> {
bits<4> Zm;
bit iop;
let Inst{20} = iop;
let Inst{19-16} = Zm;
}
}
//===----------------------------------------------------------------------===//
// SVE Floating Point Multiply - Indexed Group
//===----------------------------------------------------------------------===//
class sve_fp_fmul_by_indexed_elem<bits<2> sz, string asm, ZPRRegOp zprty,
ZPRRegOp zprty2, Operand itype>
: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty2:$Zm, itype:$iop),
asm, "\t$Zd, $Zn, $Zm$iop", "", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
let Inst{31-24} = 0b01100100;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{15-10} = 0b001000;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve_fp_fmul_by_indexed_elem<string asm> {
def _H : sve_fp_fmul_by_indexed_elem<{0, ?}, asm, ZPR16, ZPR3b16, VectorIndexH> {
bits<3> Zm;
bits<3> iop;
let Inst{22} = iop{2};
let Inst{20-19} = iop{1-0};
let Inst{18-16} = Zm;
}
def _S : sve_fp_fmul_by_indexed_elem<0b10, asm, ZPR32, ZPR3b32, VectorIndexS> {
bits<3> Zm;
bits<2> iop;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
}
def _D : sve_fp_fmul_by_indexed_elem<0b11, asm, ZPR64, ZPR4b64, VectorIndexD> {
bits<4> Zm;
bit iop;
let Inst{20} = iop;
let Inst{19-16} = Zm;
}
}
//===----------------------------------------------------------------------===//
// SVE Floating Point Complex Multiply-Add Group
//===----------------------------------------------------------------------===//
class sve_fp_fcmla<bits<2> sz, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zda), (ins PPR3bAny:$Pg, zprty:$_Zda, zprty:$Zn, zprty:$Zm,
complexrotateop:$imm),
asm, "\t$Zda, $Pg/m, $Zn, $Zm, $imm",
"", []>, Sched<[]> {
bits<5> Zda;
bits<3> Pg;
bits<5> Zn;
bits<5> Zm;
bits<2> imm;
let Inst{31-24} = 0b01100100;
let Inst{23-22} = sz;
let Inst{21} = 0;
let Inst{20-16} = Zm;
let Inst{15} = 0;
let Inst{14-13} = imm;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = Destructive;
let ElementSize = zprty.ElementSize;
}
multiclass sve_fp_fcmla<string asm> {
def _H : sve_fp_fcmla<0b01, asm, ZPR16>;
def _S : sve_fp_fcmla<0b10, asm, ZPR32>;
def _D : sve_fp_fcmla<0b11, asm, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE Floating Point Complex Multiply-Add - Indexed Group
//===----------------------------------------------------------------------===//
class sve_fp_fcmla_by_indexed_elem<bits<2> sz, string asm,
ZPRRegOp zprty,
ZPRRegOp zprty2, Operand itype>
: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, zprty2:$Zm, itype:$iop,
complexrotateop:$imm),
asm, "\t$Zda, $Zn, $Zm$iop, $imm",
"", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<2> imm;
let Inst{31-24} = 0b01100100;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{15-12} = 0b0001;
let Inst{11-10} = imm;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
multiclass sve_fp_fcmla_by_indexed_elem<string asm> {
def _H : sve_fp_fcmla_by_indexed_elem<0b10, asm, ZPR16, ZPR3b16, VectorIndexS> {
bits<3> Zm;
bits<2> iop;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
}
def _S : sve_fp_fcmla_by_indexed_elem<0b11, asm, ZPR32, ZPR4b32, VectorIndexD> {
bits<4> Zm;
bits<1> iop;
let Inst{20} = iop;
let Inst{19-16} = Zm;
}
}
//===----------------------------------------------------------------------===//
// SVE Floating Point Complex Addition Group
//===----------------------------------------------------------------------===//
class sve_fp_fcadd<bits<2> sz, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm,
complexrotateopodd:$imm),
asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm, $imm",
"",
[]>, Sched<[]> {
bits<5> Zdn;
bits<5> Zm;
bits<3> Pg;
bit imm;
let Inst{31-24} = 0b01100100;
let Inst{23-22} = sz;
let Inst{21-17} = 0;
let Inst{16} = imm;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = zprty.ElementSize;
}
multiclass sve_fp_fcadd<string asm> {
def _H : sve_fp_fcadd<0b01, asm, ZPR16>;
def _S : sve_fp_fcadd<0b10, asm, ZPR32>;
def _D : sve_fp_fcadd<0b11, asm, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE2 Floating Point Convert Group
//===----------------------------------------------------------------------===//
class sve2_fp_convert_precision<bits<4> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2>
-: I<(outs zprty1:$Zd), (ins PPR3bAny:$Pg, zprty2:$Zn),
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, PPR3bAny:$Pg, zprty2:$Zn),
asm, "\t$Zd, $Pg/m, $Zn",
"",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<3> Pg;
let Inst{31-24} = 0b01100100;
let Inst{23-22} = opc{3-2};
let Inst{21-18} = 0b0010;
let Inst{17-16} = opc{1-0};
let Inst{15-13} = 0b101;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
}
multiclass sve2_fp_convert_down_narrow<string asm> {
def _StoH : sve2_fp_convert_precision<0b1000, asm, ZPR16, ZPR32>;
def _DtoS : sve2_fp_convert_precision<0b1110, asm, ZPR32, ZPR64>;
}
multiclass sve2_fp_convert_up_long<string asm> {
def _HtoS : sve2_fp_convert_precision<0b1001, asm, ZPR32, ZPR16>;
def _StoD : sve2_fp_convert_precision<0b1111, asm, ZPR64, ZPR32>;
}
multiclass sve2_fp_convert_down_odd_rounding<string asm> {
def _DtoS : sve2_fp_convert_precision<0b0010, asm, ZPR32, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE2 Floating Point Pairwise Group
//===----------------------------------------------------------------------===//
class sve2_fp_pairwise_pred<bits<2> sz, bits<3> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zm;
bits<5> Zdn;
let Inst{31-24} = 0b01100100;
let Inst{23-22} = sz;
let Inst{21-19} = 0b010;
let Inst{18-16} = opc;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = zprty.ElementSize;
}
multiclass sve2_fp_pairwise_pred<bits<3> opc, string asm> {
def _H : sve2_fp_pairwise_pred<0b01, opc, asm, ZPR16>;
def _S : sve2_fp_pairwise_pred<0b10, opc, asm, ZPR32>;
def _D : sve2_fp_pairwise_pred<0b11, opc, asm, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE2 Floating Point Widening Multiply-Add - Indexed Group
//===----------------------------------------------------------------------===//
class sve2_fp_mla_long_by_indexed_elem<bits<2> opc, string asm>
: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm,
VectorIndexH:$iop),
asm, "\t$Zda, $Zn, $Zm$iop",
"",
[]>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<3> Zm;
bits<3> iop;
let Inst{31-21} = 0b01100100101;
let Inst{20-19} = iop{2-1};
let Inst{18-16} = Zm;
let Inst{15-14} = 0b01;
let Inst{13} = opc{1};
let Inst{12} = 0b0;
let Inst{11} = iop{0};
let Inst{10} = opc{0};
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
//===----------------------------------------------------------------------===//
// SVE2 Floating Point Widening Multiply-Add Group
//===----------------------------------------------------------------------===//
class sve2_fp_mla_long<bits<2> opc, string asm>
: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm),
asm, "\t$Zda, $Zn, $Zm",
"",
[]>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<5> Zm;
let Inst{31-21} = 0b01100100101;
let Inst{20-16} = Zm;
let Inst{15-14} = 0b10;
let Inst{13} = opc{1};
let Inst{12-11} = 0b00;
let Inst{10} = opc{0};
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
//===----------------------------------------------------------------------===//
// SVE Stack Allocation Group
//===----------------------------------------------------------------------===//
class sve_int_arith_vl<bit opc, string asm>
: I<(outs GPR64sp:$Rd), (ins GPR64sp:$Rn, simm6_32b:$imm6),
asm, "\t$Rd, $Rn, $imm6",
"",
[]>, Sched<[]> {
bits<5> Rd;
bits<5> Rn;
bits<6> imm6;
let Inst{31-23} = 0b000001000;
let Inst{22} = opc;
let Inst{21} = 0b1;
let Inst{20-16} = Rn;
let Inst{15-11} = 0b01010;
let Inst{10-5} = imm6;
let Inst{4-0} = Rd;
}
class sve_int_read_vl_a<bit op, bits<5> opc2, string asm>
: I<(outs GPR64:$Rd), (ins simm6_32b:$imm6),
asm, "\t$Rd, $imm6",
"",
[]>, Sched<[]> {
bits<5> Rd;
bits<6> imm6;
let Inst{31-23} = 0b000001001;
let Inst{22} = op;
let Inst{21} = 0b1;
let Inst{20-16} = opc2{4-0};
let Inst{15-11} = 0b01010;
let Inst{10-5} = imm6;
let Inst{4-0} = Rd;
}
//===----------------------------------------------------------------------===//
// SVE Permute - In Lane Group
//===----------------------------------------------------------------------===//
class sve_int_perm_bin_perm_zz<bits<3> opc, bits<2> sz8_64, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm",
"",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b011;
let Inst{12-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve_int_perm_bin_perm_zz<bits<3> opc, string asm> {
def _B : sve_int_perm_bin_perm_zz<opc, 0b00, asm, ZPR8>;
def _H : sve_int_perm_bin_perm_zz<opc, 0b01, asm, ZPR16>;
def _S : sve_int_perm_bin_perm_zz<opc, 0b10, asm, ZPR32>;
def _D : sve_int_perm_bin_perm_zz<opc, 0b11, asm, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE Floating Point Unary Operations Group
//===----------------------------------------------------------------------===//
class sve_fp_2op_p_zd<bits<7> opc, string asm, RegisterOperand i_zprtype,
RegisterOperand o_zprtype, ElementSizeEnum size>
: I<(outs o_zprtype:$Zd), (ins i_zprtype:$_Zd, PPR3bAny:$Pg, i_zprtype:$Zn),
asm, "\t$Zd, $Pg/m, $Zn",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zd;
bits<5> Zn;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = opc{6-5};
let Inst{21} = 0b0;
let Inst{20-16} = opc{4-0};
let Inst{15-13} = 0b101;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
let DestructiveInstType = Destructive;
let ElementSize = size;
}
multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm> {
def _H : sve_fp_2op_p_zd<{ 0b01, opc }, asm, ZPR16, ZPR16, ElementSizeH>;
def _S : sve_fp_2op_p_zd<{ 0b10, opc }, asm, ZPR32, ZPR32, ElementSizeS>;
def _D : sve_fp_2op_p_zd<{ 0b11, opc }, asm, ZPR64, ZPR64, ElementSizeD>;
}
multiclass sve2_fp_flogb<string asm> {
def _H : sve_fp_2op_p_zd<0b0011010, asm, ZPR16, ZPR16, ElementSizeH>;
def _S : sve_fp_2op_p_zd<0b0011100, asm, ZPR32, ZPR32, ElementSizeS>;
def _D : sve_fp_2op_p_zd<0b0011110, asm, ZPR64, ZPR64, ElementSizeD>;
}
//===----------------------------------------------------------------------===//
// SVE Floating Point Unary Operations - Unpredicated Group
//===----------------------------------------------------------------------===//
class sve_fp_2op_u_zd<bits<2> sz, bits<3> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn),
asm, "\t$Zd, $Zn",
"",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21-19} = 0b001;
let Inst{18-16} = opc;
let Inst{15-10} = 0b001100;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve_fp_2op_u_zd<bits<3> opc, string asm> {
def _H : sve_fp_2op_u_zd<0b01, opc, asm, ZPR16>;
def _S : sve_fp_2op_u_zd<0b10, opc, asm, ZPR32>;
def _D : sve_fp_2op_u_zd<0b11, opc, asm, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE Integer Arithmetic - Binary Predicated Group
//===----------------------------------------------------------------------===//
class sve_int_bin_pred_arit_log<bits<2> sz8_64, bits<2> fmt, bits<3> opc,
string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm", "", []>, Sched<[]> {
bits<3> Pg;
bits<5> Zdn;
bits<5> Zm;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b0;
let Inst{20-19} = fmt;
let Inst{18-16} = opc;
let Inst{15-13} = 0b000;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = zprty.ElementSize;
}
multiclass sve_int_bin_pred_log<bits<3> opc, string asm> {
def _B : sve_int_bin_pred_arit_log<0b00, 0b11, opc, asm, ZPR8>;
def _H : sve_int_bin_pred_arit_log<0b01, 0b11, opc, asm, ZPR16>;
def _S : sve_int_bin_pred_arit_log<0b10, 0b11, opc, asm, ZPR32>;
def _D : sve_int_bin_pred_arit_log<0b11, 0b11, opc, asm, ZPR64>;
}
multiclass sve_int_bin_pred_arit_0<bits<3> opc, string asm> {
def _B : sve_int_bin_pred_arit_log<0b00, 0b00, opc, asm, ZPR8>;
def _H : sve_int_bin_pred_arit_log<0b01, 0b00, opc, asm, ZPR16>;
def _S : sve_int_bin_pred_arit_log<0b10, 0b00, opc, asm, ZPR32>;
def _D : sve_int_bin_pred_arit_log<0b11, 0b00, opc, asm, ZPR64>;
}
multiclass sve_int_bin_pred_arit_1<bits<3> opc, string asm> {
def _B : sve_int_bin_pred_arit_log<0b00, 0b01, opc, asm, ZPR8>;
def _H : sve_int_bin_pred_arit_log<0b01, 0b01, opc, asm, ZPR16>;
def _S : sve_int_bin_pred_arit_log<0b10, 0b01, opc, asm, ZPR32>;
def _D : sve_int_bin_pred_arit_log<0b11, 0b01, opc, asm, ZPR64>;
}
multiclass sve_int_bin_pred_arit_2<bits<3> opc, string asm> {
def _B : sve_int_bin_pred_arit_log<0b00, 0b10, opc, asm, ZPR8>;
def _H : sve_int_bin_pred_arit_log<0b01, 0b10, opc, asm, ZPR16>;
def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>;
def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>;
}
// Special case for divides which are not defined for 8b/16b elements.
multiclass sve_int_bin_pred_arit_2_div<bits<3> opc, string asm> {
def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>;
def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE Integer Multiply-Add Group
//===----------------------------------------------------------------------===//
class sve_int_mladdsub_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm, zprty:$Za),
asm, "\t$Zdn, $Pg/m, $Zm, $Za",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zdn;
bits<5> Za;
bits<5> Zm;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b0;
let Inst{20-16} = Zm;
let Inst{15-14} = 0b11;
let Inst{13} = opc;
let Inst{12-10} = Pg;
let Inst{9-5} = Za;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = zprty.ElementSize;
}
multiclass sve_int_mladdsub_vvv_pred<bits<1> opc, string asm> {
def _B : sve_int_mladdsub_vvv_pred<0b00, opc, asm, ZPR8>;
def _H : sve_int_mladdsub_vvv_pred<0b01, opc, asm, ZPR16>;
def _S : sve_int_mladdsub_vvv_pred<0b10, opc, asm, ZPR32>;
def _D : sve_int_mladdsub_vvv_pred<0b11, opc, asm, ZPR64>;
}
class sve_int_mlas_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zda), (ins PPR3bAny:$Pg, zprty:$_Zda, zprty:$Zn, zprty:$Zm),
asm, "\t$Zda, $Pg/m, $Zn, $Zm",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zda;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b0;
let Inst{20-16} = Zm;
let Inst{15-14} = 0b01;
let Inst{13} = opc;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = Destructive;
let ElementSize = zprty.ElementSize;
}
multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm> {
def _B : sve_int_mlas_vvv_pred<0b00, opc, asm, ZPR8>;
def _H : sve_int_mlas_vvv_pred<0b01, opc, asm, ZPR16>;
def _S : sve_int_mlas_vvv_pred<0b10, opc, asm, ZPR32>;
def _D : sve_int_mlas_vvv_pred<0b11, opc, asm, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE2 Integer Multiply-Add - Unpredicated Group
//===----------------------------------------------------------------------===//
class sve2_int_mla<bits<2> sz, bits<5> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm),
asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<5> Zm;
let Inst{31-24} = 0b01000100;
let Inst{23-22} = sz;
let Inst{21} = 0b0;
let Inst{20-16} = Zm;
let Inst{15} = 0b0;
let Inst{14-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
multiclass sve2_int_mla<bit S, string asm> {
def _B : sve2_int_mla<0b00, { 0b1110, S }, asm, ZPR8, ZPR8>;
def _H : sve2_int_mla<0b01, { 0b1110, S }, asm, ZPR16, ZPR16>;
def _S : sve2_int_mla<0b10, { 0b1110, S }, asm, ZPR32, ZPR32>;
def _D : sve2_int_mla<0b11, { 0b1110, S }, asm, ZPR64, ZPR64>;
}
multiclass sve2_int_mla_long<bits<5> opc, string asm> {
def _H : sve2_int_mla<0b01, opc, asm, ZPR16, ZPR8>;
def _S : sve2_int_mla<0b10, opc, asm, ZPR32, ZPR16>;
def _D : sve2_int_mla<0b11, opc, asm, ZPR64, ZPR32>;
}
//===----------------------------------------------------------------------===//
// SVE2 Integer Multiply-Add - Indexed Group
//===----------------------------------------------------------------------===//
class sve2_int_mla_by_indexed_elem<bits<2> sz, bits<6> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2,
ZPRRegOp zprty3, Operand itype>
: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop),
asm, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
let Inst{31-24} = 0b01000100;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{15-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
multiclass sve2_int_mla_by_indexed_elem<bits<2> opc, bit S, string asm> {
def _H : sve2_int_mla_by_indexed_elem<{0, ?}, { 0b000, opc, S }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH> {
bits<3> Zm;
bits<3> iop;
let Inst{22} = iop{2};
let Inst{20-19} = iop{1-0};
let Inst{18-16} = Zm;
}
def _S : sve2_int_mla_by_indexed_elem<0b10, { 0b000, opc, S }, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS> {
bits<3> Zm;
bits<2> iop;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
}
def _D : sve2_int_mla_by_indexed_elem<0b11, { 0b000, opc, S }, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD> {
bits<4> Zm;
bit iop;
let Inst{20} = iop;
let Inst{19-16} = Zm;
}
}
//===----------------------------------------------------------------------===//
// SVE2 Integer Multiply-Add Long - Indexed Group
//===----------------------------------------------------------------------===//
multiclass sve2_int_mla_long_by_indexed_elem<bits<4> opc, string asm> {
def _S : sve2_int_mla_by_indexed_elem<0b10, { opc{3}, 0b0, opc{2-1}, ?, opc{0} },
asm, ZPR32, ZPR16, ZPR3b16, VectorIndexH> {
bits<3> Zm;
bits<3> iop;
let Inst{20-19} = iop{2-1};
let Inst{18-16} = Zm;
let Inst{11} = iop{0};
}
def _D : sve2_int_mla_by_indexed_elem<0b11, { opc{3}, 0b0, opc{2-1}, ?, opc{0} },
asm, ZPR64, ZPR32, ZPR4b32, VectorIndexS> {
bits<4> Zm;
bits<2> iop;
let Inst{20} = iop{1};
let Inst{19-16} = Zm;
let Inst{11} = iop{0};
}
}
//===----------------------------------------------------------------------===//
// SVE Integer Dot Product Group
//===----------------------------------------------------------------------===//
class sve_intx_dot<bit sz, bit U, string asm, ZPRRegOp zprty1,
ZPRRegOp zprty2>
: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm), asm,
"\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<5> Zm;
let Inst{31-23} = 0b010001001;
let Inst{22} = sz;
let Inst{21} = 0;
let Inst{20-16} = Zm;
let Inst{15-11} = 0;
let Inst{10} = U;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = Destructive;
let ElementSize = zprty1.ElementSize;
}
multiclass sve_intx_dot<bit opc, string asm> {
def _S : sve_intx_dot<0b0, opc, asm, ZPR32, ZPR8>;
def _D : sve_intx_dot<0b1, opc, asm, ZPR64, ZPR16>;
}
//===----------------------------------------------------------------------===//
// SVE Integer Dot Product Group - Indexed Group
//===----------------------------------------------------------------------===//
class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2,
ZPRRegOp zprty3, Operand itype>
: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop),
asm, "\t$Zda, $Zn, $Zm$iop",
"", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
let Inst{31-23} = 0b010001001;
let Inst{22} = sz;
let Inst{21} = 0b1;
let Inst{15-11} = 0;
let Inst{10} = U;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm> {
def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS> {
bits<2> iop;
bits<3> Zm;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
}
def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD> {
bits<1> iop;
bits<4> Zm;
let Inst{20} = iop;
let Inst{19-16} = Zm;
}
}
//===----------------------------------------------------------------------===//
// SVE2 Complex Integer Dot Product Group
//===----------------------------------------------------------------------===//
class sve2_complex_int_arith<bits<2> sz, bits<4> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm,
complexrotateop:$rot),
asm, "\t$Zda, $Zn, $Zm, $rot", "", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<5> Zm;
bits<2> rot;
let Inst{31-24} = 0b01000100;
let Inst{23-22} = sz;
let Inst{21} = 0b0;
let Inst{20-16} = Zm;
let Inst{15-12} = opc;
let Inst{11-10} = rot;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
multiclass sve2_cintx_dot<string asm> {
def _S : sve2_complex_int_arith<0b10, 0b0001, asm, ZPR32, ZPR8>;
def _D : sve2_complex_int_arith<0b11, 0b0001, asm, ZPR64, ZPR16>;
}
//===----------------------------------------------------------------------===//
// SVE2 Complex Multiply-Add Group
//===----------------------------------------------------------------------===//
multiclass sve2_int_cmla<bit opc, string asm> {
def _B : sve2_complex_int_arith<0b00, { 0b001, opc }, asm, ZPR8, ZPR8>;
def _H : sve2_complex_int_arith<0b01, { 0b001, opc }, asm, ZPR16, ZPR16>;
def _S : sve2_complex_int_arith<0b10, { 0b001, opc }, asm, ZPR32, ZPR32>;
def _D : sve2_complex_int_arith<0b11, { 0b001, opc }, asm, ZPR64, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE2 Complex Integer Dot Product - Indexed Group
//===----------------------------------------------------------------------===//
class sve2_complex_int_arith_indexed<bits<2> sz, bits<4> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2,
ZPRRegOp zprty3, Operand itype>
: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop,
complexrotateop:$rot),
asm, "\t$Zda, $Zn, $Zm$iop, $rot", "", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<2> rot;
let Inst{31-24} = 0b01000100;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{15-12} = opc;
let Inst{11-10} = rot;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
multiclass sve2_cintx_dot_by_indexed_elem<string asm> {
def _S : sve2_complex_int_arith_indexed<0b10, 0b0100, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS> {
bits<2> iop;
bits<3> Zm;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
}
def _D : sve2_complex_int_arith_indexed<0b11, 0b0100, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD> {
bit iop;
bits<4> Zm;
let Inst{20} = iop;
let Inst{19-16} = Zm;
}
}
//===----------------------------------------------------------------------===//
// SVE2 Complex Multiply-Add - Indexed Group
//===----------------------------------------------------------------------===//
multiclass sve2_cmla_by_indexed_elem<bit opc, string asm> {
def _H : sve2_complex_int_arith_indexed<0b10, { 0b011, opc }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexS> {
bits<2> iop;
bits<3> Zm;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
}
def _S : sve2_complex_int_arith_indexed<0b11, { 0b011, opc }, asm, ZPR32, ZPR32, ZPR4b32, VectorIndexD> {
bit iop;
bits<4> Zm;
let Inst{20} = iop;
let Inst{19-16} = Zm;
}
}
//===----------------------------------------------------------------------===//
// SVE2 Integer Multiply - Unpredicated Group
//===----------------------------------------------------------------------===//
class sve2_int_mul<bits<2> sz, bits<3> opc, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b011;
let Inst{12-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve2_int_mul<bits<3> opc, string asm> {
def _B : sve2_int_mul<0b00, opc, asm, ZPR8>;
def _H : sve2_int_mul<0b01, opc, asm, ZPR16>;
def _S : sve2_int_mul<0b10, opc, asm, ZPR32>;
def _D : sve2_int_mul<0b11, opc, asm, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE2 Integer Multiply - Indexed Group
//===----------------------------------------------------------------------===//
class sve2_int_mul_by_indexed_elem<bits<2> sz, bits<4> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2,
ZPRRegOp zprty3, Operand itype>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty3:$Zm, itype:$iop),
asm, "\t$Zd, $Zn, $Zm$iop", "", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
let Inst{31-24} = 0b01000100;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{15-14} = 0b11;
let Inst{13-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve2_int_mul_by_indexed_elem<bits<4> opc, string asm> {
def _H : sve2_int_mul_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH> {
bits<3> Zm;
bits<3> iop;
let Inst{22} = iop{2};
let Inst{20-19} = iop{1-0};
let Inst{18-16} = Zm;
}
def _S : sve2_int_mul_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS> {
bits<3> Zm;
bits<2> iop;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
}
def _D : sve2_int_mul_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD> {
bits<4> Zm;
bit iop;
let Inst{20} = iop;
let Inst{19-16} = Zm;
}
}
multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm> {
def _S : sve2_int_mul_by_indexed_elem<0b10, { opc{2-1}, ?, opc{0} }, asm,
ZPR32, ZPR16, ZPR3b16, VectorIndexH> {
bits<3> Zm;
bits<3> iop;
let Inst{20-19} = iop{2-1};
let Inst{18-16} = Zm;
let Inst{11} = iop{0};
}
def _D : sve2_int_mul_by_indexed_elem<0b11, { opc{2-1}, ?, opc{0} }, asm,
ZPR64, ZPR32, ZPR4b32, VectorIndexS> {
bits<4> Zm;
bits<2> iop;
let Inst{20} = iop{1};
let Inst{19-16} = Zm;
let Inst{11} = iop{0};
}
}
//===----------------------------------------------------------------------===//
// SVE2 Integer - Predicated Group
//===----------------------------------------------------------------------===//
class sve2_int_arith_pred<bits<2> sz, bits<6> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm", "", []>, Sched<[]> {
bits<3> Pg;
bits<5> Zm;
bits<5> Zdn;
let Inst{31-24} = 0b01000100;
let Inst{23-22} = sz;
let Inst{21} = 0b0;
let Inst{20-16} = opc{5-1};
let Inst{15-14} = 0b10;
let Inst{13} = opc{0};
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = zprty.ElementSize;
}
multiclass sve2_int_arith_pred<bits<6> opc, string asm> {
def _B : sve2_int_arith_pred<0b00, opc, asm, ZPR8>;
def _H : sve2_int_arith_pred<0b01, opc, asm, ZPR16>;
def _S : sve2_int_arith_pred<0b10, opc, asm, ZPR32>;
def _D : sve2_int_arith_pred<0b11, opc, asm, ZPR64>;
}
class sve2_int_sadd_long_accum_pairwise<bits<2> sz, bit U, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zda), (ins PPR3bAny:$Pg, zprty1:$_Zda, zprty2:$Zn),
asm, "\t$Zda, $Pg/m, $Zn", "", []>, Sched<[]> {
bits<3> Pg;
bits<5> Zn;
bits<5> Zda;
let Inst{31-24} = 0b01000100;
let Inst{23-22} = sz;
let Inst{21-17} = 0b00010;
let Inst{16} = U;
let Inst{15-13} = 0b101;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = Destructive;
let ElementSize = zprty1.ElementSize;
}
multiclass sve2_int_sadd_long_accum_pairwise<bit U, string asm> {
def _H : sve2_int_sadd_long_accum_pairwise<0b01, U, asm, ZPR16, ZPR8>;
def _S : sve2_int_sadd_long_accum_pairwise<0b10, U, asm, ZPR32, ZPR16>;
def _D : sve2_int_sadd_long_accum_pairwise<0b11, U, asm, ZPR64, ZPR32>;
}
class sve2_int_un_pred_arit<bits<2> sz, bit Q, bits<2> opc,
string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, zprty:$Zn),
asm, "\t$Zd, $Pg/m, $Zn",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zd;
bits<5> Zn;
let Inst{31-24} = 0b01000100;
let Inst{23-22} = sz;
let Inst{21-20} = 0b00;
let Inst{19} = Q;
let Inst{18} = 0b0;
let Inst{17-16} = opc;
let Inst{15-13} = 0b101;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
let DestructiveInstType = Destructive;
let ElementSize = zprty.ElementSize;
}
multiclass sve2_int_un_pred_arit_s<bits<3> opc, string asm> {
def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>;
}
multiclass sve2_int_un_pred_arit<bits<3> opc, string asm> {
def _B : sve2_int_un_pred_arit<0b00, opc{2}, opc{1-0}, asm, ZPR8>;
def _H : sve2_int_un_pred_arit<0b01, opc{2}, opc{1-0}, asm, ZPR16>;
def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>;
def _D : sve2_int_un_pred_arit<0b11, opc{2}, opc{1-0}, asm, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE2 Widening Integer Arithmetic Group
//===----------------------------------------------------------------------===//
class sve2_wide_int_arith<bits<2> sz, bits<5> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2, ZPRRegOp zprty3>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty3:$Zm),
asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<5> Zm;
let Inst{31-24} = 0b01000101;
let Inst{23-22} = sz;
let Inst{21} = 0b0;
let Inst{20-16} = Zm;
let Inst{15} = 0b0;
let Inst{14-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve2_wide_int_arith_long<bits<5> opc, string asm> {
def _H : sve2_wide_int_arith<0b01, opc, asm, ZPR16, ZPR8, ZPR8>;
def _S : sve2_wide_int_arith<0b10, opc, asm, ZPR32, ZPR16, ZPR16>;
def _D : sve2_wide_int_arith<0b11, opc, asm, ZPR64, ZPR32, ZPR32>;
}
multiclass sve2_wide_int_arith_wide<bits<3> opc, string asm> {
def _H : sve2_wide_int_arith<0b01, { 0b10, opc }, asm, ZPR16, ZPR16, ZPR8>;
def _S : sve2_wide_int_arith<0b10, { 0b10, opc }, asm, ZPR32, ZPR32, ZPR16>;
def _D : sve2_wide_int_arith<0b11, { 0b10, opc }, asm, ZPR64, ZPR64, ZPR32>;
}
multiclass sve2_pmul_long<bits<1> opc, string asm> {
def _H : sve2_wide_int_arith<0b01, {0b1101, opc}, asm, ZPR16, ZPR8, ZPR8>;
def _D : sve2_wide_int_arith<0b11, {0b1101, opc}, asm, ZPR64, ZPR32, ZPR32>;
}
//===----------------------------------------------------------------------===//
// SVE2 Misc Group
//===----------------------------------------------------------------------===//
class sve2_misc<bits<2> sz, bits<4> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty2:$Zm),
asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<5> Zm;
let Inst{31-24} = 0b01000101;
let Inst{23-22} = sz;
let Inst{21} = 0b0;
let Inst{20-16} = Zm;
let Inst{15-14} = 0b10;
let Inst{13-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve2_misc_bitwise<bits<4> opc, string asm> {
def _B : sve2_misc<0b00, opc, asm, ZPR8, ZPR8>;
def _H : sve2_misc<0b01, opc, asm, ZPR16, ZPR16>;
def _S : sve2_misc<0b10, opc, asm, ZPR32, ZPR32>;
def _D : sve2_misc<0b11, opc, asm, ZPR64, ZPR64>;
}
-multiclass sve2_bitwise_xor_interleaved<bit opc, string asm> {
- let DestructiveInstType = Destructive, ElementSize = ElementSizeNone in {
- def _B : sve2_misc<0b00, { 0b010, opc }, asm, ZPR8, ZPR8>;
- def _H : sve2_misc<0b01, { 0b010, opc }, asm, ZPR16, ZPR16>;
- def _S : sve2_misc<0b10, { 0b010, opc }, asm, ZPR32, ZPR32>;
- def _D : sve2_misc<0b11, { 0b010, opc }, asm, ZPR64, ZPR64>;
- }
-}
-
multiclass sve2_misc_int_addsub_long_interleaved<bits<2> opc, string asm> {
def _H : sve2_misc<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>;
def _S : sve2_misc<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>;
def _D : sve2_misc<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>;
}
+class sve2_bitwise_xor_interleaved<bits<2> sz, bits<1> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm),
+ asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b01000101;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b0;
+ let Inst{20-16} = Zm;
+ let Inst{15-11} = 0b10010;
+ let Inst{10} = opc;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+ let DestructiveInstType = Destructive;
+ let ElementSize = ElementSizeNone;
+}
+
+multiclass sve2_bitwise_xor_interleaved<bit opc, string asm> {
+ def _B : sve2_bitwise_xor_interleaved<0b00, opc, asm, ZPR8, ZPR8>;
+ def _H : sve2_bitwise_xor_interleaved<0b01, opc, asm, ZPR16, ZPR16>;
+ def _S : sve2_bitwise_xor_interleaved<0b10, opc, asm, ZPR32, ZPR32>;
+ def _D : sve2_bitwise_xor_interleaved<0b11, opc, asm, ZPR64, ZPR64>;
+}
+
class sve2_bitwise_shift_left_long<bits<3> tsz8_64, bits<2> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2,
Operand immtype>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn, immtype:$imm),
asm, "\t$Zd, $Zn, $imm",
"", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<5> imm;
let Inst{31-23} = 0b010001010;
let Inst{22} = tsz8_64{2};
let Inst{21} = 0b0;
let Inst{20-19} = tsz8_64{1-0};
let Inst{18-16} = imm{2-0}; // imm3
let Inst{15-12} = 0b1010;
let Inst{11-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm> {
def _H : sve2_bitwise_shift_left_long<{0,0,1}, opc, asm,
ZPR16, ZPR8, vecshiftL8>;
def _S : sve2_bitwise_shift_left_long<{0,1,?}, opc, asm,
ZPR32, ZPR16, vecshiftL16> {
let Inst{19} = imm{3};
}
def _D : sve2_bitwise_shift_left_long<{1,?,?}, opc, asm,
ZPR64, ZPR32, vecshiftL32> {
let Inst{20-19} = imm{4-3};
}
}
//===----------------------------------------------------------------------===//
// SVE2 Accumulate Group
//===----------------------------------------------------------------------===//
-class sve2_int_bin_cons_shift_imm<bits<4> tsz8_64, bit opc, string asm,
- ZPRRegOp zprty, Operand immtype>
-: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm),
+class sve2_int_bin_shift_imm<bits<4> tsz8_64, bit opc, string asm,
+ ZPRRegOp zprty, Operand immtype>
+: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, immtype:$imm),
asm, "\t$Zd, $Zn, $imm",
"", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<6> imm;
let Inst{31-24} = 0b01000101;
let Inst{23-22} = tsz8_64{3-2};
let Inst{21} = 0b0;
let Inst{20-19} = tsz8_64{1-0};
let Inst{18-16} = imm{2-0}; // imm3
let Inst{15-11} = 0b11110;
let Inst{10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
}
-multiclass sve2_int_bin_cons_shift_imm_left<bit opc, string asm> {
- def _B : sve2_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
- def _H : sve2_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
+multiclass sve2_int_bin_shift_imm_left<bit opc, string asm> {
+ def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
+ def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
let Inst{19} = imm{3};
}
- def _S : sve2_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
+ def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
let Inst{20-19} = imm{4-3};
}
- def _D : sve2_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
+ def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
}
}
-multiclass sve2_int_bin_cons_shift_imm_right<bit opc, string asm> {
- def _B : sve2_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
- def _H : sve2_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
+multiclass sve2_int_bin_shift_imm_right<bit opc, string asm> {
+ def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
+ def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
let Inst{19} = imm{3};
}
- def _S : sve2_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
+ def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
let Inst{20-19} = imm{4-3};
}
- def _D : sve2_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
+ def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
}
}
-class sve2_int_bin_accum_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
- ZPRRegOp zprty, Operand immtype>
+class sve2_int_bin_accum_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
+ ZPRRegOp zprty, Operand immtype>
: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, immtype:$imm),
asm, "\t$Zda, $Zn, $imm",
"", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<6> imm;
let Inst{31-24} = 0b01000101;
let Inst{23-22} = tsz8_64{3-2};
let Inst{21} = 0b0;
let Inst{20-19} = tsz8_64{1-0};
let Inst{18-16} = imm{2-0}; // imm3
let Inst{15-12} = 0b1110;
let Inst{11-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
-multiclass sve2_int_bin_accum_cons_shift_imm_right<bits<2> opc, string asm> {
- def _B : sve2_int_bin_accum_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
- def _H : sve2_int_bin_accum_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
+multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm> {
+ def _B : sve2_int_bin_accum_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
+ def _H : sve2_int_bin_accum_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
let Inst{19} = imm{3};
}
- def _S : sve2_int_bin_accum_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
+ def _S : sve2_int_bin_accum_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
let Inst{20-19} = imm{4-3};
}
- def _D : sve2_int_bin_accum_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
+ def _D : sve2_int_bin_accum_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
}
}
class sve2_int_cadd<bits<2> sz, bit opc, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, complexrotateopodd:$rot),
asm, "\t$Zdn, $_Zdn, $Zm, $rot", "", []>, Sched<[]> {
bits<5> Zdn;
bits<5> Zm;
bit rot;
let Inst{31-24} = 0b01000101;
let Inst{23-22} = sz;
let Inst{21-17} = 0b00000;
let Inst{16} = opc;
let Inst{15-11} = 0b11011;
let Inst{10} = rot;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
multiclass sve2_int_cadd<bit opc, string asm> {
def _B : sve2_int_cadd<0b00, opc, asm, ZPR8>;
def _H : sve2_int_cadd<0b01, opc, asm, ZPR16>;
def _S : sve2_int_cadd<0b10, opc, asm, ZPR32>;
def _D : sve2_int_cadd<0b11, opc, asm, ZPR64>;
}
class sve2_int_absdiff_accum<bits<2> sz, bits<4> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm),
asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<5> Zm;
let Inst{31-24} = 0b01000101;
let Inst{23-22} = sz;
let Inst{21} = 0b0;
let Inst{20-16} = Zm;
let Inst{15-14} = 0b11;
let Inst{13-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
multiclass sve2_int_absdiff_accum<bit opc, string asm> {
def _B : sve2_int_absdiff_accum<0b00, { 0b111, opc }, asm, ZPR8, ZPR8>;
def _H : sve2_int_absdiff_accum<0b01, { 0b111, opc }, asm, ZPR16, ZPR16>;
def _S : sve2_int_absdiff_accum<0b10, { 0b111, opc }, asm, ZPR32, ZPR32>;
def _D : sve2_int_absdiff_accum<0b11, { 0b111, opc }, asm, ZPR64, ZPR64>;
}
multiclass sve2_int_absdiff_accum_long<bits<2> opc, string asm> {
def _H : sve2_int_absdiff_accum<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>;
def _S : sve2_int_absdiff_accum<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>;
def _D : sve2_int_absdiff_accum<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>;
}
multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm> {
def _S : sve2_int_absdiff_accum<{ opc{1}, 0b0 }, { 0b010, opc{0} }, asm,
ZPR32, ZPR32>;
def _D : sve2_int_absdiff_accum<{ opc{1}, 0b1 }, { 0b010, opc{0} }, asm,
ZPR64, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE2 Narrowing Group
//===----------------------------------------------------------------------===//
-class sve2_int_bin_cons_shift_imm_narrow<bits<3> tsz8_64, bits<4> opc,
- string asm, ZPRRegOp zprty1,
- ZPRRegOp zprty2, Operand immtype>
+class sve2_int_bin_shift_imm_narrow_bottom<bits<3> tsz8_64, bits<3> opc,
+ string asm, ZPRRegOp zprty1,
+ ZPRRegOp zprty2, Operand immtype>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn, immtype:$imm),
asm, "\t$Zd, $Zn, $imm",
"", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<5> imm;
let Inst{31-23} = 0b010001010;
let Inst{22} = tsz8_64{2};
let Inst{21} = 0b1;
let Inst{20-19} = tsz8_64{1-0};
let Inst{18-16} = imm{2-0}; // imm3
let Inst{15-14} = 0b00;
- let Inst{13-10} = opc;
+ let Inst{13-11} = opc;
+ let Inst{10} = 0b0;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
-multiclass sve2_int_bin_cons_shift_imm_right_narrow<bits<4> opc, string asm> {
- def _B : sve2_int_bin_cons_shift_imm_narrow<{0,0,1}, opc, asm, ZPR8, ZPR16,
- vecshiftR8>;
- def _H : sve2_int_bin_cons_shift_imm_narrow<{0,1,?}, opc, asm, ZPR16, ZPR32,
- vecshiftR16> {
+multiclass sve2_int_bin_shift_imm_right_narrow_bottom<bits<3> opc, string asm> {
+ def _B : sve2_int_bin_shift_imm_narrow_bottom<{0,0,1}, opc, asm, ZPR8, ZPR16,
+ vecshiftR8>;
+ def _H : sve2_int_bin_shift_imm_narrow_bottom<{0,1,?}, opc, asm, ZPR16, ZPR32,
+ vecshiftR16> {
let Inst{19} = imm{3};
}
- def _S : sve2_int_bin_cons_shift_imm_narrow<{1,?,?}, opc, asm, ZPR32, ZPR64,
- vecshiftR32> {
+ def _S : sve2_int_bin_shift_imm_narrow_bottom<{1,?,?}, opc, asm, ZPR32, ZPR64,
+ vecshiftR32> {
let Inst{20-19} = imm{4-3};
}
}
-class sve2_int_addsub_narrow_high<bits<2> sz, bits<3> opc, string asm,
- ZPRRegOp zprty1, ZPRRegOp zprty2>
+class sve2_int_bin_shift_imm_narrow_top<bits<3> tsz8_64, bits<3> opc,
+ string asm, ZPRRegOp zprty1,
+ ZPRRegOp zprty2, Operand immtype>
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, immtype:$imm),
+ asm, "\t$Zd, $Zn, $imm",
+ "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<5> imm;
+ let Inst{31-23} = 0b010001010;
+ let Inst{22} = tsz8_64{2};
+ let Inst{21} = 0b1;
+ let Inst{20-19} = tsz8_64{1-0};
+ let Inst{18-16} = imm{2-0}; // imm3
+ let Inst{15-14} = 0b00;
+ let Inst{13-11} = opc;
+ let Inst{10} = 0b1;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve2_int_bin_shift_imm_right_narrow_top<bits<3> opc, string asm> {
+ def _B : sve2_int_bin_shift_imm_narrow_top<{0,0,1}, opc, asm, ZPR8, ZPR16,
+ vecshiftR8>;
+ def _H : sve2_int_bin_shift_imm_narrow_top<{0,1,?}, opc, asm, ZPR16, ZPR32,
+ vecshiftR16> {
+ let Inst{19} = imm{3};
+ }
+ def _S : sve2_int_bin_shift_imm_narrow_top<{1,?,?}, opc, asm, ZPR32, ZPR64,
+ vecshiftR32> {
+ let Inst{20-19} = imm{4-3};
+ }
+}
+
+class sve2_int_addsub_narrow_high_bottom<bits<2> sz, bits<2> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty2:$Zm),
asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<5> Zm;
let Inst{31-24} = 0b01000101;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b011;
- let Inst{12-10} = opc; // S, R, T
+ let Inst{12-11} = opc; // S, R
+ let Inst{10} = 0b0; // Top
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
-multiclass sve2_int_addsub_narrow_high<bits<3> opc, string asm> {
- def _B : sve2_int_addsub_narrow_high<0b01, opc, asm, ZPR8, ZPR16>;
- def _H : sve2_int_addsub_narrow_high<0b10, opc, asm, ZPR16, ZPR32>;
- def _S : sve2_int_addsub_narrow_high<0b11, opc, asm, ZPR32, ZPR64>;
+multiclass sve2_int_addsub_narrow_high_bottom<bits<2> opc, string asm> {
+ def _B : sve2_int_addsub_narrow_high_bottom<0b01, opc, asm, ZPR8, ZPR16>;
+ def _H : sve2_int_addsub_narrow_high_bottom<0b10, opc, asm, ZPR16, ZPR32>;
+ def _S : sve2_int_addsub_narrow_high_bottom<0b11, opc, asm, ZPR32, ZPR64>;
}
-class sve2_int_sat_extract_narrow<bits<3> tsz8_64, bits<3> opc, string asm,
- ZPRRegOp zprty1, ZPRRegOp zprty2>
+class sve2_int_addsub_narrow_high_top<bits<2> sz, bits<2> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm),
+ asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ bits<5> Zm;
+ let Inst{31-24} = 0b01000101;
+ let Inst{23-22} = sz;
+ let Inst{21} = 0b1;
+ let Inst{20-16} = Zm;
+ let Inst{15-13} = 0b011;
+ let Inst{12-11} = opc; // S, R
+ let Inst{10} = 0b1; // Top
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve2_int_addsub_narrow_high_top<bits<2> opc, string asm> {
+ def _B : sve2_int_addsub_narrow_high_top<0b01, opc, asm, ZPR8, ZPR16>;
+ def _H : sve2_int_addsub_narrow_high_top<0b10, opc, asm, ZPR16, ZPR32>;
+ def _S : sve2_int_addsub_narrow_high_top<0b11, opc, asm, ZPR32, ZPR64>;
+}
+
+class sve2_int_sat_extract_narrow_bottom<bits<3> tsz8_64, bits<2> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn),
asm, "\t$Zd, $Zn", "", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
let Inst{31-23} = 0b010001010;
let Inst{22} = tsz8_64{2};
let Inst{21} = 0b1;
let Inst{20-19} = tsz8_64{1-0};
let Inst{18-13} = 0b000010;
- let Inst{12-10} = opc;
+ let Inst{12-11} = opc;
+ let Inst{10} = 0b0;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
-multiclass sve2_int_sat_extract_narrow<bits<3> opc, string asm> {
- def _B : sve2_int_sat_extract_narrow<0b001, opc, asm, ZPR8, ZPR16>;
- def _H : sve2_int_sat_extract_narrow<0b010, opc, asm, ZPR16, ZPR32>;
- def _S : sve2_int_sat_extract_narrow<0b100, opc, asm, ZPR32, ZPR64>;
+multiclass sve2_int_sat_extract_narrow_bottom<bits<2> opc, string asm> {
+ def _B : sve2_int_sat_extract_narrow_bottom<0b001, opc, asm, ZPR8, ZPR16>;
+ def _H : sve2_int_sat_extract_narrow_bottom<0b010, opc, asm, ZPR16, ZPR32>;
+ def _S : sve2_int_sat_extract_narrow_bottom<0b100, opc, asm, ZPR32, ZPR64>;
}
+class sve2_int_sat_extract_narrow_top<bits<3> tsz8_64, bits<2> opc, string asm,
+ ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn),
+ asm, "\t$Zd, $Zn", "", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<5> Zn;
+ let Inst{31-23} = 0b010001010;
+ let Inst{22} = tsz8_64{2};
+ let Inst{21} = 0b1;
+ let Inst{20-19} = tsz8_64{1-0};
+ let Inst{18-13} = 0b000010;
+ let Inst{12-11} = opc;
+ let Inst{10} = 0b1;
+ let Inst{9-5} = Zn;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve2_int_sat_extract_narrow_top<bits<2> opc, string asm> {
+ def _B : sve2_int_sat_extract_narrow_top<0b001, opc, asm, ZPR8, ZPR16>;
+ def _H : sve2_int_sat_extract_narrow_top<0b010, opc, asm, ZPR16, ZPR32>;
+ def _S : sve2_int_sat_extract_narrow_top<0b100, opc, asm, ZPR32, ZPR64>;
+}
+
//===----------------------------------------------------------------------===//
// SVE Integer Arithmetic - Unary Predicated Group
//===----------------------------------------------------------------------===//
class sve_int_un_pred_arit<bits<2> sz8_64, bits<4> opc,
string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, zprty:$Zn),
asm, "\t$Zd, $Pg/m, $Zn",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zd;
bits<5> Zn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21-20} = 0b01;
let Inst{19} = opc{0};
let Inst{18-16} = opc{3-1};
let Inst{15-13} = 0b101;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
let DestructiveInstType = Destructive;
let ElementSize = zprty.ElementSize;
}
multiclass sve_int_un_pred_arit_0<bits<3> opc, string asm> {
def _B : sve_int_un_pred_arit<0b00, { opc, 0b0 }, asm, ZPR8>;
def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>;
def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
}
multiclass sve_int_un_pred_arit_0_h<bits<3> opc, string asm> {
def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>;
def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
}
multiclass sve_int_un_pred_arit_0_w<bits<3> opc, string asm> {
def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
}
multiclass sve_int_un_pred_arit_0_d<bits<3> opc, string asm> {
def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
}
multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm> {
def _B : sve_int_un_pred_arit<0b00, { opc, 0b1 }, asm, ZPR8>;
def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>;
def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>;
def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>;
}
multiclass sve_int_un_pred_arit_1_fp<bits<3> opc, string asm> {
def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>;
def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>;
def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE Integer Wide Immediate - Unpredicated Group
//===----------------------------------------------------------------------===//
class sve_int_dup_imm<bits<2> sz8_64, string asm,
ZPRRegOp zprty, Operand immtype>
: I<(outs zprty:$Zd), (ins immtype:$imm),
asm, "\t$Zd, $imm",
"",
[]>, Sched<[]> {
bits<5> Zd;
bits<9> imm;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21-14} = 0b11100011;
let Inst{13} = imm{8}; // sh
let Inst{12-5} = imm{7-0}; // imm8
let Inst{4-0} = Zd;
let isReMaterializable = 1;
}
multiclass sve_int_dup_imm<string asm> {
def _B : sve_int_dup_imm<0b00, asm, ZPR8, cpy_imm8_opt_lsl_i8>;
def _H : sve_int_dup_imm<0b01, asm, ZPR16, cpy_imm8_opt_lsl_i16>;
def _S : sve_int_dup_imm<0b10, asm, ZPR32, cpy_imm8_opt_lsl_i32>;
def _D : sve_int_dup_imm<0b11, asm, ZPR64, cpy_imm8_opt_lsl_i64>;
def : InstAlias<"mov $Zd, $imm",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, cpy_imm8_opt_lsl_i8:$imm), 1>;
def : InstAlias<"mov $Zd, $imm",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, cpy_imm8_opt_lsl_i16:$imm), 1>;
def : InstAlias<"mov $Zd, $imm",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, cpy_imm8_opt_lsl_i32:$imm), 1>;
def : InstAlias<"mov $Zd, $imm",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, cpy_imm8_opt_lsl_i64:$imm), 1>;
def : InstAlias<"fmov $Zd, #0.0",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, 0, 0), 1>;
def : InstAlias<"fmov $Zd, #0.0",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, 0, 0), 1>;
def : InstAlias<"fmov $Zd, #0.0",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, 0, 0), 1>;
}
class sve_int_dup_fpimm<bits<2> sz8_64, Operand fpimmtype,
string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins fpimmtype:$imm8),
asm, "\t$Zd, $imm8",
"",
[]>, Sched<[]> {
bits<5> Zd;
bits<8> imm8;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21-14} = 0b11100111;
let Inst{13} = 0b0;
let Inst{12-5} = imm8;
let Inst{4-0} = Zd;
let isReMaterializable = 1;
}
multiclass sve_int_dup_fpimm<string asm> {
def _H : sve_int_dup_fpimm<0b01, fpimm16, asm, ZPR16>;
def _S : sve_int_dup_fpimm<0b10, fpimm32, asm, ZPR32>;
def _D : sve_int_dup_fpimm<0b11, fpimm64, asm, ZPR64>;
def : InstAlias<"fmov $Zd, $imm8",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, fpimm16:$imm8), 1>;
def : InstAlias<"fmov $Zd, $imm8",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, fpimm32:$imm8), 1>;
def : InstAlias<"fmov $Zd, $imm8",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, fpimm64:$imm8), 1>;
}
class sve_int_arith_imm0<bits<2> sz8_64, bits<3> opc, string asm,
ZPRRegOp zprty, Operand immtype>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, immtype:$imm),
asm, "\t$Zdn, $_Zdn, $imm",
"",
[]>, Sched<[]> {
bits<5> Zdn;
bits<9> imm;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21-19} = 0b100;
let Inst{18-16} = opc;
let Inst{15-14} = 0b11;
let Inst{13} = imm{8}; // sh
let Inst{12-5} = imm{7-0}; // imm8
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
multiclass sve_int_arith_imm0<bits<3> opc, string asm> {
def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8, addsub_imm8_opt_lsl_i8>;
def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>;
def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>;
def _D : sve_int_arith_imm0<0b11, opc, asm, ZPR64, addsub_imm8_opt_lsl_i64>;
}
class sve_int_arith_imm<bits<2> sz8_64, bits<6> opc, string asm,
ZPRRegOp zprty, Operand immtype>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, immtype:$imm),
asm, "\t$Zdn, $_Zdn, $imm",
"",
[]>, Sched<[]> {
bits<5> Zdn;
bits<8> imm;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21-16} = opc;
let Inst{15-13} = 0b110;
let Inst{12-5} = imm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
multiclass sve_int_arith_imm1<bits<2> opc, string asm, Operand immtype> {
def _B : sve_int_arith_imm<0b00, { 0b1010, opc }, asm, ZPR8, immtype>;
def _H : sve_int_arith_imm<0b01, { 0b1010, opc }, asm, ZPR16, immtype>;
def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, immtype>;
def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, immtype>;
}
multiclass sve_int_arith_imm2<string asm> {
def _B : sve_int_arith_imm<0b00, 0b110000, asm, ZPR8, simm8>;
def _H : sve_int_arith_imm<0b01, 0b110000, asm, ZPR16, simm8>;
def _S : sve_int_arith_imm<0b10, 0b110000, asm, ZPR32, simm8>;
def _D : sve_int_arith_imm<0b11, 0b110000, asm, ZPR64, simm8>;
}
//===----------------------------------------------------------------------===//
// SVE Bitwise Logical - Unpredicated Group
//===----------------------------------------------------------------------===//
class sve_int_bin_cons_log<bits<2> opc, string asm>
: I<(outs ZPR64:$Zd), (ins ZPR64:$Zn, ZPR64:$Zm),
asm, "\t$Zd, $Zn, $Zm",
"",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = opc{1-0};
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-10} = 0b001100;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve_int_bin_cons_log<bits<2> opc, string asm> {
def NAME : sve_int_bin_cons_log<opc, asm>;
def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
(!cast<Instruction>(NAME) ZPR8:$Zd, ZPR8:$Zn, ZPR8:$Zm), 1>;
def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
(!cast<Instruction>(NAME) ZPR16:$Zd, ZPR16:$Zn, ZPR16:$Zm), 1>;
def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
(!cast<Instruction>(NAME) ZPR32:$Zd, ZPR32:$Zn, ZPR32:$Zm), 1>;
}
class sve2_int_bitwise_ternary_op_d<bits<3> opc, string asm>
: I<(outs ZPR64:$Zdn), (ins ZPR64:$_Zdn, ZPR64:$Zm, ZPR64:$Zk),
asm, "\t$Zdn, $_Zdn, $Zm, $Zk",
"",
[]>, Sched<[]> {
bits<5> Zdn;
bits<5> Zk;
bits<5> Zm;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = opc{2-1};
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-11} = 0b00111;
let Inst{10} = opc{0};
let Inst{9-5} = Zk;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm> {
def NAME : sve2_int_bitwise_ternary_op_d<opc, asm>;
def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
(!cast<Instruction>(NAME) ZPR8:$Zdn, ZPR8:$Zm, ZPR8:$Zk), 1>;
def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
(!cast<Instruction>(NAME) ZPR16:$Zdn, ZPR16:$Zm, ZPR16:$Zk), 1>;
def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
(!cast<Instruction>(NAME) ZPR32:$Zdn, ZPR32:$Zm, ZPR32:$Zk), 1>;
}
class sve2_int_rotate_right_imm<bits<4> tsz8_64, string asm,
ZPRRegOp zprty, Operand immtype>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, immtype:$imm),
asm, "\t$Zdn, $_Zdn, $Zm, $imm",
"",
[]>, Sched<[]> {
bits<5> Zdn;
bits<5> Zm;
bits<6> imm;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = tsz8_64{3-2};
let Inst{21} = 0b1;
let Inst{20-19} = tsz8_64{1-0};
let Inst{18-16} = imm{2-0}; // imm3
let Inst{15-10} = 0b001101;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
multiclass sve2_int_rotate_right_imm<string asm> {
def _B : sve2_int_rotate_right_imm<{0,0,0,1}, asm, ZPR8, vecshiftR8>;
def _H : sve2_int_rotate_right_imm<{0,0,1,?}, asm, ZPR16, vecshiftR16> {
let Inst{19} = imm{3};
}
def _S : sve2_int_rotate_right_imm<{0,1,?,?}, asm, ZPR32, vecshiftR32> {
let Inst{20-19} = imm{4-3};
}
def _D : sve2_int_rotate_right_imm<{1,?,?,?}, asm, ZPR64, vecshiftR64> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
}
}
//===----------------------------------------------------------------------===//
// SVE Integer Wide Immediate - Predicated Group
//===----------------------------------------------------------------------===//
class sve_int_dup_fpimm_pred<bits<2> sz, Operand fpimmtype,
string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPRAny:$Pg, fpimmtype:$imm8),
asm, "\t$Zd, $Pg/m, $imm8",
"",
[]>, Sched<[]> {
bits<4> Pg;
bits<5> Zd;
bits<8> imm8;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz;
let Inst{21-20} = 0b01;
let Inst{19-16} = Pg;
let Inst{15-13} = 0b110;
let Inst{12-5} = imm8;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
let DestructiveInstType = Destructive;
let ElementSize = zprty.ElementSize;
}
multiclass sve_int_dup_fpimm_pred<string asm> {
def _H : sve_int_dup_fpimm_pred<0b01, fpimm16, asm, ZPR16>;
def _S : sve_int_dup_fpimm_pred<0b10, fpimm32, asm, ZPR32>;
def _D : sve_int_dup_fpimm_pred<0b11, fpimm64, asm, ZPR64>;
def : InstAlias<"fmov $Zd, $Pg/m, $imm8",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, fpimm16:$imm8), 1>;
def : InstAlias<"fmov $Zd, $Pg/m, $imm8",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, fpimm32:$imm8), 1>;
def : InstAlias<"fmov $Zd, $Pg/m, $imm8",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, fpimm64:$imm8), 1>;
}
class sve_int_dup_imm_pred<bits<2> sz8_64, bit m, string asm,
ZPRRegOp zprty, string pred_qual, dag iops>
: I<(outs zprty:$Zd), iops,
asm, "\t$Zd, $Pg"#pred_qual#", $imm",
"", []>, Sched<[]> {
bits<5> Zd;
bits<4> Pg;
bits<9> imm;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-20} = 0b01;
let Inst{19-16} = Pg;
let Inst{15} = 0b0;
let Inst{14} = m;
let Inst{13} = imm{8}; // sh
let Inst{12-5} = imm{7-0}; // imm8
let Inst{4-0} = Zd;
let DestructiveInstType = Destructive;
let ElementSize = zprty.ElementSize;
}
multiclass sve_int_dup_imm_pred_merge<string asm> {
let Constraints = "$Zd = $_Zd" in {
def _B : sve_int_dup_imm_pred<0b00, 1, asm, ZPR8, "/m", (ins ZPR8:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm)>;
def _H : sve_int_dup_imm_pred<0b01, 1, asm, ZPR16, "/m", (ins ZPR16:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm)>;
def _S : sve_int_dup_imm_pred<0b10, 1, asm, ZPR32, "/m", (ins ZPR32:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm)>;
def _D : sve_int_dup_imm_pred<0b11, 1, asm, ZPR64, "/m", (ins ZPR64:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm)>;
}
def : InstAlias<"mov $Zd, $Pg/m, $imm",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $imm",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $imm",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $imm",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm), 1>;
def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, 0, 0), 0>;
def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, 0, 0), 0>;
def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, 0, 0), 0>;
}
multiclass sve_int_dup_imm_pred_zero<string asm> {
def _B : sve_int_dup_imm_pred<0b00, 0, asm, ZPR8, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm)>;
def _H : sve_int_dup_imm_pred<0b01, 0, asm, ZPR16, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm)>;
def _S : sve_int_dup_imm_pred<0b10, 0, asm, ZPR32, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm)>;
def _D : sve_int_dup_imm_pred<0b11, 0, asm, ZPR64, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm)>;
def : InstAlias<"mov $Zd, $Pg/z, $imm",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm), 1>;
def : InstAlias<"mov $Zd, $Pg/z, $imm",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm), 1>;
def : InstAlias<"mov $Zd, $Pg/z, $imm",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm), 1>;
def : InstAlias<"mov $Zd, $Pg/z, $imm",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm), 1>;
}
//===----------------------------------------------------------------------===//
// SVE Integer Compare - Vectors Group
//===----------------------------------------------------------------------===//
class sve_int_cmp<bit cmp_1, bits<2> sz8_64, bits<3> opc, string asm,
PPRRegOp pprty, ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty1:$Zn, zprty2:$Zm),
asm, "\t$Pd, $Pg/z, $Zn, $Zm",
"",
[]>, Sched<[]> {
bits<4> Pd;
bits<3> Pg;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00100100;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b0;
let Inst{20-16} = Zm;
let Inst{15} = opc{2};
let Inst{14} = cmp_1;
let Inst{13} = opc{1};
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4} = opc{0};
let Inst{3-0} = Pd;
let Defs = [NZCV];
}
multiclass sve_int_cmp_0<bits<3> opc, string asm> {
def _B : sve_int_cmp<0b0, 0b00, opc, asm, PPR8, ZPR8, ZPR8>;
def _H : sve_int_cmp<0b0, 0b01, opc, asm, PPR16, ZPR16, ZPR16>;
def _S : sve_int_cmp<0b0, 0b10, opc, asm, PPR32, ZPR32, ZPR32>;
def _D : sve_int_cmp<0b0, 0b11, opc, asm, PPR64, ZPR64, ZPR64>;
}
multiclass sve_int_cmp_0_wide<bits<3> opc, string asm> {
def _B : sve_int_cmp<0b0, 0b00, opc, asm, PPR8, ZPR8, ZPR64>;
def _H : sve_int_cmp<0b0, 0b01, opc, asm, PPR16, ZPR16, ZPR64>;
def _S : sve_int_cmp<0b0, 0b10, opc, asm, PPR32, ZPR32, ZPR64>;
}
multiclass sve_int_cmp_1_wide<bits<3> opc, string asm> {
def _B : sve_int_cmp<0b1, 0b00, opc, asm, PPR8, ZPR8, ZPR64>;
def _H : sve_int_cmp<0b1, 0b01, opc, asm, PPR16, ZPR16, ZPR64>;
def _S : sve_int_cmp<0b1, 0b10, opc, asm, PPR32, ZPR32, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE Integer Compare - Signed Immediate Group
//===----------------------------------------------------------------------===//
class sve_int_scmp_vi<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty,
ZPRRegOp zprty,
Operand immtype>
: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, immtype:$imm5),
asm, "\t$Pd, $Pg/z, $Zn, $imm5",
"",
[]>, Sched<[]> {
bits<4> Pd;
bits<3> Pg;
bits<5> Zn;
bits<5> imm5;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b0;
let Inst{20-16} = imm5;
let Inst{15} = opc{2};
let Inst{14} = 0b0;
let Inst{13} = opc{1};
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4} = opc{0};
let Inst{3-0} = Pd;
let Defs = [NZCV];
}
multiclass sve_int_scmp_vi<bits<3> opc, string asm> {
def _B : sve_int_scmp_vi<0b00, opc, asm, PPR8, ZPR8, simm5_32b>;
def _H : sve_int_scmp_vi<0b01, opc, asm, PPR16, ZPR16, simm5_32b>;
def _S : sve_int_scmp_vi<0b10, opc, asm, PPR32, ZPR32, simm5_32b>;
def _D : sve_int_scmp_vi<0b11, opc, asm, PPR64, ZPR64, simm5_64b>;
}
//===----------------------------------------------------------------------===//
// SVE Integer Compare - Unsigned Immediate Group
//===----------------------------------------------------------------------===//
class sve_int_ucmp_vi<bits<2> sz8_64, bits<2> opc, string asm, PPRRegOp pprty,
ZPRRegOp zprty, Operand immtype>
: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, immtype:$imm7),
asm, "\t$Pd, $Pg/z, $Zn, $imm7",
"",
[]>, Sched<[]> {
bits<4> Pd;
bits<3> Pg;
bits<5> Zn;
bits<7> imm7;
let Inst{31-24} = 0b00100100;
let Inst{23-22} = sz8_64;
let Inst{21} = 1;
let Inst{20-14} = imm7;
let Inst{13} = opc{1};
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4} = opc{0};
let Inst{3-0} = Pd;
let Defs = [NZCV];
}
multiclass sve_int_ucmp_vi<bits<2> opc, string asm> {
def _B : sve_int_ucmp_vi<0b00, opc, asm, PPR8, ZPR8, imm0_127>;
def _H : sve_int_ucmp_vi<0b01, opc, asm, PPR16, ZPR16, imm0_127>;
def _S : sve_int_ucmp_vi<0b10, opc, asm, PPR32, ZPR32, imm0_127>;
def _D : sve_int_ucmp_vi<0b11, opc, asm, PPR64, ZPR64, imm0_127>;
}
//===----------------------------------------------------------------------===//
// SVE Integer Compare - Scalars Group
//===----------------------------------------------------------------------===//
class sve_int_cterm<bit sz, bit opc, string asm, RegisterClass rt>
: I<(outs), (ins rt:$Rn, rt:$Rm),
asm, "\t$Rn, $Rm",
"",
[]>, Sched<[]> {
bits<5> Rm;
bits<5> Rn;
let Inst{31-23} = 0b001001011;
let Inst{22} = sz;
let Inst{21} = 0b1;
let Inst{20-16} = Rm;
let Inst{15-10} = 0b001000;
let Inst{9-5} = Rn;
let Inst{4} = opc;
let Inst{3-0} = 0b0000;
let Defs = [NZCV];
}
class sve_int_while_rr<bits<2> sz8_64, bits<4> opc, string asm,
RegisterClass gprty, PPRRegOp pprty>
: I<(outs pprty:$Pd), (ins gprty:$Rn, gprty:$Rm),
asm, "\t$Pd, $Rn, $Rm",
"", []>, Sched<[]> {
bits<4> Pd;
bits<5> Rm;
bits<5> Rn;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = Rm;
let Inst{15-13} = 0b000;
let Inst{12-10} = opc{3-1};
let Inst{9-5} = Rn;
let Inst{4} = opc{0};
let Inst{3-0} = Pd;
let Defs = [NZCV];
}
multiclass sve_int_while4_rr<bits<3> opc, string asm> {
def _B : sve_int_while_rr<0b00, { 0, opc }, asm, GPR32, PPR8>;
def _H : sve_int_while_rr<0b01, { 0, opc }, asm, GPR32, PPR16>;
def _S : sve_int_while_rr<0b10, { 0, opc }, asm, GPR32, PPR32>;
def _D : sve_int_while_rr<0b11, { 0, opc }, asm, GPR32, PPR64>;
}
multiclass sve_int_while8_rr<bits<3> opc, string asm> {
def _B : sve_int_while_rr<0b00, { 1, opc }, asm, GPR64, PPR8>;
def _H : sve_int_while_rr<0b01, { 1, opc }, asm, GPR64, PPR16>;
def _S : sve_int_while_rr<0b10, { 1, opc }, asm, GPR64, PPR32>;
def _D : sve_int_while_rr<0b11, { 1, opc }, asm, GPR64, PPR64>;
}
class sve2_int_while_rr<bits<2> sz8_64, bits<1> rw, string asm,
PPRRegOp pprty>
: I<(outs pprty:$Pd), (ins GPR64:$Rn, GPR64:$Rm),
asm, "\t$Pd, $Rn, $Rm",
"", []>, Sched<[]> {
bits<4> Pd;
bits<5> Rm;
bits<5> Rn;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = Rm;
let Inst{15-10} = 0b001100;
let Inst{9-5} = Rn;
let Inst{4} = rw;
let Inst{3-0} = Pd;
let Defs = [NZCV];
}
multiclass sve2_int_while_rr<bits<1> rw, string asm> {
def _B : sve2_int_while_rr<0b00, rw, asm, PPR8>;
def _H : sve2_int_while_rr<0b01, rw, asm, PPR16>;
def _S : sve2_int_while_rr<0b10, rw, asm, PPR32>;
def _D : sve2_int_while_rr<0b11, rw, asm, PPR64>;
}
//===----------------------------------------------------------------------===//
// SVE Floating Point Fast Reduction Group
//===----------------------------------------------------------------------===//
class sve_fp_fast_red<bits<2> sz, bits<3> opc, string asm,
ZPRRegOp zprty, RegisterClass dstRegClass>
: I<(outs dstRegClass:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
asm, "\t$Vd, $Pg, $Zn",
"",
[]>, Sched<[]> {
bits<5> Zn;
bits<5> Vd;
bits<3> Pg;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21-19} = 0b000;
let Inst{18-16} = opc;
let Inst{15-13} = 0b001;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Vd;
}
multiclass sve_fp_fast_red<bits<3> opc, string asm> {
def _H : sve_fp_fast_red<0b01, opc, asm, ZPR16, FPR16>;
def _S : sve_fp_fast_red<0b10, opc, asm, ZPR32, FPR32>;
def _D : sve_fp_fast_red<0b11, opc, asm, ZPR64, FPR64>;
}
//===----------------------------------------------------------------------===//
// SVE Floating Point Accumulating Reduction Group
//===----------------------------------------------------------------------===//
class sve_fp_2op_p_vd<bits<2> sz, bits<3> opc, string asm,
ZPRRegOp zprty, RegisterClass dstRegClass>
: I<(outs dstRegClass:$Vdn), (ins PPR3bAny:$Pg, dstRegClass:$_Vdn, zprty:$Zm),
asm, "\t$Vdn, $Pg, $_Vdn, $Zm",
"",
[]>,
Sched<[]> {
bits<3> Pg;
bits<5> Vdn;
bits<5> Zm;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21-19} = 0b011;
let Inst{18-16} = opc;
let Inst{15-13} = 0b001;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Vdn;
let Constraints = "$Vdn = $_Vdn";
}
multiclass sve_fp_2op_p_vd<bits<3> opc, string asm> {
def _H : sve_fp_2op_p_vd<0b01, opc, asm, ZPR16, FPR16>;
def _S : sve_fp_2op_p_vd<0b10, opc, asm, ZPR32, FPR32>;
def _D : sve_fp_2op_p_vd<0b11, opc, asm, ZPR64, FPR64>;
}
//===----------------------------------------------------------------------===//
// SVE Floating Point Compare - Vectors Group
//===----------------------------------------------------------------------===//
class sve_fp_3op_p_pd<bits<2> sz, bits<3> opc, string asm, PPRRegOp pprty,
ZPRRegOp zprty>
: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, zprty:$Zm),
asm, "\t$Pd, $Pg/z, $Zn, $Zm",
"",
[]>, Sched<[]> {
bits<4> Pd;
bits<3> Pg;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21} = 0b0;
let Inst{20-16} = Zm;
let Inst{15} = opc{2};
let Inst{14} = 0b1;
let Inst{13} = opc{1};
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4} = opc{0};
let Inst{3-0} = Pd;
}
multiclass sve_fp_3op_p_pd<bits<3> opc, string asm> {
def _H : sve_fp_3op_p_pd<0b01, opc, asm, PPR16, ZPR16>;
def _S : sve_fp_3op_p_pd<0b10, opc, asm, PPR32, ZPR32>;
def _D : sve_fp_3op_p_pd<0b11, opc, asm, PPR64, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE Floating Point Compare - with Zero Group
//===----------------------------------------------------------------------===//
class sve_fp_2op_p_pd<bits<2> sz, bits<3> opc, string asm, PPRRegOp pprty,
ZPRRegOp zprty>
: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn),
asm, "\t$Pd, $Pg/z, $Zn, #0.0",
"",
[]>, Sched<[]> {
bits<4> Pd;
bits<3> Pg;
bits<5> Zn;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21-18} = 0b0100;
let Inst{17-16} = opc{2-1};
let Inst{15-13} = 0b001;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4} = opc{0};
let Inst{3-0} = Pd;
}
multiclass sve_fp_2op_p_pd<bits<3> opc, string asm> {
def _H : sve_fp_2op_p_pd<0b01, opc, asm, PPR16, ZPR16>;
def _S : sve_fp_2op_p_pd<0b10, opc, asm, PPR32, ZPR32>;
def _D : sve_fp_2op_p_pd<0b11, opc, asm, PPR64, ZPR64>;
}
//===----------------------------------------------------------------------===//
//SVE Index Generation Group
//===----------------------------------------------------------------------===//
class sve_int_index_ii<bits<2> sz8_64, string asm, ZPRRegOp zprty,
Operand imm_ty>
: I<(outs zprty:$Zd), (ins imm_ty:$imm5, imm_ty:$imm5b),
asm, "\t$Zd, $imm5, $imm5b",
"", []>, Sched<[]> {
bits<5> Zd;
bits<5> imm5;
bits<5> imm5b;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = imm5b;
let Inst{15-10} = 0b010000;
let Inst{9-5} = imm5;
let Inst{4-0} = Zd;
}
multiclass sve_int_index_ii<string asm> {
def _B : sve_int_index_ii<0b00, asm, ZPR8, simm5_32b>;
def _H : sve_int_index_ii<0b01, asm, ZPR16, simm5_32b>;
def _S : sve_int_index_ii<0b10, asm, ZPR32, simm5_32b>;
def _D : sve_int_index_ii<0b11, asm, ZPR64, simm5_64b>;
}
class sve_int_index_ir<bits<2> sz8_64, string asm, ZPRRegOp zprty,
RegisterClass srcRegType, Operand imm_ty>
: I<(outs zprty:$Zd), (ins imm_ty:$imm5, srcRegType:$Rm),
asm, "\t$Zd, $imm5, $Rm",
"", []>, Sched<[]> {
bits<5> Rm;
bits<5> Zd;
bits<5> imm5;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = Rm;
let Inst{15-10} = 0b010010;
let Inst{9-5} = imm5;
let Inst{4-0} = Zd;
}
multiclass sve_int_index_ir<string asm> {
def _B : sve_int_index_ir<0b00, asm, ZPR8, GPR32, simm5_32b>;
def _H : sve_int_index_ir<0b01, asm, ZPR16, GPR32, simm5_32b>;
def _S : sve_int_index_ir<0b10, asm, ZPR32, GPR32, simm5_32b>;
def _D : sve_int_index_ir<0b11, asm, ZPR64, GPR64, simm5_64b>;
}
class sve_int_index_ri<bits<2> sz8_64, string asm, ZPRRegOp zprty,
RegisterClass srcRegType, Operand imm_ty>
: I<(outs zprty:$Zd), (ins srcRegType:$Rn, imm_ty:$imm5),
asm, "\t$Zd, $Rn, $imm5",
"", []>, Sched<[]> {
bits<5> Rn;
bits<5> Zd;
bits<5> imm5;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = imm5;
let Inst{15-10} = 0b010001;
let Inst{9-5} = Rn;
let Inst{4-0} = Zd;
}
multiclass sve_int_index_ri<string asm> {
def _B : sve_int_index_ri<0b00, asm, ZPR8, GPR32, simm5_32b>;
def _H : sve_int_index_ri<0b01, asm, ZPR16, GPR32, simm5_32b>;
def _S : sve_int_index_ri<0b10, asm, ZPR32, GPR32, simm5_32b>;
def _D : sve_int_index_ri<0b11, asm, ZPR64, GPR64, simm5_64b>;
}
class sve_int_index_rr<bits<2> sz8_64, string asm, ZPRRegOp zprty,
RegisterClass srcRegType>
: I<(outs zprty:$Zd), (ins srcRegType:$Rn, srcRegType:$Rm),
asm, "\t$Zd, $Rn, $Rm",
"", []>, Sched<[]> {
bits<5> Zd;
bits<5> Rm;
bits<5> Rn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = Rm;
let Inst{15-10} = 0b010011;
let Inst{9-5} = Rn;
let Inst{4-0} = Zd;
}
multiclass sve_int_index_rr<string asm> {
def _B : sve_int_index_rr<0b00, asm, ZPR8, GPR32>;
def _H : sve_int_index_rr<0b01, asm, ZPR16, GPR32>;
def _S : sve_int_index_rr<0b10, asm, ZPR32, GPR32>;
def _D : sve_int_index_rr<0b11, asm, ZPR64, GPR64>;
}
//
//===----------------------------------------------------------------------===//
// SVE Bitwise Shift - Predicated Group
//===----------------------------------------------------------------------===//
class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<4> opc, string asm,
ZPRRegOp zprty, Operand immtype,
ElementSizeEnum size>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, immtype:$imm),
asm, "\t$Zdn, $Pg/m, $_Zdn, $imm",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zdn;
bits<6> imm;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = tsz8_64{3-2};
let Inst{21-20} = 0b00;
let Inst{19-16} = opc;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-8} = tsz8_64{1-0};
let Inst{7-5} = imm{2-0}; // imm3
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = size;
}
multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm> {
def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8,
ElementSizeB>;
def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16,
ElementSizeH> {
let Inst{8} = imm{3};
}
def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32,
ElementSizeS> {
let Inst{9-8} = imm{4-3};
}
def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64,
ElementSizeD> {
let Inst{22} = imm{5};
let Inst{9-8} = imm{4-3};
}
}
multiclass sve_int_bin_pred_shift_imm_right<bits<4> opc, string asm> {
def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8,
ElementSizeB>;
def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16,
ElementSizeH> {
let Inst{8} = imm{3};
}
def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32,
ElementSizeS> {
let Inst{9-8} = imm{4-3};
}
def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64,
ElementSizeD> {
let Inst{22} = imm{5};
let Inst{9-8} = imm{4-3};
}
}
class sve_int_bin_pred_shift<bits<2> sz8_64, bit wide, bits<3> opc,
string asm, ZPRRegOp zprty, ZPRRegOp zprty2>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty2:$Zm),
asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zdn;
bits<5> Zm;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21-20} = 0b01;
let Inst{19} = wide;
let Inst{18-16} = opc;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = zprty.ElementSize;
}
multiclass sve_int_bin_pred_shift<bits<3> opc, string asm> {
def _B : sve_int_bin_pred_shift<0b00, 0b0, opc, asm, ZPR8, ZPR8>;
def _H : sve_int_bin_pred_shift<0b01, 0b0, opc, asm, ZPR16, ZPR16>;
def _S : sve_int_bin_pred_shift<0b10, 0b0, opc, asm, ZPR32, ZPR32>;
def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>;
}
multiclass sve_int_bin_pred_shift_wide<bits<3> opc, string asm> {
def _B : sve_int_bin_pred_shift<0b00, 0b1, opc, asm, ZPR8, ZPR64>;
def _H : sve_int_bin_pred_shift<0b01, 0b1, opc, asm, ZPR16, ZPR64>;
def _S : sve_int_bin_pred_shift<0b10, 0b1, opc, asm, ZPR32, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE Shift - Unpredicated Group
//===----------------------------------------------------------------------===//
class sve_int_bin_cons_shift_wide<bits<2> sz8_64, bits<2> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn, ZPR64:$Zm),
asm, "\t$Zd, $Zn, $Zm",
"",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-12} = 0b1000;
let Inst{11-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve_int_bin_cons_shift_wide<bits<2> opc, string asm> {
def _B : sve_int_bin_cons_shift_wide<0b00, opc, asm, ZPR8>;
def _H : sve_int_bin_cons_shift_wide<0b01, opc, asm, ZPR16>;
def _S : sve_int_bin_cons_shift_wide<0b10, opc, asm, ZPR32>;
}
class sve_int_bin_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
ZPRRegOp zprty, Operand immtype>
: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm),
asm, "\t$Zd, $Zn, $imm",
"", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<6> imm;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = tsz8_64{3-2};
let Inst{21} = 0b1;
let Inst{20-19} = tsz8_64{1-0};
let Inst{18-16} = imm{2-0}; // imm3
let Inst{15-12} = 0b1001;
let Inst{11-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm> {
def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
let Inst{19} = imm{3};
}
def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
let Inst{20-19} = imm{4-3};
}
def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
}
}
multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm> {
def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
let Inst{19} = imm{3};
}
def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
let Inst{20-19} = imm{4-3};
}
def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
}
}
//===----------------------------------------------------------------------===//
// SVE Memory - Store Group
//===----------------------------------------------------------------------===//
class sve_mem_cst_si<bits<2> msz, bits<2> esz, string asm,
RegisterOperand VecList>
: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
asm, "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zt;
bits<4> imm4;
let Inst{31-25} = 0b1110010;
let Inst{24-23} = msz;
let Inst{22-21} = esz;
let Inst{20} = 0;
let Inst{19-16} = imm4;
let Inst{15-13} = 0b111;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayStore = 1;
}
multiclass sve_mem_cst_si<bits<2> msz, bits<2> esz, string asm,
RegisterOperand listty, ZPRRegOp zprty>
{
def NAME : sve_mem_cst_si<msz, esz, asm, listty>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
(!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
}
class sve_mem_est_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
string asm, Operand immtype>
: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm4),
asm, "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zt;
bits<4> imm4;
let Inst{31-25} = 0b1110010;
let Inst{24-23} = sz;
let Inst{22-21} = nregs;
let Inst{20} = 1;
let Inst{19-16} = imm4;
let Inst{15-13} = 0b111;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayStore = 1;
}
multiclass sve_mem_est_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
string asm, Operand immtype> {
def NAME : sve_mem_est_si<sz, nregs, VecList, asm, immtype>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
(!cast<Instruction>(NAME) VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
}
class sve_mem_est_ss<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
string asm, RegisterOperand gprty>
: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
asm, "\t$Zt, $Pg, [$Rn, $Rm]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rm;
bits<5> Rn;
bits<5> Zt;
let Inst{31-25} = 0b1110010;
let Inst{24-23} = sz;
let Inst{22-21} = nregs;
let Inst{20-16} = Rm;
let Inst{15-13} = 0b011;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayStore = 1;
}
class sve_mem_cst_ss_base<bits<4> dtype, string asm,
RegisterOperand listty, RegisterOperand gprty>
: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
asm, "\t$Zt, $Pg, [$Rn, $Rm]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rm;
bits<5> Rn;
bits<5> Zt;
let Inst{31-25} = 0b1110010;
let Inst{24-21} = dtype;
let Inst{20-16} = Rm;
let Inst{15-13} = 0b010;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayStore = 1;
}
multiclass sve_mem_cst_ss<bits<4> dtype, string asm,
RegisterOperand listty, ZPRRegOp zprty,
RegisterOperand gprty> {
def NAME : sve_mem_cst_ss_base<dtype, asm, listty, gprty>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Rm]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
}
class sve_mem_cstnt_si<bits<2> msz, string asm, RegisterOperand VecList>
: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
asm, "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zt;
bits<4> imm4;
let Inst{31-25} = 0b1110010;
let Inst{24-23} = msz;
let Inst{22-20} = 0b001;
let Inst{19-16} = imm4;
let Inst{15-13} = 0b111;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayStore = 1;
}
multiclass sve_mem_cstnt_si<bits<2> msz, string asm, RegisterOperand listty,
ZPRRegOp zprty> {
def NAME : sve_mem_cstnt_si<msz, asm, listty>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
(!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
}
class sve_mem_cstnt_ss_base<bits<2> msz, string asm, RegisterOperand listty,
RegisterOperand gprty>
: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
asm, "\t$Zt, $Pg, [$Rn, $Rm]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rm;
bits<5> Rn;
bits<5> Zt;
let Inst{31-25} = 0b1110010;
let Inst{24-23} = msz;
let Inst{22-21} = 0b00;
let Inst{20-16} = Rm;
let Inst{15-13} = 0b011;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayStore = 1;
}
multiclass sve_mem_cstnt_ss<bits<2> msz, string asm, RegisterOperand listty,
ZPRRegOp zprty, RegisterOperand gprty> {
def NAME : sve_mem_cstnt_ss_base<msz, asm, listty, gprty>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Rm]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
}
-class sve2_mem_cstnt_vs_base<bits<3> opc, dag iops, string asm,
- RegisterOperand VecList>
-: I<(outs VecList:$Zt), iops,
+class sve2_mem_sstnt_vs_base<bits<3> opc, string asm,
+ RegisterOperand listty, ZPRRegOp zprty>
+: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
asm, "\t$Zt, $Pg, [$Zn, $Rm]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rm;
bits<5> Zn;
bits<5> Zt;
let Inst{31-25} = 0b1110010;
let Inst{24-22} = opc;
let Inst{21} = 0b0;
let Inst{20-16} = Rm;
let Inst{15-13} = 0b001;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zt;
let mayStore = 1;
}
-multiclass sve2_mem_cstnt_vs<bits<3> opc, string asm,
+multiclass sve2_mem_sstnt_vs<bits<3> opc, string asm,
RegisterOperand listty, ZPRRegOp zprty> {
- def _REAL : sve2_mem_cstnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
- asm, listty>;
+ def _REAL : sve2_mem_sstnt_vs_base<opc, asm, listty, zprty>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
- def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
- (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
}
class sve_mem_sst_sv<bits<3> opc, bit xs, bit scaled, string asm,
RegisterOperand VecList, RegisterOperand zprext>
: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
asm, "\t$Zt, $Pg, [$Rn, $Zm]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zm;
bits<5> Zt;
let Inst{31-25} = 0b1110010;
let Inst{24-22} = opc;
let Inst{21} = scaled;
let Inst{20-16} = Zm;
let Inst{15} = 0b1;
let Inst{14} = xs;
let Inst{13} = 0;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayStore = 1;
}
multiclass sve_mem_sst_sv_32_scaled<bits<3> opc, string asm,
RegisterOperand listty,
ZPRRegOp zprty,
RegisterOperand sxtw_opnd,
RegisterOperand uxtw_opnd > {
def _UXTW_SCALED : sve_mem_sst_sv<opc, 0, 1, asm, listty, uxtw_opnd>;
def _SXTW_SCALED : sve_mem_sst_sv<opc, 1, 1, asm, listty, sxtw_opnd>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _UXTW_SCALED) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW_SCALED) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
}
multiclass sve_mem_sst_sv_32_unscaled<bits<3> opc, string asm,
RegisterOperand listty,
ZPRRegOp zprty,
RegisterOperand sxtw_opnd,
RegisterOperand uxtw_opnd> {
def _UXTW : sve_mem_sst_sv<opc, 0, 0, asm, listty, uxtw_opnd>;
def _SXTW : sve_mem_sst_sv<opc, 1, 0, asm, listty, sxtw_opnd>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _UXTW) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
}
class sve_mem_sst_sv2<bits<2> msz, bit scaled, string asm,
RegisterOperand zprext>
: I<(outs), (ins Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
asm, "\t$Zt, $Pg, [$Rn, $Zm]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zm;
bits<5> Zt;
let Inst{31-25} = 0b1110010;
let Inst{24-23} = msz;
let Inst{22} = 0b0;
let Inst{21} = scaled;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b101;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayStore = 1;
}
multiclass sve_mem_sst_sv_64_scaled<bits<2> msz, string asm,
RegisterOperand zprext> {
def "" : sve_mem_sst_sv2<msz, 1, asm, zprext>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
(!cast<Instruction>(NAME) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;
}
multiclass sve_mem_sst_sv_64_unscaled<bits<2> msz, string asm> {
def "" : sve_mem_sst_sv2<msz, 0, asm, ZPR64ExtLSL8>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
(!cast<Instruction>(NAME) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
}
class sve_mem_sst_vi<bits<3> opc, string asm, ZPRRegOp zprty,
RegisterOperand VecList, Operand imm_ty>
: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, zprty:$Zn, imm_ty:$imm5),
asm, "\t$Zt, $Pg, [$Zn, $imm5]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> imm5;
bits<5> Zn;
bits<5> Zt;
let Inst{31-25} = 0b1110010;
let Inst{24-23} = opc{2-1};
let Inst{22} = 0b1;
let Inst{21} = opc{0};
let Inst{20-16} = imm5;
let Inst{15-13} = 0b101;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zt;
let mayStore = 1;
}
multiclass sve_mem_sst_vi_ptrs<bits<3> opc, string asm, RegisterOperand listty,
ZPRRegOp zprty, Operand imm_ty> {
def _IMM : sve_mem_sst_vi<opc, asm, zprty, listty, imm_ty>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
(!cast<Instruction>(NAME # _IMM) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $imm5]",
(!cast<Instruction>(NAME # _IMM) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, imm_ty:$imm5), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
(!cast<Instruction>(NAME # _IMM) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, 0), 1>;
}
class sve_mem_z_spill<string asm>
: I<(outs), (ins ZPRAny:$Zt, GPR64sp:$Rn, simm9:$imm9),
asm, "\t$Zt, [$Rn, $imm9, mul vl]",
"",
[]>, Sched<[]> {
bits<5> Rn;
bits<5> Zt;
bits<9> imm9;
let Inst{31-22} = 0b1110010110;
let Inst{21-16} = imm9{8-3};
let Inst{15-13} = 0b010;
let Inst{12-10} = imm9{2-0};
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayStore = 1;
}
multiclass sve_mem_z_spill<string asm> {
def NAME : sve_mem_z_spill<asm>;
def : InstAlias<asm # "\t$Zt, [$Rn]",
(!cast<Instruction>(NAME) ZPRAny:$Zt, GPR64sp:$Rn, 0), 1>;
}
class sve_mem_p_spill<string asm>
: I<(outs), (ins PPRAny:$Pt, GPR64sp:$Rn, simm9:$imm9),
asm, "\t$Pt, [$Rn, $imm9, mul vl]",
"",
[]>, Sched<[]> {
bits<4> Pt;
bits<5> Rn;
bits<9> imm9;
let Inst{31-22} = 0b1110010110;
let Inst{21-16} = imm9{8-3};
let Inst{15-13} = 0b000;
let Inst{12-10} = imm9{2-0};
let Inst{9-5} = Rn;
let Inst{4} = 0b0;
let Inst{3-0} = Pt;
let mayStore = 1;
}
multiclass sve_mem_p_spill<string asm> {
def NAME : sve_mem_p_spill<asm>;
def : InstAlias<asm # "\t$Pt, [$Rn]",
(!cast<Instruction>(NAME) PPRAny:$Pt, GPR64sp:$Rn, 0), 1>;
}
//===----------------------------------------------------------------------===//
// SVE Permute - Predicates Group
//===----------------------------------------------------------------------===//
class sve_int_perm_bin_perm_pp<bits<3> opc, bits<2> sz8_64, string asm,
PPRRegOp pprty>
: I<(outs pprty:$Pd), (ins pprty:$Pn, pprty:$Pm),
asm, "\t$Pd, $Pn, $Pm",
"",
[]>, Sched<[]> {
bits<4> Pd;
bits<4> Pm;
bits<4> Pn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-20} = 0b10;
let Inst{19-16} = Pm;
let Inst{15-13} = 0b010;
let Inst{12-10} = opc;
let Inst{9} = 0b0;
let Inst{8-5} = Pn;
let Inst{4} = 0b0;
let Inst{3-0} = Pd;
}
multiclass sve_int_perm_bin_perm_pp<bits<3> opc, string asm> {
def _B : sve_int_perm_bin_perm_pp<opc, 0b00, asm, PPR8>;
def _H : sve_int_perm_bin_perm_pp<opc, 0b01, asm, PPR16>;
def _S : sve_int_perm_bin_perm_pp<opc, 0b10, asm, PPR32>;
def _D : sve_int_perm_bin_perm_pp<opc, 0b11, asm, PPR64>;
}
class sve_int_perm_punpk<bit opc, string asm>
: I<(outs PPR16:$Pd), (ins PPR8:$Pn),
asm, "\t$Pd, $Pn",
"",
[]>, Sched<[]> {
bits<4> Pd;
bits<4> Pn;
let Inst{31-17} = 0b000001010011000;
let Inst{16} = opc;
let Inst{15-9} = 0b0100000;
let Inst{8-5} = Pn;
let Inst{4} = 0b0;
let Inst{3-0} = Pd;
}
class sve_int_rdffr_pred<bit s, string asm>
: I<(outs PPR8:$Pd), (ins PPRAny:$Pg),
asm, "\t$Pd, $Pg/z",
"",
[]>, Sched<[]> {
bits<4> Pd;
bits<4> Pg;
let Inst{31-23} = 0b001001010;
let Inst{22} = s;
let Inst{21-9} = 0b0110001111000;
let Inst{8-5} = Pg;
let Inst{4} = 0;
let Inst{3-0} = Pd;
let Defs = !if(!eq (s, 1), [NZCV], []);
let Uses = [FFR];
}
class sve_int_rdffr_unpred<string asm> : I<
(outs PPR8:$Pd), (ins),
asm, "\t$Pd",
"",
[]>, Sched<[]> {
bits<4> Pd;
let Inst{31-4} = 0b0010010100011001111100000000;
let Inst{3-0} = Pd;
let Uses = [FFR];
}
class sve_int_wrffr<string asm>
: I<(outs), (ins PPR8:$Pn),
asm, "\t$Pn",
"",
[]>, Sched<[]> {
bits<4> Pn;
let Inst{31-9} = 0b00100101001010001001000;
let Inst{8-5} = Pn;
let Inst{4-0} = 0b00000;
let hasSideEffects = 1;
let Defs = [FFR];
}
class sve_int_setffr<string asm>
: I<(outs), (ins),
asm, "",
"",
[]>, Sched<[]> {
let Inst{31-0} = 0b00100101001011001001000000000000;
let hasSideEffects = 1;
let Defs = [FFR];
}
//===----------------------------------------------------------------------===//
// SVE Permute Vector - Predicated Group
//===----------------------------------------------------------------------===//
class sve_int_perm_clast_rz<bits<2> sz8_64, bit ab, string asm,
ZPRRegOp zprty, RegisterClass rt>
: I<(outs rt:$Rdn), (ins PPR3bAny:$Pg, rt:$_Rdn, zprty:$Zm),
asm, "\t$Rdn, $Pg, $_Rdn, $Zm",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rdn;
bits<5> Zm;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-17} = 0b11000;
let Inst{16} = ab;
let Inst{15-13} = 0b101;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Rdn;
let Constraints = "$Rdn = $_Rdn";
}
multiclass sve_int_perm_clast_rz<bit ab, string asm> {
def _B : sve_int_perm_clast_rz<0b00, ab, asm, ZPR8, GPR32>;
def _H : sve_int_perm_clast_rz<0b01, ab, asm, ZPR16, GPR32>;
def _S : sve_int_perm_clast_rz<0b10, ab, asm, ZPR32, GPR32>;
def _D : sve_int_perm_clast_rz<0b11, ab, asm, ZPR64, GPR64>;
}
class sve_int_perm_clast_vz<bits<2> sz8_64, bit ab, string asm,
ZPRRegOp zprty, RegisterClass rt>
: I<(outs rt:$Vdn), (ins PPR3bAny:$Pg, rt:$_Vdn, zprty:$Zm),
asm, "\t$Vdn, $Pg, $_Vdn, $Zm",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Vdn;
bits<5> Zm;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-17} = 0b10101;
let Inst{16} = ab;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Vdn;
let Constraints = "$Vdn = $_Vdn";
}
multiclass sve_int_perm_clast_vz<bit ab, string asm> {
def _B : sve_int_perm_clast_vz<0b00, ab, asm, ZPR8, FPR8>;
def _H : sve_int_perm_clast_vz<0b01, ab, asm, ZPR16, FPR16>;
def _S : sve_int_perm_clast_vz<0b10, ab, asm, ZPR32, FPR32>;
def _D : sve_int_perm_clast_vz<0b11, ab, asm, ZPR64, FPR64>;
}
class sve_int_perm_clast_zz<bits<2> sz8_64, bit ab, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
asm, "\t$Zdn, $Pg, $_Zdn, $Zm",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zdn;
bits<5> Zm;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-17} = 0b10100;
let Inst{16} = ab;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
multiclass sve_int_perm_clast_zz<bit ab, string asm> {
def _B : sve_int_perm_clast_zz<0b00, ab, asm, ZPR8>;
def _H : sve_int_perm_clast_zz<0b01, ab, asm, ZPR16>;
def _S : sve_int_perm_clast_zz<0b10, ab, asm, ZPR32>;
def _D : sve_int_perm_clast_zz<0b11, ab, asm, ZPR64>;
}
class sve_int_perm_last_r<bits<2> sz8_64, bit ab, string asm,
ZPRRegOp zprty, RegisterClass resultRegType>
: I<(outs resultRegType:$Rd), (ins PPR3bAny:$Pg, zprty:$Zn),
asm, "\t$Rd, $Pg, $Zn",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rd;
bits<5> Zn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-17} = 0b10000;
let Inst{16} = ab;
let Inst{15-13} = 0b101;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Rd;
}
multiclass sve_int_perm_last_r<bit ab, string asm> {
def _B : sve_int_perm_last_r<0b00, ab, asm, ZPR8, GPR32>;
def _H : sve_int_perm_last_r<0b01, ab, asm, ZPR16, GPR32>;
def _S : sve_int_perm_last_r<0b10, ab, asm, ZPR32, GPR32>;
def _D : sve_int_perm_last_r<0b11, ab, asm, ZPR64, GPR64>;
}
class sve_int_perm_last_v<bits<2> sz8_64, bit ab, string asm,
ZPRRegOp zprty, RegisterClass dstRegtype>
: I<(outs dstRegtype:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
asm, "\t$Vd, $Pg, $Zn",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Vd;
bits<5> Zn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-17} = 0b10001;
let Inst{16} = ab;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Vd;
}
multiclass sve_int_perm_last_v<bit ab, string asm> {
def _B : sve_int_perm_last_v<0b00, ab, asm, ZPR8, FPR8>;
def _H : sve_int_perm_last_v<0b01, ab, asm, ZPR16, FPR16>;
def _S : sve_int_perm_last_v<0b10, ab, asm, ZPR32, FPR32>;
def _D : sve_int_perm_last_v<0b11, ab, asm, ZPR64, FPR64>;
}
class sve_int_perm_splice<bits<2> sz8_64, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
asm, "\t$Zdn, $Pg, $_Zdn, $Zm",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zdn;
bits<5> Zm;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-13} = 0b101100100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = Destructive;
let ElementSize = ElementSizeNone;
}
multiclass sve_int_perm_splice<string asm> {
def _B : sve_int_perm_splice<0b00, asm, ZPR8>;
def _H : sve_int_perm_splice<0b01, asm, ZPR16>;
def _S : sve_int_perm_splice<0b10, asm, ZPR32>;
def _D : sve_int_perm_splice<0b11, asm, ZPR64>;
}
class sve2_int_perm_splice_cons<bits<2> sz8_64, string asm,
ZPRRegOp zprty, RegisterOperand VecList>
: I<(outs zprty:$Zd), (ins PPR3bAny:$Pg, VecList:$Zn),
asm, "\t$Zd, $Pg, $Zn",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zn;
bits<5> Zd;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-13} = 0b101101100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve2_int_perm_splice_cons<string asm> {
def _B : sve2_int_perm_splice_cons<0b00, asm, ZPR8, ZZ_b>;
def _H : sve2_int_perm_splice_cons<0b01, asm, ZPR16, ZZ_h>;
def _S : sve2_int_perm_splice_cons<0b10, asm, ZPR32, ZZ_s>;
def _D : sve2_int_perm_splice_cons<0b11, asm, ZPR64, ZZ_d>;
}
class sve_int_perm_rev<bits<2> sz8_64, bits<2> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, zprty:$Zn),
asm, "\t$Zd, $Pg/m, $Zn",
"",
[]>, Sched<[]> {
bits<5> Zd;
bits<3> Pg;
bits<5> Zn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-18} = 0b1001;
let Inst{17-16} = opc;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
let DestructiveInstType = Destructive;
let ElementSize = zprty.ElementSize;
}
multiclass sve_int_perm_rev_rbit<string asm> {
def _B : sve_int_perm_rev<0b00, 0b11, asm, ZPR8>;
def _H : sve_int_perm_rev<0b01, 0b11, asm, ZPR16>;
def _S : sve_int_perm_rev<0b10, 0b11, asm, ZPR32>;
def _D : sve_int_perm_rev<0b11, 0b11, asm, ZPR64>;
}
multiclass sve_int_perm_rev_revb<string asm> {
def _H : sve_int_perm_rev<0b01, 0b00, asm, ZPR16>;
def _S : sve_int_perm_rev<0b10, 0b00, asm, ZPR32>;
def _D : sve_int_perm_rev<0b11, 0b00, asm, ZPR64>;
}
multiclass sve_int_perm_rev_revh<string asm> {
def _S : sve_int_perm_rev<0b10, 0b01, asm, ZPR32>;
def _D : sve_int_perm_rev<0b11, 0b01, asm, ZPR64>;
}
multiclass sve_int_perm_rev_revw<string asm> {
def _D : sve_int_perm_rev<0b11, 0b10, asm, ZPR64>;
}
class sve_int_perm_cpy_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
RegisterClass srcRegType>
: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, srcRegType:$Rn),
asm, "\t$Zd, $Pg/m, $Rn",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zd;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-13} = 0b101000101;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
let DestructiveInstType = Destructive;
let ElementSize = zprty.ElementSize;
}
multiclass sve_int_perm_cpy_r<string asm> {
def _B : sve_int_perm_cpy_r<0b00, asm, ZPR8, GPR32sp>;
def _H : sve_int_perm_cpy_r<0b01, asm, ZPR16, GPR32sp>;
def _S : sve_int_perm_cpy_r<0b10, asm, ZPR32, GPR32sp>;
def _D : sve_int_perm_cpy_r<0b11, asm, ZPR64, GPR64sp>;
def : InstAlias<"mov $Zd, $Pg/m, $Rn",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $Rn",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $Rn",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $Rn",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, GPR64sp:$Rn), 1>;
}
class sve_int_perm_cpy_v<bits<2> sz8_64, string asm, ZPRRegOp zprty,
RegisterClass srcRegtype>
: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, srcRegtype:$Vn),
asm, "\t$Zd, $Pg/m, $Vn",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Vn;
bits<5> Zd;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-13} = 0b100000100;
let Inst{12-10} = Pg;
let Inst{9-5} = Vn;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
let DestructiveInstType = Destructive;
let ElementSize = zprty.ElementSize;
}
multiclass sve_int_perm_cpy_v<string asm> {
def _B : sve_int_perm_cpy_v<0b00, asm, ZPR8, FPR8>;
def _H : sve_int_perm_cpy_v<0b01, asm, ZPR16, FPR16>;
def _S : sve_int_perm_cpy_v<0b10, asm, ZPR32, FPR32>;
def _D : sve_int_perm_cpy_v<0b11, asm, ZPR64, FPR64>;
def : InstAlias<"mov $Zd, $Pg/m, $Vn",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPR3bAny:$Pg, FPR8:$Vn), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $Vn",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPR3bAny:$Pg, FPR16:$Vn), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $Vn",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, FPR32:$Vn), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $Vn",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, FPR64:$Vn), 1>;
}
class sve_int_perm_compact<bit sz, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zn),
asm, "\t$Zd, $Pg, $Zn",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zd;
bits<5> Zn;
let Inst{31-23} = 0b000001011;
let Inst{22} = sz;
let Inst{21-13} = 0b100001100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve_int_perm_compact<string asm> {
def _S : sve_int_perm_compact<0b0, asm, ZPR32>;
def _D : sve_int_perm_compact<0b1, asm, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE Memory - Contiguous Load Group
//===----------------------------------------------------------------------===//
class sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
RegisterOperand VecList>
: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
asm, "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zt;
bits<4> imm4;
let Inst{31-25} = 0b1010010;
let Inst{24-21} = dtype;
let Inst{20} = nf;
let Inst{19-16} = imm4;
let Inst{15-13} = 0b101;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
let Uses = !if(!eq(nf, 1), [FFR], []);
let Defs = !if(!eq(nf, 1), [FFR], []);
}
multiclass sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
RegisterOperand listty, ZPRRegOp zprty> {
def _REAL : sve_mem_cld_si_base<dtype, nf, asm, listty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
}
multiclass sve_mem_cld_si<bits<4> dtype, string asm, RegisterOperand listty,
ZPRRegOp zprty>
: sve_mem_cld_si_base<dtype, 0, asm, listty, zprty>;
class sve_mem_cldnt_si_base<bits<2> msz, string asm, RegisterOperand VecList>
: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
asm, "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
"",
[]>, Sched<[]> {
bits<5> Zt;
bits<3> Pg;
bits<5> Rn;
bits<4> imm4;
let Inst{31-25} = 0b1010010;
let Inst{24-23} = msz;
let Inst{22-20} = 0b000;
let Inst{19-16} = imm4;
let Inst{15-13} = 0b111;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
}
multiclass sve_mem_cldnt_si<bits<2> msz, string asm, RegisterOperand listty,
ZPRRegOp zprty> {
def NAME : sve_mem_cldnt_si_base<msz, asm, listty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
}
class sve_mem_cldnt_ss_base<bits<2> msz, string asm, RegisterOperand VecList,
RegisterOperand gprty>
: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
asm, "\t$Zt, $Pg/z, [$Rn, $Rm]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rm;
bits<5> Rn;
bits<5> Zt;
let Inst{31-25} = 0b1010010;
let Inst{24-23} = msz;
let Inst{22-21} = 0b00;
let Inst{20-16} = Rm;
let Inst{15-13} = 0b110;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
}
multiclass sve_mem_cldnt_ss<bits<2> msz, string asm, RegisterOperand listty,
ZPRRegOp zprty, RegisterOperand gprty> {
def NAME : sve_mem_cldnt_ss_base<msz, asm, listty, gprty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
}
class sve_mem_ldqr_si<bits<2> sz, string asm, RegisterOperand VecList>
: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s16:$imm4),
asm, "\t$Zt, $Pg/z, [$Rn, $imm4]", "", []>, Sched<[]> {
bits<5> Zt;
bits<5> Rn;
bits<3> Pg;
bits<4> imm4;
let Inst{31-25} = 0b1010010;
let Inst{24-23} = sz;
let Inst{22-20} = 0;
let Inst{19-16} = imm4;
let Inst{15-13} = 0b001;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
}
multiclass sve_mem_ldqr_si<bits<2> sz, string asm, RegisterOperand listty,
ZPRRegOp zprty> {
def NAME : sve_mem_ldqr_si<sz, asm, listty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s16:$imm4), 0>;
}
class sve_mem_ldqr_ss<bits<2> sz, string asm, RegisterOperand VecList,
RegisterOperand gprty>
: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
asm, "\t$Zt, $Pg/z, [$Rn, $Rm]", "", []>, Sched<[]> {
bits<5> Zt;
bits<3> Pg;
bits<5> Rn;
bits<5> Rm;
let Inst{31-25} = 0b1010010;
let Inst{24-23} = sz;
let Inst{22-21} = 0;
let Inst{20-16} = Rm;
let Inst{15-13} = 0;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
}
multiclass sve_mem_ldqr_ss<bits<2> sz, string asm, RegisterOperand listty,
ZPRRegOp zprty, RegisterOperand gprty> {
def NAME : sve_mem_ldqr_ss<sz, asm, listty, gprty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
}
class sve_mem_ld_dup<bits<2> dtypeh, bits<2> dtypel, string asm,
RegisterOperand VecList, Operand immtype>
: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm6),
asm, "\t$Zt, $Pg/z, [$Rn, $imm6]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zt;
bits<6> imm6;
let Inst{31-25} = 0b1000010;
let Inst{24-23} = dtypeh;
let Inst{22} = 1;
let Inst{21-16} = imm6;
let Inst{15} = 0b1;
let Inst{14-13} = dtypel;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
}
multiclass sve_mem_ld_dup<bits<2> dtypeh, bits<2> dtypel, string asm,
RegisterOperand zlistty, ZPRRegOp zprty, Operand immtype> {
def NAME : sve_mem_ld_dup<dtypeh, dtypel, asm, zlistty, immtype>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm6]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm6), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME) zlistty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
}
class sve_mem_cld_ss_base<bits<4> dtype, bit ff, dag iops, string asm,
RegisterOperand VecList>
: I<(outs VecList:$Zt), iops,
asm, "\t$Zt, $Pg/z, [$Rn, $Rm]",
"",
[]>, Sched<[]> {
bits<5> Zt;
bits<3> Pg;
bits<5> Rm;
bits<5> Rn;
let Inst{31-25} = 0b1010010;
let Inst{24-21} = dtype;
let Inst{20-16} = Rm;
let Inst{15-14} = 0b01;
let Inst{13} = ff;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
let Uses = !if(!eq(ff, 1), [FFR], []);
let Defs = !if(!eq(ff, 1), [FFR], []);
}
multiclass sve_mem_cld_ss<bits<4> dtype, string asm, RegisterOperand listty,
ZPRRegOp zprty, RegisterOperand gprty> {
def "" : sve_mem_cld_ss_base<dtype, 0, (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
asm, listty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
}
multiclass sve_mem_cldff_ss<bits<4> dtype, string asm, RegisterOperand listty,
ZPRRegOp zprty, RegisterOperand gprty> {
def _REAL : sve_mem_cld_ss_base<dtype, 1, (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
asm, listty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 1>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 0>;
}
multiclass sve_mem_cldnf_si<bits<4> dtype, string asm, RegisterOperand listty,
ZPRRegOp zprty>
: sve_mem_cld_si_base<dtype, 1, asm, listty, zprty>;
class sve_mem_eld_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
string asm, Operand immtype>
: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm4),
asm, "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
"",
[]>, Sched<[]> {
bits<5> Zt;
bits<3> Pg;
bits<5> Rn;
bits<4> imm4;
let Inst{31-25} = 0b1010010;
let Inst{24-23} = sz;
let Inst{22-21} = nregs;
let Inst{20} = 0;
let Inst{19-16} = imm4;
let Inst{15-13} = 0b111;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
}
multiclass sve_mem_eld_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
string asm, Operand immtype> {
def NAME : sve_mem_eld_si<sz, nregs, VecList, asm, immtype>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME) VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
}
class sve_mem_eld_ss<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
string asm, RegisterOperand gprty>
: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
asm, "\t$Zt, $Pg/z, [$Rn, $Rm]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rm;
bits<5> Rn;
bits<5> Zt;
let Inst{31-25} = 0b1010010;
let Inst{24-23} = sz;
let Inst{22-21} = nregs;
let Inst{20-16} = Rm;
let Inst{15-13} = 0b110;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
}
//===----------------------------------------------------------------------===//
// SVE Memory - 32-bit Gather and Unsized Contiguous Group
//===----------------------------------------------------------------------===//
// bit xs is '1' if offsets are signed
// bit scaled is '1' if the offsets are scaled
class sve_mem_32b_gld_sv<bits<4> opc, bit xs, bit scaled, string asm,
RegisterOperand zprext>
: I<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
asm, "\t$Zt, $Pg/z, [$Rn, $Zm]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zm;
bits<5> Zt;
let Inst{31-25} = 0b1000010;
let Inst{24-23} = opc{3-2};
let Inst{22} = xs;
let Inst{21} = scaled;
let Inst{20-16} = Zm;
let Inst{15} = 0b0;
let Inst{14-13} = opc{1-0};
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
let Defs = !if(!eq(opc{0}, 1), [FFR], []);
let Uses = !if(!eq(opc{0}, 1), [FFR], []);
}
multiclass sve_mem_32b_gld_sv_32_scaled<bits<4> opc, string asm,
RegisterOperand sxtw_opnd,
RegisterOperand uxtw_opnd> {
def _UXTW_SCALED_REAL : sve_mem_32b_gld_sv<opc, 0, 1, asm, uxtw_opnd>;
def _SXTW_SCALED_REAL : sve_mem_32b_gld_sv<opc, 1, 1, asm, sxtw_opnd>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _UXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
}
multiclass sve_mem_32b_gld_vs_32_unscaled<bits<4> opc, string asm,
RegisterOperand sxtw_opnd,
RegisterOperand uxtw_opnd> {
def _UXTW_REAL : sve_mem_32b_gld_sv<opc, 0, 0, asm, uxtw_opnd>;
def _SXTW_REAL : sve_mem_32b_gld_sv<opc, 1, 0, asm, sxtw_opnd>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _UXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
}
class sve_mem_32b_gld_vi<bits<4> opc, string asm, Operand imm_ty>
: I<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5),
asm, "\t$Zt, $Pg/z, [$Zn, $imm5]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zn;
bits<5> Zt;
bits<5> imm5;
let Inst{31-25} = 0b1000010;
let Inst{24-23} = opc{3-2};
let Inst{22-21} = 0b01;
let Inst{20-16} = imm5;
let Inst{15} = 0b1;
let Inst{14-13} = opc{1-0};
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zt;
let mayLoad = 1;
let Defs = !if(!eq(opc{0}, 1), [FFR], []);
let Uses = !if(!eq(opc{0}, 1), [FFR], []);
}
multiclass sve_mem_32b_gld_vi_32_ptrs<bits<4> opc, string asm, Operand imm_ty> {
def _IMM_REAL : sve_mem_32b_gld_vi<opc, asm, imm_ty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _IMM_REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $imm5]",
(!cast<Instruction>(NAME # _IMM_REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;
}
class sve_mem_prfm_si<bits<2> msz, string asm>
: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, simm6s1:$imm6),
asm, "\t$prfop, $Pg, [$Rn, $imm6, mul vl]",
"",
[]>, Sched<[]> {
bits<5> Rn;
bits<3> Pg;
bits<6> imm6;
bits<4> prfop;
let Inst{31-22} = 0b1000010111;
let Inst{21-16} = imm6;
let Inst{15} = 0b0;
let Inst{14-13} = msz;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4} = 0b0;
let Inst{3-0} = prfop;
let hasSideEffects = 1;
}
multiclass sve_mem_prfm_si<bits<2> msz, string asm> {
def NAME : sve_mem_prfm_si<msz, asm>;
def : InstAlias<asm # "\t$prfop, $Pg, [$Rn]",
(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
}
class sve_mem_prfm_ss<bits<3> opc, string asm, RegisterOperand gprty>
: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
asm, "\t$prfop, $Pg, [$Rn, $Rm]",
"",
[]>, Sched<[]> {
bits<5> Rm;
bits<5> Rn;
bits<3> Pg;
bits<4> prfop;
let Inst{31-25} = 0b1000010;
let Inst{24-23} = opc{2-1};
let Inst{22-21} = 0b00;
let Inst{20-16} = Rm;
let Inst{15} = 0b1;
let Inst{14} = opc{0};
let Inst{13} = 0b0;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4} = 0b0;
let Inst{3-0} = prfop;
let hasSideEffects = 1;
}
class sve_mem_32b_prfm_sv<bits<2> msz, bit xs, string asm,
RegisterOperand zprext>
: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
asm, "\t$prfop, $Pg, [$Rn, $Zm]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zm;
bits<4> prfop;
let Inst{31-23} = 0b100001000;
let Inst{22} = xs;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15} = 0b0;
let Inst{14-13} = msz;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4} = 0b0;
let Inst{3-0} = prfop;
let hasSideEffects = 1;
}
multiclass sve_mem_32b_prfm_sv_scaled<bits<2> msz, string asm,
RegisterOperand sxtw_opnd,
RegisterOperand uxtw_opnd> {
def _UXTW_SCALED : sve_mem_32b_prfm_sv<msz, 0, asm, uxtw_opnd>;
def _SXTW_SCALED : sve_mem_32b_prfm_sv<msz, 1, asm, sxtw_opnd>;
}
class sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5),
asm, "\t$prfop, $Pg, [$Zn, $imm5]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zn;
bits<5> imm5;
bits<4> prfop;
let Inst{31-25} = 0b1000010;
let Inst{24-23} = msz;
let Inst{22-21} = 0b00;
let Inst{20-16} = imm5;
let Inst{15-13} = 0b111;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4} = 0b0;
let Inst{3-0} = prfop;
}
multiclass sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty> {
def NAME : sve_mem_32b_prfm_vi<msz, asm, imm_ty>;
def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]",
(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;
}
class sve_mem_z_fill<string asm>
: I<(outs ZPRAny:$Zt), (ins GPR64sp:$Rn, simm9:$imm9),
asm, "\t$Zt, [$Rn, $imm9, mul vl]",
"",
[]>, Sched<[]> {
bits<5> Rn;
bits<5> Zt;
bits<9> imm9;
let Inst{31-22} = 0b1000010110;
let Inst{21-16} = imm9{8-3};
let Inst{15-13} = 0b010;
let Inst{12-10} = imm9{2-0};
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
}
multiclass sve_mem_z_fill<string asm> {
def NAME : sve_mem_z_fill<asm>;
def : InstAlias<asm # "\t$Zt, [$Rn]",
(!cast<Instruction>(NAME) ZPRAny:$Zt, GPR64sp:$Rn, 0), 1>;
}
class sve_mem_p_fill<string asm>
: I<(outs PPRAny:$Pt), (ins GPR64sp:$Rn, simm9:$imm9),
asm, "\t$Pt, [$Rn, $imm9, mul vl]",
"",
[]>, Sched<[]> {
bits<4> Pt;
bits<5> Rn;
bits<9> imm9;
let Inst{31-22} = 0b1000010110;
let Inst{21-16} = imm9{8-3};
let Inst{15-13} = 0b000;
let Inst{12-10} = imm9{2-0};
let Inst{9-5} = Rn;
let Inst{4} = 0b0;
let Inst{3-0} = Pt;
let mayLoad = 1;
}
multiclass sve_mem_p_fill<string asm> {
def NAME : sve_mem_p_fill<asm>;
def : InstAlias<asm # "\t$Pt, [$Rn]",
(!cast<Instruction>(NAME) PPRAny:$Pt, GPR64sp:$Rn, 0), 1>;
}
-class sve2_mem_cldnt_vs_base<bits<5> opc, dag iops, string asm,
+class sve2_mem_gldnt_vs_base<bits<5> opc, dag iops, string asm,
RegisterOperand VecList>
: I<(outs VecList:$Zt), iops,
asm, "\t$Zt, $Pg/z, [$Zn, $Rm]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rm;
bits<5> Zn;
bits<5> Zt;
let Inst{31} = 0b1;
let Inst{30} = opc{4};
let Inst{29-25} = 0b00010;
let Inst{24-23} = opc{3-2};
let Inst{22-21} = 0b00;
let Inst{20-16} = Rm;
let Inst{15} = 0b1;
let Inst{14-13} = opc{1-0};
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zt;
let mayLoad = 1;
}
-multiclass sve2_mem_cldnt_vs<bits<5> opc, string asm,
+multiclass sve2_mem_gldnt_vs<bits<5> opc, string asm,
RegisterOperand listty, ZPRRegOp zprty> {
- def _REAL : sve2_mem_cldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
+ def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
asm, listty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
- def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
- (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
}
//===----------------------------------------------------------------------===//
// SVE Memory - 64-bit Gather Group
//===----------------------------------------------------------------------===//
// bit xs is '1' if offsets are signed
// bit scaled is '1' if the offsets are scaled
// bit lsl is '0' if the offsets are extended (uxtw/sxtw), '1' if shifted (lsl)
class sve_mem_64b_gld_sv<bits<4> opc, bit xs, bit scaled, bit lsl, string asm,
RegisterOperand zprext>
: I<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
asm, "\t$Zt, $Pg/z, [$Rn, $Zm]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zm;
bits<5> Zt;
let Inst{31-25} = 0b1100010;
let Inst{24-23} = opc{3-2};
let Inst{22} = xs;
let Inst{21} = scaled;
let Inst{20-16} = Zm;
let Inst{15} = lsl;
let Inst{14-13} = opc{1-0};
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
let Defs = !if(!eq(opc{0}, 1), [FFR], []);
let Uses = !if(!eq(opc{0}, 1), [FFR], []);
}
multiclass sve_mem_64b_gld_sv_32_scaled<bits<4> opc, string asm,
RegisterOperand sxtw_opnd,
RegisterOperand uxtw_opnd> {
def _UXTW_SCALED_REAL : sve_mem_64b_gld_sv<opc, 0, 1, 0, asm, uxtw_opnd>;
def _SXTW_SCALED_REAL : sve_mem_64b_gld_sv<opc, 1, 1, 0, asm, sxtw_opnd>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _UXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
}
multiclass sve_mem_64b_gld_vs_32_unscaled<bits<4> opc, string asm,
RegisterOperand sxtw_opnd,
RegisterOperand uxtw_opnd> {
def _UXTW_REAL : sve_mem_64b_gld_sv<opc, 0, 0, 0, asm, uxtw_opnd>;
def _SXTW_REAL : sve_mem_64b_gld_sv<opc, 1, 0, 0, asm, sxtw_opnd>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _UXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
}
multiclass sve_mem_64b_gld_sv2_64_scaled<bits<4> opc, string asm,
RegisterOperand zprext> {
def _SCALED_REAL : sve_mem_64b_gld_sv<opc, 1, 1, 1, asm, zprext>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;
}
multiclass sve_mem_64b_gld_vs2_64_unscaled<bits<4> opc, string asm> {
def _REAL : sve_mem_64b_gld_sv<opc, 1, 0, 1, asm, ZPR64ExtLSL8>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
}
class sve_mem_64b_gld_vi<bits<4> opc, string asm, Operand imm_ty>
: I<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5),
asm, "\t$Zt, $Pg/z, [$Zn, $imm5]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zn;
bits<5> Zt;
bits<5> imm5;
let Inst{31-25} = 0b1100010;
let Inst{24-23} = opc{3-2};
let Inst{22-21} = 0b01;
let Inst{20-16} = imm5;
let Inst{15} = 0b1;
let Inst{14-13} = opc{1-0};
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zt;
let mayLoad = 1;
let Defs = !if(!eq(opc{0}, 1), [FFR], []);
let Uses = !if(!eq(opc{0}, 1), [FFR], []);
}
multiclass sve_mem_64b_gld_vi_64_ptrs<bits<4> opc, string asm, Operand imm_ty> {
def _IMM_REAL : sve_mem_64b_gld_vi<opc, asm, imm_ty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _IMM_REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $imm5]",
(!cast<Instruction>(NAME # _IMM_REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
}
// bit lsl is '0' if the offsets are extended (uxtw/sxtw), '1' if shifted (lsl)
class sve_mem_64b_prfm_sv<bits<2> msz, bit xs, bit lsl, string asm,
RegisterOperand zprext>
: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
asm, "\t$prfop, $Pg, [$Rn, $Zm]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zm;
bits<4> prfop;
let Inst{31-23} = 0b110001000;
let Inst{22} = xs;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15} = lsl;
let Inst{14-13} = msz;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4} = 0b0;
let Inst{3-0} = prfop;
let hasSideEffects = 1;
}
multiclass sve_mem_64b_prfm_sv_ext_scaled<bits<2> msz, string asm,
RegisterOperand sxtw_opnd,
RegisterOperand uxtw_opnd> {
def _UXTW_SCALED : sve_mem_64b_prfm_sv<msz, 0, 0, asm, uxtw_opnd>;
def _SXTW_SCALED : sve_mem_64b_prfm_sv<msz, 1, 0, asm, sxtw_opnd>;
}
multiclass sve_mem_64b_prfm_sv_lsl_scaled<bits<2> msz, string asm,
RegisterOperand zprext> {
def NAME : sve_mem_64b_prfm_sv<msz, 1, 1, asm, zprext>;
}
class sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5),
asm, "\t$prfop, $Pg, [$Zn, $imm5]",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zn;
bits<5> imm5;
bits<4> prfop;
let Inst{31-25} = 0b1100010;
let Inst{24-23} = msz;
let Inst{22-21} = 0b00;
let Inst{20-16} = imm5;
let Inst{15-13} = 0b111;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4} = 0b0;
let Inst{3-0} = prfop;
let hasSideEffects = 1;
}
multiclass sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty> {
def NAME : sve_mem_64b_prfm_vi<msz, asm, imm_ty>;
def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]",
(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
}
//===----------------------------------------------------------------------===//
// SVE Compute Vector Address Group
//===----------------------------------------------------------------------===//
class sve_int_bin_cons_misc_0_a<bits<2> opc, bits<2> msz, string asm,
ZPRRegOp zprty, RegisterOperand zprext>
: I<(outs zprty:$Zd), (ins zprty:$Zn, zprext:$Zm),
asm, "\t$Zd, [$Zn, $Zm]",
"",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<5> Zm;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = opc;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-12} = 0b1010;
let Inst{11-10} = msz;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve_int_bin_cons_misc_0_a_uxtw<bits<2> opc, string asm> {
def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR64, ZPR64ExtUXTW8>;
def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR64, ZPR64ExtUXTW16>;
def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR64, ZPR64ExtUXTW32>;
def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR64, ZPR64ExtUXTW64>;
}
multiclass sve_int_bin_cons_misc_0_a_sxtw<bits<2> opc, string asm> {
def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR64, ZPR64ExtSXTW8>;
def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR64, ZPR64ExtSXTW16>;
def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR64, ZPR64ExtSXTW32>;
def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR64, ZPR64ExtSXTW64>;
}
multiclass sve_int_bin_cons_misc_0_a_32_lsl<bits<2> opc, string asm> {
def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR32, ZPR32ExtLSL8>;
def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR32, ZPR32ExtLSL16>;
def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR32, ZPR32ExtLSL32>;
def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR32, ZPR32ExtLSL64>;
}
multiclass sve_int_bin_cons_misc_0_a_64_lsl<bits<2> opc, string asm> {
def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR64, ZPR64ExtLSL8>;
def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR64, ZPR64ExtLSL16>;
def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR64, ZPR64ExtLSL32>;
def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR64, ZPR64ExtLSL64>;
}
//===----------------------------------------------------------------------===//
// SVE Integer Misc - Unpredicated Group
//===----------------------------------------------------------------------===//
class sve_int_bin_cons_misc_0_b<bits<2> sz, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm",
"",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-10} = 0b101100;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve_int_bin_cons_misc_0_b<string asm> {
def _H : sve_int_bin_cons_misc_0_b<0b01, asm, ZPR16>;
def _S : sve_int_bin_cons_misc_0_b<0b10, asm, ZPR32>;
def _D : sve_int_bin_cons_misc_0_b<0b11, asm, ZPR64>;
}
class sve_int_bin_cons_misc_0_c<bits<8> opc, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn),
asm, "\t$Zd, $Zn",
"",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = opc{7-6};
let Inst{21} = 0b1;
let Inst{20-16} = opc{5-1};
let Inst{15-11} = 0b10111;
let Inst{10} = opc{0};
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
//===----------------------------------------------------------------------===//
// SVE Integer Reduction Group
//===----------------------------------------------------------------------===//
class sve_int_reduce<bits<2> sz8_32, bits<2> fmt, bits<3> opc, string asm,
ZPRRegOp zprty, RegisterClass regtype>
: I<(outs regtype:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
asm, "\t$Vd, $Pg, $Zn",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Vd;
bits<5> Zn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_32;
let Inst{21} = 0b0;
let Inst{20-19} = fmt;
let Inst{18-16} = opc;
let Inst{15-13} = 0b001;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Vd;
}
multiclass sve_int_reduce_0_saddv<bits<3> opc, string asm> {
def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64>;
def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64>;
def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64>;
}
multiclass sve_int_reduce_0_uaddv<bits<3> opc, string asm> {
def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64>;
def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64>;
def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64>;
def _D : sve_int_reduce<0b11, 0b00, opc, asm, ZPR64, FPR64>;
}
multiclass sve_int_reduce_1<bits<3> opc, string asm> {
def _B : sve_int_reduce<0b00, 0b01, opc, asm, ZPR8, FPR8>;
def _H : sve_int_reduce<0b01, 0b01, opc, asm, ZPR16, FPR16>;
def _S : sve_int_reduce<0b10, 0b01, opc, asm, ZPR32, FPR32>;
def _D : sve_int_reduce<0b11, 0b01, opc, asm, ZPR64, FPR64>;
}
multiclass sve_int_reduce_2<bits<3> opc, string asm> {
def _B : sve_int_reduce<0b00, 0b11, opc, asm, ZPR8, FPR8>;
def _H : sve_int_reduce<0b01, 0b11, opc, asm, ZPR16, FPR16>;
def _S : sve_int_reduce<0b10, 0b11, opc, asm, ZPR32, FPR32>;
def _D : sve_int_reduce<0b11, 0b11, opc, asm, ZPR64, FPR64>;
}
class sve_int_movprfx_pred<bits<2> sz8_32, bits<3> opc, string asm,
ZPRRegOp zprty, string pg_suffix, dag iops>
: I<(outs zprty:$Zd), iops,
asm, "\t$Zd, $Pg"#pg_suffix#", $Zn",
"",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zd;
bits<5> Zn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_32;
let Inst{21-19} = 0b010;
let Inst{18-16} = opc;
let Inst{15-13} = 0b001;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
let ElementSize = zprty.ElementSize;
}
multiclass sve_int_movprfx_pred_merge<bits<3> opc, string asm> {
let Constraints = "$Zd = $_Zd" in {
def _B : sve_int_movprfx_pred<0b00, opc, asm, ZPR8, "/m",
(ins ZPR8:$_Zd, PPR3bAny:$Pg, ZPR8:$Zn)>;
def _H : sve_int_movprfx_pred<0b01, opc, asm, ZPR16, "/m",
(ins ZPR16:$_Zd, PPR3bAny:$Pg, ZPR16:$Zn)>;
def _S : sve_int_movprfx_pred<0b10, opc, asm, ZPR32, "/m",
(ins ZPR32:$_Zd, PPR3bAny:$Pg, ZPR32:$Zn)>;
def _D : sve_int_movprfx_pred<0b11, opc, asm, ZPR64, "/m",
(ins ZPR64:$_Zd, PPR3bAny:$Pg, ZPR64:$Zn)>;
}
}
multiclass sve_int_movprfx_pred_zero<bits<3> opc, string asm> {
def _B : sve_int_movprfx_pred<0b00, opc, asm, ZPR8, "/z",
(ins PPR3bAny:$Pg, ZPR8:$Zn)>;
def _H : sve_int_movprfx_pred<0b01, opc, asm, ZPR16, "/z",
(ins PPR3bAny:$Pg, ZPR16:$Zn)>;
def _S : sve_int_movprfx_pred<0b10, opc, asm, ZPR32, "/z",
(ins PPR3bAny:$Pg, ZPR32:$Zn)>;
def _D : sve_int_movprfx_pred<0b11, opc, asm, ZPR64, "/z",
(ins PPR3bAny:$Pg, ZPR64:$Zn)>;
}
//===----------------------------------------------------------------------===//
// SVE Propagate Break Group
//===----------------------------------------------------------------------===//
class sve_int_brkp<bits<2> opc, string asm>
: I<(outs PPR8:$Pd), (ins PPRAny:$Pg, PPR8:$Pn, PPR8:$Pm),
asm, "\t$Pd, $Pg/z, $Pn, $Pm",
"",
[]>, Sched<[]> {
bits<4> Pd;
bits<4> Pg;
bits<4> Pm;
bits<4> Pn;
let Inst{31-24} = 0b00100101;
let Inst{23} = 0b0;
let Inst{22} = opc{1};
let Inst{21-20} = 0b00;
let Inst{19-16} = Pm;
let Inst{15-14} = 0b11;
let Inst{13-10} = Pg;
let Inst{9} = 0b0;
let Inst{8-5} = Pn;
let Inst{4} = opc{0};
let Inst{3-0} = Pd;
let Defs = !if(!eq (opc{1}, 1), [NZCV], []);
}
//===----------------------------------------------------------------------===//
// SVE Partition Break Group
//===----------------------------------------------------------------------===//
class sve_int_brkn<bit S, string asm>
: I<(outs PPR8:$Pdm), (ins PPRAny:$Pg, PPR8:$Pn, PPR8:$_Pdm),
asm, "\t$Pdm, $Pg/z, $Pn, $_Pdm",
"",
[]>, Sched<[]> {
bits<4> Pdm;
bits<4> Pg;
bits<4> Pn;
let Inst{31-23} = 0b001001010;
let Inst{22} = S;
let Inst{21-14} = 0b01100001;
let Inst{13-10} = Pg;
let Inst{9} = 0b0;
let Inst{8-5} = Pn;
let Inst{4} = 0b0;
let Inst{3-0} = Pdm;
let Constraints = "$Pdm = $_Pdm";
let Defs = !if(!eq (S, 0b1), [NZCV], []);
}
class sve_int_break<bits<3> opc, string asm, string suffix, dag iops>
: I<(outs PPR8:$Pd), iops,
asm, "\t$Pd, $Pg"#suffix#", $Pn",
"",
[]>, Sched<[]> {
bits<4> Pd;
bits<4> Pg;
bits<4> Pn;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = opc{2-1};
let Inst{21-14} = 0b01000001;
let Inst{13-10} = Pg;
let Inst{9} = 0b0;
let Inst{8-5} = Pn;
let Inst{4} = opc{0};
let Inst{3-0} = Pd;
let Constraints = !if(!eq (opc{0}, 1), "$Pd = $_Pd", "");
let Defs = !if(!eq (opc{1}, 1), [NZCV], []);
}
multiclass sve_int_break_m<bits<3> opc, string asm> {
def NAME : sve_int_break<opc, asm, "/m", (ins PPR8:$_Pd, PPRAny:$Pg, PPR8:$Pn)>;
}
multiclass sve_int_break_z<bits<3> opc, string asm> {
def NAME : sve_int_break<opc, asm, "/z", (ins PPRAny:$Pg, PPR8:$Pn)>;
}
//===----------------------------------------------------------------------===//
// SVE2 String Processing Group
//===----------------------------------------------------------------------===//
class sve2_char_match<bit sz, bit opc, string asm,
PPRRegOp pprty, ZPRRegOp zprty>
: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, zprty:$Zm),
asm, "\t$Pd, $Pg/z, $Zn, $Zm",
"",
[]>, Sched<[]> {
bits<4> Pd;
bits<3> Pg;
bits<5> Zm;
bits<5> Zn;
let Inst{31-23} = 0b010001010;
let Inst{22} = sz;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4} = opc;
let Inst{3-0} = Pd;
let Defs = [NZCV];
}
multiclass sve2_char_match<bit opc, string asm> {
def _B : sve2_char_match<0b0, opc, asm, PPR8, ZPR8>;
def _H : sve2_char_match<0b1, opc, asm, PPR16, ZPR16>;
}
//===----------------------------------------------------------------------===//
// SVE2 Histogram Computation - Segment Group
//===----------------------------------------------------------------------===//
class sve2_hist_gen_segment<string asm>
: I<(outs ZPR8:$Zd), (ins ZPR8:$Zn, ZPR8:$Zm),
asm, "\t$Zd, $Zn, $Zm",
"",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<5> Zm;
let Inst{31-21} = 0b01000101001;
let Inst{20-16} = Zm;
let Inst{15-10} = 0b101000;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
//===----------------------------------------------------------------------===//
// SVE2 Histogram Computation - Vector Group
//===----------------------------------------------------------------------===//
class sve2_hist_gen_vector<bit sz, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Pg/z, $Zn, $Zm",
"",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<3> Pg;
bits<5> Zm;
let Inst{31-23} = 0b010001011;
let Inst{22} = sz;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b110;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
multiclass sve2_hist_gen_vector<string asm> {
def _S : sve2_hist_gen_vector<0b0, asm, ZPR32>;
def _D : sve2_hist_gen_vector<0b1, asm, ZPR64>;
}
//===----------------------------------------------------------------------===//
// SVE2 Crypto Extensions Group
//===----------------------------------------------------------------------===//
class sve2_crypto_cons_bin_op<bit opc, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm",
"",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<5> Zm;
let Inst{31-21} = 0b01000101001;
let Inst{20-16} = Zm;
let Inst{15-11} = 0b11110;
let Inst{10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
}
class sve2_crypto_des_bin_op<bits<2> opc, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm),
asm, "\t$Zdn, $_Zdn, $Zm",
"",
[]>, Sched<[]> {
bits<5> Zdn;
bits<5> Zm;
let Inst{31-17} = 0b010001010010001;
let Inst{16} = opc{1};
let Inst{15-11} = 0b11100;
let Inst{10} = opc{0};
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
}
class sve2_crypto_unary_op<bit opc, string asm>
: I<(outs ZPR8:$Zdn), (ins ZPR8:$_Zdn),
asm, "\t$Zdn, $_Zdn",
"",
[]>, Sched<[]> {
bits<5> Zdn;
let Inst{31-11} = 0b010001010010000011100;
let Inst{10} = opc;
let Inst{9-5} = 0b00000;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
}
Index: vendor/llvm/dist-release_90/lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/ARM/ARMISelLowering.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/ARM/ARMISelLowering.cpp (revision 351303)
@@ -1,15888 +1,15889 @@
//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines the interfaces that ARM uses to lower LLVM code into a
// selection DAG.
//
//===----------------------------------------------------------------------===//
#include "ARMISelLowering.h"
#include "ARMBaseInstrInfo.h"
#include "ARMBaseRegisterInfo.h"
#include "ARMCallingConv.h"
#include "ARMConstantPoolValue.h"
#include "ARMMachineFunctionInfo.h"
#include "ARMPerfectShuffle.h"
#include "ARMRegisterInfo.h"
#include "ARMSelectionDAGInfo.h"
#include "ARMSubtarget.h"
#include "MCTargetDesc/ARMAddressingModes.h"
#include "MCTargetDesc/ARMBaseInfo.h"
#include "Utils/ARMBaseInfo.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Triple.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrItineraries.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSchedule.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/BranchProbability.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <iterator>
#include <limits>
#include <string>
#include <tuple>
#include <utility>
#include <vector>
using namespace llvm;
using namespace llvm::PatternMatch;
#define DEBUG_TYPE "arm-isel"
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
STATISTIC(NumConstpoolPromoted,
"Number of constants with their storage promoted into constant pools");
static cl::opt<bool>
ARMInterworking("arm-interworking", cl::Hidden,
cl::desc("Enable / disable ARM interworking (for debugging only)"),
cl::init(true));
static cl::opt<bool> EnableConstpoolPromotion(
"arm-promote-constant", cl::Hidden,
cl::desc("Enable / disable promotion of unnamed_addr constants into "
"constant pools"),
cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
static cl::opt<unsigned> ConstpoolPromotionMaxSize(
"arm-promote-constant-max-size", cl::Hidden,
cl::desc("Maximum size of constant to promote into a constant pool"),
cl::init(64));
static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
"arm-promote-constant-max-total", cl::Hidden,
cl::desc("Maximum size of ALL constants to promote into a constant pool"),
cl::init(128));
// The APCS parameter registers.
static const MCPhysReg GPRArgRegs[] = {
ARM::R0, ARM::R1, ARM::R2, ARM::R3
};
void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
MVT PromotedBitwiseVT) {
if (VT != PromotedLdStVT) {
setOperationAction(ISD::LOAD, VT, Promote);
AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
setOperationAction(ISD::STORE, VT, Promote);
AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
}
MVT ElemTy = VT.getVectorElementType();
if (ElemTy != MVT::f64)
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
if (ElemTy == MVT::i32) {
setOperationAction(ISD::SINT_TO_FP, VT, Custom);
setOperationAction(ISD::UINT_TO_FP, VT, Custom);
setOperationAction(ISD::FP_TO_SINT, VT, Custom);
setOperationAction(ISD::FP_TO_UINT, VT, Custom);
} else {
setOperationAction(ISD::SINT_TO_FP, VT, Expand);
setOperationAction(ISD::UINT_TO_FP, VT, Expand);
setOperationAction(ISD::FP_TO_SINT, VT, Expand);
setOperationAction(ISD::FP_TO_UINT, VT, Expand);
}
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::SELECT, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
setOperationAction(ISD::VSELECT, VT, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
if (VT.isInteger()) {
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::SRL, VT, Custom);
}
// Promote all bit-wise operations.
if (VT.isInteger() && VT != PromotedBitwiseVT) {
setOperationAction(ISD::AND, VT, Promote);
AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
setOperationAction(ISD::OR, VT, Promote);
AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT);
setOperationAction(ISD::XOR, VT, Promote);
AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
}
// Neon does not support vector divide/remainder operations.
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UDIV, VT, Expand);
setOperationAction(ISD::FDIV, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
setOperationAction(ISD::FREM, VT, Expand);
if (!VT.isFloatingPoint() &&
VT != MVT::v2i64 && VT != MVT::v1i64)
for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
setOperationAction(Opcode, VT, Legal);
}
void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
addRegisterClass(VT, &ARM::DPRRegClass);
addTypeForNEON(VT, MVT::f64, MVT::v2i32);
}
void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
addRegisterClass(VT, &ARM::DPairRegClass);
addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
}
void ARMTargetLowering::setAllExpand(MVT VT) {
for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
setOperationAction(Opc, VT, Expand);
// We support these really simple operations even on types where all
// the actual arithmetic has to be broken down into simpler
// operations or turned into library calls.
setOperationAction(ISD::BITCAST, VT, Legal);
setOperationAction(ISD::LOAD, VT, Legal);
setOperationAction(ISD::STORE, VT, Legal);
setOperationAction(ISD::UNDEF, VT, Legal);
}
void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
LegalizeAction Action) {
setLoadExtAction(ISD::EXTLOAD, From, To, Action);
setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
}
void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
for (auto VT : IntTypes) {
addRegisterClass(VT, &ARM::QPRRegClass);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SMIN, VT, Legal);
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMIN, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::ABS, VT, Legal);
// No native support for these.
setOperationAction(ISD::UDIV, VT, Expand);
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
if (!HasMVEFP) {
setOperationAction(ISD::SINT_TO_FP, VT, Expand);
setOperationAction(ISD::UINT_TO_FP, VT, Expand);
setOperationAction(ISD::FP_TO_SINT, VT, Expand);
setOperationAction(ISD::FP_TO_UINT, VT, Expand);
}
}
const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
for (auto VT : FloatTypes) {
addRegisterClass(VT, &ARM::QPRRegClass);
if (!HasMVEFP)
setAllExpand(VT);
// These are legal or custom whether we have MVE.fp or not
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
if (HasMVEFP) {
setOperationAction(ISD::FMINNUM, VT, Legal);
setOperationAction(ISD::FMAXNUM, VT, Legal);
setOperationAction(ISD::FROUND, VT, Legal);
// No native support for these.
setOperationAction(ISD::FDIV, VT, Expand);
setOperationAction(ISD::FREM, VT, Expand);
setOperationAction(ISD::FSQRT, VT, Expand);
setOperationAction(ISD::FSIN, VT, Expand);
setOperationAction(ISD::FCOS, VT, Expand);
setOperationAction(ISD::FPOW, VT, Expand);
setOperationAction(ISD::FLOG, VT, Expand);
setOperationAction(ISD::FLOG2, VT, Expand);
setOperationAction(ISD::FLOG10, VT, Expand);
setOperationAction(ISD::FEXP, VT, Expand);
setOperationAction(ISD::FEXP2, VT, Expand);
setOperationAction(ISD::FNEARBYINT, VT, Expand);
}
}
// We 'support' these types up to bitcast/load/store level, regardless of
// MVE integer-only / float support. Only doing FP data processing on the FP
// vector types is inhibited at integer-only level.
const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
for (auto VT : LongTypes) {
addRegisterClass(VT, &ARM::QPRRegClass);
setAllExpand(VT);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
}
// We can do bitwise operations on v2i64 vectors
setOperationAction(ISD::AND, MVT::v2i64, Legal);
setOperationAction(ISD::OR, MVT::v2i64, Legal);
setOperationAction(ISD::XOR, MVT::v2i64, Legal);
// It is legal to extload from v4i8 to v4i16 or v4i32.
addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
// Some truncating stores are legal too.
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
}
ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
const ARMSubtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
RegInfo = Subtarget->getRegisterInfo();
Itins = Subtarget->getInstrItineraryData();
setBooleanContents(ZeroOrOneBooleanContent);
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
!Subtarget->isTargetWatchOS()) {
bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID),
IsHFTarget ? CallingConv::ARM_AAPCS_VFP
: CallingConv::ARM_AAPCS);
}
if (Subtarget->isTargetMachO()) {
// Uses VFP for Thumb libfuncs if available.
if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
static const struct {
const RTLIB::Libcall Op;
const char * const Name;
const ISD::CondCode Cond;
} LibraryCalls[] = {
// Single-precision floating-point arithmetic.
{ RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
{ RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
{ RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
{ RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
// Double-precision floating-point arithmetic.
{ RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
{ RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
{ RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
{ RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
// Single-precision comparisons.
{ RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE },
{ RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE },
{ RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE },
{ RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE },
{ RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE },
{ RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE },
{ RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE },
{ RTLIB::O_F32, "__unordsf2vfp", ISD::SETEQ },
// Double-precision comparisons.
{ RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE },
{ RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE },
{ RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE },
{ RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE },
{ RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE },
{ RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE },
{ RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE },
{ RTLIB::O_F64, "__unorddf2vfp", ISD::SETEQ },
// Floating-point to integer conversions.
// i64 conversions are done via library routines even when generating VFP
// instructions, so use the same ones.
{ RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID },
{ RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
{ RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID },
{ RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
// Conversions between floating types.
{ RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID },
{ RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID },
// Integer to floating-point conversions.
// i64 conversions are done via library routines even when generating VFP
// instructions, so use the same ones.
// FIXME: There appears to be some naming inconsistency in ARM libgcc:
// e.g., __floatunsidf vs. __floatunssidfvfp.
{ RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID },
{ RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
{ RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID },
{ RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
};
for (const auto &LC : LibraryCalls) {
setLibcallName(LC.Op, LC.Name);
if (LC.Cond != ISD::SETCC_INVALID)
setCmpLibcallCC(LC.Op, LC.Cond);
}
}
}
// These libcalls are not available in 32-bit.
setLibcallName(RTLIB::SHL_I128, nullptr);
setLibcallName(RTLIB::SRL_I128, nullptr);
setLibcallName(RTLIB::SRA_I128, nullptr);
// RTLIB
if (Subtarget->isAAPCS_ABI() &&
(Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
static const struct {
const RTLIB::Libcall Op;
const char * const Name;
const CallingConv::ID CC;
const ISD::CondCode Cond;
} LibraryCalls[] = {
// Double-precision floating-point arithmetic helper functions
// RTABI chapter 4.1.2, Table 2
{ RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
// Double-precision floating-point comparison helper functions
// RTABI chapter 4.1.2, Table 3
{ RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
{ RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
{ RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
{ RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
{ RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
{ RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
{ RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
{ RTLIB::O_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
// Single-precision floating-point arithmetic helper functions
// RTABI chapter 4.1.2, Table 4
{ RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
// Single-precision floating-point comparison helper functions
// RTABI chapter 4.1.2, Table 5
{ RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
{ RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
{ RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
{ RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
{ RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
{ RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
{ RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
{ RTLIB::O_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
// Floating-point to integer conversions.
// RTABI chapter 4.1.2, Table 6
{ RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
// Conversions between floating types.
// RTABI chapter 4.1.2, Table 7
{ RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
// Integer to floating-point conversions.
// RTABI chapter 4.1.2, Table 8
{ RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
// Long long helper functions
// RTABI chapter 4.2, Table 9
{ RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
// Integer division functions
// RTABI chapter 4.3.1
{ RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
};
for (const auto &LC : LibraryCalls) {
setLibcallName(LC.Op, LC.Name);
setLibcallCallingConv(LC.Op, LC.CC);
if (LC.Cond != ISD::SETCC_INVALID)
setCmpLibcallCC(LC.Op, LC.Cond);
}
// EABI dependent RTLIB
if (TM.Options.EABIVersion == EABI::EABI4 ||
TM.Options.EABIVersion == EABI::EABI5) {
static const struct {
const RTLIB::Libcall Op;
const char *const Name;
const CallingConv::ID CC;
const ISD::CondCode Cond;
} MemOpsLibraryCalls[] = {
// Memory operations
// RTABI chapter 4.3.4
{ RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
{ RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
};
for (const auto &LC : MemOpsLibraryCalls) {
setLibcallName(LC.Op, LC.Name);
setLibcallCallingConv(LC.Op, LC.CC);
if (LC.Cond != ISD::SETCC_INVALID)
setCmpLibcallCC(LC.Op, LC.Cond);
}
}
}
if (Subtarget->isTargetWindows()) {
static const struct {
const RTLIB::Libcall Op;
const char * const Name;
const CallingConv::ID CC;
} LibraryCalls[] = {
{ RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
{ RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
{ RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
{ RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
{ RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
{ RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
{ RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
{ RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
};
for (const auto &LC : LibraryCalls) {
setLibcallName(LC.Op, LC.Name);
setLibcallCallingConv(LC.Op, LC.CC);
}
}
// Use divmod compiler-rt calls for iOS 5.0 and later.
if (Subtarget->isTargetMachO() &&
!(Subtarget->isTargetIOS() &&
Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
}
// The half <-> float conversion functions are always soft-float on
// non-watchos platforms, but are needed for some targets which use a
// hard-float calling convention by default.
if (!Subtarget->isTargetWatchABI()) {
if (Subtarget->isAAPCS_ABI()) {
setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
} else {
setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
}
}
// In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
// a __gnu_ prefix (which is the default).
if (Subtarget->isTargetAEABI()) {
static const struct {
const RTLIB::Libcall Op;
const char * const Name;
const CallingConv::ID CC;
} LibraryCalls[] = {
{ RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },
{ RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },
{ RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },
};
for (const auto &LC : LibraryCalls) {
setLibcallName(LC.Op, LC.Name);
setLibcallCallingConv(LC.Op, LC.CC);
}
}
if (Subtarget->isThumb1Only())
addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
else
addRegisterClass(MVT::i32, &ARM::GPRRegClass);
if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
Subtarget->hasFPRegs()) {
addRegisterClass(MVT::f32, &ARM::SPRRegClass);
addRegisterClass(MVT::f64, &ARM::DPRRegClass);
if (!Subtarget->hasVFP2Base())
setAllExpand(MVT::f32);
if (!Subtarget->hasFP64())
setAllExpand(MVT::f64);
}
if (Subtarget->hasFullFP16()) {
addRegisterClass(MVT::f16, &ARM::HPRRegClass);
setOperationAction(ISD::BITCAST, MVT::i16, Custom);
setOperationAction(ISD::BITCAST, MVT::i32, Custom);
setOperationAction(ISD::BITCAST, MVT::f16, Custom);
setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
}
for (MVT VT : MVT::vector_valuetypes()) {
for (MVT InnerVT : MVT::vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
addAllExtLoads(VT, InnerVT, Expand);
}
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
setOperationAction(ISD::BSWAP, VT, Expand);
}
setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
if (Subtarget->hasMVEIntegerOps())
addMVEVectorTypes(Subtarget->hasMVEFloatOps());
// Combine low-overhead loop intrinsics so that we can lower i1 types.
if (Subtarget->hasLOB())
setTargetDAGCombine(ISD::BRCOND);
if (Subtarget->hasNEON()) {
addDRTypeForNEON(MVT::v2f32);
addDRTypeForNEON(MVT::v8i8);
addDRTypeForNEON(MVT::v4i16);
addDRTypeForNEON(MVT::v2i32);
addDRTypeForNEON(MVT::v1i64);
addQRTypeForNEON(MVT::v4f32);
addQRTypeForNEON(MVT::v2f64);
addQRTypeForNEON(MVT::v16i8);
addQRTypeForNEON(MVT::v8i16);
addQRTypeForNEON(MVT::v4i32);
addQRTypeForNEON(MVT::v2i64);
if (Subtarget->hasFullFP16()) {
addQRTypeForNEON(MVT::v8f16);
addDRTypeForNEON(MVT::v4f16);
}
}
if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
// v2f64 is legal so that QR subregs can be extracted as f64 elements, but
// none of Neon, MVE or VFP supports any arithmetic operations on it.
setOperationAction(ISD::FADD, MVT::v2f64, Expand);
setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
// FIXME: Code duplication: FDIV and FREM are expanded always, see
// ARMTargetLowering::addTypeForNEON method for details.
setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
setOperationAction(ISD::FREM, MVT::v2f64, Expand);
// FIXME: Create unittest.
// In another words, find a way when "copysign" appears in DAG with vector
// operands.
setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
// FIXME: Code duplication: SETCC has custom operation action, see
// ARMTargetLowering::addTypeForNEON method for details.
setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
// FIXME: Create unittest for FNEG and for FABS.
setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
setOperationAction(ISD::FABS, MVT::v2f64, Expand);
setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
// FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
setOperationAction(ISD::FMA, MVT::v2f64, Expand);
}
if (Subtarget->hasNEON()) {
// The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
// supported for v4f32.
setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
// Mark v2f32 intrinsics.
setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
// Neon does not support some operations on v1i64 and v2i64 types.
setOperationAction(ISD::MUL, MVT::v1i64, Expand);
// Custom handling for some quad-vector types to detect VMULL.
setOperationAction(ISD::MUL, MVT::v8i16, Custom);
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
// Custom handling for some vector types to avoid expensive expansions
setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
// Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
// a destination type that is wider than the source, and nor does
// it have a FP_TO_[SU]INT instruction with a narrower destination than
// source.
setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
// NEON does not have single instruction CTPOP for vectors with element
// types wider than 8-bits. However, custom lowering can leverage the
// v8i8/v16i8 vcnt instruction.
setOperationAction(ISD::CTPOP, MVT::v2i32, Custom);
setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
setOperationAction(ISD::CTPOP, MVT::v4i16, Custom);
setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
setOperationAction(ISD::CTPOP, MVT::v1i64, Custom);
setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
// NEON does not have single instruction CTTZ for vectors.
setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
// NEON only has FMA instructions as of VFP4.
if (!Subtarget->hasVFP4Base()) {
setOperationAction(ISD::FMA, MVT::v2f32, Expand);
setOperationAction(ISD::FMA, MVT::v4f32, Expand);
}
setTargetDAGCombine(ISD::INTRINSIC_VOID);
setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRL);
setTargetDAGCombine(ISD::SRA);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::FP_TO_SINT);
setTargetDAGCombine(ISD::FP_TO_UINT);
setTargetDAGCombine(ISD::FDIV);
setTargetDAGCombine(ISD::LOAD);
// It is legal to extload from v4i8 to v4i16 or v4i32.
for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
MVT::v2i32}) {
for (MVT VT : MVT::integer_vector_valuetypes()) {
setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
}
}
}
if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
setTargetDAGCombine(ISD::BUILD_VECTOR);
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
}
if (!Subtarget->hasFP64()) {
// When targeting a floating-point unit with only single-precision
// operations, f64 is legal for the few double-precision instructions which
// are present However, no double-precision operations other than moves,
// loads and stores are provided by the hardware.
setOperationAction(ISD::FADD, MVT::f64, Expand);
setOperationAction(ISD::FSUB, MVT::f64, Expand);
setOperationAction(ISD::FMUL, MVT::f64, Expand);
setOperationAction(ISD::FMA, MVT::f64, Expand);
setOperationAction(ISD::FDIV, MVT::f64, Expand);
setOperationAction(ISD::FREM, MVT::f64, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
setOperationAction(ISD::FGETSIGN, MVT::f64, Expand);
setOperationAction(ISD::FNEG, MVT::f64, Expand);
setOperationAction(ISD::FABS, MVT::f64, Expand);
setOperationAction(ISD::FSQRT, MVT::f64, Expand);
setOperationAction(ISD::FSIN, MVT::f64, Expand);
setOperationAction(ISD::FCOS, MVT::f64, Expand);
setOperationAction(ISD::FPOW, MVT::f64, Expand);
setOperationAction(ISD::FLOG, MVT::f64, Expand);
setOperationAction(ISD::FLOG2, MVT::f64, Expand);
setOperationAction(ISD::FLOG10, MVT::f64, Expand);
setOperationAction(ISD::FEXP, MVT::f64, Expand);
setOperationAction(ISD::FEXP2, MVT::f64, Expand);
setOperationAction(ISD::FCEIL, MVT::f64, Expand);
setOperationAction(ISD::FTRUNC, MVT::f64, Expand);
setOperationAction(ISD::FRINT, MVT::f64, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
setOperationAction(ISD::FFLOOR, MVT::f64, Expand);
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
}
if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()){
setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
}
if (!Subtarget->hasFP16())
setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
if (!Subtarget->hasFP64())
setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
computeRegisterProperties(Subtarget->getRegisterInfo());
// ARM does not have floating-point extending loads.
for (MVT VT : MVT::fp_valuetypes()) {
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
}
// ... or truncating stores
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
// ARM does not have i1 sign extending load.
for (MVT VT : MVT::integer_valuetypes())
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
// ARM supports all 4 flavors of integer indexed load / store.
if (!Subtarget->isThumb1Only()) {
for (unsigned im = (unsigned)ISD::PRE_INC;
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
setIndexedLoadAction(im, MVT::i1, Legal);
setIndexedLoadAction(im, MVT::i8, Legal);
setIndexedLoadAction(im, MVT::i16, Legal);
setIndexedLoadAction(im, MVT::i32, Legal);
setIndexedStoreAction(im, MVT::i1, Legal);
setIndexedStoreAction(im, MVT::i8, Legal);
setIndexedStoreAction(im, MVT::i16, Legal);
setIndexedStoreAction(im, MVT::i32, Legal);
}
} else {
// Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal);
setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal);
}
setOperationAction(ISD::SADDO, MVT::i32, Custom);
setOperationAction(ISD::UADDO, MVT::i32, Custom);
setOperationAction(ISD::SSUBO, MVT::i32, Custom);
setOperationAction(ISD::USUBO, MVT::i32, Custom);
setOperationAction(ISD::ADDCARRY, MVT::i32, Custom);
setOperationAction(ISD::SUBCARRY, MVT::i32, Custom);
// i64 operation support.
setOperationAction(ISD::MUL, MVT::i64, Expand);
setOperationAction(ISD::MULHU, MVT::i32, Expand);
if (Subtarget->isThumb1Only()) {
setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
}
if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
|| (Subtarget->isThumb2() && !Subtarget->hasDSP()))
setOperationAction(ISD::MULHS, MVT::i32, Expand);
setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
setOperationAction(ISD::SRL, MVT::i64, Custom);
setOperationAction(ISD::SRA, MVT::i64, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
// MVE lowers 64 bit shifts to lsll and lsrl
// assuming that ISD::SRL and SRA of i64 are already marked custom
if (Subtarget->hasMVEIntegerOps())
setOperationAction(ISD::SHL, MVT::i64, Custom);
// Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
if (Subtarget->isThumb1Only()) {
setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
}
if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
// ARM does not have ROTL.
setOperationAction(ISD::ROTL, MVT::i32, Expand);
for (MVT VT : MVT::vector_valuetypes()) {
setOperationAction(ISD::ROTL, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
}
setOperationAction(ISD::CTTZ, MVT::i32, Custom);
setOperationAction(ISD::CTPOP, MVT::i32, Expand);
if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
setOperationAction(ISD::CTLZ, MVT::i32, Expand);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall);
}
// @llvm.readcyclecounter requires the Performance Monitors extension.
// Default to the 0 expansion on unsupported platforms.
// FIXME: Technically there are older ARM CPUs that have
// implementation-specific ways of obtaining this information.
if (Subtarget->hasPerfMon())
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
// Only ARMv6 has BSWAP.
if (!Subtarget->hasV6Ops())
setOperationAction(ISD::BSWAP, MVT::i32, Expand);
bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
: Subtarget->hasDivideInARMMode();
if (!hasDivide) {
// These are expanded into libcalls if the cpu doesn't have HW divider.
setOperationAction(ISD::SDIV, MVT::i32, LibCall);
setOperationAction(ISD::UDIV, MVT::i32, LibCall);
}
if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) {
setOperationAction(ISD::SDIV, MVT::i32, Custom);
setOperationAction(ISD::UDIV, MVT::i32, Custom);
setOperationAction(ISD::SDIV, MVT::i64, Custom);
setOperationAction(ISD::UDIV, MVT::i64, Custom);
}
setOperationAction(ISD::SREM, MVT::i32, Expand);
setOperationAction(ISD::UREM, MVT::i32, Expand);
// Register based DivRem for AEABI (RTABI 4.2)
if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
Subtarget->isTargetWindows()) {
setOperationAction(ISD::SREM, MVT::i64, Custom);
setOperationAction(ISD::UREM, MVT::i64, Custom);
HasStandaloneRem = false;
if (Subtarget->isTargetWindows()) {
const struct {
const RTLIB::Libcall Op;
const char * const Name;
const CallingConv::ID CC;
} LibraryCalls[] = {
{ RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
{ RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
{ RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
{ RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },
{ RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
{ RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
{ RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
{ RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
};
for (const auto &LC : LibraryCalls) {
setLibcallName(LC.Op, LC.Name);
setLibcallCallingConv(LC.Op, LC.CC);
}
} else {
const struct {
const RTLIB::Libcall Op;
const char * const Name;
const CallingConv::ID CC;
} LibraryCalls[] = {
{ RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
{ RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
{ RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
{ RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },
{ RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
{ RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
{ RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
{ RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
};
for (const auto &LC : LibraryCalls) {
setLibcallName(LC.Op, LC.Name);
setLibcallCallingConv(LC.Op, LC.CC);
}
}
setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
} else {
setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
}
if (Subtarget->isTargetWindows() && Subtarget->getTargetTriple().isOSMSVCRT())
for (auto &VT : {MVT::f32, MVT::f64})
setOperationAction(ISD::FPOWI, VT, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
setOperationAction(ISD::TRAP, MVT::Other, Legal);
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
// Use the default implementation.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
setOperationAction(ISD::VAARG, MVT::Other, Expand);
setOperationAction(ISD::VACOPY, MVT::Other, Expand);
setOperationAction(ISD::VAEND, MVT::Other, Expand);
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
if (Subtarget->isTargetWindows())
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
else
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
// ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
// the default expansion.
InsertFencesForAtomic = false;
if (Subtarget->hasAnyDataBarrier() &&
(!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
// ATOMIC_FENCE needs custom lowering; the others should have been expanded
// to ldrex/strex loops already.
setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
if (!Subtarget->isThumb() || !Subtarget->isMClass())
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
// On v8, we have particularly efficient implementations of atomic fences
// if they can be combined with nearby atomic loads and stores.
if (!Subtarget->hasAcquireRelease() ||
getTargetMachine().getOptLevel() == 0) {
// Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
InsertFencesForAtomic = true;
}
} else {
// If there's anything we can use as a barrier, go through custom lowering
// for ATOMIC_FENCE.
// If target has DMB in thumb, Fences can be inserted.
if (Subtarget->hasDataBarrier())
InsertFencesForAtomic = true;
setOperationAction(ISD::ATOMIC_FENCE, MVT::Other,
Subtarget->hasAnyDataBarrier() ? Custom : Expand);
// Set them all for expansion, which will force libcalls.
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand);
setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand);
setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand);
setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand);
setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand);
setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand);
setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand);
setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
// Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
// Unordered/Monotonic case.
if (!InsertFencesForAtomic) {
setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
}
}
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
// Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
if (!Subtarget->hasV6Ops()) {
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
}
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
!Subtarget->isThumb1Only()) {
// Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
// iff target supports vfp2.
setOperationAction(ISD::BITCAST, MVT::i64, Custom);
setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
}
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
if (Subtarget->useSjLjEH())
setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
setOperationAction(ISD::SETCC, MVT::i32, Expand);
setOperationAction(ISD::SETCC, MVT::f32, Expand);
setOperationAction(ISD::SETCC, MVT::f64, Expand);
setOperationAction(ISD::SELECT, MVT::i32, Custom);
setOperationAction(ISD::SELECT, MVT::f32, Custom);
setOperationAction(ISD::SELECT, MVT::f64, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
if (Subtarget->hasFullFP16()) {
setOperationAction(ISD::SETCC, MVT::f16, Expand);
setOperationAction(ISD::SELECT, MVT::f16, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
}
setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom);
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
setOperationAction(ISD::BR_CC, MVT::i32, Custom);
if (Subtarget->hasFullFP16())
setOperationAction(ISD::BR_CC, MVT::f16, Custom);
setOperationAction(ISD::BR_CC, MVT::f32, Custom);
setOperationAction(ISD::BR_CC, MVT::f64, Custom);
setOperationAction(ISD::BR_JT, MVT::Other, Custom);
// We don't support sin/cos/fmod/copysign/pow
setOperationAction(ISD::FSIN, MVT::f64, Expand);
setOperationAction(ISD::FSIN, MVT::f32, Expand);
setOperationAction(ISD::FCOS, MVT::f32, Expand);
setOperationAction(ISD::FCOS, MVT::f64, Expand);
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
setOperationAction(ISD::FREM, MVT::f64, Expand);
setOperationAction(ISD::FREM, MVT::f32, Expand);
if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
!Subtarget->isThumb1Only()) {
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
}
setOperationAction(ISD::FPOW, MVT::f64, Expand);
setOperationAction(ISD::FPOW, MVT::f32, Expand);
if (!Subtarget->hasVFP4Base()) {
setOperationAction(ISD::FMA, MVT::f64, Expand);
setOperationAction(ISD::FMA, MVT::f32, Expand);
}
// Various VFP goodness
if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
// FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
}
// fp16 is a special v7 extension that adds f16 <-> f32 conversions.
if (!Subtarget->hasFP16()) {
setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
}
}
// Use __sincos_stret if available.
if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
}
// FP-ARMv8 implements a lot of rounding-like FP operations.
if (Subtarget->hasFPARMv8Base()) {
setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
setOperationAction(ISD::FCEIL, MVT::f32, Legal);
setOperationAction(ISD::FROUND, MVT::f32, Legal);
setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
setOperationAction(ISD::FRINT, MVT::f32, Legal);
setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
if (Subtarget->hasNEON()) {
setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
}
if (Subtarget->hasFP64()) {
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
setOperationAction(ISD::FROUND, MVT::f64, Legal);
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
setOperationAction(ISD::FRINT, MVT::f64, Legal);
setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
}
}
// FP16 often need to be promoted to call lib functions
if (Subtarget->hasFullFP16()) {
setOperationAction(ISD::FREM, MVT::f16, Promote);
setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
setOperationAction(ISD::FSIN, MVT::f16, Promote);
setOperationAction(ISD::FCOS, MVT::f16, Promote);
setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
setOperationAction(ISD::FPOWI, MVT::f16, Promote);
setOperationAction(ISD::FPOW, MVT::f16, Promote);
setOperationAction(ISD::FEXP, MVT::f16, Promote);
setOperationAction(ISD::FEXP2, MVT::f16, Promote);
setOperationAction(ISD::FLOG, MVT::f16, Promote);
setOperationAction(ISD::FLOG10, MVT::f16, Promote);
setOperationAction(ISD::FLOG2, MVT::f16, Promote);
setOperationAction(ISD::FROUND, MVT::f16, Legal);
}
if (Subtarget->hasNEON()) {
// vmin and vmax aren't available in a scalar form, so we use
// a NEON instruction with an undef lane instead.
setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
if (Subtarget->hasFullFP16()) {
setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal);
setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal);
setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal);
setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal);
setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal);
setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal);
setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
}
}
// We have target-specific dag combine patterns for the following nodes:
// ARMISD::VMOVRRD - No need to call setTargetDAGCombine
setTargetDAGCombine(ISD::ADD);
setTargetDAGCombine(ISD::SUB);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::OR);
setTargetDAGCombine(ISD::XOR);
if (Subtarget->hasV6Ops())
setTargetDAGCombine(ISD::SRL);
if (Subtarget->isThumb1Only())
setTargetDAGCombine(ISD::SHL);
setStackPointerRegisterToSaveRestore(ARM::SP);
if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
!Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
setSchedulingPreference(Sched::RegPressure);
else
setSchedulingPreference(Sched::Hybrid);
//// temporary - rewrite interface to use type
MaxStoresPerMemset = 8;
MaxStoresPerMemsetOptSize = 4;
MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
MaxStoresPerMemcpyOptSize = 2;
MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
MaxStoresPerMemmoveOptSize = 2;
// On ARM arguments smaller than 4 bytes are extended, so all arguments
// are at least 4 bytes aligned.
setMinStackArgumentAlignment(4);
// Prefer likely predicted branches to selects on out-of-order cores.
PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
setPrefLoopAlignment(Subtarget->getPrefLoopAlignment());
setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
if (Subtarget->isThumb() || Subtarget->isThumb2())
setTargetDAGCombine(ISD::ABS);
}
bool ARMTargetLowering::useSoftFloat() const {
return Subtarget->useSoftFloat();
}
// FIXME: It might make sense to define the representative register class as the
// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
// SPR's representative would be DPR_VFP2. This should work well if register
// pressure tracking were modified such that a register use would increment the
// pressure of the register class's representative and all of it's super
// classes' representatives transitively. We have not implemented this because
// of the difficulty prior to coalescing of modeling operand register classes
// due to the common occurrence of cross class copies and subregister insertions
// and extractions.
std::pair<const TargetRegisterClass *, uint8_t>
ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
MVT VT) const {
const TargetRegisterClass *RRC = nullptr;
uint8_t Cost = 1;
switch (VT.SimpleTy) {
default:
return TargetLowering::findRepresentativeClass(TRI, VT);
// Use DPR as representative register class for all floating point
// and vector types. Since there are 32 SPR registers and 32 DPR registers so
// the cost is 1 for both f32 and f64.
case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
RRC = &ARM::DPRRegClass;
// When NEON is used for SP, only half of the register file is available
// because operations that define both SP and DP results will be constrained
// to the VFP2 class (D0-D15). We currently model this constraint prior to
// coalescing by double-counting the SP regs. See the FIXME above.
if (Subtarget->useNEONForSinglePrecisionFP())
Cost = 2;
break;
case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
case MVT::v4f32: case MVT::v2f64:
RRC = &ARM::DPRRegClass;
Cost = 2;
break;
case MVT::v4i64:
RRC = &ARM::DPRRegClass;
Cost = 4;
break;
case MVT::v8i64:
RRC = &ARM::DPRRegClass;
Cost = 8;
break;
}
return std::make_pair(RRC, Cost);
}
const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((ARMISD::NodeType)Opcode) {
case ARMISD::FIRST_NUMBER: break;
case ARMISD::Wrapper: return "ARMISD::Wrapper";
case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC";
case ARMISD::WrapperJT: return "ARMISD::WrapperJT";
case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL";
case ARMISD::CALL: return "ARMISD::CALL";
case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED";
case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK";
case ARMISD::BRCOND: return "ARMISD::BRCOND";
case ARMISD::BR_JT: return "ARMISD::BR_JT";
case ARMISD::BR2_JT: return "ARMISD::BR2_JT";
case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG";
case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG";
case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD";
case ARMISD::CMP: return "ARMISD::CMP";
case ARMISD::CMN: return "ARMISD::CMN";
case ARMISD::CMPZ: return "ARMISD::CMPZ";
case ARMISD::CMPFP: return "ARMISD::CMPFP";
case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0";
case ARMISD::BCC_i64: return "ARMISD::BCC_i64";
case ARMISD::FMSTAT: return "ARMISD::FMSTAT";
case ARMISD::CMOV: return "ARMISD::CMOV";
case ARMISD::SUBS: return "ARMISD::SUBS";
case ARMISD::SSAT: return "ARMISD::SSAT";
case ARMISD::USAT: return "ARMISD::USAT";
case ARMISD::ASRL: return "ARMISD::ASRL";
case ARMISD::LSRL: return "ARMISD::LSRL";
case ARMISD::LSLL: return "ARMISD::LSLL";
case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG";
case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG";
case ARMISD::RRX: return "ARMISD::RRX";
case ARMISD::ADDC: return "ARMISD::ADDC";
case ARMISD::ADDE: return "ARMISD::ADDE";
case ARMISD::SUBC: return "ARMISD::SUBC";
case ARMISD::SUBE: return "ARMISD::SUBE";
case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD";
case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR";
case ARMISD::VMOVhr: return "ARMISD::VMOVhr";
case ARMISD::VMOVrh: return "ARMISD::VMOVrh";
case ARMISD::VMOVSR: return "ARMISD::VMOVSR";
case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";
case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH";
case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN";
case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC";
case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
case ARMISD::PRELOAD: return "ARMISD::PRELOAD";
case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK";
case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK";
case ARMISD::VCEQ: return "ARMISD::VCEQ";
case ARMISD::VCEQZ: return "ARMISD::VCEQZ";
case ARMISD::VCGE: return "ARMISD::VCGE";
case ARMISD::VCGEZ: return "ARMISD::VCGEZ";
case ARMISD::VCLEZ: return "ARMISD::VCLEZ";
case ARMISD::VCGEU: return "ARMISD::VCGEU";
case ARMISD::VCGT: return "ARMISD::VCGT";
case ARMISD::VCGTZ: return "ARMISD::VCGTZ";
case ARMISD::VCLTZ: return "ARMISD::VCLTZ";
case ARMISD::VCGTU: return "ARMISD::VCGTU";
case ARMISD::VTST: return "ARMISD::VTST";
case ARMISD::VSHLs: return "ARMISD::VSHLs";
case ARMISD::VSHLu: return "ARMISD::VSHLu";
case ARMISD::VSHLIMM: return "ARMISD::VSHLIMM";
case ARMISD::VSHRsIMM: return "ARMISD::VSHRsIMM";
case ARMISD::VSHRuIMM: return "ARMISD::VSHRuIMM";
case ARMISD::VRSHRsIMM: return "ARMISD::VRSHRsIMM";
case ARMISD::VRSHRuIMM: return "ARMISD::VRSHRuIMM";
case ARMISD::VRSHRNIMM: return "ARMISD::VRSHRNIMM";
case ARMISD::VQSHLsIMM: return "ARMISD::VQSHLsIMM";
case ARMISD::VQSHLuIMM: return "ARMISD::VQSHLuIMM";
case ARMISD::VQSHLsuIMM: return "ARMISD::VQSHLsuIMM";
case ARMISD::VQSHRNsIMM: return "ARMISD::VQSHRNsIMM";
case ARMISD::VQSHRNuIMM: return "ARMISD::VQSHRNuIMM";
case ARMISD::VQSHRNsuIMM: return "ARMISD::VQSHRNsuIMM";
case ARMISD::VQRSHRNsIMM: return "ARMISD::VQRSHRNsIMM";
case ARMISD::VQRSHRNuIMM: return "ARMISD::VQRSHRNuIMM";
case ARMISD::VQRSHRNsuIMM: return "ARMISD::VQRSHRNsuIMM";
case ARMISD::VSLIIMM: return "ARMISD::VSLIIMM";
case ARMISD::VSRIIMM: return "ARMISD::VSRIIMM";
case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu";
case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs";
case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM";
case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM";
case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM";
case ARMISD::VDUP: return "ARMISD::VDUP";
case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE";
case ARMISD::VEXT: return "ARMISD::VEXT";
case ARMISD::VREV64: return "ARMISD::VREV64";
case ARMISD::VREV32: return "ARMISD::VREV32";
case ARMISD::VREV16: return "ARMISD::VREV16";
case ARMISD::VZIP: return "ARMISD::VZIP";
case ARMISD::VUZP: return "ARMISD::VUZP";
case ARMISD::VTRN: return "ARMISD::VTRN";
case ARMISD::VTBL1: return "ARMISD::VTBL1";
case ARMISD::VTBL2: return "ARMISD::VTBL2";
case ARMISD::VMULLs: return "ARMISD::VMULLs";
case ARMISD::VMULLu: return "ARMISD::VMULLu";
case ARMISD::UMAAL: return "ARMISD::UMAAL";
case ARMISD::UMLAL: return "ARMISD::UMLAL";
case ARMISD::SMLAL: return "ARMISD::SMLAL";
case ARMISD::SMLALBB: return "ARMISD::SMLALBB";
case ARMISD::SMLALBT: return "ARMISD::SMLALBT";
case ARMISD::SMLALTB: return "ARMISD::SMLALTB";
case ARMISD::SMLALTT: return "ARMISD::SMLALTT";
case ARMISD::SMULWB: return "ARMISD::SMULWB";
case ARMISD::SMULWT: return "ARMISD::SMULWT";
case ARMISD::SMLALD: return "ARMISD::SMLALD";
case ARMISD::SMLALDX: return "ARMISD::SMLALDX";
case ARMISD::SMLSLD: return "ARMISD::SMLSLD";
case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX";
case ARMISD::SMMLAR: return "ARMISD::SMMLAR";
case ARMISD::SMMLSR: return "ARMISD::SMMLSR";
case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR";
case ARMISD::BFI: return "ARMISD::BFI";
case ARMISD::VORRIMM: return "ARMISD::VORRIMM";
case ARMISD::VBICIMM: return "ARMISD::VBICIMM";
case ARMISD::VBSL: return "ARMISD::VBSL";
case ARMISD::MEMCPY: return "ARMISD::MEMCPY";
case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP";
case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP";
case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP";
case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP";
case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD";
case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD";
case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD";
case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD";
case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD";
case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD";
case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD";
case ARMISD::VLD1DUP_UPD: return "ARMISD::VLD1DUP_UPD";
case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD";
case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD";
case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD";
case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD";
case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD";
case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD";
case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD";
case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD";
case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD";
case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD";
case ARMISD::WLS: return "ARMISD::WLS";
}
return nullptr;
}
EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
EVT VT) const {
if (!VT.isVector())
return getPointerTy(DL);
return VT.changeVectorElementTypeToInteger();
}
/// getRegClassFor - Return the register class that should be used for the
/// specified value type.
const TargetRegisterClass *
ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
(void)isDivergent;
// Map v4i64 to QQ registers but do not make the type legal. Similarly map
// v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
// load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
// MVE Q registers.
if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
if (VT == MVT::v4i64)
return &ARM::QQPRRegClass;
if (VT == MVT::v8i64)
return &ARM::QQQQPRRegClass;
}
return TargetLowering::getRegClassFor(VT);
}
// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
// source/dest is aligned and the copy size is large enough. We therefore want
// to align such objects passed to memory intrinsics.
bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
unsigned &PrefAlign) const {
if (!isa<MemIntrinsic>(CI))
return false;
MinSize = 8;
// On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
// cycle faster than 4-byte aligned LDM.
PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4);
return true;
}
// Create a fast isel object.
FastISel *
ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo) const {
return ARM::createFastISel(funcInfo, libInfo);
}
Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
unsigned NumVals = N->getNumValues();
if (!NumVals)
return Sched::RegPressure;
for (unsigned i = 0; i != NumVals; ++i) {
EVT VT = N->getValueType(i);
if (VT == MVT::Glue || VT == MVT::Other)
continue;
if (VT.isFloatingPoint() || VT.isVector())
return Sched::ILP;
}
if (!N->isMachineOpcode())
return Sched::RegPressure;
// Load are scheduled for latency even if there instruction itinerary
// is not available.
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
if (MCID.getNumDefs() == 0)
return Sched::RegPressure;
if (!Itins->isEmpty() &&
Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
return Sched::ILP;
return Sched::RegPressure;
}
//===----------------------------------------------------------------------===//
// Lowering Code
//===----------------------------------------------------------------------===//
static bool isSRL16(const SDValue &Op) {
if (Op.getOpcode() != ISD::SRL)
return false;
if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
return Const->getZExtValue() == 16;
return false;
}
static bool isSRA16(const SDValue &Op) {
if (Op.getOpcode() != ISD::SRA)
return false;
if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
return Const->getZExtValue() == 16;
return false;
}
static bool isSHL16(const SDValue &Op) {
if (Op.getOpcode() != ISD::SHL)
return false;
if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
return Const->getZExtValue() == 16;
return false;
}
// Check for a signed 16-bit value. We special case SRA because it makes it
// more simple when also looking for SRAs that aren't sign extending a
// smaller value. Without the check, we'd need to take extra care with
// checking order for some operations.
static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
if (isSRA16(Op))
return isSHL16(Op.getOperand(0));
return DAG.ComputeNumSignBits(Op) == 17;
}
/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
switch (CC) {
default: llvm_unreachable("Unknown condition code!");
case ISD::SETNE: return ARMCC::NE;
case ISD::SETEQ: return ARMCC::EQ;
case ISD::SETGT: return ARMCC::GT;
case ISD::SETGE: return ARMCC::GE;
case ISD::SETLT: return ARMCC::LT;
case ISD::SETLE: return ARMCC::LE;
case ISD::SETUGT: return ARMCC::HI;
case ISD::SETUGE: return ARMCC::HS;
case ISD::SETULT: return ARMCC::LO;
case ISD::SETULE: return ARMCC::LS;
}
}
/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
ARMCC::CondCodes &CondCode2, bool &InvalidOnQNaN) {
CondCode2 = ARMCC::AL;
InvalidOnQNaN = true;
switch (CC) {
default: llvm_unreachable("Unknown FP condition!");
case ISD::SETEQ:
case ISD::SETOEQ:
CondCode = ARMCC::EQ;
InvalidOnQNaN = false;
break;
case ISD::SETGT:
case ISD::SETOGT: CondCode = ARMCC::GT; break;
case ISD::SETGE:
case ISD::SETOGE: CondCode = ARMCC::GE; break;
case ISD::SETOLT: CondCode = ARMCC::MI; break;
case ISD::SETOLE: CondCode = ARMCC::LS; break;
case ISD::SETONE:
CondCode = ARMCC::MI;
CondCode2 = ARMCC::GT;
InvalidOnQNaN = false;
break;
case ISD::SETO: CondCode = ARMCC::VC; break;
case ISD::SETUO: CondCode = ARMCC::VS; break;
case ISD::SETUEQ:
CondCode = ARMCC::EQ;
CondCode2 = ARMCC::VS;
InvalidOnQNaN = false;
break;
case ISD::SETUGT: CondCode = ARMCC::HI; break;
case ISD::SETUGE: CondCode = ARMCC::PL; break;
case ISD::SETLT:
case ISD::SETULT: CondCode = ARMCC::LT; break;
case ISD::SETLE:
case ISD::SETULE: CondCode = ARMCC::LE; break;
case ISD::SETNE:
case ISD::SETUNE:
CondCode = ARMCC::NE;
InvalidOnQNaN = false;
break;
}
}
//===----------------------------------------------------------------------===//
// Calling Convention Implementation
//===----------------------------------------------------------------------===//
/// getEffectiveCallingConv - Get the effective calling convention, taking into
/// account presence of floating point hardware and calling convention
/// limitations, such as support for variadic functions.
CallingConv::ID
ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
bool isVarArg) const {
switch (CC) {
default:
report_fatal_error("Unsupported calling convention");
case CallingConv::ARM_AAPCS:
case CallingConv::ARM_APCS:
case CallingConv::GHC:
return CC;
case CallingConv::PreserveMost:
return CallingConv::PreserveMost;
case CallingConv::ARM_AAPCS_VFP:
case CallingConv::Swift:
return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
case CallingConv::C:
if (!Subtarget->isAAPCS_ABI())
return CallingConv::ARM_APCS;
else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
!isVarArg)
return CallingConv::ARM_AAPCS_VFP;
else
return CallingConv::ARM_AAPCS;
case CallingConv::Fast:
case CallingConv::CXX_FAST_TLS:
if (!Subtarget->isAAPCS_ABI()) {
if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
return CallingConv::Fast;
return CallingConv::ARM_APCS;
} else if (Subtarget->hasVFP2Base() &&
!Subtarget->isThumb1Only() && !isVarArg)
return CallingConv::ARM_AAPCS_VFP;
else
return CallingConv::ARM_AAPCS;
}
}
CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
bool isVarArg) const {
return CCAssignFnForNode(CC, false, isVarArg);
}
CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
bool isVarArg) const {
return CCAssignFnForNode(CC, true, isVarArg);
}
/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
/// CallingConvention.
CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
bool Return,
bool isVarArg) const {
switch (getEffectiveCallingConv(CC, isVarArg)) {
default:
report_fatal_error("Unsupported calling convention");
case CallingConv::ARM_APCS:
return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
case CallingConv::ARM_AAPCS:
return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
case CallingConv::ARM_AAPCS_VFP:
return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
case CallingConv::Fast:
return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
case CallingConv::GHC:
return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
case CallingConv::PreserveMost:
return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
}
}
/// LowerCallResult - Lower the result values of a call into the
/// appropriate copies out of appropriate physical registers.
SDValue ARMTargetLowering::LowerCallResult(
SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
SDValue ThisVal) const {
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
// Copy all of the result registers out of their specified physreg.
for (unsigned i = 0; i != RVLocs.size(); ++i) {
CCValAssign VA = RVLocs[i];
// Pass 'this' value directly from the argument to return value, to avoid
// reg unit interference
if (i == 0 && isThisReturn) {
assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
"unexpected return calling convention register assignment");
InVals.push_back(ThisVal);
continue;
}
SDValue Val;
if (VA.needsCustom()) {
// Handle f64 or half of a v2f64.
SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
InFlag);
Chain = Lo.getValue(1);
InFlag = Lo.getValue(2);
VA = RVLocs[++i]; // skip ahead to next loc
SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
InFlag);
Chain = Hi.getValue(1);
InFlag = Hi.getValue(2);
if (!Subtarget->isLittle())
std::swap (Lo, Hi);
Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
if (VA.getLocVT() == MVT::v2f64) {
SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
DAG.getConstant(0, dl, MVT::i32));
VA = RVLocs[++i]; // skip ahead to next loc
Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
Chain = Lo.getValue(1);
InFlag = Lo.getValue(2);
VA = RVLocs[++i]; // skip ahead to next loc
Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
Chain = Hi.getValue(1);
InFlag = Hi.getValue(2);
if (!Subtarget->isLittle())
std::swap (Lo, Hi);
Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
DAG.getConstant(1, dl, MVT::i32));
}
} else {
Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
InFlag);
Chain = Val.getValue(1);
InFlag = Val.getValue(2);
}
switch (VA.getLocInfo()) {
default: llvm_unreachable("Unknown loc info!");
case CCValAssign::Full: break;
case CCValAssign::BCvt:
Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
break;
}
InVals.push_back(Val);
}
return Chain;
}
/// LowerMemOpCallTo - Store the argument to the stack.
SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
SDValue Arg, const SDLoc &dl,
SelectionDAG &DAG,
const CCValAssign &VA,
ISD::ArgFlagsTy Flags) const {
unsigned LocMemOffset = VA.getLocMemOffset();
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
StackPtr, PtrOff);
return DAG.getStore(
Chain, dl, Arg, PtrOff,
MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
}
void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
SDValue Chain, SDValue &Arg,
RegsToPassVector &RegsToPass,
CCValAssign &VA, CCValAssign &NextVA,
SDValue &StackPtr,
SmallVectorImpl<SDValue> &MemOpChains,
ISD::ArgFlagsTy Flags) const {
SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
DAG.getVTList(MVT::i32, MVT::i32), Arg);
unsigned id = Subtarget->isLittle() ? 0 : 1;
RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
if (NextVA.isRegLoc())
RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
else {
assert(NextVA.isMemLoc());
if (!StackPtr.getNode())
StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
getPointerTy(DAG.getDataLayout()));
MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
dl, DAG, NextVA,
Flags));
}
}
/// LowerCall - Lowering a call into a callseq_start <-
/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
/// nodes.
SDValue
ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
SelectionDAG &DAG = CLI.DAG;
SDLoc &dl = CLI.DL;
SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
bool &isTailCall = CLI.IsTailCall;
CallingConv::ID CallConv = CLI.CallConv;
bool doesNotRet = CLI.DoesNotReturn;
bool isVarArg = CLI.IsVarArg;
MachineFunction &MF = DAG.getMachineFunction();
bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
bool isThisReturn = false;
auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
bool PreferIndirect = false;
// Disable tail calls if they're not supported.
if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true")
isTailCall = false;
if (isa<GlobalAddressSDNode>(Callee)) {
// If we're optimizing for minimum size and the function is called three or
// more times in this block, we can improve codesize by calling indirectly
// as BLXr has a 16-bit encoding.
auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
if (CLI.CS) {
auto *BB = CLI.CS.getParent();
PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
count_if(GV->users(), [&BB](const User *U) {
return isa<Instruction>(U) &&
cast<Instruction>(U)->getParent() == BB;
}) > 2;
}
}
if (isTailCall) {
// Check if it's really possible to do a tail call.
isTailCall = IsEligibleForTailCallOptimization(
Callee, CallConv, isVarArg, isStructRet,
MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG,
PreferIndirect);
if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
// We don't support GuaranteedTailCallOpt for ARM, only automatically
// detected sibcalls.
if (isTailCall)
++NumTailCalls;
}
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
if (isTailCall) {
// For tail calls, memory operands are available in our caller's stack.
NumBytes = 0;
} else {
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
}
SDValue StackPtr =
DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
RegsToPassVector RegsToPass;
SmallVector<SDValue, 8> MemOpChains;
// Walk the register/memloc assignments, inserting copies/loads. In the case
// of tail call optimization, arguments are handled later.
for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
i != e;
++i, ++realArgIdx) {
CCValAssign &VA = ArgLocs[i];
SDValue Arg = OutVals[realArgIdx];
ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
bool isByVal = Flags.isByVal();
// Promote the value if needed.
switch (VA.getLocInfo()) {
default: llvm_unreachable("Unknown loc info!");
case CCValAssign::Full: break;
case CCValAssign::SExt:
Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
break;
case CCValAssign::ZExt:
Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
break;
case CCValAssign::AExt:
Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
break;
case CCValAssign::BCvt:
Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
break;
}
// f64 and v2f64 might be passed in i32 pairs and must be split into pieces
if (VA.needsCustom()) {
if (VA.getLocVT() == MVT::v2f64) {
SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
DAG.getConstant(0, dl, MVT::i32));
SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
DAG.getConstant(1, dl, MVT::i32));
PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
VA = ArgLocs[++i]; // skip ahead to next loc
if (VA.isRegLoc()) {
PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
} else {
assert(VA.isMemLoc());
MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
dl, DAG, VA, Flags));
}
} else {
PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
StackPtr, MemOpChains, Flags);
}
} else if (VA.isRegLoc()) {
if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
Outs[0].VT == MVT::i32) {
assert(VA.getLocVT() == MVT::i32 &&
"unexpected calling convention register assignment");
assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
"unexpected use of 'returned'");
isThisReturn = true;
}
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
} else if (isByVal) {
assert(VA.isMemLoc());
unsigned offset = 0;
// True if this byval aggregate will be split between registers
// and memory.
unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
if (CurByValIdx < ByValArgsCount) {
unsigned RegBegin, RegEnd;
CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
EVT PtrVT =
DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
unsigned int i, j;
for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
MachinePointerInfo(),
DAG.InferPtrAlignment(AddArg));
MemOpChains.push_back(Load.getValue(1));
RegsToPass.push_back(std::make_pair(j, Load));
}
// If parameter size outsides register area, "offset" value
// helps us to calculate stack slot for remained part properly.
offset = RegEnd - RegBegin;
CCInfo.nextInRegsParam();
}
if (Flags.getByValSize() > 4*offset) {
auto PtrVT = getPointerTy(DAG.getDataLayout());
unsigned LocMemOffset = VA.getLocMemOffset();
SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff);
SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
MVT::i32);
SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl,
MVT::i32);
SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
Ops));
}
} else if (!isTailCall) {
assert(VA.isMemLoc());
MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
dl, DAG, VA, Flags));
}
}
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
// Build a sequence of copy-to-reg nodes chained together with token chain
// and flag operands which copy the outgoing args into the appropriate regs.
SDValue InFlag;
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
RegsToPass[i].second, InFlag);
InFlag = Chain.getValue(1);
}
// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
// direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
// node so that legalize doesn't hack it.
bool isDirect = false;
const TargetMachine &TM = getTargetMachine();
const Module *Mod = MF.getFunction().getParent();
const GlobalValue *GV = nullptr;
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
GV = G->getGlobal();
bool isStub =
!TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO();
bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
bool isLocalARMFunc = false;
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
auto PtrVt = getPointerTy(DAG.getDataLayout());
if (Subtarget->genLongCalls()) {
assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
"long-calls codegen is not position independent!");
// Handle a global address or an external symbol. If it's not one of
// those, the target's already in a register, so we don't need to do
// anything extra.
if (isa<GlobalAddressSDNode>(Callee)) {
// Create a constant pool entry for the callee address
unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
ARMConstantPoolValue *CPV =
ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
// Get the address of the callee into a register
SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
Callee = DAG.getLoad(
PtrVt, dl, DAG.getEntryNode(), CPAddr,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
} else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
const char *Sym = S->getSymbol();
// Create a constant pool entry for the callee address
unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
ARMConstantPoolValue *CPV =
ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
ARMPCLabelIndex, 0);
// Get the address of the callee into a register
SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
Callee = DAG.getLoad(
PtrVt, dl, DAG.getEntryNode(), CPAddr,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
}
} else if (isa<GlobalAddressSDNode>(Callee)) {
if (!PreferIndirect) {
isDirect = true;
bool isDef = GV->isStrongDefinitionForLinker();
// ARM call to a local ARM function is predicable.
isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
// tBX takes a register source operand.
if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
Callee = DAG.getNode(
ARMISD::WrapperPIC, dl, PtrVt,
DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
Callee = DAG.getLoad(
PtrVt, dl, DAG.getEntryNode(), Callee,
MachinePointerInfo::getGOT(DAG.getMachineFunction()),
/* Alignment = */ 0, MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
} else if (Subtarget->isTargetCOFF()) {
assert(Subtarget->isTargetWindows() &&
"Windows is the only supported COFF target");
unsigned TargetFlags = GV->hasDLLImportStorageClass()
? ARMII::MO_DLLIMPORT
: ARMII::MO_NO_FLAG;
Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0,
TargetFlags);
if (GV->hasDLLImportStorageClass())
Callee =
DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
MachinePointerInfo::getGOT(DAG.getMachineFunction()));
} else {
Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0);
}
}
} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
isDirect = true;
// tBX takes a register source operand.
const char *Sym = S->getSymbol();
if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
ARMConstantPoolValue *CPV =
ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
ARMPCLabelIndex, 4);
SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
Callee = DAG.getLoad(
PtrVt, dl, DAG.getEntryNode(), CPAddr,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
} else {
Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
}
}
// FIXME: handle tail calls differently.
unsigned CallOpc;
if (Subtarget->isThumb()) {
if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
CallOpc = ARMISD::CALL_NOLINK;
else
CallOpc = ARMISD::CALL;
} else {
if (!isDirect && !Subtarget->hasV5TOps())
CallOpc = ARMISD::CALL_NOLINK;
else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
// Emit regular call when code size is the priority
!Subtarget->hasMinSize())
// "mov lr, pc; b _foo" to avoid confusing the RSP
CallOpc = ARMISD::CALL_NOLINK;
else
CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
}
std::vector<SDValue> Ops;
Ops.push_back(Chain);
Ops.push_back(Callee);
// Add argument registers to the end of the list so that they are known live
// into the call.
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
Ops.push_back(DAG.getRegister(RegsToPass[i].first,
RegsToPass[i].second.getValueType()));
// Add a register mask operand representing the call-preserved registers.
if (!isTailCall) {
const uint32_t *Mask;
const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
if (isThisReturn) {
// For 'this' returns, use the R0-preserving mask if applicable
Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
if (!Mask) {
// Set isThisReturn to false if the calling convention is not one that
// allows 'returned' to be modeled in this way, so LowerCallResult does
// not try to pass 'this' straight through
isThisReturn = false;
Mask = ARI->getCallPreservedMask(MF, CallConv);
}
} else
Mask = ARI->getCallPreservedMask(MF, CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
}
if (InFlag.getNode())
Ops.push_back(InFlag);
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
if (isTailCall) {
MF.getFrameInfo().setHasTailCall();
return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
}
// Returns a chain and a flag for retval copy to use.
Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
InFlag = Chain.getValue(1);
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
if (!Ins.empty())
InFlag = Chain.getValue(1);
// Handle result values, copying them out of physregs into vregs that we
// return.
return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
InVals, isThisReturn,
isThisReturn ? OutVals[0] : SDValue());
}
/// HandleByVal - Every parameter *after* a byval parameter is passed
/// on the stack. Remember the next parameter register to allocate,
/// and then confiscate the rest of the parameter registers to insure
/// this.
void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
unsigned Align) const {
// Byval (as with any stack) slots are always at least 4 byte aligned.
Align = std::max(Align, 4U);
unsigned Reg = State->AllocateReg(GPRArgRegs);
if (!Reg)
return;
unsigned AlignInRegs = Align / 4;
unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
for (unsigned i = 0; i < Waste; ++i)
Reg = State->AllocateReg(GPRArgRegs);
if (!Reg)
return;
unsigned Excess = 4 * (ARM::R4 - Reg);
// Special case when NSAA != SP and parameter size greater than size of
// all remained GPR regs. In that case we can't split parameter, we must
// send it to stack. We also must set NCRN to R4, so waste all
// remained registers.
const unsigned NSAAOffset = State->getNextStackOffset();
if (NSAAOffset != 0 && Size > Excess) {
while (State->AllocateReg(GPRArgRegs))
;
return;
}
// First register for byval parameter is the first register that wasn't
// allocated before this method call, so it would be "reg".
// If parameter is small enough to be saved in range [reg, r4), then
// the end (first after last) register would be reg + param-size-in-regs,
// else parameter would be splitted between registers and stack,
// end register would be r4 in this case.
unsigned ByValRegBegin = Reg;
unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
// Note, first register is allocated in the beginning of function already,
// allocate remained amount of registers we need.
for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
State->AllocateReg(GPRArgRegs);
// A byval parameter that is split between registers and memory needs its
// size truncated here.
// In the case where the entire structure fits in registers, we set the
// size in memory to zero.
Size = std::max<int>(Size - Excess, 0);
}
/// MatchingStackOffset - Return true if the given stack call argument is
/// already available in the same position (relatively) of the caller's
/// incoming argument stack.
static
bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
const TargetInstrInfo *TII) {
unsigned Bytes = Arg.getValueSizeInBits() / 8;
int FI = std::numeric_limits<int>::max();
if (Arg.getOpcode() == ISD::CopyFromReg) {
unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
if (!TargetRegisterInfo::isVirtualRegister(VR))
return false;
MachineInstr *Def = MRI->getVRegDef(VR);
if (!Def)
return false;
if (!Flags.isByVal()) {
if (!TII->isLoadFromStackSlot(*Def, FI))
return false;
} else {
return false;
}
} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
if (Flags.isByVal())
// ByVal argument is passed in as a pointer but it's now being
// dereferenced. e.g.
// define @foo(%struct.X* %A) {
// tail call @bar(%struct.X* byval %A)
// }
return false;
SDValue Ptr = Ld->getBasePtr();
FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
if (!FINode)
return false;
FI = FINode->getIndex();
} else
return false;
assert(FI != std::numeric_limits<int>::max());
if (!MFI.isFixedObjectIndex(FI))
return false;
return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
}
/// IsEligibleForTailCallOptimization - Check whether the call is eligible
/// for tail call optimization. Targets which want to do tail call
/// optimization should implement this function.
bool ARMTargetLowering::IsEligibleForTailCallOptimization(
SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
bool isCalleeStructRet, bool isCallerStructRet,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG,
const bool isIndirect) const {
MachineFunction &MF = DAG.getMachineFunction();
const Function &CallerF = MF.getFunction();
CallingConv::ID CallerCC = CallerF.getCallingConv();
assert(Subtarget->supportsTailCall());
// Indirect tail calls cannot be optimized for Thumb1 if the args
// to the call take up r0-r3. The reason is that there are no legal registers
// left to hold the pointer to the function to be called.
if (Subtarget->isThumb1Only() && Outs.size() >= 4 &&
(!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect))
return false;
// Look for obvious safe cases to perform tail call optimization that do not
// require ABI changes. This is what gcc calls sibcall.
// Exception-handling functions need a special set of instructions to indicate
// a return to the hardware. Tail-calling another function would probably
// break this.
if (CallerF.hasFnAttribute("interrupt"))
return false;
// Also avoid sibcall optimization if either caller or callee uses struct
// return semantics.
if (isCalleeStructRet || isCallerStructRet)
return false;
// Externally-defined functions with weak linkage should not be
// tail-called on ARM when the OS does not support dynamic
// pre-emption of symbols, as the AAELF spec requires normal calls
// to undefined weak functions to be replaced with a NOP or jump to the
// next instruction. The behaviour of branch instructions in this
// situation (as used for tail calls) is implementation-defined, so we
// cannot rely on the linker replacing the tail call with a return.
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
const GlobalValue *GV = G->getGlobal();
const Triple &TT = getTargetMachine().getTargetTriple();
if (GV->hasExternalWeakLinkage() &&
(!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
return false;
}
// Check that the call results are passed in the same way.
LLVMContext &C = *DAG.getContext();
if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
CCAssignFnForReturn(CalleeCC, isVarArg),
CCAssignFnForReturn(CallerCC, isVarArg)))
return false;
// The callee has to preserve all registers the caller needs to preserve.
const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
if (CalleeCC != CallerCC) {
const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
return false;
}
// If Caller's vararg or byval argument has been split between registers and
// stack, do not perform tail call, since part of the argument is in caller's
// local frame.
const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
if (AFI_Caller->getArgRegsSaveSize())
return false;
// If the callee takes no arguments then go on to check the results of the
// call.
if (!Outs.empty()) {
// Check if stack adjustment is needed. For now, do not do this if any
// argument is passed on the stack.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
if (CCInfo.getNextStackOffset()) {
// Check if the arguments are already laid out in the right way as
// the caller's fixed stack objects.
MachineFrameInfo &MFI = MF.getFrameInfo();
const MachineRegisterInfo *MRI = &MF.getRegInfo();
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
i != e;
++i, ++realArgIdx) {
CCValAssign &VA = ArgLocs[i];
EVT RegVT = VA.getLocVT();
SDValue Arg = OutVals[realArgIdx];
ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
if (VA.getLocInfo() == CCValAssign::Indirect)
return false;
if (VA.needsCustom()) {
// f64 and vector types are split into multiple registers or
// register/stack-slot combinations. The types will not match
// the registers; give up on memory f64 refs until we figure
// out what to do about this.
if (!VA.isRegLoc())
return false;
if (!ArgLocs[++i].isRegLoc())
return false;
if (RegVT == MVT::v2f64) {
if (!ArgLocs[++i].isRegLoc())
return false;
if (!ArgLocs[++i].isRegLoc())
return false;
}
} else if (!VA.isRegLoc()) {
if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
MFI, MRI, TII))
return false;
}
}
}
const MachineRegisterInfo &MRI = MF.getRegInfo();
if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
return false;
}
return true;
}
bool
ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
MachineFunction &MF, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
LLVMContext &Context) const {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
}
static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
const SDLoc &DL, SelectionDAG &DAG) {
const MachineFunction &MF = DAG.getMachineFunction();
const Function &F = MF.getFunction();
StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
// See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
// version of the "preferred return address". These offsets affect the return
// instruction if this is a return from PL1 without hypervisor extensions.
// IRQ/FIQ: +4 "subs pc, lr, #4"
// SWI: 0 "subs pc, lr, #0"
// ABORT: +4 "subs pc, lr, #4"
// UNDEF: +4/+2 "subs pc, lr, #0"
// UNDEF varies depending on where the exception came from ARM or Thumb
// mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
int64_t LROffset;
if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
IntKind == "ABORT")
LROffset = 4;
else if (IntKind == "SWI" || IntKind == "UNDEF")
LROffset = 0;
else
report_fatal_error("Unsupported interrupt attribute. If present, value "
"must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
RetOps.insert(RetOps.begin() + 1,
DAG.getConstant(LROffset, DL, MVT::i32, false));
return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
}
SDValue
ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &dl, SelectionDAG &DAG) const {
// CCValAssign - represent the assignment of the return value to a location.
SmallVector<CCValAssign, 16> RVLocs;
// CCState - Info about the registers and stack slots.
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
// Analyze outgoing return values.
CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
SDValue Flag;
SmallVector<SDValue, 4> RetOps;
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
bool isLittleEndian = Subtarget->isLittle();
MachineFunction &MF = DAG.getMachineFunction();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
AFI->setReturnRegsCount(RVLocs.size());
// Copy the result values into the output registers.
for (unsigned i = 0, realRVLocIdx = 0;
i != RVLocs.size();
++i, ++realRVLocIdx) {
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
SDValue Arg = OutVals[realRVLocIdx];
bool ReturnF16 = false;
if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
// Half-precision return values can be returned like this:
//
// t11 f16 = fadd ...
// t12: i16 = bitcast t11
// t13: i32 = zero_extend t12
// t14: f32 = bitcast t13 <~~~~~~~ Arg
//
// to avoid code generation for bitcasts, we simply set Arg to the node
// that produces the f16 value, t11 in this case.
//
if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
SDValue ZE = Arg.getOperand(0);
if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
SDValue BC = ZE.getOperand(0);
if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
Arg = BC.getOperand(0);
ReturnF16 = true;
}
}
}
}
switch (VA.getLocInfo()) {
default: llvm_unreachable("Unknown loc info!");
case CCValAssign::Full: break;
case CCValAssign::BCvt:
if (!ReturnF16)
Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
break;
}
if (VA.needsCustom()) {
if (VA.getLocVT() == MVT::v2f64) {
// Extract the first half and return it in two registers.
SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
DAG.getConstant(0, dl, MVT::i32));
SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
DAG.getVTList(MVT::i32, MVT::i32), Half);
Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
HalfGPRs.getValue(isLittleEndian ? 0 : 1),
Flag);
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
VA = RVLocs[++i]; // skip ahead to next loc
Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
HalfGPRs.getValue(isLittleEndian ? 1 : 0),
Flag);
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
VA = RVLocs[++i]; // skip ahead to next loc
// Extract the 2nd half and fall through to handle it as an f64 value.
Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
DAG.getConstant(1, dl, MVT::i32));
}
// Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
// available.
SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
DAG.getVTList(MVT::i32, MVT::i32), Arg);
Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
fmrrd.getValue(isLittleEndian ? 0 : 1),
Flag);
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
VA = RVLocs[++i]; // skip ahead to next loc
Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
fmrrd.getValue(isLittleEndian ? 1 : 0),
Flag);
} else
Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
// Guarantee that all emitted copies are
// stuck together, avoiding something bad.
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(),
ReturnF16 ? MVT::f16 : VA.getLocVT()));
}
const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
const MCPhysReg *I =
TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
if (I) {
for (; *I; ++I) {
if (ARM::GPRRegClass.contains(*I))
RetOps.push_back(DAG.getRegister(*I, MVT::i32));
else if (ARM::DPRRegClass.contains(*I))
RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
}
}
// Update chain and glue.
RetOps[0] = Chain;
if (Flag.getNode())
RetOps.push_back(Flag);
// CPUs which aren't M-class use a special sequence to return from
// exceptions (roughly, any instruction setting pc and cpsr simultaneously,
// though we use "subs pc, lr, #N").
//
// M-class CPUs actually use a normal return sequence with a special
// (hardware-provided) value in LR, so the normal code path works.
if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
!Subtarget->isMClass()) {
if (Subtarget->isThumb1Only())
report_fatal_error("interrupt attribute is not supported in Thumb1");
return LowerInterruptReturn(RetOps, dl, DAG);
}
return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps);
}
bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
if (N->getNumValues() != 1)
return false;
if (!N->hasNUsesOfValue(1, 0))
return false;
SDValue TCChain = Chain;
SDNode *Copy = *N->use_begin();
if (Copy->getOpcode() == ISD::CopyToReg) {
// If the copy has a glue operand, we conservatively assume it isn't safe to
// perform a tail call.
if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
return false;
TCChain = Copy->getOperand(0);
} else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
SDNode *VMov = Copy;
// f64 returned in a pair of GPRs.
SmallPtrSet<SDNode*, 2> Copies;
for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
UI != UE; ++UI) {
if (UI->getOpcode() != ISD::CopyToReg)
return false;
Copies.insert(*UI);
}
if (Copies.size() > 2)
return false;
for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
UI != UE; ++UI) {
SDValue UseChain = UI->getOperand(0);
if (Copies.count(UseChain.getNode()))
// Second CopyToReg
Copy = *UI;
else {
// We are at the top of this chain.
// If the copy has a glue operand, we conservatively assume it
// isn't safe to perform a tail call.
if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue)
return false;
// First CopyToReg
TCChain = UseChain;
}
}
} else if (Copy->getOpcode() == ISD::BITCAST) {
// f32 returned in a single GPR.
if (!Copy->hasOneUse())
return false;
Copy = *Copy->use_begin();
if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
return false;
// If the copy has a glue operand, we conservatively assume it isn't safe to
// perform a tail call.
if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
return false;
TCChain = Copy->getOperand(0);
} else {
return false;
}
bool HasRet = false;
for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
UI != UE; ++UI) {
if (UI->getOpcode() != ARMISD::RET_FLAG &&
UI->getOpcode() != ARMISD::INTRET_FLAG)
return false;
HasRet = true;
}
if (!HasRet)
return false;
Chain = TCChain;
return true;
}
bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
if (!Subtarget->supportsTailCall())
return false;
auto Attr =
CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
if (!CI->isTailCall() || Attr.getValueAsString() == "true")
return false;
return true;
}
// Trying to write a 64 bit value so need to split into two 32 bit values first,
// and pass the lower and high parts through.
static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
SDLoc DL(Op);
SDValue WriteValue = Op->getOperand(2);
// This function is only supposed to be called for i64 type argument.
assert(WriteValue.getValueType() == MVT::i64
&& "LowerWRITE_REGISTER called for non-i64 type argument.");
SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
DAG.getConstant(0, DL, MVT::i32));
SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
DAG.getConstant(1, DL, MVT::i32));
SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
}
// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
// one of the above mentioned nodes. It has to be wrapped because otherwise
// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
// be used to form addressing mode. These wrapped nodes will be selected
// into MOVi.
SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
SelectionDAG &DAG) const {
EVT PtrVT = Op.getValueType();
// FIXME there is no actual debug info here
SDLoc dl(Op);
ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
SDValue Res;
// When generating execute-only code Constant Pools must be promoted to the
// global data section. It's a bit ugly that we can't share them across basic
// blocks, but this way we guarantee that execute-only behaves correct with
// position-independent addressing modes.
if (Subtarget->genExecuteOnly()) {
auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
auto T = const_cast<Type*>(CP->getType());
auto C = const_cast<Constant*>(CP->getConstVal());
auto M = const_cast<Module*>(DAG.getMachineFunction().
getFunction().getParent());
auto GV = new GlobalVariable(
*M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
Twine(AFI->createPICLabelUId())
);
SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV),
dl, PtrVT);
return LowerGlobalAddress(GA, DAG);
}
if (CP->isMachineConstantPoolEntry())
Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
CP->getAlignment());
else
Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
CP->getAlignment());
return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
}
unsigned ARMTargetLowering::getJumpTableEncoding() const {
return MachineJumpTableInfo::EK_Inline;
}
SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
unsigned ARMPCLabelIndex = 0;
SDLoc DL(Op);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
SDValue CPAddr;
bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
if (!IsPositionIndependent) {
CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
} else {
unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
ARMPCLabelIndex = AFI->createPICLabelUId();
ARMConstantPoolValue *CPV =
ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
ARMCP::CPBlockAddress, PCAdj);
CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
}
CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
SDValue Result = DAG.getLoad(
PtrVT, DL, DAG.getEntryNode(), CPAddr,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
if (!IsPositionIndependent)
return Result;
SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
}
/// Convert a TLS address reference into the correct sequence of loads
/// and calls to compute the variable's address for Darwin, and return an
/// SDValue containing the final node.
/// Darwin only has one TLS scheme which must be capable of dealing with the
/// fully general situation, in the worst case. This means:
/// + "extern __thread" declaration.
/// + Defined in a possibly unknown dynamic library.
///
/// The general system is that each __thread variable has a [3 x i32] descriptor
/// which contains information used by the runtime to calculate the address. The
/// only part of this the compiler needs to know about is the first word, which
/// contains a function pointer that must be called with the address of the
/// entire descriptor in "r0".
///
/// Since this descriptor may be in a different unit, in general access must
/// proceed along the usual ARM rules. A common sequence to produce is:
///
/// movw rT1, :lower16:_var$non_lazy_ptr
/// movt rT1, :upper16:_var$non_lazy_ptr
/// ldr r0, [rT1]
/// ldr rT2, [r0]
/// blx rT2
/// [...address now in r0...]
SDValue
ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
SelectionDAG &DAG) const {
assert(Subtarget->isTargetDarwin() &&
"This function expects a Darwin target");
SDLoc DL(Op);
// First step is to get the address of the actua global symbol. This is where
// the TLS descriptor lives.
SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
// The first entry in the descriptor is a function pointer that we must call
// to obtain the address of the variable.
SDValue Chain = DAG.getEntryNode();
SDValue FuncTLVGet = DAG.getLoad(
MVT::i32, DL, Chain, DescAddr,
MachinePointerInfo::getGOT(DAG.getMachineFunction()),
/* Alignment = */ 4,
MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
Chain = FuncTLVGet.getValue(1);
MachineFunction &F = DAG.getMachineFunction();
MachineFrameInfo &MFI = F.getFrameInfo();
MFI.setAdjustsStack(true);
// TLS calls preserve all registers except those that absolutely must be
// trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
// silly).
auto TRI =
getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo();
auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
// Finally, we can make the call. This is just a degenerate version of a
// normal AArch64 call node: r0 takes the address of the descriptor, and
// returns the address of the variable in this thread.
Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
Chain =
DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
DAG.getRegisterMask(Mask), Chain.getValue(1));
return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
}
SDValue
ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
SelectionDAG &DAG) const {
assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
SDValue Chain = DAG.getEntryNode();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDLoc DL(Op);
// Load the current TEB (thread environment block)
SDValue Ops[] = {Chain,
DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
DAG.getConstant(15, DL, MVT::i32),
DAG.getConstant(0, DL, MVT::i32),
DAG.getConstant(13, DL, MVT::i32),
DAG.getConstant(0, DL, MVT::i32),
DAG.getConstant(2, DL, MVT::i32)};
SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
DAG.getVTList(MVT::i32, MVT::Other), Ops);
SDValue TEB = CurrentTEB.getValue(0);
Chain = CurrentTEB.getValue(1);
// Load the ThreadLocalStoragePointer from the TEB
// A pointer to the TLS array is located at offset 0x2c from the TEB.
SDValue TLSArray =
DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
// The pointer to the thread's TLS data area is at the TLS Index scaled by 4
// offset into the TLSArray.
// Load the TLS index from the C runtime
SDValue TLSIndex =
DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
DAG.getConstant(2, DL, MVT::i32));
SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
MachinePointerInfo());
// Get the offset of the start of the .tls section (section base)
const auto *GA = cast<GlobalAddressSDNode>(Op);
auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
SDValue Offset = DAG.getLoad(
PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
DAG.getTargetConstantPool(CPV, PtrVT, 4)),
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
}
// Lower ISD::GlobalTLSAddress using the "general dynamic" model
SDValue
ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
SelectionDAG &DAG) const {
SDLoc dl(GA);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
MachineFunction &MF = DAG.getMachineFunction();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
ARMConstantPoolValue *CPV =
ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
Argument = DAG.getLoad(
PtrVT, dl, DAG.getEntryNode(), Argument,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
SDValue Chain = Argument.getValue(1);
SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
// call __tls_get_addr.
ArgListTy Args;
ArgListEntry Entry;
Entry.Node = Argument;
Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
Args.push_back(Entry);
// FIXME: is there useful debug info available here?
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
return CallResult.first;
}
// Lower ISD::GlobalTLSAddress using the "initial exec" or
// "local exec" model.
SDValue
ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
SelectionDAG &DAG,
TLSModel::Model model) const {
const GlobalValue *GV = GA->getGlobal();
SDLoc dl(GA);
SDValue Offset;
SDValue Chain = DAG.getEntryNode();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
// Get the Thread Pointer
SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
if (model == TLSModel::InitialExec) {
MachineFunction &MF = DAG.getMachineFunction();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
// Initial exec model.
unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
ARMConstantPoolValue *CPV =
ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
true);
Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
Offset = DAG.getLoad(
PtrVT, dl, Chain, Offset,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
Chain = Offset.getValue(1);
SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
Offset = DAG.getLoad(
PtrVT, dl, Chain, Offset,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
} else {
// local exec model
assert(model == TLSModel::LocalExec);
ARMConstantPoolValue *CPV =
ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
Offset = DAG.getLoad(
PtrVT, dl, Chain, Offset,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
}
// The address of the thread local variable is the add of the thread
// pointer with the offset of the variable.
return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
}
SDValue
ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
if (DAG.getTarget().useEmulatedTLS())
return LowerToTLSEmulatedModel(GA, DAG);
if (Subtarget->isTargetDarwin())
return LowerGlobalTLSAddressDarwin(Op, DAG);
if (Subtarget->isTargetWindows())
return LowerGlobalTLSAddressWindows(Op, DAG);
// TODO: implement the "local dynamic" model
assert(Subtarget->isTargetELF() && "Only ELF implemented here");
TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
switch (model) {
case TLSModel::GeneralDynamic:
case TLSModel::LocalDynamic:
return LowerToTLSGeneralDynamicModel(GA, DAG);
case TLSModel::InitialExec:
case TLSModel::LocalExec:
return LowerToTLSExecModels(GA, DAG, model);
}
llvm_unreachable("bogus TLS model");
}
/// Return true if all users of V are within function F, looking through
/// ConstantExprs.
static bool allUsersAreInFunction(const Value *V, const Function *F) {
SmallVector<const User*,4> Worklist;
for (auto *U : V->users())
Worklist.push_back(U);
while (!Worklist.empty()) {
auto *U = Worklist.pop_back_val();
if (isa<ConstantExpr>(U)) {
for (auto *UU : U->users())
Worklist.push_back(UU);
continue;
}
auto *I = dyn_cast<Instruction>(U);
if (!I || I->getParent()->getParent() != F)
return false;
}
return true;
}
static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,
const GlobalValue *GV, SelectionDAG &DAG,
EVT PtrVT, const SDLoc &dl) {
// If we're creating a pool entry for a constant global with unnamed address,
// and the global is small enough, we can emit it inline into the constant pool
// to save ourselves an indirection.
//
// This is a win if the constant is only used in one function (so it doesn't
// need to be duplicated) or duplicating the constant wouldn't increase code
// size (implying the constant is no larger than 4 bytes).
const Function &F = DAG.getMachineFunction().getFunction();
// We rely on this decision to inline being idemopotent and unrelated to the
// use-site. We know that if we inline a variable at one use site, we'll
// inline it elsewhere too (and reuse the constant pool entry). Fast-isel
// doesn't know about this optimization, so bail out if it's enabled else
// we could decide to inline here (and thus never emit the GV) but require
// the GV from fast-isel generated code.
if (!EnableConstpoolPromotion ||
DAG.getMachineFunction().getTarget().Options.EnableFastISel)
return SDValue();
auto *GVar = dyn_cast<GlobalVariable>(GV);
if (!GVar || !GVar->hasInitializer() ||
!GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
!GVar->hasLocalLinkage())
return SDValue();
// If we inline a value that contains relocations, we move the relocations
// from .data to .text. This is not allowed in position-independent code.
auto *Init = GVar->getInitializer();
if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
Init->needsRelocation())
return SDValue();
// The constant islands pass can only really deal with alignment requests
// <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
// any type wanting greater alignment requirements than 4 bytes. We also
// can only promote constants that are multiples of 4 bytes in size or
// are paddable to a multiple of 4. Currently we only try and pad constants
// that are strings for simplicity.
auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
unsigned Align = DAG.getDataLayout().getPreferredAlignment(GVar);
unsigned RequiredPadding = 4 - (Size % 4);
bool PaddingPossible =
RequiredPadding == 4 || (CDAInit && CDAInit->isString());
if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize ||
Size == 0)
return SDValue();
unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
MachineFunction &MF = DAG.getMachineFunction();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
// We can't bloat the constant pool too much, else the ConstantIslands pass
// may fail to converge. If we haven't promoted this global yet (it may have
// multiple uses), and promoting it would increase the constant pool size (Sz
// > 4), ensure we have space to do so up to MaxTotal.
if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
ConstpoolPromotionMaxTotal)
return SDValue();
// This is only valid if all users are in a single function; we can't clone
// the constant in general. The LLVM IR unnamed_addr allows merging
// constants, but not cloning them.
//
// We could potentially allow cloning if we could prove all uses of the
// constant in the current function don't care about the address, like
// printf format strings. But that isn't implemented for now.
if (!allUsersAreInFunction(GVar, &F))
return SDValue();
// We're going to inline this global. Pad it out if needed.
if (RequiredPadding != 4) {
StringRef S = CDAInit->getAsString();
SmallVector<uint8_t,16> V(S.size());
std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
while (RequiredPadding--)
V.push_back(0);
Init = ConstantDataArray::get(*DAG.getContext(), V);
}
auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
SDValue CPAddr =
DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4);
if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
AFI->markGlobalAsPromotedToConstantPool(GVar);
AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() +
PaddedSize - 4);
}
++NumConstpoolPromoted;
return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
}
bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const {
if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
if (!(GV = GA->getBaseObject()))
return false;
if (const auto *V = dyn_cast<GlobalVariable>(GV))
return V->isConstant();
return isa<Function>(GV);
}
SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
SelectionDAG &DAG) const {
switch (Subtarget->getTargetTriple().getObjectFormat()) {
default: llvm_unreachable("unknown object format");
case Triple::COFF:
return LowerGlobalAddressWindows(Op, DAG);
case Triple::ELF:
return LowerGlobalAddressELF(Op, DAG);
case Triple::MachO:
return LowerGlobalAddressDarwin(Op, DAG);
}
}
SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
SelectionDAG &DAG) const {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDLoc dl(Op);
const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
const TargetMachine &TM = getTargetMachine();
bool IsRO = isReadOnly(GV);
// promoteToConstantPool only if not generating XO text section
if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly())
if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
return V;
if (isPositionIndependent()) {
bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
UseGOT_PREL ? ARMII::MO_GOT : 0);
SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
if (UseGOT_PREL)
Result =
DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
MachinePointerInfo::getGOT(DAG.getMachineFunction()));
return Result;
} else if (Subtarget->isROPI() && IsRO) {
// PC-relative.
SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
return Result;
} else if (Subtarget->isRWPI() && !IsRO) {
// SB-relative.
SDValue RelAddr;
if (Subtarget->useMovt()) {
++NumMovwMovt;
SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
} else { // use literal pool for address constant
ARMConstantPoolValue *CPV =
ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
RelAddr = DAG.getLoad(
PtrVT, dl, DAG.getEntryNode(), CPAddr,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
}
SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
return Result;
}
// If we have T2 ops, we can materialize the address directly via movt/movw
// pair. This is always cheaper.
if (Subtarget->useMovt()) {
++NumMovwMovt;
// FIXME: Once remat is capable of dealing with instructions with register
// operands, expand this into two nodes.
return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
DAG.getTargetGlobalAddress(GV, dl, PtrVT));
} else {
SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
return DAG.getLoad(
PtrVT, dl, DAG.getEntryNode(), CPAddr,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
}
}
SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
SelectionDAG &DAG) const {
assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
"ROPI/RWPI not currently supported for Darwin");
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDLoc dl(Op);
const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
if (Subtarget->useMovt())
++NumMovwMovt;
// FIXME: Once remat is capable of dealing with instructions with register
// operands, expand this into multiple nodes
unsigned Wrapper =
isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
if (Subtarget->isGVIndirectSymbol(GV))
Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
MachinePointerInfo::getGOT(DAG.getMachineFunction()));
return Result;
}
SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
SelectionDAG &DAG) const {
assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
assert(Subtarget->useMovt() &&
"Windows on ARM expects to use movw/movt");
assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
"ROPI/RWPI not currently supported for Windows");
const TargetMachine &TM = getTargetMachine();
const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
if (GV->hasDLLImportStorageClass())
TargetFlags = ARMII::MO_DLLIMPORT;
else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
TargetFlags = ARMII::MO_COFFSTUB;
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result;
SDLoc DL(Op);
++NumMovwMovt;
// FIXME: Once remat is capable of dealing with instructions with register
// operands, expand this into two nodes.
Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
TargetFlags));
if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
MachinePointerInfo::getGOT(DAG.getMachineFunction()));
return Result;
}
SDValue
ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
SDValue Val = DAG.getConstant(0, dl, MVT::i32);
return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
Op.getOperand(1), Val);
}
SDValue
ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
}
SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
Op.getOperand(0));
}
SDValue
ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *Subtarget) const {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
SDLoc dl(Op);
switch (IntNo) {
default: return SDValue(); // Don't custom lower most intrinsics.
case Intrinsic::thread_pointer: {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
}
case Intrinsic::eh_sjlj_lsda: {
MachineFunction &MF = DAG.getMachineFunction();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue CPAddr;
bool IsPositionIndependent = isPositionIndependent();
unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
ARMConstantPoolValue *CPV =
ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
ARMCP::CPLSDA, PCAdj);
CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
SDValue Result = DAG.getLoad(
PtrVT, dl, DAG.getEntryNode(), CPAddr,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
if (IsPositionIndependent) {
SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
}
return Result;
}
case Intrinsic::arm_neon_vabs:
return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
Op.getOperand(1));
case Intrinsic::arm_neon_vmulls:
case Intrinsic::arm_neon_vmullu: {
unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
? ARMISD::VMULLs : ARMISD::VMULLu;
return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
}
case Intrinsic::arm_neon_vminnm:
case Intrinsic::arm_neon_vmaxnm: {
unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
? ISD::FMINNUM : ISD::FMAXNUM;
return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
}
case Intrinsic::arm_neon_vminu:
case Intrinsic::arm_neon_vmaxu: {
if (Op.getValueType().isFloatingPoint())
return SDValue();
unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
? ISD::UMIN : ISD::UMAX;
return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
}
case Intrinsic::arm_neon_vmins:
case Intrinsic::arm_neon_vmaxs: {
// v{min,max}s is overloaded between signed integers and floats.
if (!Op.getValueType().isFloatingPoint()) {
unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
? ISD::SMIN : ISD::SMAX;
return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
}
unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
? ISD::FMINIMUM : ISD::FMAXIMUM;
return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
}
case Intrinsic::arm_neon_vtbl1:
return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::arm_neon_vtbl2:
return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
}
}
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *Subtarget) {
SDLoc dl(Op);
ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2));
auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue());
if (SSID == SyncScope::SingleThread)
return Op;
if (!Subtarget->hasDataBarrier()) {
// Some ARMv6 cpus can support data barriers with an mcr instruction.
// Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
// here.
assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
"Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
DAG.getConstant(0, dl, MVT::i32));
}
ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
ARM_MB::MemBOpt Domain = ARM_MB::ISH;
if (Subtarget->isMClass()) {
// Only a full system barrier exists in the M-class architectures.
Domain = ARM_MB::SY;
} else if (Subtarget->preferISHSTBarriers() &&
Ord == AtomicOrdering::Release) {
// Swift happens to implement ISHST barriers in a way that's compatible with
// Release semantics but weaker than ISH so we'd be fools not to use
// it. Beware: other processors probably don't!
Domain = ARM_MB::ISHST;
}
return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
DAG.getConstant(Domain, dl, MVT::i32));
}
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *Subtarget) {
// ARM pre v5TE and Thumb1 does not have preload instructions.
if (!(Subtarget->isThumb2() ||
(!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
// Just preserve the chain.
return Op.getOperand(0);
SDLoc dl(Op);
unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
if (!isRead &&
(!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
// ARMv7 with MP extension has PLDW.
return Op.getOperand(0);
unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
if (Subtarget->isThumb()) {
// Invert the bits.
isRead = ~isRead & 1;
isData = ~isData & 1;
}
return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
DAG.getConstant(isData, dl, MVT::i32));
}
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
MachineFunction &MF = DAG.getMachineFunction();
ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
// vastart just stores the address of the VarArgsFrameIndex slot into the
// memory location argument.
SDLoc dl(Op);
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
MachinePointerInfo(SV));
}
SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
CCValAssign &NextVA,
SDValue &Root,
SelectionDAG &DAG,
const SDLoc &dl) const {
MachineFunction &MF = DAG.getMachineFunction();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
const TargetRegisterClass *RC;
if (AFI->isThumb1OnlyFunction())
RC = &ARM::tGPRRegClass;
else
RC = &ARM::GPRRegClass;
// Transform the arguments stored in physical registers into virtual ones.
unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
SDValue ArgValue2;
if (NextVA.isMemLoc()) {
MachineFrameInfo &MFI = MF.getFrameInfo();
int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
// Create load node to retrieve arguments from the stack.
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
ArgValue2 = DAG.getLoad(
MVT::i32, dl, Root, FIN,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
} else {
Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
}
if (!Subtarget->isLittle())
std::swap (ArgValue, ArgValue2);
return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
}
// The remaining GPRs hold either the beginning of variable-argument
// data, or the beginning of an aggregate passed by value (usually
// byval). Either way, we allocate stack slots adjacent to the data
// provided by our caller, and store the unallocated registers there.
// If this is a variadic function, the va_list pointer will begin with
// these values; otherwise, this reassembles a (byval) structure that
// was split between registers and memory.
// Return: The frame index registers were stored into.
int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
const SDLoc &dl, SDValue &Chain,
const Value *OrigArg,
unsigned InRegsParamRecordIdx,
int ArgOffset, unsigned ArgSize) const {
// Currently, two use-cases possible:
// Case #1. Non-var-args function, and we meet first byval parameter.
// Setup first unallocated register as first byval register;
// eat all remained registers
// (these two actions are performed by HandleByVal method).
// Then, here, we initialize stack frame with
// "store-reg" instructions.
// Case #2. Var-args function, that doesn't contain byval parameters.
// The same: eat all remained unallocated registers,
// initialize stack frame.
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
unsigned RBegin, REnd;
if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
} else {
unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
REnd = ARM::R4;
}
if (REnd != RBegin)
ArgOffset = -4 * (ARM::R4 - RBegin);
auto PtrVT = getPointerTy(DAG.getDataLayout());
int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
SmallVector<SDValue, 4> MemOps;
const TargetRegisterClass *RC =
AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
unsigned VReg = MF.addLiveIn(Reg, RC);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
MachinePointerInfo(OrigArg, 4 * i));
MemOps.push_back(Store);
FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
}
if (!MemOps.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
return FrameIndex;
}
// Setup stack frame, the va_list pointer will start from.
void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
const SDLoc &dl, SDValue &Chain,
unsigned ArgOffset,
unsigned TotalArgRegsSaveSize,
bool ForceMutable) const {
MachineFunction &MF = DAG.getMachineFunction();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
// Try to store any remaining integer argument regs
// to their spots on the stack so that they may be loaded by dereferencing
// the result of va_next.
// If there is no regs to be stored, just point address after last
// argument passed via stack.
int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
CCInfo.getInRegsParamsCount(),
CCInfo.getNextStackOffset(),
std::max(4U, TotalArgRegsSaveSize));
AFI->setVarArgsFrameIndex(FrameIndex);
}
SDValue ARMTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
SmallVector<SDValue, 16> ArgValues;
SDValue ArgValue;
Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
unsigned CurArgIdx = 0;
// Initially ArgRegsSaveSize is zero.
// Then we increase this value each time we meet byval parameter.
// We also increase this value in case of varargs function.
AFI->setArgRegsSaveSize(0);
// Calculate the amount of stack space that we need to allocate to store
// byval and variadic arguments that are passed in registers.
// We need to know this before we allocate the first byval or variadic
// argument, as they will be allocated a stack slot below the CFA (Canonical
// Frame Address, the stack pointer at entry to the function).
unsigned ArgRegBegin = ARM::R4;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
break;
CCValAssign &VA = ArgLocs[i];
unsigned Index = VA.getValNo();
ISD::ArgFlagsTy Flags = Ins[Index].Flags;
if (!Flags.isByVal())
continue;
assert(VA.isMemLoc() && "unexpected byval pointer in reg");
unsigned RBegin, REnd;
CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
ArgRegBegin = std::min(ArgRegBegin, RBegin);
CCInfo.nextInRegsParam();
}
CCInfo.rewindByValRegsInfo();
int lastInsIndex = -1;
if (isVarArg && MFI.hasVAStart()) {
unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
if (RegIdx != array_lengthof(GPRArgRegs))
ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
}
unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
auto PtrVT = getPointerTy(DAG.getDataLayout());
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
if (Ins[VA.getValNo()].isOrigArg()) {
std::advance(CurOrigArg,
Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
}
// Arguments stored in registers.
if (VA.isRegLoc()) {
EVT RegVT = VA.getLocVT();
if (VA.needsCustom()) {
// f64 and vector types are split up into multiple registers or
// combinations of registers and stack slots.
if (VA.getLocVT() == MVT::v2f64) {
SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
Chain, DAG, dl);
VA = ArgLocs[++i]; // skip ahead to next loc
SDValue ArgValue2;
if (VA.isMemLoc()) {
int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
MachinePointerInfo::getFixedStack(
DAG.getMachineFunction(), FI));
} else {
ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
Chain, DAG, dl);
}
ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
ArgValue, ArgValue1,
DAG.getIntPtrConstant(0, dl));
ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
ArgValue, ArgValue2,
DAG.getIntPtrConstant(1, dl));
} else
ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
} else {
const TargetRegisterClass *RC;
if (RegVT == MVT::f16)
RC = &ARM::HPRRegClass;
else if (RegVT == MVT::f32)
RC = &ARM::SPRRegClass;
else if (RegVT == MVT::f64 || RegVT == MVT::v4f16)
RC = &ARM::DPRRegClass;
else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16)
RC = &ARM::QPRRegClass;
else if (RegVT == MVT::i32)
RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
: &ARM::GPRRegClass;
else
llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
// Transform the arguments in physical registers into virtual ones.
unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
}
// If this is an 8 or 16-bit value, it is really passed promoted
// to 32 bits. Insert an assert[sz]ext to capture this, then
// truncate to the right size.
switch (VA.getLocInfo()) {
default: llvm_unreachable("Unknown loc info!");
case CCValAssign::Full: break;
case CCValAssign::BCvt:
ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
break;
case CCValAssign::SExt:
ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
DAG.getValueType(VA.getValVT()));
ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
break;
case CCValAssign::ZExt:
ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
DAG.getValueType(VA.getValVT()));
ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
break;
}
InVals.push_back(ArgValue);
} else { // VA.isRegLoc()
// sanity check
assert(VA.isMemLoc());
assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
int index = VA.getValNo();
// Some Ins[] entries become multiple ArgLoc[] entries.
// Process them only once.
if (index != lastInsIndex)
{
ISD::ArgFlagsTy Flags = Ins[index].Flags;
// FIXME: For now, all byval parameter objects are marked mutable.
// This can be changed with more analysis.
// In case of tail call optimization mark all arguments mutable.
// Since they could be overwritten by lowering of arguments in case of
// a tail call.
if (Flags.isByVal()) {
assert(Ins[index].isOrigArg() &&
"Byval arguments cannot be implicit");
unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
int FrameIndex = StoreByValRegs(
CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
VA.getLocMemOffset(), Flags.getByValSize());
InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
CCInfo.nextInRegsParam();
} else {
unsigned FIOffset = VA.getLocMemOffset();
int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
FIOffset, true);
// Create load nodes to retrieve arguments from the stack.
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
MachinePointerInfo::getFixedStack(
DAG.getMachineFunction(), FI)));
}
lastInsIndex = index;
}
}
}
// varargs
if (isVarArg && MFI.hasVAStart())
VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
CCInfo.getNextStackOffset(),
TotalArgRegsSaveSize);
AFI->setArgumentStackSize(CCInfo.getNextStackOffset());
return Chain;
}
/// isFloatingPointZero - Return true if this is +0.0.
static bool isFloatingPointZero(SDValue Op) {
if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
return CFP->getValueAPF().isPosZero();
else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
// Maybe this has already been legalized into the constant pool?
if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
SDValue WrapperOp = Op.getOperand(1).getOperand(0);
if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
return CFP->getValueAPF().isPosZero();
}
} else if (Op->getOpcode() == ISD::BITCAST &&
Op->getValueType(0) == MVT::f64) {
// Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
// created by LowerConstantFP().
SDValue BitcastOp = Op->getOperand(0);
if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
isNullConstant(BitcastOp->getOperand(0)))
return true;
}
return false;
}
/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
/// the given operands.
SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
SDValue &ARMcc, SelectionDAG &DAG,
const SDLoc &dl) const {
if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
unsigned C = RHSC->getZExtValue();
if (!isLegalICmpImmediate((int32_t)C)) {
// Constant does not fit, try adjusting it by one.
switch (CC) {
default: break;
case ISD::SETLT:
case ISD::SETGE:
if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
RHS = DAG.getConstant(C - 1, dl, MVT::i32);
}
break;
case ISD::SETULT:
case ISD::SETUGE:
if (C != 0 && isLegalICmpImmediate(C-1)) {
CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
RHS = DAG.getConstant(C - 1, dl, MVT::i32);
}
break;
case ISD::SETLE:
case ISD::SETGT:
if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
RHS = DAG.getConstant(C + 1, dl, MVT::i32);
}
break;
case ISD::SETULE:
case ISD::SETUGT:
if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
RHS = DAG.getConstant(C + 1, dl, MVT::i32);
}
break;
}
}
} else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
(ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) {
// In ARM and Thumb-2, the compare instructions can shift their second
// operand.
CC = ISD::getSetCCSwappedOperands(CC);
std::swap(LHS, RHS);
}
ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
// If the RHS is a constant zero then the V (overflow) flag will never be
// set. This can allow us to simplify GE to PL or LT to MI, which can be
// simpler for other passes (like the peephole optimiser) to deal with.
if (isNullConstant(RHS)) {
switch (CondCode) {
default: break;
case ARMCC::GE:
CondCode = ARMCC::PL;
break;
case ARMCC::LT:
CondCode = ARMCC::MI;
break;
}
}
ARMISD::NodeType CompareType;
switch (CondCode) {
default:
CompareType = ARMISD::CMP;
break;
case ARMCC::EQ:
case ARMCC::NE:
// Uses only Z Flag
CompareType = ARMISD::CMPZ;
break;
}
ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
}
/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
SelectionDAG &DAG, const SDLoc &dl,
bool InvalidOnQNaN) const {
assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
SDValue Cmp;
SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32);
if (!isFloatingPointZero(RHS))
Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS, C);
else
Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS, C);
return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
}
/// duplicateCmp - Glue values can have only one use, so this function
/// duplicates a comparison node.
SDValue
ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
unsigned Opc = Cmp.getOpcode();
SDLoc DL(Cmp);
if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
Cmp = Cmp.getOperand(0);
Opc = Cmp.getOpcode();
if (Opc == ARMISD::CMPFP)
Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
Cmp.getOperand(1), Cmp.getOperand(2));
else {
assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
Cmp.getOperand(1));
}
return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
}
// This function returns three things: the arithmetic computation itself
// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
// comparison and the condition code define the case in which the arithmetic
// computation *does not* overflow.
std::pair<SDValue, SDValue>
ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
SDValue &ARMcc) const {
assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
SDValue Value, OverflowCmp;
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
SDLoc dl(Op);
// FIXME: We are currently always generating CMPs because we don't support
// generating CMN through the backend. This is not as good as the natural
// CMP case because it causes a register dependency and cannot be folded
// later.
switch (Op.getOpcode()) {
default:
llvm_unreachable("Unknown overflow instruction!");
case ISD::SADDO:
ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
break;
case ISD::UADDO:
ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
// We use ADDC here to correspond to its use in LowerUnsignedALUO.
// We do not use it in the USUBO case as Value may not be used.
Value = DAG.getNode(ARMISD::ADDC, dl,
DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
.getValue(0);
OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
break;
case ISD::SSUBO:
ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
break;
case ISD::USUBO:
ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
break;
case ISD::UMULO:
// We generate a UMUL_LOHI and then check if the high word is 0.
ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
Value = DAG.getNode(ISD::UMUL_LOHI, dl,
DAG.getVTList(Op.getValueType(), Op.getValueType()),
LHS, RHS);
OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
DAG.getConstant(0, dl, MVT::i32));
Value = Value.getValue(0); // We only want the low 32 bits for the result.
break;
case ISD::SMULO:
// We generate a SMUL_LOHI and then check if all the bits of the high word
// are the same as the sign bit of the low word.
ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
Value = DAG.getNode(ISD::SMUL_LOHI, dl,
DAG.getVTList(Op.getValueType(), Op.getValueType()),
LHS, RHS);
OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
DAG.getNode(ISD::SRA, dl, Op.getValueType(),
Value.getValue(0),
DAG.getConstant(31, dl, MVT::i32)));
Value = Value.getValue(0); // We only want the low 32 bits for the result.
break;
} // switch (...)
return std::make_pair(Value, OverflowCmp);
}
SDValue
ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
// Let legalize expand this if it isn't a legal type yet.
if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
return SDValue();
SDValue Value, OverflowCmp;
SDValue ARMcc;
std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
SDLoc dl(Op);
// We use 0 and 1 as false and true values.
SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
EVT VT = Op.getValueType();
SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,
ARMcc, CCR, OverflowCmp);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
}
static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry,
SelectionDAG &DAG) {
SDLoc DL(BoolCarry);
EVT CarryVT = BoolCarry.getValueType();
// This converts the boolean value carry into the carry flag by doing
// ARMISD::SUBC Carry, 1
SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
DAG.getVTList(CarryVT, MVT::i32),
BoolCarry, DAG.getConstant(1, DL, CarryVT));
return Carry.getValue(1);
}
static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT,
SelectionDAG &DAG) {
SDLoc DL(Flags);
// Now convert the carry flag into a boolean carry. We do this
// using ARMISD:ADDE 0, 0, Carry
return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
DAG.getConstant(0, DL, MVT::i32),
DAG.getConstant(0, DL, MVT::i32), Flags);
}
SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
SelectionDAG &DAG) const {
// Let legalize expand this if it isn't a legal type yet.
if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
return SDValue();
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
SDLoc dl(Op);
EVT VT = Op.getValueType();
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
SDValue Value;
SDValue Overflow;
switch (Op.getOpcode()) {
default:
llvm_unreachable("Unknown overflow instruction!");
case ISD::UADDO:
Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
// Convert the carry flag into a boolean value.
Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
break;
case ISD::USUBO: {
Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
// Convert the carry flag into a boolean value.
Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
// ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
// value. So compute 1 - C.
Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
DAG.getConstant(1, dl, MVT::i32), Overflow);
break;
}
}
return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
}
SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue Cond = Op.getOperand(0);
SDValue SelectTrue = Op.getOperand(1);
SDValue SelectFalse = Op.getOperand(2);
SDLoc dl(Op);
unsigned Opc = Cond.getOpcode();
if (Cond.getResNo() == 1 &&
(Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
Opc == ISD::USUBO)) {
if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
return SDValue();
SDValue Value, OverflowCmp;
SDValue ARMcc;
std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
EVT VT = Op.getValueType();
return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,
OverflowCmp, DAG);
}
// Convert:
//
// (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
// (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
//
if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
const ConstantSDNode *CMOVTrue =
dyn_cast<ConstantSDNode>(Cond.getOperand(0));
const ConstantSDNode *CMOVFalse =
dyn_cast<ConstantSDNode>(Cond.getOperand(1));
if (CMOVTrue && CMOVFalse) {
unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
SDValue True;
SDValue False;
if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
True = SelectTrue;
False = SelectFalse;
} else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
True = SelectFalse;
False = SelectTrue;
}
if (True.getNode() && False.getNode()) {
EVT VT = Op.getValueType();
SDValue ARMcc = Cond.getOperand(2);
SDValue CCR = Cond.getOperand(3);
SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
assert(True.getValueType() == VT);
return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);
}
}
}
// ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
// undefined bits before doing a full-word comparison with zero.
Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
DAG.getConstant(1, dl, Cond.getValueType()));
return DAG.getSelectCC(dl, Cond,
DAG.getConstant(0, dl, Cond.getValueType()),
SelectTrue, SelectFalse, ISD::SETNE);
}
static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
bool &swpCmpOps, bool &swpVselOps) {
// Start by selecting the GE condition code for opcodes that return true for
// 'equality'
if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
CondCode = ARMCC::GE;
// and GT for opcodes that return false for 'equality'.
else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
CondCode = ARMCC::GT;
// Since we are constrained to GE/GT, if the opcode contains 'less', we need
// to swap the compare operands.
if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
swpCmpOps = true;
// Both GT and GE are ordered comparisons, and return false for 'unordered'.
// If we have an unordered opcode, we need to swap the operands to the VSEL
// instruction (effectively negating the condition).
//
// This also has the effect of swapping which one of 'less' or 'greater'
// returns true, so we also swap the compare operands. It also switches
// whether we return true for 'equality', so we compensate by picking the
// opposite condition code to our original choice.
if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
CC == ISD::SETUGT) {
swpCmpOps = !swpCmpOps;
swpVselOps = !swpVselOps;
CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
}
// 'ordered' is 'anything but unordered', so use the VS condition code and
// swap the VSEL operands.
if (CC == ISD::SETO) {
CondCode = ARMCC::VS;
swpVselOps = true;
}
// 'unordered or not equal' is 'anything but equal', so use the EQ condition
// code and swap the VSEL operands. Also do this if we don't care about the
// unordered case.
if (CC == ISD::SETUNE || CC == ISD::SETNE) {
CondCode = ARMCC::EQ;
swpVselOps = true;
}
}
SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
SDValue TrueVal, SDValue ARMcc, SDValue CCR,
SDValue Cmp, SelectionDAG &DAG) const {
if (!Subtarget->hasFP64() && VT == MVT::f64) {
FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
SDValue TrueLow = TrueVal.getValue(0);
SDValue TrueHigh = TrueVal.getValue(1);
SDValue FalseLow = FalseVal.getValue(0);
SDValue FalseHigh = FalseVal.getValue(1);
SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
ARMcc, CCR, Cmp);
SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
ARMcc, CCR, duplicateCmp(Cmp, DAG));
return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
} else {
return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
Cmp);
}
}
static bool isGTorGE(ISD::CondCode CC) {
return CC == ISD::SETGT || CC == ISD::SETGE;
}
static bool isLTorLE(ISD::CondCode CC) {
return CC == ISD::SETLT || CC == ISD::SETLE;
}
// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
// All of these conditions (and their <= and >= counterparts) will do:
// x < k ? k : x
// x > k ? x : k
// k < x ? x : k
// k > x ? k : x
static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
const SDValue TrueVal, const SDValue FalseVal,
const ISD::CondCode CC, const SDValue K) {
return (isGTorGE(CC) &&
((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
(isLTorLE(CC) &&
((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
}
// Similar to isLowerSaturate(), but checks for upper-saturating conditions.
static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
const SDValue TrueVal, const SDValue FalseVal,
const ISD::CondCode CC, const SDValue K) {
return (isGTorGE(CC) &&
((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) ||
(isLTorLE(CC) &&
((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal)));
}
// Check if two chained conditionals could be converted into SSAT or USAT.
//
// SSAT can replace a set of two conditional selectors that bound a number to an
// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
//
// x < -k ? -k : (x > k ? k : x)
// x < -k ? -k : (x < k ? x : k)
// x > -k ? (x > k ? k : x) : -k
// x < k ? (x < -k ? -k : x) : k
// etc.
//
// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is
// a power of 2.
//
// It returns true if the conversion can be done, false otherwise.
// Additionally, the variable is returned in parameter V, the constant in K and
// usat is set to true if the conditional represents an unsigned saturation
static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
uint64_t &K, bool &usat) {
SDValue LHS1 = Op.getOperand(0);
SDValue RHS1 = Op.getOperand(1);
SDValue TrueVal1 = Op.getOperand(2);
SDValue FalseVal1 = Op.getOperand(3);
ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
if (Op2.getOpcode() != ISD::SELECT_CC)
return false;
SDValue LHS2 = Op2.getOperand(0);
SDValue RHS2 = Op2.getOperand(1);
SDValue TrueVal2 = Op2.getOperand(2);
SDValue FalseVal2 = Op2.getOperand(3);
ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
// Find out which are the constants and which are the variables
// in each conditional
SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1)
? &RHS1
: nullptr;
SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2)
? &RHS2
: nullptr;
SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2;
SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1;
SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2;
SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2;
// We must detect cases where the original operations worked with 16- or
// 8-bit values. In such case, V2Tmp != V2 because the comparison operations
// must work with sign-extended values but the select operations return
// the original non-extended value.
SDValue V2TmpReg = V2Tmp;
if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG)
V2TmpReg = V2Tmp->getOperand(0);
// Check that the registers and the constants have the correct values
// in both conditionals
if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp ||
V2TmpReg != V2)
return false;
// Figure out which conditional is saturating the lower/upper bound.
const SDValue *LowerCheckOp =
isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
? &Op
: isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
? &Op2
: nullptr;
const SDValue *UpperCheckOp =
isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
? &Op
: isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
? &Op2
: nullptr;
if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp)
return false;
// Check that the constant in the lower-bound check is
// the opposite of the constant in the upper-bound check
// in 1's complement.
int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue();
int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue();
int64_t PosVal = std::max(Val1, Val2);
int64_t NegVal = std::min(Val1, Val2);
if (((Val1 > Val2 && UpperCheckOp == &Op) ||
(Val1 < Val2 && UpperCheckOp == &Op2)) &&
isPowerOf2_64(PosVal + 1)) {
// Handle the difference between USAT (unsigned) and SSAT (signed) saturation
if (Val1 == ~Val2)
usat = false;
else if (NegVal == 0)
usat = true;
else
return false;
V = V2;
K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive
return true;
}
return false;
}
// Check if a condition of the type x < k ? k : x can be converted into a
// bit operation instead of conditional moves.
// Currently this is allowed given:
// - The conditions and values match up
// - k is 0 or -1 (all ones)
// This function will not check the last condition, thats up to the caller
// It returns true if the transformation can be made, and in such case
// returns x in V, and k in SatK.
static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V,
SDValue &SatK)
{
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
SDValue TrueVal = Op.getOperand(2);
SDValue FalseVal = Op.getOperand(3);
SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS)
? &RHS
: nullptr;
// No constant operation in comparison, early out
if (!K)
return false;
SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
V = (KTmp == TrueVal) ? FalseVal : TrueVal;
SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
// If the constant on left and right side, or variable on left and right,
// does not match, early out
if (*K != KTmp || V != VTmp)
return false;
if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
SatK = *K;
return true;
}
return false;
}
bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
if (VT == MVT::f32)
return !Subtarget->hasVFP2Base();
if (VT == MVT::f64)
return !Subtarget->hasFP64();
if (VT == MVT::f16)
return !Subtarget->hasFullFP16();
return false;
}
SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc dl(Op);
// Try to convert two saturating conditional selects into a single SSAT
SDValue SatValue;
uint64_t SatConstant;
bool SatUSat;
if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) &&
isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) {
if (SatUSat)
return DAG.getNode(ARMISD::USAT, dl, VT, SatValue,
DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
else
return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
}
// Try to convert expressions of the form x < k ? k : x (and similar forms)
// into more efficient bit operations, which is possible when k is 0 or -1
// On ARM and Thumb-2 which have flexible operand 2 this will result in
// single instructions. On Thumb the shift and the bit operation will be two
// instructions.
// Only allow this transformation on full-width (32-bit) operations
SDValue LowerSatConstant;
if (VT == MVT::i32 &&
isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
DAG.getConstant(31, dl, VT));
if (isNullConstant(LowerSatConstant)) {
SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
DAG.getAllOnesConstant(dl, VT));
return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
} else if (isAllOnesConstant(LowerSatConstant))
return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
}
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
SDValue TrueVal = Op.getOperand(2);
SDValue FalseVal = Op.getOperand(3);
if (isUnsupportedFloatingType(LHS.getValueType())) {
DAG.getTargetLoweringInfo().softenSetCCOperands(
DAG, LHS.getValueType(), LHS, RHS, CC, dl);
// If softenSetCCOperands only returned one value, we should compare it to
// zero.
if (!RHS.getNode()) {
RHS = DAG.getConstant(0, dl, LHS.getValueType());
CC = ISD::SETNE;
}
}
if (LHS.getValueType() == MVT::i32) {
// Try to generate VSEL on ARMv8.
// The VSEL instruction can't use all the usual ARM condition
// codes: it only has two bits to select the condition code, so it's
// constrained to use only GE, GT, VS and EQ.
//
// To implement all the various ISD::SETXXX opcodes, we sometimes need to
// swap the operands of the previous compare instruction (effectively
// inverting the compare condition, swapping 'less' and 'greater') and
// sometimes need to swap the operands to the VSEL (which inverts the
// condition in the sense of firing whenever the previous condition didn't)
if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
TrueVal.getValueType() == MVT::f32 ||
TrueVal.getValueType() == MVT::f64)) {
ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
CC = ISD::getSetCCInverse(CC, true);
std::swap(TrueVal, FalseVal);
}
}
SDValue ARMcc;
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
// Choose GE over PL, which vsel does now support
if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL)
ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
}
ARMCC::CondCodes CondCode, CondCode2;
bool InvalidOnQNaN;
FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
// Normalize the fp compare. If RHS is zero we prefer to keep it there so we
// match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
// must use VSEL (limited condition codes), due to not having conditional f16
// moves.
if (Subtarget->hasFPARMv8Base() &&
!(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
(TrueVal.getValueType() == MVT::f16 ||
TrueVal.getValueType() == MVT::f32 ||
TrueVal.getValueType() == MVT::f64)) {
bool swpCmpOps = false;
bool swpVselOps = false;
checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
if (swpCmpOps)
std::swap(LHS, RHS);
if (swpVselOps)
std::swap(TrueVal, FalseVal);
}
}
SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
if (CondCode2 != ARMCC::AL) {
SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
// FIXME: Needs another CMP because flag can have but one use.
SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
}
return Result;
}
/// canChangeToInt - Given the fp compare operand, return true if it is suitable
/// to morph to an integer compare sequence.
static bool canChangeToInt(SDValue Op, bool &SeenZero,
const ARMSubtarget *Subtarget) {
SDNode *N = Op.getNode();
if (!N->hasOneUse())
// Otherwise it requires moving the value from fp to integer registers.
return false;
if (!N->getNumValues())
return false;
EVT VT = Op.getValueType();
if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
// f32 case is generally profitable. f64 case only makes sense when vcmpe +
// vmrs are very slow, e.g. cortex-a8.
return false;
if (isFloatingPointZero(Op)) {
SeenZero = true;
return true;
}
return ISD::isNormalLoad(N);
}
static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
if (isFloatingPointZero(Op))
return DAG.getConstant(0, SDLoc(Op), MVT::i32);
if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
Ld->getPointerInfo(), Ld->getAlignment(),
Ld->getMemOperand()->getFlags());
llvm_unreachable("Unknown VFP cmp argument!");
}
static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
SDValue &RetVal1, SDValue &RetVal2) {
SDLoc dl(Op);
if (isFloatingPointZero(Op)) {
RetVal1 = DAG.getConstant(0, dl, MVT::i32);
RetVal2 = DAG.getConstant(0, dl, MVT::i32);
return;
}
if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
SDValue Ptr = Ld->getBasePtr();
RetVal1 =
DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
Ld->getAlignment(), Ld->getMemOperand()->getFlags());
EVT PtrType = Ptr.getValueType();
unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
Ld->getPointerInfo().getWithOffset(4), NewAlign,
Ld->getMemOperand()->getFlags());
return;
}
llvm_unreachable("Unknown VFP cmp argument!");
}
/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
/// f32 and even f64 comparisons to integer ones.
SDValue
ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
SDValue LHS = Op.getOperand(2);
SDValue RHS = Op.getOperand(3);
SDValue Dest = Op.getOperand(4);
SDLoc dl(Op);
bool LHSSeenZero = false;
bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
bool RHSSeenZero = false;
bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
// If unsafe fp math optimization is enabled and there are no other uses of
// the CMP operands, and the condition code is EQ or NE, we can optimize it
// to an integer comparison.
if (CC == ISD::SETOEQ)
CC = ISD::SETEQ;
else if (CC == ISD::SETUNE)
CC = ISD::SETNE;
SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
SDValue ARMcc;
if (LHS.getValueType() == MVT::f32) {
LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
bitcastf32Toi32(LHS, DAG), Mask);
RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
bitcastf32Toi32(RHS, DAG), Mask);
SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
Chain, Dest, ARMcc, CCR, Cmp);
}
SDValue LHS1, LHS2;
SDValue RHS1, RHS2;
expandf64Toi32(LHS, DAG, LHS1, LHS2);
expandf64Toi32(RHS, DAG, RHS1, RHS2);
LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
}
return SDValue();
}
SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
SDValue Cond = Op.getOperand(1);
SDValue Dest = Op.getOperand(2);
SDLoc dl(Op);
// Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
// instruction.
unsigned Opc = Cond.getOpcode();
bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
!Subtarget->isThumb1Only();
if (Cond.getResNo() == 1 &&
(Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
Opc == ISD::USUBO || OptimizeMul)) {
// Only lower legal XALUO ops.
if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
return SDValue();
// The actual operation with overflow check.
SDValue Value, OverflowCmp;
SDValue ARMcc;
std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
// Reverse the condition code.
ARMCC::CondCodes CondCode =
(ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
CondCode = ARMCC::getOppositeCondition(CondCode);
ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
OverflowCmp);
}
return SDValue();
}
SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
SDValue LHS = Op.getOperand(2);
SDValue RHS = Op.getOperand(3);
SDValue Dest = Op.getOperand(4);
SDLoc dl(Op);
if (isUnsupportedFloatingType(LHS.getValueType())) {
DAG.getTargetLoweringInfo().softenSetCCOperands(
DAG, LHS.getValueType(), LHS, RHS, CC, dl);
// If softenSetCCOperands only returned one value, we should compare it to
// zero.
if (!RHS.getNode()) {
RHS = DAG.getConstant(0, dl, LHS.getValueType());
CC = ISD::SETNE;
}
}
// Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
// instruction.
unsigned Opc = LHS.getOpcode();
bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
!Subtarget->isThumb1Only();
if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
(Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
Opc == ISD::USUBO || OptimizeMul) &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
// Only lower legal XALUO ops.
if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
return SDValue();
// The actual operation with overflow check.
SDValue Value, OverflowCmp;
SDValue ARMcc;
std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
// Reverse the condition code.
ARMCC::CondCodes CondCode =
(ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
CondCode = ARMCC::getOppositeCondition(CondCode);
ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
}
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
OverflowCmp);
}
if (LHS.getValueType() == MVT::i32) {
SDValue ARMcc;
SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
Chain, Dest, ARMcc, CCR, Cmp);
}
if (getTargetMachine().Options.UnsafeFPMath &&
(CC == ISD::SETEQ || CC == ISD::SETOEQ ||
CC == ISD::SETNE || CC == ISD::SETUNE)) {
if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
return Result;
}
ARMCC::CondCodes CondCode, CondCode2;
bool InvalidOnQNaN;
FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
if (CondCode2 != ARMCC::AL) {
ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
}
return Res;
}
SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
SDValue Table = Op.getOperand(1);
SDValue Index = Op.getOperand(2);
SDLoc dl(Op);
EVT PTy = getPointerTy(DAG.getDataLayout());
JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
// Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
// which does another jump to the destination. This also makes it easier
// to translate it to TBB / TBH later (Thumb2 only).
// FIXME: This might not work if the function is extremely large.
return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
Addr, Op.getOperand(2), JTI);
}
if (isPositionIndependent() || Subtarget->isROPI()) {
Addr =
DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
Chain = Addr.getValue(1);
Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
} else {
Addr =
DAG.getLoad(PTy, dl, Chain, Addr,
MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
Chain = Addr.getValue(1);
return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
}
}
static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
SDLoc dl(Op);
if (Op.getValueType().getVectorElementType() == MVT::i32) {
if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
return Op;
return DAG.UnrollVectorOp(Op.getNode());
}
const bool HasFullFP16 =
static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
EVT NewTy;
const EVT OpTy = Op.getOperand(0).getValueType();
if (OpTy == MVT::v4f32)
NewTy = MVT::v4i32;
else if (OpTy == MVT::v4f16 && HasFullFP16)
NewTy = MVT::v4i16;
else if (OpTy == MVT::v8f16 && HasFullFP16)
NewTy = MVT::v8i16;
else
llvm_unreachable("Invalid type for custom lowering!");
if (VT != MVT::v4i16 && VT != MVT::v8i16)
return DAG.UnrollVectorOp(Op.getNode());
Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
}
SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
if (VT.isVector())
return LowerVectorFP_TO_INT(Op, DAG);
if (isUnsupportedFloatingType(Op.getOperand(0).getValueType())) {
RTLIB::Libcall LC;
if (Op.getOpcode() == ISD::FP_TO_SINT)
LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(),
Op.getValueType());
else
LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
Op.getValueType());
return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
/*isSigned*/ false, SDLoc(Op)).first;
}
return Op;
}
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
SDLoc dl(Op);
if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
if (VT.getVectorElementType() == MVT::f32)
return Op;
return DAG.UnrollVectorOp(Op.getNode());
}
assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
Op.getOperand(0).getValueType() == MVT::v8i16) &&
"Invalid type for custom lowering!");
const bool HasFullFP16 =
static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
EVT DestVecType;
if (VT == MVT::v4f32)
DestVecType = MVT::v4i32;
else if (VT == MVT::v4f16 && HasFullFP16)
DestVecType = MVT::v4i16;
else if (VT == MVT::v8f16 && HasFullFP16)
DestVecType = MVT::v8i16;
else
return DAG.UnrollVectorOp(Op.getNode());
unsigned CastOpc;
unsigned Opc;
switch (Op.getOpcode()) {
default: llvm_unreachable("Invalid opcode!");
case ISD::SINT_TO_FP:
CastOpc = ISD::SIGN_EXTEND;
Opc = ISD::SINT_TO_FP;
break;
case ISD::UINT_TO_FP:
CastOpc = ISD::ZERO_EXTEND;
Opc = ISD::UINT_TO_FP;
break;
}
Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
return DAG.getNode(Opc, dl, VT, Op);
}
SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
if (VT.isVector())
return LowerVectorINT_TO_FP(Op, DAG);
if (isUnsupportedFloatingType(VT)) {
RTLIB::Libcall LC;
if (Op.getOpcode() == ISD::SINT_TO_FP)
LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
Op.getValueType());
else
LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
Op.getValueType());
return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
/*isSigned*/ false, SDLoc(Op)).first;
}
return Op;
}
SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
// Implement fcopysign with a fabs and a conditional fneg.
SDValue Tmp0 = Op.getOperand(0);
SDValue Tmp1 = Op.getOperand(1);
SDLoc dl(Op);
EVT VT = Op.getValueType();
EVT SrcVT = Tmp1.getValueType();
bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
Tmp0.getOpcode() == ARMISD::VMOVDRR;
bool UseNEON = !InGPR && Subtarget->hasNEON();
if (UseNEON) {
// Use VBSL to copy the sign bit.
unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
if (VT == MVT::f64)
Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
DAG.getConstant(32, dl, MVT::i32));
else /*if (VT == MVT::f32)*/
Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
if (SrcVT == MVT::f32) {
Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
if (VT == MVT::f64)
Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
DAG.getConstant(32, dl, MVT::i32));
} else if (VT == MVT::f32)
Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
DAG.getConstant(32, dl, MVT::i32));
Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
dl, MVT::i32);
AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
if (VT == MVT::f32) {
Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
DAG.getConstant(0, dl, MVT::i32));
} else {
Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
}
return Res;
}
// Bitcast operand 1 to i32.
if (SrcVT == MVT::f64)
Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
Tmp1).getValue(1);
Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
// Or in the signbit with integer operations.
SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
if (VT == MVT::f32) {
Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
}
// f64: Or the high part with signbit and then combine two parts.
Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
Tmp0);
SDValue Lo = Tmp0.getValue(0);
SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
}
SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
MFI.setReturnAddressIsTaken(true);
if (verifyReturnAddressArgumentIsConstant(Op, DAG))
return SDValue();
EVT VT = Op.getValueType();
SDLoc dl(Op);
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
if (Depth) {
SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
return DAG.getLoad(VT, dl, DAG.getEntryNode(),
DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
MachinePointerInfo());
}
// Return LR, which contains the return address. Mark it an implicit live-in.
unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
}
SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
const ARMBaseRegisterInfo &ARI =
*static_cast<const ARMBaseRegisterInfo*>(RegInfo);
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
MFI.setFrameAddressIsTaken(true);
EVT VT = Op.getValueType();
SDLoc dl(Op); // FIXME probably not meaningful
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
unsigned FrameReg = ARI.getFrameRegister(MF);
SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
while (Depth--)
FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
MachinePointerInfo());
return FrameAddr;
}
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const {
unsigned Reg = StringSwitch<unsigned>(RegName)
.Case("sp", ARM::SP)
.Default(0);
if (Reg)
return Reg;
report_fatal_error(Twine("Invalid register name \""
+ StringRef(RegName) + "\"."));
}
// Result is 64 bit value so split into two 32 bit values and return as a
// pair of values.
static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) {
SDLoc DL(N);
// This function is only supposed to be called for i64 type destination.
assert(N->getValueType(0) == MVT::i64
&& "ExpandREAD_REGISTER called for non-i64 type result.");
SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL,
DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
N->getOperand(0),
N->getOperand(1));
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
Read.getValue(1)));
Results.push_back(Read.getOperand(0));
}
/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
/// When \p DstVT, the destination type of \p BC, is on the vector
/// register bank and the source of bitcast, \p Op, operates on the same bank,
/// it might be possible to combine them, such that everything stays on the
/// vector register bank.
/// \p return The node that would replace \p BT, if the combine
/// is possible.
static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
SelectionDAG &DAG) {
SDValue Op = BC->getOperand(0);
EVT DstVT = BC->getValueType(0);
// The only vector instruction that can produce a scalar (remember,
// since the bitcast was about to be turned into VMOVDRR, the source
// type is i64) from a vector is EXTRACT_VECTOR_ELT.
// Moreover, we can do this combine only if there is one use.
// Finally, if the destination type is not a vector, there is not
// much point on forcing everything on the vector bank.
if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!Op.hasOneUse())
return SDValue();
// If the index is not constant, we will introduce an additional
// multiply that will stick.
// Give up in that case.
ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
if (!Index)
return SDValue();
unsigned DstNumElt = DstVT.getVectorNumElements();
// Compute the new index.
const APInt &APIntIndex = Index->getAPIntValue();
APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
NewIndex *= APIntIndex;
// Check if the new constant index fits into i32.
if (NewIndex.getBitWidth() > 32)
return SDValue();
// vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
// vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
SDLoc dl(Op);
SDValue ExtractSrc = Op.getOperand(0);
EVT VecVT = EVT::getVectorVT(
*DAG.getContext(), DstVT.getScalarType(),
ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
}
/// ExpandBITCAST - If the target supports VFP, this function is called to
/// expand a bit convert where either the source or destination type is i64 to
/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
/// operand type is illegal (e.g., v2f32 for a target that doesn't support
/// vectors), since the legalizer won't know what to do with that.
static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *Subtarget) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDLoc dl(N);
SDValue Op = N->getOperand(0);
// This function is only supposed to be called for i64 types, either as the
// source or destination of the bit convert.
EVT SrcVT = Op.getValueType();
EVT DstVT = N->getValueType(0);
const bool HasFullFP16 = Subtarget->hasFullFP16();
if (SrcVT == MVT::f32 && DstVT == MVT::i32) {
// FullFP16: half values are passed in S-registers, and we don't
// need any of the bitcast and moves:
//
// t2: f32,ch = CopyFromReg t0, Register:f32 %0
// t5: i32 = bitcast t2
// t18: f16 = ARMISD::VMOVhr t5
if (Op.getOpcode() != ISD::CopyFromReg ||
Op.getValueType() != MVT::f32)
return SDValue();
auto Move = N->use_begin();
if (Move->getOpcode() != ARMISD::VMOVhr)
return SDValue();
SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) };
SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops);
DAG.ReplaceAllUsesWith(*Move, &Copy);
return Copy;
}
if (SrcVT == MVT::i16 && DstVT == MVT::f16) {
if (!HasFullFP16)
return SDValue();
// SoftFP: read half-precision arguments:
//
// t2: i32,ch = ...
// t7: i16 = truncate t2 <~~~~ Op
// t8: f16 = bitcast t7 <~~~~ N
//
if (Op.getOperand(0).getValueType() == MVT::i32)
return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op),
MVT::f16, Op.getOperand(0));
return SDValue();
}
// Half-precision return values
if (SrcVT == MVT::f16 && DstVT == MVT::i16) {
if (!HasFullFP16)
return SDValue();
//
// t11: f16 = fadd t8, t10
// t12: i16 = bitcast t11 <~~~ SDNode N
// t13: i32 = zero_extend t12
// t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13
// t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1
//
// transform this into:
//
// t20: i32 = ARMISD::VMOVrh t11
// t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20
//
auto ZeroExtend = N->use_begin();
if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND ||
ZeroExtend->getValueType(0) != MVT::i32)
return SDValue();
auto Copy = ZeroExtend->use_begin();
if (Copy->getOpcode() == ISD::CopyToReg &&
Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) {
SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op);
DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt);
return Cvt;
}
return SDValue();
}
if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
return SDValue();
// Turn i64->f64 into VMOVDRR.
if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
// Do not force values to GPRs (this is what VMOVDRR does for the inputs)
// if we can combine the bitcast with its source.
if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
return Val;
SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
DAG.getConstant(0, dl, MVT::i32));
SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
DAG.getConstant(1, dl, MVT::i32));
return DAG.getNode(ISD::BITCAST, dl, DstVT,
DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
}
// Turn f64->i64 into VMOVRRD.
if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
SDValue Cvt;
if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
SrcVT.getVectorNumElements() > 1)
Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
DAG.getVTList(MVT::i32, MVT::i32),
DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
else
Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
DAG.getVTList(MVT::i32, MVT::i32), Op);
// Merge the pieces into a single i64 value.
return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
}
return SDValue();
}
/// getZeroVector - Returns a vector of specified type with all zero elements.
/// Zero vectors are used to represent vector negation and in those cases
/// will be implemented with the NEON VNEG instruction. However, VNEG does
/// not support i64 elements, so sometimes the zero vectors will need to be
/// explicitly constructed. Regardless, use a canonical VMOV to create the
/// zero vector.
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
assert(VT.isVector() && "Expected a vector type");
// The canonical modified immediate encoding of a zero vector is....0!
SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
}
/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
/// i32 values and take a 2 x i32 value to shift plus a shift amount.
SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
EVT VT = Op.getValueType();
unsigned VTBits = VT.getSizeInBits();
SDLoc dl(Op);
SDValue ShOpLo = Op.getOperand(0);
SDValue ShOpHi = Op.getOperand(1);
SDValue ShAmt = Op.getOperand(2);
SDValue ARMcc;
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
DAG.getConstant(VTBits, dl, MVT::i32));
SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
ISD::SETGE, ARMcc, DAG, dl);
SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift,
ARMcc, CCR, CmpLo);
SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
SDValue HiBigShift = Opc == ISD::SRA
? DAG.getNode(Opc, dl, VT, ShOpHi,
DAG.getConstant(VTBits - 1, dl, VT))
: DAG.getConstant(0, dl, VT);
SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
ISD::SETGE, ARMcc, DAG, dl);
SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
ARMcc, CCR, CmpHi);
SDValue Ops[2] = { Lo, Hi };
return DAG.getMergeValues(Ops, dl);
}
/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
/// i32 values and take a 2 x i32 value to shift plus a shift amount.
SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
EVT VT = Op.getValueType();
unsigned VTBits = VT.getSizeInBits();
SDLoc dl(Op);
SDValue ShOpLo = Op.getOperand(0);
SDValue ShOpHi = Op.getOperand(1);
SDValue ShAmt = Op.getOperand(2);
SDValue ARMcc;
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
assert(Op.getOpcode() == ISD::SHL_PARTS);
SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
DAG.getConstant(VTBits, dl, MVT::i32));
SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
ISD::SETGE, ARMcc, DAG, dl);
SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
ARMcc, CCR, CmpHi);
SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
ISD::SETGE, ARMcc, DAG, dl);
SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo);
SDValue Ops[2] = { Lo, Hi };
return DAG.getMergeValues(Ops, dl);
}
SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
SelectionDAG &DAG) const {
// The rounding mode is in bits 23:22 of the FPSCR.
// The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
// The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
// so that the shift + and get folded into a bitfield extract.
SDLoc dl(Op);
SDValue Ops[] = { DAG.getEntryNode(),
DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) };
SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops);
SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
DAG.getConstant(1U << 22, dl, MVT::i32));
SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
DAG.getConstant(22, dl, MVT::i32));
return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
DAG.getConstant(3, dl, MVT::i32));
}
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *ST) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
if (VT.isVector()) {
assert(ST->hasNEON());
// Compute the least significant set bit: LSB = X & -X
SDValue X = N->getOperand(0);
SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
EVT ElemTy = VT.getVectorElementType();
if (ElemTy == MVT::i8) {
// Compute with: cttz(x) = ctpop(lsb - 1)
SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
DAG.getTargetConstant(1, dl, ElemTy));
SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
}
if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
(N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
// Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
unsigned NumBits = ElemTy.getSizeInBits();
SDValue WidthMinus1 =
DAG.getNode(ARMISD::VMOVIMM, dl, VT,
DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
}
// Compute with: cttz(x) = ctpop(lsb - 1)
// Compute LSB - 1.
SDValue Bits;
if (ElemTy == MVT::i64) {
// Load constant 0xffff'ffff'ffff'ffff to register.
SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
DAG.getTargetConstant(0x1eff, dl, MVT::i32));
Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
} else {
SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
DAG.getTargetConstant(1, dl, ElemTy));
Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
}
return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
}
if (!ST->hasV6T2Ops())
return SDValue();
SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
}
static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *ST) {
EVT VT = N->getValueType(0);
SDLoc DL(N);
assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
"Unexpected type for custom ctpop lowering");
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
// Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
unsigned EltSize = 8;
unsigned NumElts = VT.is64BitVector() ? 8 : 16;
while (EltSize != VT.getScalarSizeInBits()) {
SmallVector<SDValue, 8> Ops;
Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
TLI.getPointerTy(DAG.getDataLayout())));
Ops.push_back(Res);
EltSize *= 2;
NumElts /= 2;
MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
}
return Res;
}
/// Getvshiftimm - Check if this is a valid build_vector for the immediate
/// operand of a vector shift operation, where all the elements of the
/// build_vector must have the same constant integer value.
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
// Ignore bit_converts.
while (Op.getOpcode() == ISD::BITCAST)
Op = Op.getOperand(0);
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
if (!BVN ||
!BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
ElementBits) ||
SplatBitSize > ElementBits)
return false;
Cnt = SplatBits.getSExtValue();
return true;
}
/// isVShiftLImm - Check if this is a valid build_vector for the immediate
/// operand of a vector shift left operation. That value must be in the range:
/// 0 <= Value < ElementBits for a left shift; or
/// 0 <= Value <= ElementBits for a long left shift.
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
assert(VT.isVector() && "vector shift count is not a vector type");
int64_t ElementBits = VT.getScalarSizeInBits();
if (!getVShiftImm(Op, ElementBits, Cnt))
return false;
return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
}
/// isVShiftRImm - Check if this is a valid build_vector for the immediate
/// operand of a vector shift right operation. For a shift opcode, the value
/// is positive, but for an intrinsic the value count must be negative. The
/// absolute value must be in the range:
/// 1 <= |Value| <= ElementBits for a right shift; or
/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
int64_t &Cnt) {
assert(VT.isVector() && "vector shift count is not a vector type");
int64_t ElementBits = VT.getScalarSizeInBits();
if (!getVShiftImm(Op, ElementBits, Cnt))
return false;
if (!isIntrinsic)
return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
Cnt = -Cnt;
return true;
}
return false;
}
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *ST) {
EVT VT = N->getValueType(0);
SDLoc dl(N);
int64_t Cnt;
if (!VT.isVector())
return SDValue();
// We essentially have two forms here. Shift by an immediate and shift by a
// vector register (there are also shift by a gpr, but that is just handled
// with a tablegen pattern). We cannot easily match shift by an immediate in
// tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
// For shifting by a vector, we don't have VSHR, only VSHL (which can be
// signed or unsigned, and a negative shift indicates a shift right).
if (N->getOpcode() == ISD::SHL) {
if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
DAG.getConstant(Cnt, dl, MVT::i32));
return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
N->getOperand(1));
}
assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
"unexpected vector shift opcode");
if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
unsigned VShiftOpc =
(N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
DAG.getConstant(Cnt, dl, MVT::i32));
}
// Other right shifts we don't have operations for (we use a shift left by a
// negative number).
EVT ShiftVT = N->getOperand(1).getValueType();
SDValue NegatedCount = DAG.getNode(
ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
unsigned VShiftOpc =
(N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
}
static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *ST) {
EVT VT = N->getValueType(0);
SDLoc dl(N);
// We can get here for a node like i32 = ISD::SHL i32, i64
if (VT != MVT::i64)
return SDValue();
assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
N->getOpcode() == ISD::SHL) &&
"Unknown shift to lower!");
unsigned ShOpc = N->getOpcode();
if (ST->hasMVEIntegerOps()) {
SDValue ShAmt = N->getOperand(1);
unsigned ShPartsOpc = ARMISD::LSLL;
ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt);
// If the shift amount is greater than 32 then do the default optimisation
if (Con && Con->getZExtValue() > 32)
return SDValue();
// Extract the lower 32 bits of the shift amount if it's an i64
if (ShAmt->getValueType(0) == MVT::i64)
ShAmt = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ShAmt,
DAG.getConstant(0, dl, MVT::i32));
if (ShOpc == ISD::SRL) {
if (!Con)
// There is no t2LSRLr instruction so negate and perform an lsll if the
// shift amount is in a register, emulating a right shift.
ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
DAG.getConstant(0, dl, MVT::i32), ShAmt);
else
// Else generate an lsrl on the immediate shift amount
ShPartsOpc = ARMISD::LSRL;
} else if (ShOpc == ISD::SRA)
ShPartsOpc = ARMISD::ASRL;
// Lower 32 bits of the destination/source
SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
DAG.getConstant(0, dl, MVT::i32));
// Upper 32 bits of the destination/source
SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
DAG.getConstant(1, dl, MVT::i32));
// Generate the shift operation as computed above
Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
ShAmt);
// The upper 32 bits come from the second return value of lsll
Hi = SDValue(Lo.getNode(), 1);
return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
}
// We only lower SRA, SRL of 1 here, all others use generic lowering.
if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
return SDValue();
// If we are in thumb mode, we don't have RRX.
if (ST->isThumb1Only())
return SDValue();
// Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
DAG.getConstant(0, dl, MVT::i32));
SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
DAG.getConstant(1, dl, MVT::i32));
// First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
// captures the result into a carry flag.
unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
// The low part is an ARMISD::RRX operand, which shifts the carry in.
Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
// Merge the pieces into a single i64 value.
return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
}
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
SDValue TmpOp0, TmpOp1;
bool Invert = false;
bool Swap = false;
unsigned Opc = 0;
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDValue CC = Op.getOperand(2);
EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
EVT VT = Op.getValueType();
ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
SDLoc dl(Op);
if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
(SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
// Special-case integer 64-bit equality comparisons. They aren't legal,
// but they can be lowered with a few vector instructions.
unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
DAG.getCondCode(ISD::SETEQ));
SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
if (SetCCOpcode == ISD::SETNE)
Merged = DAG.getNOT(dl, Merged, CmpVT);
Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
return Merged;
}
if (CmpVT.getVectorElementType() == MVT::i64)
// 64-bit comparisons are not legal in general.
return SDValue();
if (Op1.getValueType().isFloatingPoint()) {
switch (SetCCOpcode) {
default: llvm_unreachable("Illegal FP comparison");
case ISD::SETUNE:
case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH;
case ISD::SETOEQ:
case ISD::SETEQ: Opc = ARMISD::VCEQ; break;
case ISD::SETOLT:
case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETOGT:
case ISD::SETGT: Opc = ARMISD::VCGT; break;
case ISD::SETOLE:
case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETOGE:
case ISD::SETGE: Opc = ARMISD::VCGE; break;
case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH;
case ISD::SETONE:
// Expand this to (OLT | OGT).
TmpOp0 = Op0;
TmpOp1 = Op1;
Opc = ISD::OR;
Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1);
break;
case ISD::SETUO:
Invert = true;
LLVM_FALLTHROUGH;
case ISD::SETO:
// Expand this to (OLT | OGE).
TmpOp0 = Op0;
TmpOp1 = Op1;
Opc = ISD::OR;
Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1);
break;
}
} else {
// Integer comparisons.
switch (SetCCOpcode) {
default: llvm_unreachable("Illegal integer comparison");
case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH;
case ISD::SETEQ: Opc = ARMISD::VCEQ; break;
case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETGT: Opc = ARMISD::VCGT; break;
case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETGE: Opc = ARMISD::VCGE; break;
case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
}
// Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
if (Opc == ARMISD::VCEQ) {
SDValue AndOp;
if (ISD::isBuildVectorAllZeros(Op1.getNode()))
AndOp = Op0;
else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
AndOp = Op1;
// Ignore bitconvert.
if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
AndOp = AndOp.getOperand(0);
if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
Opc = ARMISD::VTST;
Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
Invert = !Invert;
}
}
}
if (Swap)
std::swap(Op0, Op1);
// If one of the operands is a constant vector zero, attempt to fold the
// comparison to a specialized compare-against-zero form.
SDValue SingleOp;
if (ISD::isBuildVectorAllZeros(Op1.getNode()))
SingleOp = Op0;
else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
if (Opc == ARMISD::VCGE)
Opc = ARMISD::VCLEZ;
else if (Opc == ARMISD::VCGT)
Opc = ARMISD::VCLTZ;
SingleOp = Op1;
}
SDValue Result;
if (SingleOp.getNode()) {
switch (Opc) {
case ARMISD::VCEQ:
Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break;
case ARMISD::VCGE:
Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break;
case ARMISD::VCLEZ:
Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break;
case ARMISD::VCGT:
Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break;
case ARMISD::VCLTZ:
Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break;
default:
Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
}
} else {
Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
}
Result = DAG.getSExtOrTrunc(Result, dl, VT);
if (Invert)
Result = DAG.getNOT(dl, Result, VT);
return Result;
}
static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
SDValue Carry = Op.getOperand(2);
SDValue Cond = Op.getOperand(3);
SDLoc DL(Op);
assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
// ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
// have to invert the carry first.
Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
DAG.getConstant(1, DL, MVT::i32), Carry);
// This converts the boolean value carry into the carry flag.
Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
SDValue ARMcc = DAG.getConstant(
IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
Cmp.getValue(1), SDValue());
return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
CCR, Chain.getValue(1));
}
/// isNEONModifiedImm - Check if the specified splat value corresponds to a
/// valid vector constant for a NEON or MVE instruction with a "modified immediate"
/// operand (e.g., VMOV). If so, return the encoded value.
static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
unsigned SplatBitSize, SelectionDAG &DAG,
const SDLoc &dl, EVT &VT, bool is128Bits,
NEONModImmType type) {
unsigned OpCmode, Imm;
// SplatBitSize is set to the smallest size that splats the vector, so a
// zero vector will always have SplatBitSize == 8. However, NEON modified
// immediate instructions others than VMOV do not support the 8-bit encoding
// of a zero vector, and the default encoding of zero is supposed to be the
// 32-bit version.
if (SplatBits == 0)
SplatBitSize = 32;
switch (SplatBitSize) {
case 8:
if (type != VMOVModImm)
return SDValue();
// Any 1-byte value is OK. Op=0, Cmode=1110.
assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
OpCmode = 0xe;
Imm = SplatBits;
VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
break;
case 16:
// NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
if ((SplatBits & ~0xff) == 0) {
// Value = 0x00nn: Op=x, Cmode=100x.
OpCmode = 0x8;
Imm = SplatBits;
break;
}
if ((SplatBits & ~0xff00) == 0) {
// Value = 0xnn00: Op=x, Cmode=101x.
OpCmode = 0xa;
Imm = SplatBits >> 8;
break;
}
return SDValue();
case 32:
// NEON's 32-bit VMOV supports splat values where:
// * only one byte is nonzero, or
// * the least significant byte is 0xff and the second byte is nonzero, or
// * the least significant 2 bytes are 0xff and the third is nonzero.
VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
if ((SplatBits & ~0xff) == 0) {
// Value = 0x000000nn: Op=x, Cmode=000x.
OpCmode = 0;
Imm = SplatBits;
break;
}
if ((SplatBits & ~0xff00) == 0) {
// Value = 0x0000nn00: Op=x, Cmode=001x.
OpCmode = 0x2;
Imm = SplatBits >> 8;
break;
}
if ((SplatBits & ~0xff0000) == 0) {
// Value = 0x00nn0000: Op=x, Cmode=010x.
OpCmode = 0x4;
Imm = SplatBits >> 16;
break;
}
if ((SplatBits & ~0xff000000) == 0) {
// Value = 0xnn000000: Op=x, Cmode=011x.
OpCmode = 0x6;
Imm = SplatBits >> 24;
break;
}
// cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
if (type == OtherModImm) return SDValue();
if ((SplatBits & ~0xffff) == 0 &&
((SplatBits | SplatUndef) & 0xff) == 0xff) {
// Value = 0x0000nnff: Op=x, Cmode=1100.
OpCmode = 0xc;
Imm = SplatBits >> 8;
break;
}
// cmode == 0b1101 is not supported for MVE VMVN
if (type == MVEVMVNModImm)
return SDValue();
if ((SplatBits & ~0xffffff) == 0 &&
((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
// Value = 0x00nnffff: Op=x, Cmode=1101.
OpCmode = 0xd;
Imm = SplatBits >> 16;
break;
}
// Note: there are a few 32-bit splat values (specifically: 00ffff00,
// ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
// VMOV.I32. A (very) minor optimization would be to replicate the value
// and fall through here to test for a valid 64-bit splat. But, then the
// caller would also need to check and handle the change in size.
return SDValue();
case 64: {
if (type != VMOVModImm)
return SDValue();
// NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
uint64_t BitMask = 0xff;
uint64_t Val = 0;
unsigned ImmMask = 1;
Imm = 0;
for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
Val |= BitMask;
Imm |= ImmMask;
} else if ((SplatBits & BitMask) != 0) {
return SDValue();
}
BitMask <<= 8;
ImmMask <<= 1;
}
if (DAG.getDataLayout().isBigEndian())
// swap higher and lower 32 bit word
Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4);
// Op=1, Cmode=1110.
OpCmode = 0x1e;
VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
break;
}
default:
llvm_unreachable("unexpected size for isNEONModifiedImm");
}
unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
}
SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *ST) const {
EVT VT = Op.getValueType();
bool IsDouble = (VT == MVT::f64);
ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
const APFloat &FPVal = CFP->getValueAPF();
// Prevent floating-point constants from using literal loads
// when execute-only is enabled.
if (ST->genExecuteOnly()) {
// If we can represent the constant as an immediate, don't lower it
if (isFPImmLegal(FPVal, VT))
return Op;
// Otherwise, construct as integer, and move to float register
APInt INTVal = FPVal.bitcastToAPInt();
SDLoc DL(CFP);
switch (VT.getSimpleVT().SimpleTy) {
default:
llvm_unreachable("Unknown floating point type!");
break;
case MVT::f64: {
SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
if (!ST->isLittle())
std::swap(Lo, Hi);
return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
}
case MVT::f32:
return DAG.getNode(ARMISD::VMOVSR, DL, VT,
DAG.getConstant(INTVal, DL, MVT::i32));
}
}
if (!ST->hasVFP3Base())
return SDValue();
// Use the default (constant pool) lowering for double constants when we have
// an SP-only FPU
if (IsDouble && !Subtarget->hasFP64())
return SDValue();
// Try splatting with a VMOV.f32...
int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
if (ImmVal != -1) {
if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
// We have code in place to select a valid ConstantFP already, no need to
// do any mangling.
return Op;
}
// It's a float and we are trying to use NEON operations where
// possible. Lower it to a splat followed by an extract.
SDLoc DL(Op);
SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
NewVal);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
DAG.getConstant(0, DL, MVT::i32));
}
// The rest of our options are NEON only, make sure that's allowed before
// proceeding..
if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
return SDValue();
EVT VMovVT;
uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
// It wouldn't really be worth bothering for doubles except for one very
// important value, which does happen to match: 0.0. So make sure we don't do
// anything stupid.
if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
return SDValue();
// Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
VMovVT, false, VMOVModImm);
if (NewVal != SDValue()) {
SDLoc DL(Op);
SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
NewVal);
if (IsDouble)
return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
// It's a float: cast and extract a vector element.
SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
VecConstant);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
DAG.getConstant(0, DL, MVT::i32));
}
// Finally, try a VMVN.i32
NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
false, VMVNModImm);
if (NewVal != SDValue()) {
SDLoc DL(Op);
SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
if (IsDouble)
return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
// It's a float: cast and extract a vector element.
SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
VecConstant);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
DAG.getConstant(0, DL, MVT::i32));
}
return SDValue();
}
// check if an VEXT instruction can handle the shuffle mask when the
// vector sources of the shuffle are the same.
static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
unsigned NumElts = VT.getVectorNumElements();
// Assume that the first shuffle index is not UNDEF. Fail if it is.
if (M[0] < 0)
return false;
Imm = M[0];
// If this is a VEXT shuffle, the immediate value is the index of the first
// element. The other shuffle indices must be the successive elements after
// the first one.
unsigned ExpectedElt = Imm;
for (unsigned i = 1; i < NumElts; ++i) {
// Increment the expected index. If it wraps around, just follow it
// back to index zero and keep going.
++ExpectedElt;
if (ExpectedElt == NumElts)
ExpectedElt = 0;
if (M[i] < 0) continue; // ignore UNDEF indices
if (ExpectedElt != static_cast<unsigned>(M[i]))
return false;
}
return true;
}
static bool isVEXTMask(ArrayRef<int> M, EVT VT,
bool &ReverseVEXT, unsigned &Imm) {
unsigned NumElts = VT.getVectorNumElements();
ReverseVEXT = false;
// Assume that the first shuffle index is not UNDEF. Fail if it is.
if (M[0] < 0)
return false;
Imm = M[0];
// If this is a VEXT shuffle, the immediate value is the index of the first
// element. The other shuffle indices must be the successive elements after
// the first one.
unsigned ExpectedElt = Imm;
for (unsigned i = 1; i < NumElts; ++i) {
// Increment the expected index. If it wraps around, it may still be
// a VEXT but the source vectors must be swapped.
ExpectedElt += 1;
if (ExpectedElt == NumElts * 2) {
ExpectedElt = 0;
ReverseVEXT = true;
}
if (M[i] < 0) continue; // ignore UNDEF indices
if (ExpectedElt != static_cast<unsigned>(M[i]))
return false;
}
// Adjust the index value if the source operands will be swapped.
if (ReverseVEXT)
Imm -= NumElts;
return true;
}
/// isVREVMask - Check if a vector shuffle corresponds to a VREV
/// instruction with the specified blocksize. (The order of the elements
/// within each block of the vector is reversed.)
static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
"Only possible block sizes for VREV are: 16, 32, 64");
unsigned EltSz = VT.getScalarSizeInBits();
if (EltSz == 64)
return false;
unsigned NumElts = VT.getVectorNumElements();
unsigned BlockElts = M[0] + 1;
// If the first shuffle index is UNDEF, be optimistic.
if (M[0] < 0)
BlockElts = BlockSize / EltSz;
if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
return false;
for (unsigned i = 0; i < NumElts; ++i) {
if (M[i] < 0) continue; // ignore UNDEF indices
if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
return false;
}
return true;
}
static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
// We can handle <8 x i8> vector shuffles. If the index in the mask is out of
// range, then 0 is placed into the resulting vector. So pretty much any mask
// of 8 elements can work here.
return VT == MVT::v8i8 && M.size() == 8;
}
static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
unsigned Index) {
if (Mask.size() == Elements * 2)
return Index / Elements;
return Mask[Index] == 0 ? 0 : 1;
}
// Checks whether the shuffle mask represents a vector transpose (VTRN) by
// checking that pairs of elements in the shuffle mask represent the same index
// in each vector, incrementing the expected index by 2 at each step.
// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
// v2={e,f,g,h}
// WhichResult gives the offset for each element in the mask based on which
// of the two results it belongs to.
//
// The transpose can be represented either as:
// result1 = shufflevector v1, v2, result1_shuffle_mask
// result2 = shufflevector v1, v2, result2_shuffle_mask
// where v1/v2 and the shuffle masks have the same number of elements
// (here WhichResult (see below) indicates which result is being checked)
//
// or as:
// results = shufflevector v1, v2, shuffle_mask
// where both results are returned in one vector and the shuffle mask has twice
// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
// want to check the low half and high half of the shuffle mask as if it were
// the other case
static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned EltSz = VT.getScalarSizeInBits();
if (EltSz == 64)
return false;
unsigned NumElts = VT.getVectorNumElements();
if (M.size() != NumElts && M.size() != NumElts*2)
return false;
// If the mask is twice as long as the input vector then we need to check the
// upper and lower parts of the mask with a matching value for WhichResult
// FIXME: A mask with only even values will be rejected in case the first
// element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
// M[0] is used to determine WhichResult
for (unsigned i = 0; i < M.size(); i += NumElts) {
WhichResult = SelectPairHalf(NumElts, M, i);
for (unsigned j = 0; j < NumElts; j += 2) {
if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
(M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
return false;
}
}
if (M.size() == NumElts*2)
WhichResult = 0;
return true;
}
/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
unsigned EltSz = VT.getScalarSizeInBits();
if (EltSz == 64)
return false;
unsigned NumElts = VT.getVectorNumElements();
if (M.size() != NumElts && M.size() != NumElts*2)
return false;
for (unsigned i = 0; i < M.size(); i += NumElts) {
WhichResult = SelectPairHalf(NumElts, M, i);
for (unsigned j = 0; j < NumElts; j += 2) {
if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
(M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
return false;
}
}
if (M.size() == NumElts*2)
WhichResult = 0;
return true;
}
// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
// that the mask elements are either all even and in steps of size 2 or all odd
// and in steps of size 2.
// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
// v2={e,f,g,h}
// Requires similar checks to that of isVTRNMask with
// respect the how results are returned.
static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned EltSz = VT.getScalarSizeInBits();
if (EltSz == 64)
return false;
unsigned NumElts = VT.getVectorNumElements();
if (M.size() != NumElts && M.size() != NumElts*2)
return false;
for (unsigned i = 0; i < M.size(); i += NumElts) {
WhichResult = SelectPairHalf(NumElts, M, i);
for (unsigned j = 0; j < NumElts; ++j) {
if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
return false;
}
}
if (M.size() == NumElts*2)
WhichResult = 0;
// VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
if (VT.is64BitVector() && EltSz == 32)
return false;
return true;
}
/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
unsigned EltSz = VT.getScalarSizeInBits();
if (EltSz == 64)
return false;
unsigned NumElts = VT.getVectorNumElements();
if (M.size() != NumElts && M.size() != NumElts*2)
return false;
unsigned Half = NumElts / 2;
for (unsigned i = 0; i < M.size(); i += NumElts) {
WhichResult = SelectPairHalf(NumElts, M, i);
for (unsigned j = 0; j < NumElts; j += Half) {
unsigned Idx = WhichResult;
for (unsigned k = 0; k < Half; ++k) {
int MIdx = M[i + j + k];
if (MIdx >= 0 && (unsigned) MIdx != Idx)
return false;
Idx += 2;
}
}
}
if (M.size() == NumElts*2)
WhichResult = 0;
// VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
if (VT.is64BitVector() && EltSz == 32)
return false;
return true;
}
// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
// that pairs of elements of the shufflemask represent the same index in each
// vector incrementing sequentially through the vectors.
// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
// v2={e,f,g,h}
// Requires similar checks to that of isVTRNMask with respect the how results
// are returned.
static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned EltSz = VT.getScalarSizeInBits();
if (EltSz == 64)
return false;
unsigned NumElts = VT.getVectorNumElements();
if (M.size() != NumElts && M.size() != NumElts*2)
return false;
for (unsigned i = 0; i < M.size(); i += NumElts) {
WhichResult = SelectPairHalf(NumElts, M, i);
unsigned Idx = WhichResult * NumElts / 2;
for (unsigned j = 0; j < NumElts; j += 2) {
if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
(M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
return false;
Idx += 1;
}
}
if (M.size() == NumElts*2)
WhichResult = 0;
// VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
if (VT.is64BitVector() && EltSz == 32)
return false;
return true;
}
/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
unsigned EltSz = VT.getScalarSizeInBits();
if (EltSz == 64)
return false;
unsigned NumElts = VT.getVectorNumElements();
if (M.size() != NumElts && M.size() != NumElts*2)
return false;
for (unsigned i = 0; i < M.size(); i += NumElts) {
WhichResult = SelectPairHalf(NumElts, M, i);
unsigned Idx = WhichResult * NumElts / 2;
for (unsigned j = 0; j < NumElts; j += 2) {
if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
(M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
return false;
Idx += 1;
}
}
if (M.size() == NumElts*2)
WhichResult = 0;
// VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
if (VT.is64BitVector() && EltSz == 32)
return false;
return true;
}
/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
unsigned &WhichResult,
bool &isV_UNDEF) {
isV_UNDEF = false;
if (isVTRNMask(ShuffleMask, VT, WhichResult))
return ARMISD::VTRN;
if (isVUZPMask(ShuffleMask, VT, WhichResult))
return ARMISD::VUZP;
if (isVZIPMask(ShuffleMask, VT, WhichResult))
return ARMISD::VZIP;
isV_UNDEF = true;
if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
return ARMISD::VTRN;
if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
return ARMISD::VUZP;
if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
return ARMISD::VZIP;
return 0;
}
/// \return true if this is a reverse operation on an vector.
static bool isReverseMask(ArrayRef<int> M, EVT VT) {
unsigned NumElts = VT.getVectorNumElements();
// Make sure the mask has the right size.
if (NumElts != M.size())
return false;
// Look for <15, ..., 3, -1, 1, 0>.
for (unsigned i = 0; i != NumElts; ++i)
if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
return false;
return true;
}
// If N is an integer constant that can be moved into a register in one
// instruction, return an SDValue of such a constant (will become a MOV
// instruction). Otherwise return null.
static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
const ARMSubtarget *ST, const SDLoc &dl) {
uint64_t Val;
if (!isa<ConstantSDNode>(N))
return SDValue();
Val = cast<ConstantSDNode>(N)->getZExtValue();
if (ST->isThumb1Only()) {
if (Val <= 255 || ~Val <= 255)
return DAG.getConstant(Val, dl, MVT::i32);
} else {
if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
return DAG.getConstant(Val, dl, MVT::i32);
}
return SDValue();
}
// If this is a case we can't handle, return null and let the default
// expansion code take care of it.
SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *ST) const {
BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
SDLoc dl(Op);
EVT VT = Op.getValueType();
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
if (SplatUndef.isAllOnesValue())
return DAG.getUNDEF(VT);
if ((ST->hasNEON() && SplatBitSize <= 64) ||
(ST->hasMVEIntegerOps() && SplatBitSize <= 32)) {
// Check if an immediate VMOV works.
EVT VmovVT;
SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
SplatUndef.getZExtValue(), SplatBitSize,
DAG, dl, VmovVT, VT.is128BitVector(),
VMOVModImm);
if (Val.getNode()) {
SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
}
// Try an immediate VMVN.
uint64_t NegatedImm = (~SplatBits).getZExtValue();
Val = isNEONModifiedImm(
NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
DAG, dl, VmovVT, VT.is128BitVector(),
ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
if (Val.getNode()) {
SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
}
// Use vmov.f32 to materialize other v2f32 and v4f32 splats.
if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
int ImmVal = ARM_AM::getFP32Imm(SplatBits);
if (ImmVal != -1) {
SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
}
}
}
}
// Scan through the operands to see if only one value is used.
//
// As an optimisation, even if more than one value is used it may be more
// profitable to splat with one value then change some lanes.
//
// Heuristically we decide to do this if the vector has a "dominant" value,
// defined as splatted to more than half of the lanes.
unsigned NumElts = VT.getVectorNumElements();
bool isOnlyLowElement = true;
bool usesOnlyOneValue = true;
bool hasDominantValue = false;
bool isConstant = true;
// Map of the number of times a particular SDValue appears in the
// element list.
DenseMap<SDValue, unsigned> ValueCounts;
SDValue Value;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
if (V.isUndef())
continue;
if (i > 0)
isOnlyLowElement = false;
if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
isConstant = false;
ValueCounts.insert(std::make_pair(V, 0));
unsigned &Count = ValueCounts[V];
// Is this value dominant? (takes up more than half of the lanes)
if (++Count > (NumElts / 2)) {
hasDominantValue = true;
Value = V;
}
}
if (ValueCounts.size() != 1)
usesOnlyOneValue = false;
if (!Value.getNode() && !ValueCounts.empty())
Value = ValueCounts.begin()->first;
if (ValueCounts.empty())
return DAG.getUNDEF(VT);
// Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
// Keep going if we are hitting this case.
if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
unsigned EltSize = VT.getScalarSizeInBits();
// Use VDUP for non-constant splats. For f32 constant splats, reduce to
// i32 and try again.
if (hasDominantValue && EltSize <= 32) {
if (!isConstant) {
SDValue N;
// If we are VDUPing a value that comes directly from a vector, that will
// cause an unnecessary move to and from a GPR, where instead we could
// just use VDUPLANE. We can only do this if the lane being extracted
// is at a constant index, as the VDUP from lane instructions only have
// constant-index forms.
ConstantSDNode *constIndex;
if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
(constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
// We need to create a new undef vector to use for the VDUPLANE if the
// size of the vector from which we get the value is different than the
// size of the vector that we need to create. We will insert the element
// such that the register coalescer will remove unnecessary copies.
if (VT != Value->getOperand(0).getValueType()) {
unsigned index = constIndex->getAPIntValue().getLimitedValue() %
VT.getVectorNumElements();
N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
Value, DAG.getConstant(index, dl, MVT::i32)),
DAG.getConstant(index, dl, MVT::i32));
} else
N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
Value->getOperand(0), Value->getOperand(1));
} else
N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
if (!usesOnlyOneValue) {
// The dominant value was splatted as 'N', but we now have to insert
// all differing elements.
for (unsigned I = 0; I < NumElts; ++I) {
if (Op.getOperand(I) == Value)
continue;
SmallVector<SDValue, 3> Ops;
Ops.push_back(N);
Ops.push_back(Op.getOperand(I));
Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
}
}
return N;
}
if (VT.getVectorElementType().isFloatingPoint()) {
SmallVector<SDValue, 8> Ops;
MVT FVT = VT.getVectorElementType().getSimpleVT();
assert(FVT == MVT::f32 || FVT == MVT::f16);
MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
for (unsigned i = 0; i < NumElts; ++i)
Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
Op.getOperand(i)));
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
Val = LowerBUILD_VECTOR(Val, DAG, ST);
if (Val.getNode())
return DAG.getNode(ISD::BITCAST, dl, VT, Val);
}
if (usesOnlyOneValue) {
SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
if (isConstant && Val.getNode())
return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
}
}
// If all elements are constants and the case above didn't get hit, fall back
// to the default expansion, which will generate a load from the constant
// pool.
if (isConstant)
return SDValue();
// Empirical tests suggest this is rarely worth it for vectors of length <= 2.
if (NumElts >= 4) {
SDValue shuffle = ReconstructShuffle(Op, DAG);
if (shuffle != SDValue())
return shuffle;
}
if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
// If we haven't found an efficient lowering, try splitting a 128-bit vector
// into two 64-bit vectors; we might discover a better way to lower it.
SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
EVT ExtVT = VT.getVectorElementType();
EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
SDValue Lower =
DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2));
if (Lower.getOpcode() == ISD::BUILD_VECTOR)
Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
SDValue Upper = DAG.getBuildVector(
HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2));
if (Upper.getOpcode() == ISD::BUILD_VECTOR)
Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
if (Lower && Upper)
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
}
// Vectors with 32- or 64-bit elements can be built by directly assigning
// the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
// will be legalized.
if (EltSize >= 32) {
// Do the expansion with floating-point types, since that is what the VFP
// registers are defined to use, and since i64 is not legal.
EVT EltVT = EVT::getFloatingPointVT(EltSize);
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
SmallVector<SDValue, 8> Ops;
for (unsigned i = 0; i < NumElts; ++i)
Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
return DAG.getNode(ISD::BITCAST, dl, VT, Val);
}
// If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
// know the default expansion would otherwise fall back on something even
// worse. For a vector with one or two non-undef values, that's
// scalar_to_vector for the elements followed by a shuffle (provided the
// shuffle is valid for the target) and materialization element by element
// on the stack followed by a load for everything else.
if (!isConstant && !usesOnlyOneValue) {
SDValue Vec = DAG.getUNDEF(VT);
for (unsigned i = 0 ; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
if (V.isUndef())
continue;
SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
}
return Vec;
}
return SDValue();
}
// Gather data to see if the operation can be modelled as a
// shuffle in combination with VEXTs.
SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
SDLoc dl(Op);
EVT VT = Op.getValueType();
unsigned NumElts = VT.getVectorNumElements();
struct ShuffleSourceInfo {
SDValue Vec;
unsigned MinElt = std::numeric_limits<unsigned>::max();
unsigned MaxElt = 0;
// We may insert some combination of BITCASTs and VEXT nodes to force Vec to
// be compatible with the shuffle we intend to construct. As a result
// ShuffleVec will be some sliding window into the original Vec.
SDValue ShuffleVec;
// Code should guarantee that element i in Vec starts at element "WindowBase
// + i * WindowScale in ShuffleVec".
int WindowBase = 0;
int WindowScale = 1;
ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
};
// First gather all vectors used as an immediate source for this BUILD_VECTOR
// node.
SmallVector<ShuffleSourceInfo, 2> Sources;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
if (V.isUndef())
continue;
else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
// A shuffle can only come from building a vector from various
// elements of other vectors.
return SDValue();
} else if (!isa<ConstantSDNode>(V.getOperand(1))) {
// Furthermore, shuffles require a constant mask, whereas extractelts
// accept variable indices.
return SDValue();
}
// Add this element source to the list if it's not already there.
SDValue SourceVec = V.getOperand(0);
auto Source = llvm::find(Sources, SourceVec);
if (Source == Sources.end())
Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
// Update the minimum and maximum lane number seen.
unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
Source->MinElt = std::min(Source->MinElt, EltNo);
Source->MaxElt = std::max(Source->MaxElt, EltNo);
}
// Currently only do something sane when at most two source vectors
// are involved.
if (Sources.size() > 2)
return SDValue();
// Find out the smallest element size among result and two sources, and use
// it as element size to build the shuffle_vector.
EVT SmallestEltTy = VT.getVectorElementType();
for (auto &Source : Sources) {
EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
if (SrcEltTy.bitsLT(SmallestEltTy))
SmallestEltTy = SrcEltTy;
}
unsigned ResMultiplier =
VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
// If the source vector is too wide or too narrow, we may nevertheless be able
// to construct a compatible shuffle either by concatenating it with UNDEF or
// extracting a suitable range of elements.
for (auto &Src : Sources) {
EVT SrcVT = Src.ShuffleVec.getValueType();
if (SrcVT.getSizeInBits() == VT.getSizeInBits())
continue;
// This stage of the search produces a source with the same element type as
// the original, but with a total width matching the BUILD_VECTOR output.
EVT EltVT = SrcVT.getVectorElementType();
unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits())
return SDValue();
// We can pad out the smaller vector for free, so if it's part of a
// shuffle...
Src.ShuffleVec =
DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
DAG.getUNDEF(Src.ShuffleVec.getValueType()));
continue;
}
if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits())
return SDValue();
if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
// Span too large for a VEXT to cope
return SDValue();
}
if (Src.MinElt >= NumSrcElts) {
// The extraction can just take the second half
Src.ShuffleVec =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
DAG.getConstant(NumSrcElts, dl, MVT::i32));
Src.WindowBase = -NumSrcElts;
} else if (Src.MaxElt < NumSrcElts) {
// The extraction can just take the first half
Src.ShuffleVec =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
DAG.getConstant(0, dl, MVT::i32));
} else {
// An actual VEXT is needed
SDValue VEXTSrc1 =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
DAG.getConstant(0, dl, MVT::i32));
SDValue VEXTSrc2 =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
DAG.getConstant(NumSrcElts, dl, MVT::i32));
Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
VEXTSrc2,
DAG.getConstant(Src.MinElt, dl, MVT::i32));
Src.WindowBase = -Src.MinElt;
}
}
// Another possible incompatibility occurs from the vector element types. We
// can fix this by bitcasting the source vectors to the same type we intend
// for the shuffle.
for (auto &Src : Sources) {
EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
if (SrcEltTy == SmallestEltTy)
continue;
assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
Src.WindowBase *= Src.WindowScale;
}
// Final sanity check before we try to actually produce a shuffle.
LLVM_DEBUG(for (auto Src
: Sources)
assert(Src.ShuffleVec.getValueType() == ShuffleVT););
// The stars all align, our next step is to produce the mask for the shuffle.
SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
SDValue Entry = Op.getOperand(i);
if (Entry.isUndef())
continue;
auto Src = llvm::find(Sources, Entry.getOperand(0));
int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
// EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
// trunc. So only std::min(SrcBits, DestBits) actually get defined in this
// segment.
EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
VT.getScalarSizeInBits());
int LanesDefined = BitsDefined / BitsPerShuffleLane;
// This source is expected to fill ResMultiplier lanes of the final shuffle,
// starting at the appropriate offset.
int *LaneMask = &Mask[i * ResMultiplier];
int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
ExtractBase += NumElts * (Src - Sources.begin());
for (int j = 0; j < LanesDefined; ++j)
LaneMask[j] = ExtractBase + j;
}
// Final check before we try to produce nonsense...
if (!isShuffleMaskLegal(Mask, ShuffleVT))
return SDValue();
// We can't handle more than two sources. This should have already
// been checked before this point.
assert(Sources.size() <= 2 && "Too many sources!");
SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
for (unsigned i = 0; i < Sources.size(); ++i)
ShuffleOps[i] = Sources[i].ShuffleVec;
SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
ShuffleOps[1], Mask);
return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
}
enum ShuffleOpCodes {
OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
OP_VREV,
OP_VDUP0,
OP_VDUP1,
OP_VDUP2,
OP_VDUP3,
OP_VEXT1,
OP_VEXT2,
OP_VEXT3,
OP_VUZPL, // VUZP, left result
OP_VUZPR, // VUZP, right result
OP_VZIPL, // VZIP, left result
OP_VZIPR, // VZIP, right result
OP_VTRNL, // VTRN, left result
OP_VTRNR // VTRN, right result
};
static bool isLegalMVEShuffleOp(unsigned PFEntry) {
unsigned OpNum = (PFEntry >> 26) & 0x0F;
switch (OpNum) {
case OP_COPY:
case OP_VREV:
case OP_VDUP0:
case OP_VDUP1:
case OP_VDUP2:
case OP_VDUP3:
return true;
}
return false;
}
/// isShuffleMaskLegal - Targets can use this to indicate that they only
/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
/// are assumed to be legal.
bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
if (VT.getVectorNumElements() == 4 &&
(VT.is128BitVector() || VT.is64BitVector())) {
unsigned PFIndexes[4];
for (unsigned i = 0; i != 4; ++i) {
if (M[i] < 0)
PFIndexes[i] = 8;
else
PFIndexes[i] = M[i];
}
// Compute the index in the perfect shuffle table.
unsigned PFTableIndex =
PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
unsigned Cost = (PFEntry >> 30);
if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
return true;
}
bool ReverseVEXT, isV_UNDEF;
unsigned Imm, WhichResult;
unsigned EltSize = VT.getScalarSizeInBits();
if (EltSize >= 32 ||
ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
isVREVMask(M, VT, 64) ||
isVREVMask(M, VT, 32) ||
isVREVMask(M, VT, 16))
return true;
else if (Subtarget->hasNEON() &&
(isVEXTMask(M, VT, ReverseVEXT, Imm) ||
isVTBLMask(M, VT) ||
isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
return true;
else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) &&
isReverseMask(M, VT))
return true;
else
return false;
}
/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
/// the specified operations to build the shuffle.
static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
SDValue RHS, SelectionDAG &DAG,
const SDLoc &dl) {
unsigned OpNum = (PFEntry >> 26) & 0x0F;
unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
if (OpNum == OP_COPY) {
if (LHSID == (1*9+2)*9+3) return LHS;
assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
return RHS;
}
SDValue OpLHS, OpRHS;
OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
EVT VT = OpLHS.getValueType();
switch (OpNum) {
default: llvm_unreachable("Unknown shuffle opcode!");
case OP_VREV:
// VREV divides the vector in half and swaps within the half.
if (VT.getVectorElementType() == MVT::i32 ||
VT.getVectorElementType() == MVT::f32)
return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
// vrev <4 x i16> -> VREV32
if (VT.getVectorElementType() == MVT::i16)
return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
// vrev <4 x i8> -> VREV16
assert(VT.getVectorElementType() == MVT::i8);
return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
case OP_VDUP0:
case OP_VDUP1:
case OP_VDUP2:
case OP_VDUP3:
return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
case OP_VEXT1:
case OP_VEXT2:
case OP_VEXT3:
return DAG.getNode(ARMISD::VEXT, dl, VT,
OpLHS, OpRHS,
DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
case OP_VUZPL:
case OP_VUZPR:
return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
case OP_VZIPL:
case OP_VZIPR:
return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
case OP_VTRNL:
case OP_VTRNR:
return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
}
}
static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
ArrayRef<int> ShuffleMask,
SelectionDAG &DAG) {
// Check to see if we can use the VTBL instruction.
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
SDLoc DL(Op);
SmallVector<SDValue, 8> VTBLMask;
for (ArrayRef<int>::iterator
I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32));
if (V2.getNode()->isUndef())
return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
}
static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
SelectionDAG &DAG) {
SDLoc DL(Op);
SDValue OpLHS = Op.getOperand(0);
EVT VT = OpLHS.getValueType();
assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
"Expect an v8i16/v16i8 type");
OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
// For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
// extract the first 8 bytes into the top double word and the last 8 bytes
// into the bottom double word. The v8i16 case is similar.
unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
DAG.getConstant(ExtractNum, DL, MVT::i32));
}
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *ST) {
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
SDLoc dl(Op);
EVT VT = Op.getValueType();
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
// Convert shuffles that are directly supported on NEON to target-specific
// DAG nodes, instead of keeping them as shuffles and matching them again
// during code selection. This is more efficient and avoids the possibility
// of inconsistencies between legalization and selection.
// FIXME: floating-point vectors should be canonicalized to integer vectors
// of the same time so that they get CSEd properly.
ArrayRef<int> ShuffleMask = SVN->getMask();
unsigned EltSize = VT.getScalarSizeInBits();
if (EltSize <= 32) {
if (SVN->isSplat()) {
int Lane = SVN->getSplatIndex();
// If this is undef splat, generate it via "just" vdup, if possible.
if (Lane == -1) Lane = 0;
// Test if V1 is a SCALAR_TO_VECTOR.
if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
}
// Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
// (and probably will turn into a SCALAR_TO_VECTOR once legalization
// reaches it).
if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
!isa<ConstantSDNode>(V1.getOperand(0))) {
bool IsScalarToVector = true;
for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
if (!V1.getOperand(i).isUndef()) {
IsScalarToVector = false;
break;
}
if (IsScalarToVector)
return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
}
return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
DAG.getConstant(Lane, dl, MVT::i32));
}
bool ReverseVEXT = false;
unsigned Imm = 0;
if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
if (ReverseVEXT)
std::swap(V1, V2);
return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
DAG.getConstant(Imm, dl, MVT::i32));
}
if (isVREVMask(ShuffleMask, VT, 64))
return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
if (isVREVMask(ShuffleMask, VT, 32))
return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
if (isVREVMask(ShuffleMask, VT, 16))
return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
DAG.getConstant(Imm, dl, MVT::i32));
}
// Check for Neon shuffles that modify both input vectors in place.
// If both results are used, i.e., if there are two shuffles with the same
// source operands and with masks corresponding to both results of one of
// these operations, DAG memoization will ensure that a single node is
// used for both shuffles.
unsigned WhichResult = 0;
bool isV_UNDEF = false;
if (ST->hasNEON()) {
if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
ShuffleMask, VT, WhichResult, isV_UNDEF)) {
if (isV_UNDEF)
V2 = V1;
return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
.getValue(WhichResult);
}
}
// Also check for these shuffles through CONCAT_VECTORS: we canonicalize
// shuffles that produce a result larger than their operands with:
// shuffle(concat(v1, undef), concat(v2, undef))
// ->
// shuffle(concat(v1, v2), undef)
// because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
//
// This is useful in the general case, but there are special cases where
// native shuffles produce larger results: the two-result ops.
//
// Look through the concat when lowering them:
// shuffle(concat(v1, v2), undef)
// ->
// concat(VZIP(v1, v2):0, :1)
//
if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
SDValue SubV1 = V1->getOperand(0);
SDValue SubV2 = V1->getOperand(1);
EVT SubVT = SubV1.getValueType();
// We expect these to have been canonicalized to -1.
assert(llvm::all_of(ShuffleMask, [&](int i) {
return i < (int)VT.getVectorNumElements();
}) && "Unexpected shuffle index into UNDEF operand!");
if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
if (isV_UNDEF)
SubV2 = SubV1;
assert((WhichResult == 0) &&
"In-place shuffle of concat can only have one result!");
SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
SubV1, SubV2);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
Res.getValue(1));
}
}
}
// If the shuffle is not directly supported and it has 4 elements, use
// the PerfectShuffle-generated table to synthesize it from other shuffles.
unsigned NumElts = VT.getVectorNumElements();
if (NumElts == 4) {
unsigned PFIndexes[4];
for (unsigned i = 0; i != 4; ++i) {
if (ShuffleMask[i] < 0)
PFIndexes[i] = 8;
else
PFIndexes[i] = ShuffleMask[i];
}
// Compute the index in the perfect shuffle table.
unsigned PFTableIndex =
PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
unsigned Cost = (PFEntry >> 30);
if (Cost <= 4) {
if (ST->hasNEON())
return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
else if (isLegalMVEShuffleOp(PFEntry)) {
unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
}
}
}
// Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
if (EltSize >= 32) {
// Do the expansion with floating-point types, since that is what the VFP
// registers are defined to use, and since i64 is not legal.
EVT EltVT = EVT::getFloatingPointVT(EltSize);
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
SmallVector<SDValue, 8> Ops;
for (unsigned i = 0; i < NumElts; ++i) {
if (ShuffleMask[i] < 0)
Ops.push_back(DAG.getUNDEF(EltVT));
else
Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
ShuffleMask[i] < (int)NumElts ? V1 : V2,
DAG.getConstant(ShuffleMask[i] & (NumElts-1),
dl, MVT::i32)));
}
SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
return DAG.getNode(ISD::BITCAST, dl, VT, Val);
}
if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
if (ST->hasNEON() && VT == MVT::v8i8)
if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
return NewOp;
return SDValue();
}
SDValue ARMTargetLowering::
LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
// INSERT_VECTOR_ELT is legal only for immediate indexes.
SDValue Lane = Op.getOperand(2);
if (!isa<ConstantSDNode>(Lane))
return SDValue();
SDValue Elt = Op.getOperand(1);
EVT EltVT = Elt.getValueType();
if (getTypeAction(*DAG.getContext(), EltVT) ==
TargetLowering::TypePromoteFloat) {
// INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
// but the type system will try to do that if we don't intervene.
// Reinterpret any such vector-element insertion as one with the
// corresponding integer types.
SDLoc dl(Op);
EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
assert(getTypeAction(*DAG.getContext(), IEltVT) !=
TargetLowering::TypePromoteFloat);
SDValue VecIn = Op.getOperand(0);
EVT VecVT = VecIn.getValueType();
EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
VecVT.getVectorNumElements());
SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
IVecIn, IElt, Lane);
return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
}
return Op;
}
static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
// EXTRACT_VECTOR_ELT is legal only for immediate indexes.
SDValue Lane = Op.getOperand(1);
if (!isa<ConstantSDNode>(Lane))
return SDValue();
SDValue Vec = Op.getOperand(0);
if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
SDLoc dl(Op);
return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
}
return Op;
}
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
// The only time a CONCAT_VECTORS operation can have legal types is when
// two 64-bit vectors are concatenated to a 128-bit vector.
assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
"unexpected CONCAT_VECTORS");
SDLoc dl(Op);
SDValue Val = DAG.getUNDEF(MVT::v2f64);
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
if (!Op0.isUndef())
Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
DAG.getIntPtrConstant(0, dl));
if (!Op1.isUndef())
Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
DAG.getIntPtrConstant(1, dl));
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
}
/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
/// element has been zero/sign-extended, depending on the isSigned parameter,
/// from an integer type half its size.
static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
bool isSigned) {
// A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
EVT VT = N->getValueType(0);
if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
SDNode *BVN = N->getOperand(0).getNode();
if (BVN->getValueType(0) != MVT::v4i32 ||
BVN->getOpcode() != ISD::BUILD_VECTOR)
return false;
unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
unsigned HiElt = 1 - LoElt;
ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
return false;
if (isSigned) {
if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
return true;
} else {
if (Hi0->isNullValue() && Hi1->isNullValue())
return true;
}
return false;
}
if (N->getOpcode() != ISD::BUILD_VECTOR)
return false;
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
SDNode *Elt = N->getOperand(i).getNode();
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
unsigned EltSize = VT.getScalarSizeInBits();
unsigned HalfSize = EltSize / 2;
if (isSigned) {
if (!isIntN(HalfSize, C->getSExtValue()))
return false;
} else {
if (!isUIntN(HalfSize, C->getZExtValue()))
return false;
}
continue;
}
return false;
}
return true;
}
/// isSignExtended - Check if a node is a vector value that is sign-extended
/// or a constant BUILD_VECTOR with sign-extended elements.
static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
return true;
if (isExtendedBUILD_VECTOR(N, DAG, true))
return true;
return false;
}
/// isZeroExtended - Check if a node is a vector value that is zero-extended
/// or a constant BUILD_VECTOR with zero-extended elements.
static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
return true;
if (isExtendedBUILD_VECTOR(N, DAG, false))
return true;
return false;
}
static EVT getExtensionTo64Bits(const EVT &OrigVT) {
if (OrigVT.getSizeInBits() >= 64)
return OrigVT;
assert(OrigVT.isSimple() && "Expecting a simple value type");
MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
switch (OrigSimpleTy) {
default: llvm_unreachable("Unexpected Vector Type");
case MVT::v2i8:
case MVT::v2i16:
return MVT::v2i32;
case MVT::v4i8:
return MVT::v4i16;
}
}
/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
/// We insert the required extension here to get the vector to fill a D register.
static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
const EVT &OrigTy,
const EVT &ExtTy,
unsigned ExtOpcode) {
// The vector originally had a size of OrigTy. It was then extended to ExtTy.
// We expect the ExtTy to be 128-bits total. If the OrigTy is less than
// 64-bits we need to insert a new extension so that it will be 64-bits.
assert(ExtTy.is128BitVector() && "Unexpected extension size");
if (OrigTy.getSizeInBits() >= 64)
return N;
// Must extend size to at least 64 bits to be used as an operand for VMULL.
EVT NewVT = getExtensionTo64Bits(OrigTy);
return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
}
/// SkipLoadExtensionForVMULL - return a load of the original vector size that
/// does not do any sign/zero extension. If the original vector is less
/// than 64 bits, an appropriate extension will be added after the load to
/// reach a total size of 64 bits. We have to add the extension separately
/// because ARM does not have a sign/zero extending load for vectors.
static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
// The load already has the right type.
if (ExtendedTy == LD->getMemoryVT())
return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
LD->getBasePtr(), LD->getPointerInfo(),
LD->getAlignment(), LD->getMemOperand()->getFlags());
// We need to create a zextload/sextload. We cannot just create a load
// followed by a zext/zext node because LowerMUL is also run during normal
// operation legalization where we can't create illegal types.
return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
LD->getMemoryVT(), LD->getAlignment(),
LD->getMemOperand()->getFlags());
}
/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
/// extending load, or BUILD_VECTOR with extended elements, return the
/// unextended value. The unextended vector should be 64 bits so that it can
/// be used as an operand to a VMULL instruction. If the original vector size
/// before extension is less than 64 bits we add a an extension to resize
/// the vector to 64 bits.
static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
N->getOperand(0)->getValueType(0),
N->getValueType(0),
N->getOpcode());
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
"Expected extending load");
SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
SDValue extLoad =
DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
return newLoad;
}
// Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
// have been legalized as a BITCAST from v4i32.
if (N->getOpcode() == ISD::BITCAST) {
SDNode *BVN = N->getOperand(0).getNode();
assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
return DAG.getBuildVector(
MVT::v2i32, SDLoc(N),
{BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
}
// Construct a new BUILD_VECTOR with elements truncated to half the size.
assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
EVT VT = N->getValueType(0);
unsigned EltSize = VT.getScalarSizeInBits() / 2;
unsigned NumElts = VT.getVectorNumElements();
MVT TruncVT = MVT::getIntegerVT(EltSize);
SmallVector<SDValue, 8> Ops;
SDLoc dl(N);
for (unsigned i = 0; i != NumElts; ++i) {
ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
const APInt &CInt = C->getAPIntValue();
// Element types smaller than 32 bits are not legal, so use i32 elements.
// The values are implicitly truncated so sext vs. zext doesn't matter.
Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
}
return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
}
static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
unsigned Opcode = N->getOpcode();
if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
SDNode *N0 = N->getOperand(0).getNode();
SDNode *N1 = N->getOperand(1).getNode();
return N0->hasOneUse() && N1->hasOneUse() &&
isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
}
return false;
}
static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
unsigned Opcode = N->getOpcode();
if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
SDNode *N0 = N->getOperand(0).getNode();
SDNode *N1 = N->getOperand(1).getNode();
return N0->hasOneUse() && N1->hasOneUse() &&
isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
}
return false;
}
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
// Multiplications are only custom-lowered for 128-bit vectors so that
// VMULL can be detected. Otherwise v2i64 multiplications are not legal.
EVT VT = Op.getValueType();
assert(VT.is128BitVector() && VT.isInteger() &&
"unexpected type for custom-lowering ISD::MUL");
SDNode *N0 = Op.getOperand(0).getNode();
SDNode *N1 = Op.getOperand(1).getNode();
unsigned NewOpc = 0;
bool isMLA = false;
bool isN0SExt = isSignExtended(N0, DAG);
bool isN1SExt = isSignExtended(N1, DAG);
if (isN0SExt && isN1SExt)
NewOpc = ARMISD::VMULLs;
else {
bool isN0ZExt = isZeroExtended(N0, DAG);
bool isN1ZExt = isZeroExtended(N1, DAG);
if (isN0ZExt && isN1ZExt)
NewOpc = ARMISD::VMULLu;
else if (isN1SExt || isN1ZExt) {
// Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
// into (s/zext A * s/zext C) + (s/zext B * s/zext C)
if (isN1SExt && isAddSubSExt(N0, DAG)) {
NewOpc = ARMISD::VMULLs;
isMLA = true;
} else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
NewOpc = ARMISD::VMULLu;
isMLA = true;
} else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
std::swap(N0, N1);
NewOpc = ARMISD::VMULLu;
isMLA = true;
}
}
if (!NewOpc) {
if (VT == MVT::v2i64)
// Fall through to expand this. It is not legal.
return SDValue();
else
// Other vector multiplications are legal.
return Op;
}
}
// Legalize to a VMULL instruction.
SDLoc DL(Op);
SDValue Op0;
SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
if (!isMLA) {
Op0 = SkipExtensionForVMULL(N0, DAG);
assert(Op0.getValueType().is64BitVector() &&
Op1.getValueType().is64BitVector() &&
"unexpected types for extended operands to VMULL");
return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
}
// Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
// isel lowering to take advantage of no-stall back to back vmul + vmla.
// vmull q0, d4, d6
// vmlal q0, d5, d6
// is faster than
// vaddl q0, d4, d5
// vmovl q1, d6
// vmul q0, q0, q1
SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
EVT Op1VT = Op1.getValueType();
return DAG.getNode(N0->getOpcode(), DL, VT,
DAG.getNode(NewOpc, DL, VT,
DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
DAG.getNode(NewOpc, DL, VT,
DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
}
static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl,
SelectionDAG &DAG) {
// TODO: Should this propagate fast-math-flags?
// Convert to float
// float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
// float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
// Get reciprocal estimate.
// float4 recip = vrecpeq_f32(yf);
Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
Y);
// Because char has a smaller range than uchar, we can actually get away
// without any newton steps. This requires that we use a weird bias
// of 0xb000, however (again, this has been exhaustively tested).
// float4 result = as_float4(as_int4(xf*recip) + 0xb000);
X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
// Convert back to short.
X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
return X;
}
static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,
SelectionDAG &DAG) {
// TODO: Should this propagate fast-math-flags?
SDValue N2;
// Convert to float.
// float4 yf = vcvt_f32_s32(vmovl_s16(y));
// float4 xf = vcvt_f32_s32(vmovl_s16(x));
N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
// Use reciprocal estimate and one refinement step.
// float4 recip = vrecpeq_f32(yf);
// recip *= vrecpsq_f32(yf, recip);
N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
N1);
N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
N1, N2);
N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
// Because short has a smaller range than ushort, we can actually get away
// with only a single newton step. This requires that we use a weird bias
// of 89, however (again, this has been exhaustively tested).
// float4 result = as_float4(as_int4(xf*recip) + 0x89);
N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
// Convert back to integer and return.
// return vmovn_s32(vcvt_s32_f32(result));
N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
return N0;
}
static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
"unexpected type for custom-lowering ISD::SDIV");
SDLoc dl(Op);
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
SDValue N2, N3;
if (VT == MVT::v8i8) {
N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
DAG.getIntPtrConstant(4, dl));
N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
DAG.getIntPtrConstant(4, dl));
N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
DAG.getIntPtrConstant(0, dl));
N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
DAG.getIntPtrConstant(0, dl));
N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
N0 = LowerCONCAT_VECTORS(N0, DAG);
N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
return N0;
}
return LowerSDIV_v4i16(N0, N1, dl, DAG);
}
static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
// TODO: Should this propagate fast-math-flags?
EVT VT = Op.getValueType();
assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
"unexpected type for custom-lowering ISD::UDIV");
SDLoc dl(Op);
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
SDValue N2, N3;
if (VT == MVT::v8i8) {
N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
DAG.getIntPtrConstant(4, dl));
N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
DAG.getIntPtrConstant(4, dl));
N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
DAG.getIntPtrConstant(0, dl));
N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
DAG.getIntPtrConstant(0, dl));
N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
N0 = LowerCONCAT_VECTORS(N0, DAG);
N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
MVT::i32),
N0);
return N0;
}
// v4i16 sdiv ... Convert to float.
// float4 yf = vcvt_f32_s32(vmovl_u16(y));
// float4 xf = vcvt_f32_s32(vmovl_u16(x));
N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
// Use reciprocal estimate and two refinement steps.
// float4 recip = vrecpeq_f32(yf);
// recip *= vrecpsq_f32(yf, recip);
// recip *= vrecpsq_f32(yf, recip);
N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
BN1);
N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
BN1, N2);
N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
BN1, N2);
N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
// Simply multiplying by the reciprocal estimate can leave us a few ulps
// too low, so we add 2 ulps (exhaustive testing shows that this is enough,
// and that it will never cause us to return an answer too large).
// float4 result = as_float4(as_int4(xf*recip) + 2);
N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
N1 = DAG.getConstant(2, dl, MVT::v4i32);
N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
// Convert back to integer and return.
// return vmovn_u32(vcvt_s32_f32(result));
N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
return N0;
}
static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
SDNode *N = Op.getNode();
EVT VT = N->getValueType(0);
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
SDValue Carry = Op.getOperand(2);
SDLoc DL(Op);
SDValue Result;
if (Op.getOpcode() == ISD::ADDCARRY) {
// This converts the boolean value carry into the carry flag.
Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
// Do the addition proper using the carry flag we wanted.
Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
Op.getOperand(1), Carry);
// Now convert the carry flag into a boolean value.
Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
} else {
// ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
// have to invert the carry first.
Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
DAG.getConstant(1, DL, MVT::i32), Carry);
// This converts the boolean value carry into the carry flag.
Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
// Do the subtraction proper using the carry flag we wanted.
Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
Op.getOperand(1), Carry);
// Now convert the carry flag into a boolean value.
Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
// But the carry returned by ARMISD::SUBE is not a borrow as expected
// by ISD::SUBCARRY, so compute 1 - C.
Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
DAG.getConstant(1, DL, MVT::i32), Carry);
}
// Return both values.
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
}
SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
assert(Subtarget->isTargetDarwin());
// For iOS, we want to call an alternative entry point: __sincos_stret,
// return values are passed via sret.
SDLoc dl(Op);
SDValue Arg = Op.getOperand(0);
EVT ArgVT = Arg.getValueType();
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
auto PtrVT = getPointerTy(DAG.getDataLayout());
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Pair of floats / doubles used to pass the result.
Type *RetTy = StructType::get(ArgTy, ArgTy);
auto &DL = DAG.getDataLayout();
ArgListTy Args;
bool ShouldUseSRet = Subtarget->isAPCS_ABI();
SDValue SRet;
if (ShouldUseSRet) {
// Create stack object for sret.
const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
ArgListEntry Entry;
Entry.Node = SRet;
Entry.Ty = RetTy->getPointerTo();
Entry.IsSExt = false;
Entry.IsZExt = false;
Entry.IsSRet = true;
Args.push_back(Entry);
RetTy = Type::getVoidTy(*DAG.getContext());
}
ArgListEntry Entry;
Entry.Node = Arg;
Entry.Ty = ArgTy;
Entry.IsSExt = false;
Entry.IsZExt = false;
Args.push_back(Entry);
RTLIB::Libcall LC =
(ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
const char *LibcallName = getLibcallName(LC);
CallingConv::ID CC = getLibcallCallingConv(LC);
SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl)
.setChain(DAG.getEntryNode())
.setCallee(CC, RetTy, Callee, std::move(Args))
.setDiscardResult(ShouldUseSRet);
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
if (!ShouldUseSRet)
return CallResult.first;
SDValue LoadSin =
DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
// Address of cos field.
SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
SDValue LoadCos =
DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
LoadSin.getValue(0), LoadCos.getValue(0));
}
SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
bool Signed,
SDValue &Chain) const {
EVT VT = Op.getValueType();
assert((VT == MVT::i32 || VT == MVT::i64) &&
"unexpected type for custom lowering DIV");
SDLoc dl(Op);
const auto &DL = DAG.getDataLayout();
const auto &TLI = DAG.getTargetLoweringInfo();
const char *Name = nullptr;
if (Signed)
Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
else
Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));
ARMTargetLowering::ArgListTy Args;
for (auto AI : {1, 0}) {
ArgListEntry Arg;
Arg.Node = Op.getOperand(AI);
Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
Args.push_back(Arg);
}
CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl)
.setChain(Chain)
.setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),
ES, std::move(Args));
return LowerCallTo(CLI).first;
}
// This is a code size optimisation: return the original SDIV node to
// DAGCombiner when we don't want to expand SDIV into a sequence of
// instructions, and an empty node otherwise which will cause the
// SDIV to be expanded in DAGCombine.
SDValue
ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const {
// TODO: Support SREM
if (N->getOpcode() != ISD::SDIV)
return SDValue();
const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget());
const bool MinSize = ST.hasMinSize();
const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
: ST.hasDivideInARMMode();
// Don't touch vector types; rewriting this may lead to scalarizing
// the int divs.
if (N->getOperand(0).getValueType().isVector())
return SDValue();
// Bail if MinSize is not set, and also for both ARM and Thumb mode we need
// hwdiv support for this to be really profitable.
if (!(MinSize && HasDivide))
return SDValue();
// ARM mode is a bit simpler than Thumb: we can handle large power
// of 2 immediates with 1 mov instruction; no further checks required,
// just return the sdiv node.
if (!ST.isThumb())
return SDValue(N, 0);
// In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
// and thus lose the code size benefits of a MOVS that requires only 2.
// TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
// but as it's doing exactly this, it's not worth the trouble to get TTI.
if (Divisor.sgt(128))
return SDValue();
return SDValue(N, 0);
}
SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
bool Signed) const {
assert(Op.getValueType() == MVT::i32 &&
"unexpected type for custom lowering DIV");
SDLoc dl(Op);
SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
DAG.getEntryNode(), Op.getOperand(1));
return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
}
static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) {
SDLoc DL(N);
SDValue Op = N->getOperand(1);
if (N->getValueType(0) == MVT::i32)
return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
DAG.getConstant(0, DL, MVT::i32));
SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
DAG.getConstant(1, DL, MVT::i32));
return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
}
void ARMTargetLowering::ExpandDIV_Windows(
SDValue Op, SelectionDAG &DAG, bool Signed,
SmallVectorImpl<SDValue> &Results) const {
const auto &DL = DAG.getDataLayout();
const auto &TLI = DAG.getTargetLoweringInfo();
assert(Op.getValueType() == MVT::i64 &&
"unexpected type for custom lowering DIV");
SDLoc dl(Op);
SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
Results.push_back(Lower);
Results.push_back(Upper);
}
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
// Acquire/Release load/store is not legal for targets without a dmb or
// equivalent available.
return SDValue();
// Monotonic load/store is legal for all targets.
return Op;
}
static void ReplaceREADCYCLECOUNTER(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG,
const ARMSubtarget *Subtarget) {
SDLoc DL(N);
// Under Power Management extensions, the cycle-count is:
// mrc p15, #0, <Rt>, c9, c13, #0
SDValue Ops[] = { N->getOperand(0), // Chain
DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
DAG.getConstant(15, DL, MVT::i32),
DAG.getConstant(0, DL, MVT::i32),
DAG.getConstant(9, DL, MVT::i32),
DAG.getConstant(13, DL, MVT::i32),
DAG.getConstant(0, DL, MVT::i32)
};
SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
DAG.getVTList(MVT::i32, MVT::Other), Ops);
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
DAG.getConstant(0, DL, MVT::i32)));
Results.push_back(Cycles32.getValue(1));
}
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
SDLoc dl(V.getNode());
SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32);
SDValue VHi = DAG.getAnyExtOrTrunc(
DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
dl, MVT::i32);
bool isBigEndian = DAG.getDataLayout().isBigEndian();
if (isBigEndian)
std::swap (VLo, VHi);
SDValue RegClass =
DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
return SDValue(
DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
}
static void ReplaceCMP_SWAP_64Results(SDNode *N,
SmallVectorImpl<SDValue> & Results,
SelectionDAG &DAG) {
assert(N->getValueType(0) == MVT::i64 &&
"AtomicCmpSwap on types less than 64 should be legal");
SDValue Ops[] = {N->getOperand(1),
createGPRPairNode(DAG, N->getOperand(2)),
createGPRPairNode(DAG, N->getOperand(3)),
N->getOperand(0)};
SDNode *CmpSwap = DAG.getMachineNode(
ARM::CMP_SWAP_64, SDLoc(N),
DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);
MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
bool isBigEndian = DAG.getDataLayout().isBigEndian();
Results.push_back(
DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
Results.push_back(
DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
Results.push_back(SDValue(CmpSwap, 2));
}
static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget,
SelectionDAG &DAG) {
const auto &TLI = DAG.getTargetLoweringInfo();
assert(Subtarget.getTargetTriple().isOSMSVCRT() &&
"Custom lowering is MSVCRT specific!");
SDLoc dl(Op);
SDValue Val = Op.getOperand(0);
MVT Ty = Val->getSimpleValueType(0);
SDValue Exponent = DAG.getNode(ISD::SINT_TO_FP, dl, Ty, Op.getOperand(1));
SDValue Callee = DAG.getExternalSymbol(Ty == MVT::f32 ? "powf" : "pow",
TLI.getPointerTy(DAG.getDataLayout()));
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
Entry.Node = Val;
Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext());
Entry.IsZExt = true;
Args.push_back(Entry);
Entry.Node = Exponent;
Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext());
Entry.IsZExt = true;
Args.push_back(Entry);
Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext());
// In the in-chain to the call is the entry node If we are emitting a
// tailcall, the chain will be mutated if the node has a non-entry input
// chain.
SDValue InChain = DAG.getEntryNode();
SDValue TCChain = InChain;
const Function &F = DAG.getMachineFunction().getFunction();
bool IsTC = TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) &&
F.getReturnType() == LCRTy;
if (IsTC)
InChain = TCChain;
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl)
.setChain(InChain)
.setCallee(CallingConv::ARM_AAPCS_VFP, LCRTy, Callee, std::move(Args))
.setTailCall(IsTC);
std::pair<SDValue, SDValue> CI = TLI.LowerCallTo(CLI);
// Return the chain (the DAG root) if it is a tail call
return !CI.second.getNode() ? DAG.getRoot() : CI.first;
}
SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
switch (Op.getOpcode()) {
default: llvm_unreachable("Don't know how to custom lower this!");
case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
case ISD::SELECT: return LowerSELECT(Op, DAG);
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::BR_CC: return LowerBR_CC(Op, DAG);
case ISD::BR_JT: return LowerBR_JT(Op, DAG);
case ISD::VASTART: return LowerVASTART(Op, DAG);
case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
Subtarget);
case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
case ISD::SHL:
case ISD::SRL:
case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
case ISD::SREM: return LowerREM(Op.getNode(), DAG);
case ISD::UREM: return LowerREM(Op.getNode(), DAG);
case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
case ISD::SRL_PARTS:
case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
case ISD::SETCC: return LowerVSETCC(Op, DAG);
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
case ISD::MUL: return LowerMUL(Op, DAG);
case ISD::SDIV:
if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
return LowerDIV_Windows(Op, DAG, /* Signed */ true);
return LowerSDIV(Op, DAG);
case ISD::UDIV:
if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
return LowerDIV_Windows(Op, DAG, /* Signed */ false);
return LowerUDIV(Op, DAG);
case ISD::ADDCARRY:
case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
case ISD::SADDO:
case ISD::SSUBO:
return LowerSignedALUO(Op, DAG);
case ISD::UADDO:
case ISD::USUBO:
return LowerUnsignedALUO(Op, DAG);
case ISD::ATOMIC_LOAD:
case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
case ISD::SDIVREM:
case ISD::UDIVREM: return LowerDivRem(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:
if (Subtarget->isTargetWindows())
return LowerDYNAMIC_STACKALLOC(Op, DAG);
llvm_unreachable("Don't know how to custom lower this!");
case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG);
case ARMISD::WIN__DBZCHK: return SDValue();
}
}
static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) {
unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
unsigned Opc = 0;
if (IntNo == Intrinsic::arm_smlald)
Opc = ARMISD::SMLALD;
else if (IntNo == Intrinsic::arm_smlaldx)
Opc = ARMISD::SMLALDX;
else if (IntNo == Intrinsic::arm_smlsld)
Opc = ARMISD::SMLSLD;
else if (IntNo == Intrinsic::arm_smlsldx)
Opc = ARMISD::SMLSLDX;
else
return;
SDLoc dl(N);
SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
N->getOperand(3),
DAG.getConstant(0, dl, MVT::i32));
SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
N->getOperand(3),
DAG.getConstant(1, dl, MVT::i32));
SDValue LongMul = DAG.getNode(Opc, dl,
DAG.getVTList(MVT::i32, MVT::i32),
N->getOperand(1), N->getOperand(2),
Lo, Hi);
Results.push_back(LongMul.getValue(0));
Results.push_back(LongMul.getValue(1));
}
/// ReplaceNodeResults - Replace the results of node with an illegal result
/// type with new values built out of custom code.
void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
SDValue Res;
switch (N->getOpcode()) {
default:
llvm_unreachable("Don't know how to custom expand this!");
case ISD::READ_REGISTER:
ExpandREAD_REGISTER(N, Results, DAG);
break;
case ISD::BITCAST:
Res = ExpandBITCAST(N, DAG, Subtarget);
break;
case ISD::SRL:
case ISD::SRA:
case ISD::SHL:
Res = Expand64BitShift(N, DAG, Subtarget);
break;
case ISD::SREM:
case ISD::UREM:
Res = LowerREM(N, DAG);
break;
case ISD::SDIVREM:
case ISD::UDIVREM:
Res = LowerDivRem(SDValue(N, 0), DAG);
assert(Res.getNumOperands() == 2 && "DivRem needs two values");
Results.push_back(Res.getValue(0));
Results.push_back(Res.getValue(1));
return;
case ISD::READCYCLECOUNTER:
ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
return;
case ISD::UDIV:
case ISD::SDIV:
assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
Results);
case ISD::ATOMIC_CMP_SWAP:
ReplaceCMP_SWAP_64Results(N, Results, DAG);
return;
case ISD::INTRINSIC_WO_CHAIN:
return ReplaceLongIntrinsic(N, Results, DAG);
case ISD::ABS:
lowerABS(N, Results, DAG);
return ;
}
if (Res.getNode())
Results.push_back(Res);
}
//===----------------------------------------------------------------------===//
// ARM Scheduler Hooks
//===----------------------------------------------------------------------===//
/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
/// registers the function context.
void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
MachineBasicBlock *MBB,
MachineBasicBlock *DispatchBB,
int FI) const {
assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
"ROPI/RWPI not currently supported with SjLj");
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc dl = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
MachineRegisterInfo *MRI = &MF->getRegInfo();
MachineConstantPool *MCP = MF->getConstantPool();
ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
const Function &F = MF->getFunction();
bool isThumb = Subtarget->isThumb();
bool isThumb2 = Subtarget->isThumb2();
unsigned PCLabelId = AFI->createPICLabelUId();
unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
ARMConstantPoolValue *CPV =
ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
: &ARM::GPRRegClass;
// Grab constant pool and fixed stack memory operands.
MachineMemOperand *CPMMO =
MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
MachineMemOperand::MOLoad, 4, 4);
MachineMemOperand *FIMMOSt =
MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
MachineMemOperand::MOStore, 4, 4);
// Load the address of the dispatch MBB into the jump buffer.
if (isThumb2) {
// Incoming value: jbuf
// ldr.n r5, LCPI1_1
// orr r5, r5, #1
// add r5, pc
// str r5, [$jbuf, #+4] ; &jbuf[1]
unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
.addConstantPoolIndex(CPI)
.addMemOperand(CPMMO)
.add(predOps(ARMCC::AL));
// Set the low bit because of thumb mode.
unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
.addReg(NewVReg1, RegState::Kill)
.addImm(0x01)
.add(predOps(ARMCC::AL))
.add(condCodeOp());
unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
.addReg(NewVReg2, RegState::Kill)
.addImm(PCLabelId);
BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
.addReg(NewVReg3, RegState::Kill)
.addFrameIndex(FI)
.addImm(36) // &jbuf[1] :: pc
.addMemOperand(FIMMOSt)
.add(predOps(ARMCC::AL));
} else if (isThumb) {
// Incoming value: jbuf
// ldr.n r1, LCPI1_4
// add r1, pc
// mov r2, #1
// orrs r1, r2
// add r2, $jbuf, #+4 ; &jbuf[1]
// str r1, [r2]
unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
.addConstantPoolIndex(CPI)
.addMemOperand(CPMMO)
.add(predOps(ARMCC::AL));
unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
.addReg(NewVReg1, RegState::Kill)
.addImm(PCLabelId);
// Set the low bit because of thumb mode.
unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
.addReg(ARM::CPSR, RegState::Define)
.addImm(1)
.add(predOps(ARMCC::AL));
unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
.addReg(ARM::CPSR, RegState::Define)
.addReg(NewVReg2, RegState::Kill)
.addReg(NewVReg3, RegState::Kill)
.add(predOps(ARMCC::AL));
unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
.addFrameIndex(FI)
.addImm(36); // &jbuf[1] :: pc
BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
.addReg(NewVReg4, RegState::Kill)
.addReg(NewVReg5, RegState::Kill)
.addImm(0)
.addMemOperand(FIMMOSt)
.add(predOps(ARMCC::AL));
} else {
// Incoming value: jbuf
// ldr r1, LCPI1_1
// add r1, pc, r1
// str r1, [$jbuf, #+4] ; &jbuf[1]
unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
.addConstantPoolIndex(CPI)
.addImm(0)
.addMemOperand(CPMMO)
.add(predOps(ARMCC::AL));
unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
.addReg(NewVReg1, RegState::Kill)
.addImm(PCLabelId)
.add(predOps(ARMCC::AL));
BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
.addReg(NewVReg2, RegState::Kill)
.addFrameIndex(FI)
.addImm(36) // &jbuf[1] :: pc
.addMemOperand(FIMMOSt)
.add(predOps(ARMCC::AL));
}
}
void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
MachineBasicBlock *MBB) const {
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc dl = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
MachineRegisterInfo *MRI = &MF->getRegInfo();
MachineFrameInfo &MFI = MF->getFrameInfo();
int FI = MFI.getFunctionContextIndex();
const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
: &ARM::GPRnopcRegClass;
// Get a mapping of the call site numbers to all of the landing pads they're
// associated with.
DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
unsigned MaxCSNum = 0;
for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
++BB) {
if (!BB->isEHPad()) continue;
// FIXME: We should assert that the EH_LABEL is the first MI in the landing
// pad.
for (MachineBasicBlock::iterator
II = BB->begin(), IE = BB->end(); II != IE; ++II) {
if (!II->isEHLabel()) continue;
MCSymbol *Sym = II->getOperand(0).getMCSymbol();
if (!MF->hasCallSiteLandingPad(Sym)) continue;
SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
for (SmallVectorImpl<unsigned>::iterator
CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
CSI != CSE; ++CSI) {
CallSiteNumToLPad[*CSI].push_back(&*BB);
MaxCSNum = std::max(MaxCSNum, *CSI);
}
break;
}
}
// Get an ordered list of the machine basic blocks for the jump table.
std::vector<MachineBasicBlock*> LPadList;
SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
LPadList.reserve(CallSiteNumToLPad.size());
for (unsigned I = 1; I <= MaxCSNum; ++I) {
SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
for (SmallVectorImpl<MachineBasicBlock*>::iterator
II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
LPadList.push_back(*II);
InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
}
}
assert(!LPadList.empty() &&
"No landing pad destinations for the dispatch jump table!");
// Create the jump table and associated information.
MachineJumpTableInfo *JTI =
MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
unsigned MJTI = JTI->createJumpTableIndex(LPadList);
// Create the MBBs for the dispatch code.
// Shove the dispatch's address into the return slot in the function context.
MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
DispatchBB->setIsEHPad();
MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
unsigned trap_opcode;
if (Subtarget->isThumb())
trap_opcode = ARM::tTRAP;
else
trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
BuildMI(TrapBB, dl, TII->get(trap_opcode));
DispatchBB->addSuccessor(TrapBB);
MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
DispatchBB->addSuccessor(DispContBB);
// Insert and MBBs.
MF->insert(MF->end(), DispatchBB);
MF->insert(MF->end(), DispContBB);
MF->insert(MF->end(), TrapBB);
// Insert code into the entry block that creates and registers the function
// context.
SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
MachinePointerInfo::getFixedStack(*MF, FI),
MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4);
MachineInstrBuilder MIB;
MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
// Add a register mask with no preserved registers. This results in all
// registers being marked as clobbered. This can't work if the dispatch block
// is in a Thumb1 function and is linked with ARM code which uses the FP
// registers, as there is no way to preserve the FP registers in Thumb1 mode.
MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF));
bool IsPositionIndependent = isPositionIndependent();
unsigned NumLPads = LPadList.size();
if (Subtarget->isThumb2()) {
unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
.addFrameIndex(FI)
.addImm(4)
.addMemOperand(FIMMOLd)
.add(predOps(ARMCC::AL));
if (NumLPads < 256) {
BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
.addReg(NewVReg1)
.addImm(LPadList.size())
.add(predOps(ARMCC::AL));
} else {
unsigned VReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
.addImm(NumLPads & 0xFFFF)
.add(predOps(ARMCC::AL));
unsigned VReg2 = VReg1;
if ((NumLPads & 0xFFFF0000) != 0) {
VReg2 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
.addReg(VReg1)
.addImm(NumLPads >> 16)
.add(predOps(ARMCC::AL));
}
BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
.addReg(NewVReg1)
.addReg(VReg2)
.add(predOps(ARMCC::AL));
}
BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
.addMBB(TrapBB)
.addImm(ARMCC::HI)
.addReg(ARM::CPSR);
unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
.addJumpTableIndex(MJTI)
.add(predOps(ARMCC::AL));
unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
.addReg(NewVReg3, RegState::Kill)
.addReg(NewVReg1)
.addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
.add(predOps(ARMCC::AL))
.add(condCodeOp());
BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
.addReg(NewVReg4, RegState::Kill)
.addReg(NewVReg1)
.addJumpTableIndex(MJTI);
} else if (Subtarget->isThumb()) {
unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
.addFrameIndex(FI)
.addImm(1)
.addMemOperand(FIMMOLd)
.add(predOps(ARMCC::AL));
if (NumLPads < 256) {
BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
.addReg(NewVReg1)
.addImm(NumLPads)
.add(predOps(ARMCC::AL));
} else {
MachineConstantPool *ConstantPool = MF->getConstantPool();
Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
// MachineConstantPool wants an explicit alignment.
unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
if (Align == 0)
Align = MF->getDataLayout().getTypeAllocSize(C->getType());
unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
unsigned VReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
.addReg(VReg1, RegState::Define)
.addConstantPoolIndex(Idx)
.add(predOps(ARMCC::AL));
BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
.addReg(NewVReg1)
.addReg(VReg1)
.add(predOps(ARMCC::AL));
}
BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
.addMBB(TrapBB)
.addImm(ARMCC::HI)
.addReg(ARM::CPSR);
unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
.addReg(ARM::CPSR, RegState::Define)
.addReg(NewVReg1)
.addImm(2)
.add(predOps(ARMCC::AL));
unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
.addJumpTableIndex(MJTI)
.add(predOps(ARMCC::AL));
unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
.addReg(ARM::CPSR, RegState::Define)
.addReg(NewVReg2, RegState::Kill)
.addReg(NewVReg3)
.add(predOps(ARMCC::AL));
MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
.addReg(NewVReg4, RegState::Kill)
.addImm(0)
.addMemOperand(JTMMOLd)
.add(predOps(ARMCC::AL));
unsigned NewVReg6 = NewVReg5;
if (IsPositionIndependent) {
NewVReg6 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
.addReg(ARM::CPSR, RegState::Define)
.addReg(NewVReg5, RegState::Kill)
.addReg(NewVReg3)
.add(predOps(ARMCC::AL));
}
BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
.addReg(NewVReg6, RegState::Kill)
.addJumpTableIndex(MJTI);
} else {
unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
.addFrameIndex(FI)
.addImm(4)
.addMemOperand(FIMMOLd)
.add(predOps(ARMCC::AL));
if (NumLPads < 256) {
BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
.addReg(NewVReg1)
.addImm(NumLPads)
.add(predOps(ARMCC::AL));
} else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
unsigned VReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
.addImm(NumLPads & 0xFFFF)
.add(predOps(ARMCC::AL));
unsigned VReg2 = VReg1;
if ((NumLPads & 0xFFFF0000) != 0) {
VReg2 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
.addReg(VReg1)
.addImm(NumLPads >> 16)
.add(predOps(ARMCC::AL));
}
BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
.addReg(NewVReg1)
.addReg(VReg2)
.add(predOps(ARMCC::AL));
} else {
MachineConstantPool *ConstantPool = MF->getConstantPool();
Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
// MachineConstantPool wants an explicit alignment.
unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
if (Align == 0)
Align = MF->getDataLayout().getTypeAllocSize(C->getType());
unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
unsigned VReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
.addReg(VReg1, RegState::Define)
.addConstantPoolIndex(Idx)
.addImm(0)
.add(predOps(ARMCC::AL));
BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
.addReg(NewVReg1)
.addReg(VReg1, RegState::Kill)
.add(predOps(ARMCC::AL));
}
BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
.addMBB(TrapBB)
.addImm(ARMCC::HI)
.addReg(ARM::CPSR);
unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
.addReg(NewVReg1)
.addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
.add(predOps(ARMCC::AL))
.add(condCodeOp());
unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
.addJumpTableIndex(MJTI)
.add(predOps(ARMCC::AL));
MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
.addReg(NewVReg3, RegState::Kill)
.addReg(NewVReg4)
.addImm(0)
.addMemOperand(JTMMOLd)
.add(predOps(ARMCC::AL));
if (IsPositionIndependent) {
BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
.addReg(NewVReg5, RegState::Kill)
.addReg(NewVReg4)
.addJumpTableIndex(MJTI);
} else {
BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
.addReg(NewVReg5, RegState::Kill)
.addJumpTableIndex(MJTI);
}
}
// Add the jump table entries as successors to the MBB.
SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
for (std::vector<MachineBasicBlock*>::iterator
I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
MachineBasicBlock *CurMBB = *I;
if (SeenMBBs.insert(CurMBB).second)
DispContBB->addSuccessor(CurMBB);
}
// N.B. the order the invoke BBs are processed in doesn't matter here.
const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
SmallVector<MachineBasicBlock*, 64> MBBLPads;
for (MachineBasicBlock *BB : InvokeBBs) {
// Remove the landing pad successor from the invoke block and replace it
// with the new dispatch block.
SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(),
BB->succ_end());
while (!Successors.empty()) {
MachineBasicBlock *SMBB = Successors.pop_back_val();
if (SMBB->isEHPad()) {
BB->removeSuccessor(SMBB);
MBBLPads.push_back(SMBB);
}
}
BB->addSuccessor(DispatchBB, BranchProbability::getZero());
BB->normalizeSuccProbs();
// Find the invoke call and mark all of the callee-saved registers as
// 'implicit defined' so that they're spilled. This prevents code from
// moving instructions to before the EH block, where they will never be
// executed.
for (MachineBasicBlock::reverse_iterator
II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
if (!II->isCall()) continue;
DenseMap<unsigned, bool> DefRegs;
for (MachineInstr::mop_iterator
OI = II->operands_begin(), OE = II->operands_end();
OI != OE; ++OI) {
if (!OI->isReg()) continue;
DefRegs[OI->getReg()] = true;
}
MachineInstrBuilder MIB(*MF, &*II);
for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
unsigned Reg = SavedRegs[i];
if (Subtarget->isThumb2() &&
!ARM::tGPRRegClass.contains(Reg) &&
!ARM::hGPRRegClass.contains(Reg))
continue;
if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
continue;
if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
continue;
if (!DefRegs[Reg])
MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
}
break;
}
}
// Mark all former landing pads as non-landing pads. The dispatch is the only
// landing pad now.
for (SmallVectorImpl<MachineBasicBlock*>::iterator
I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
(*I)->setIsEHPad(false);
// The instruction is gone now.
MI.eraseFromParent();
}
static
MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
E = MBB->succ_end(); I != E; ++I)
if (*I != Succ)
return *I;
llvm_unreachable("Expecting a BB with two successors!");
}
/// Return the load opcode for a given load size. If load size >= 8,
/// neon opcode will be returned.
static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
if (LdSize >= 8)
return LdSize == 16 ? ARM::VLD1q32wb_fixed
: LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
if (IsThumb1)
return LdSize == 4 ? ARM::tLDRi
: LdSize == 2 ? ARM::tLDRHi
: LdSize == 1 ? ARM::tLDRBi : 0;
if (IsThumb2)
return LdSize == 4 ? ARM::t2LDR_POST
: LdSize == 2 ? ARM::t2LDRH_POST
: LdSize == 1 ? ARM::t2LDRB_POST : 0;
return LdSize == 4 ? ARM::LDR_POST_IMM
: LdSize == 2 ? ARM::LDRH_POST
: LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
}
/// Return the store opcode for a given store size. If store size >= 8,
/// neon opcode will be returned.
static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
if (StSize >= 8)
return StSize == 16 ? ARM::VST1q32wb_fixed
: StSize == 8 ? ARM::VST1d32wb_fixed : 0;
if (IsThumb1)
return StSize == 4 ? ARM::tSTRi
: StSize == 2 ? ARM::tSTRHi
: StSize == 1 ? ARM::tSTRBi : 0;
if (IsThumb2)
return StSize == 4 ? ARM::t2STR_POST
: StSize == 2 ? ARM::t2STRH_POST
: StSize == 1 ? ARM::t2STRB_POST : 0;
return StSize == 4 ? ARM::STR_POST_IMM
: StSize == 2 ? ARM::STRH_POST
: StSize == 1 ? ARM::STRB_POST_IMM : 0;
}
/// Emit a post-increment load operation with given size. The instructions
/// will be added to BB at Pos.
static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
const TargetInstrInfo *TII, const DebugLoc &dl,
unsigned LdSize, unsigned Data, unsigned AddrIn,
unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
assert(LdOpc != 0 && "Should have a load opcode");
if (LdSize >= 8) {
BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
.addReg(AddrOut, RegState::Define)
.addReg(AddrIn)
.addImm(0)
.add(predOps(ARMCC::AL));
} else if (IsThumb1) {
// load + update AddrIn
BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
.addReg(AddrIn)
.addImm(0)
.add(predOps(ARMCC::AL));
BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
.add(t1CondCodeOp())
.addReg(AddrIn)
.addImm(LdSize)
.add(predOps(ARMCC::AL));
} else if (IsThumb2) {
BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
.addReg(AddrOut, RegState::Define)
.addReg(AddrIn)
.addImm(LdSize)
.add(predOps(ARMCC::AL));
} else { // arm
BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
.addReg(AddrOut, RegState::Define)
.addReg(AddrIn)
.addReg(0)
.addImm(LdSize)
.add(predOps(ARMCC::AL));
}
}
/// Emit a post-increment store operation with given size. The instructions
/// will be added to BB at Pos.
static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
const TargetInstrInfo *TII, const DebugLoc &dl,
unsigned StSize, unsigned Data, unsigned AddrIn,
unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
assert(StOpc != 0 && "Should have a store opcode");
if (StSize >= 8) {
BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
.addReg(AddrIn)
.addImm(0)
.addReg(Data)
.add(predOps(ARMCC::AL));
} else if (IsThumb1) {
// store + update AddrIn
BuildMI(*BB, Pos, dl, TII->get(StOpc))
.addReg(Data)
.addReg(AddrIn)
.addImm(0)
.add(predOps(ARMCC::AL));
BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
.add(t1CondCodeOp())
.addReg(AddrIn)
.addImm(StSize)
.add(predOps(ARMCC::AL));
} else if (IsThumb2) {
BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
.addReg(Data)
.addReg(AddrIn)
.addImm(StSize)
.add(predOps(ARMCC::AL));
} else { // arm
BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
.addReg(Data)
.addReg(AddrIn)
.addReg(0)
.addImm(StSize)
.add(predOps(ARMCC::AL));
}
}
MachineBasicBlock *
ARMTargetLowering::EmitStructByval(MachineInstr &MI,
MachineBasicBlock *BB) const {
// This pseudo instruction has 3 operands: dst, src, size
// We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
// Otherwise, we will generate unrolled scalar copies.
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
MachineFunction::iterator It = ++BB->getIterator();
unsigned dest = MI.getOperand(0).getReg();
unsigned src = MI.getOperand(1).getReg();
unsigned SizeVal = MI.getOperand(2).getImm();
unsigned Align = MI.getOperand(3).getImm();
DebugLoc dl = MI.getDebugLoc();
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
unsigned UnitSize = 0;
const TargetRegisterClass *TRC = nullptr;
const TargetRegisterClass *VecTRC = nullptr;
bool IsThumb1 = Subtarget->isThumb1Only();
bool IsThumb2 = Subtarget->isThumb2();
bool IsThumb = Subtarget->isThumb();
if (Align & 1) {
UnitSize = 1;
} else if (Align & 2) {
UnitSize = 2;
} else {
// Check whether we can use NEON instructions.
if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
Subtarget->hasNEON()) {
if ((Align % 16 == 0) && SizeVal >= 16)
UnitSize = 16;
else if ((Align % 8 == 0) && SizeVal >= 8)
UnitSize = 8;
}
// Can't use NEON instructions.
if (UnitSize == 0)
UnitSize = 4;
}
// Select the correct opcode and register class for unit size load/store
bool IsNeon = UnitSize >= 8;
TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
if (IsNeon)
VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
: UnitSize == 8 ? &ARM::DPRRegClass
: nullptr;
unsigned BytesLeft = SizeVal % UnitSize;
unsigned LoopSize = SizeVal - BytesLeft;
if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
// Use LDR and STR to copy.
// [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
// [destOut] = STR_POST(scratch, destIn, UnitSize)
unsigned srcIn = src;
unsigned destIn = dest;
for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
unsigned srcOut = MRI.createVirtualRegister(TRC);
unsigned destOut = MRI.createVirtualRegister(TRC);
unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
IsThumb1, IsThumb2);
emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
IsThumb1, IsThumb2);
srcIn = srcOut;
destIn = destOut;
}
// Handle the leftover bytes with LDRB and STRB.
// [scratch, srcOut] = LDRB_POST(srcIn, 1)
// [destOut] = STRB_POST(scratch, destIn, 1)
for (unsigned i = 0; i < BytesLeft; i++) {
unsigned srcOut = MRI.createVirtualRegister(TRC);
unsigned destOut = MRI.createVirtualRegister(TRC);
unsigned scratch = MRI.createVirtualRegister(TRC);
emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
IsThumb1, IsThumb2);
emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
IsThumb1, IsThumb2);
srcIn = srcOut;
destIn = destOut;
}
MI.eraseFromParent(); // The instruction is gone now.
return BB;
}
// Expand the pseudo op to a loop.
// thisMBB:
// ...
// movw varEnd, # --> with thumb2
// movt varEnd, #
// ldrcp varEnd, idx --> without thumb2
// fallthrough --> loopMBB
// loopMBB:
// PHI varPhi, varEnd, varLoop
// PHI srcPhi, src, srcLoop
// PHI destPhi, dst, destLoop
// [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
// [destLoop] = STR_POST(scratch, destPhi, UnitSize)
// subs varLoop, varPhi, #UnitSize
// bne loopMBB
// fallthrough --> exitMBB
// exitMBB:
// epilogue to handle left-over bytes
// [scratch, srcOut] = LDRB_POST(srcLoop, 1)
// [destOut] = STRB_POST(scratch, destLoop, 1)
MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MF->insert(It, loopMBB);
MF->insert(It, exitMBB);
// Transfer the remainder of BB and its successor edges to exitMBB.
exitMBB->splice(exitMBB->begin(), BB,
std::next(MachineBasicBlock::iterator(MI)), BB->end());
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
// Load an immediate to varEnd.
unsigned varEnd = MRI.createVirtualRegister(TRC);
if (Subtarget->useMovt()) {
unsigned Vtmp = varEnd;
if ((LoopSize & 0xFFFF0000) != 0)
Vtmp = MRI.createVirtualRegister(TRC);
BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp)
.addImm(LoopSize & 0xFFFF)
.add(predOps(ARMCC::AL));
if ((LoopSize & 0xFFFF0000) != 0)
BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd)
.addReg(Vtmp)
.addImm(LoopSize >> 16)
.add(predOps(ARMCC::AL));
} else {
MachineConstantPool *ConstantPool = MF->getConstantPool();
Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
// MachineConstantPool wants an explicit alignment.
unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
if (Align == 0)
Align = MF->getDataLayout().getTypeAllocSize(C->getType());
unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
MachineMemOperand *CPMMO =
MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
MachineMemOperand::MOLoad, 4, 4);
if (IsThumb)
BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
.addReg(varEnd, RegState::Define)
.addConstantPoolIndex(Idx)
.add(predOps(ARMCC::AL))
.addMemOperand(CPMMO);
else
BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
.addReg(varEnd, RegState::Define)
.addConstantPoolIndex(Idx)
.addImm(0)
.add(predOps(ARMCC::AL))
.addMemOperand(CPMMO);
}
BB->addSuccessor(loopMBB);
// Generate the loop body:
// varPhi = PHI(varLoop, varEnd)
// srcPhi = PHI(srcLoop, src)
// destPhi = PHI(destLoop, dst)
MachineBasicBlock *entryBB = BB;
BB = loopMBB;
unsigned varLoop = MRI.createVirtualRegister(TRC);
unsigned varPhi = MRI.createVirtualRegister(TRC);
unsigned srcLoop = MRI.createVirtualRegister(TRC);
unsigned srcPhi = MRI.createVirtualRegister(TRC);
unsigned destLoop = MRI.createVirtualRegister(TRC);
unsigned destPhi = MRI.createVirtualRegister(TRC);
BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
.addReg(varLoop).addMBB(loopMBB)
.addReg(varEnd).addMBB(entryBB);
BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
.addReg(srcLoop).addMBB(loopMBB)
.addReg(src).addMBB(entryBB);
BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
.addReg(destLoop).addMBB(loopMBB)
.addReg(dest).addMBB(entryBB);
// [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
// [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
IsThumb1, IsThumb2);
emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
IsThumb1, IsThumb2);
// Decrement loop variable by UnitSize.
if (IsThumb1) {
BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
.add(t1CondCodeOp())
.addReg(varPhi)
.addImm(UnitSize)
.add(predOps(ARMCC::AL));
} else {
MachineInstrBuilder MIB =
BuildMI(*BB, BB->end(), dl,
TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
MIB.addReg(varPhi)
.addImm(UnitSize)
.add(predOps(ARMCC::AL))
.add(condCodeOp());
MIB->getOperand(5).setReg(ARM::CPSR);
MIB->getOperand(5).setIsDef(true);
}
BuildMI(*BB, BB->end(), dl,
TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
.addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
// loopMBB can loop back to loopMBB or fall through to exitMBB.
BB->addSuccessor(loopMBB);
BB->addSuccessor(exitMBB);
// Add epilogue to handle BytesLeft.
BB = exitMBB;
auto StartOfExit = exitMBB->begin();
// [scratch, srcOut] = LDRB_POST(srcLoop, 1)
// [destOut] = STRB_POST(scratch, destLoop, 1)
unsigned srcIn = srcLoop;
unsigned destIn = destLoop;
for (unsigned i = 0; i < BytesLeft; i++) {
unsigned srcOut = MRI.createVirtualRegister(TRC);
unsigned destOut = MRI.createVirtualRegister(TRC);
unsigned scratch = MRI.createVirtualRegister(TRC);
emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
IsThumb1, IsThumb2);
emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
IsThumb1, IsThumb2);
srcIn = srcOut;
destIn = destOut;
}
MI.eraseFromParent(); // The instruction is gone now.
return BB;
}
MachineBasicBlock *
ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
MachineBasicBlock *MBB) const {
const TargetMachine &TM = getTargetMachine();
const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
assert(Subtarget->isTargetWindows() &&
"__chkstk is only supported on Windows");
assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
// __chkstk takes the number of words to allocate on the stack in R4, and
// returns the stack adjustment in number of bytes in R4. This will not
// clober any other registers (other than the obvious lr).
//
// Although, technically, IP should be considered a register which may be
// clobbered, the call itself will not touch it. Windows on ARM is a pure
// thumb-2 environment, so there is no interworking required. As a result, we
// do not expect a veneer to be emitted by the linker, clobbering IP.
//
// Each module receives its own copy of __chkstk, so no import thunk is
// required, again, ensuring that IP is not clobbered.
//
// Finally, although some linkers may theoretically provide a trampoline for
// out of range calls (which is quite common due to a 32M range limitation of
// branches for Thumb), we can generate the long-call version via
// -mcmodel=large, alleviating the need for the trampoline which may clobber
// IP.
switch (TM.getCodeModel()) {
case CodeModel::Tiny:
llvm_unreachable("Tiny code model not available on ARM.");
case CodeModel::Small:
case CodeModel::Medium:
case CodeModel::Kernel:
BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
.add(predOps(ARMCC::AL))
.addExternalSymbol("__chkstk")
.addReg(ARM::R4, RegState::Implicit | RegState::Kill)
.addReg(ARM::R4, RegState::Implicit | RegState::Define)
.addReg(ARM::R12,
RegState::Implicit | RegState::Define | RegState::Dead)
.addReg(ARM::CPSR,
RegState::Implicit | RegState::Define | RegState::Dead);
break;
case CodeModel::Large: {
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
.addExternalSymbol("__chkstk");
BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr))
.add(predOps(ARMCC::AL))
.addReg(Reg, RegState::Kill)
.addReg(ARM::R4, RegState::Implicit | RegState::Kill)
.addReg(ARM::R4, RegState::Implicit | RegState::Define)
.addReg(ARM::R12,
RegState::Implicit | RegState::Define | RegState::Dead)
.addReg(ARM::CPSR,
RegState::Implicit | RegState::Define | RegState::Dead);
break;
}
}
BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
.addReg(ARM::SP, RegState::Kill)
.addReg(ARM::R4, RegState::Kill)
.setMIFlags(MachineInstr::FrameSetup)
.add(predOps(ARMCC::AL))
.add(condCodeOp());
MI.eraseFromParent();
return MBB;
}
MachineBasicBlock *
ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
MachineBasicBlock *MBB) const {
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
MF->insert(++MBB->getIterator(), ContBB);
ContBB->splice(ContBB->begin(), MBB,
std::next(MachineBasicBlock::iterator(MI)), MBB->end());
ContBB->transferSuccessorsAndUpdatePHIs(MBB);
MBB->addSuccessor(ContBB);
MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
MF->push_back(TrapBB);
MBB->addSuccessor(TrapBB);
BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
.addReg(MI.getOperand(0).getReg())
.addImm(0)
.add(predOps(ARMCC::AL));
BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
.addMBB(TrapBB)
.addImm(ARMCC::EQ)
.addReg(ARM::CPSR);
MI.eraseFromParent();
return ContBB;
}
// The CPSR operand of SelectItr might be missing a kill marker
// because there were multiple uses of CPSR, and ISel didn't know
// which to mark. Figure out whether SelectItr should have had a
// kill marker, and set it if it should. Returns the correct kill
// marker value.
static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr,
MachineBasicBlock* BB,
const TargetRegisterInfo* TRI) {
// Scan forward through BB for a use/def of CPSR.
MachineBasicBlock::iterator miI(std::next(SelectItr));
for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
const MachineInstr& mi = *miI;
if (mi.readsRegister(ARM::CPSR))
return false;
if (mi.definesRegister(ARM::CPSR))
break; // Should have kill-flag - update below.
}
// If we hit the end of the block, check whether CPSR is live into a
// successor.
if (miI == BB->end()) {
for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
sEnd = BB->succ_end();
sItr != sEnd; ++sItr) {
MachineBasicBlock* succ = *sItr;
if (succ->isLiveIn(ARM::CPSR))
return false;
}
}
// We found a def, or hit the end of the basic block and CPSR wasn't live
// out. SelectMI should have a kill flag on CPSR.
SelectItr->addRegisterKilled(ARM::CPSR, TRI);
return true;
}
MachineBasicBlock *
ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc dl = MI.getDebugLoc();
bool isThumb2 = Subtarget->isThumb2();
switch (MI.getOpcode()) {
default: {
MI.print(errs());
llvm_unreachable("Unexpected instr type to insert");
}
// Thumb1 post-indexed loads are really just single-register LDMs.
case ARM::tLDR_postidx: {
MachineOperand Def(MI.getOperand(1));
BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
.add(Def) // Rn_wb
.add(MI.getOperand(2)) // Rn
.add(MI.getOperand(3)) // PredImm
.add(MI.getOperand(4)) // PredReg
.add(MI.getOperand(0)) // Rt
.cloneMemRefs(MI);
MI.eraseFromParent();
return BB;
}
// The Thumb2 pre-indexed stores have the same MI operands, they just
// define them differently in the .td files from the isel patterns, so
// they need pseudos.
case ARM::t2STR_preidx:
MI.setDesc(TII->get(ARM::t2STR_PRE));
return BB;
case ARM::t2STRB_preidx:
MI.setDesc(TII->get(ARM::t2STRB_PRE));
return BB;
case ARM::t2STRH_preidx:
MI.setDesc(TII->get(ARM::t2STRH_PRE));
return BB;
case ARM::STRi_preidx:
case ARM::STRBi_preidx: {
unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
: ARM::STRB_PRE_IMM;
// Decode the offset.
unsigned Offset = MI.getOperand(4).getImm();
bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
Offset = ARM_AM::getAM2Offset(Offset);
if (isSub)
Offset = -Offset;
MachineMemOperand *MMO = *MI.memoperands_begin();
BuildMI(*BB, MI, dl, TII->get(NewOpc))
.add(MI.getOperand(0)) // Rn_wb
.add(MI.getOperand(1)) // Rt
.add(MI.getOperand(2)) // Rn
.addImm(Offset) // offset (skip GPR==zero_reg)
.add(MI.getOperand(5)) // pred
.add(MI.getOperand(6))
.addMemOperand(MMO);
MI.eraseFromParent();
return BB;
}
case ARM::STRr_preidx:
case ARM::STRBr_preidx:
case ARM::STRH_preidx: {
unsigned NewOpc;
switch (MI.getOpcode()) {
default: llvm_unreachable("unexpected opcode!");
case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
}
MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
for (unsigned i = 0; i < MI.getNumOperands(); ++i)
MIB.add(MI.getOperand(i));
MI.eraseFromParent();
return BB;
}
case ARM::tMOVCCr_pseudo: {
// To "insert" a SELECT_CC instruction, we actually have to insert the
// diamond control-flow pattern. The incoming instruction knows the
// destination vreg to set, the condition code register to branch on, the
// true/false values to select between, and a branch opcode to use.
const BasicBlock *LLVM_BB = BB->getBasicBlock();
MachineFunction::iterator It = ++BB->getIterator();
// thisMBB:
// ...
// TrueVal = ...
// cmpTY ccX, r1, r2
// bCC copy1MBB
// fallthrough --> copy0MBB
MachineBasicBlock *thisMBB = BB;
MachineFunction *F = BB->getParent();
MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
F->insert(It, copy0MBB);
F->insert(It, sinkMBB);
// Check whether CPSR is live past the tMOVCCr_pseudo.
const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
if (!MI.killsRegister(ARM::CPSR) &&
!checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
copy0MBB->addLiveIn(ARM::CPSR);
sinkMBB->addLiveIn(ARM::CPSR);
}
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), BB,
std::next(MachineBasicBlock::iterator(MI)), BB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
BB->addSuccessor(copy0MBB);
BB->addSuccessor(sinkMBB);
BuildMI(BB, dl, TII->get(ARM::tBcc))
.addMBB(sinkMBB)
.addImm(MI.getOperand(3).getImm())
.addReg(MI.getOperand(4).getReg());
// copy0MBB:
// %FalseValue = ...
// # fallthrough to sinkMBB
BB = copy0MBB;
// Update machine-CFG edges
BB->addSuccessor(sinkMBB);
// sinkMBB:
// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
// ...
BB = sinkMBB;
BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
.addReg(MI.getOperand(1).getReg())
.addMBB(copy0MBB)
.addReg(MI.getOperand(2).getReg())
.addMBB(thisMBB);
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
}
case ARM::BCCi64:
case ARM::BCCZi64: {
// If there is an unconditional branch to the other successor, remove it.
BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
// Compare both parts that make up the double comparison separately for
// equality.
bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
unsigned LHS1 = MI.getOperand(1).getReg();
unsigned LHS2 = MI.getOperand(2).getReg();
if (RHSisZero) {
BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
.addReg(LHS1)
.addImm(0)
.add(predOps(ARMCC::AL));
BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
.addReg(LHS2).addImm(0)
.addImm(ARMCC::EQ).addReg(ARM::CPSR);
} else {
unsigned RHS1 = MI.getOperand(3).getReg();
unsigned RHS2 = MI.getOperand(4).getReg();
BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
.addReg(LHS1)
.addReg(RHS1)
.add(predOps(ARMCC::AL));
BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
.addReg(LHS2).addReg(RHS2)
.addImm(ARMCC::EQ).addReg(ARM::CPSR);
}
MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
if (MI.getOperand(0).getImm() == ARMCC::NE)
std::swap(destMBB, exitMBB);
BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
.addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
if (isThumb2)
BuildMI(BB, dl, TII->get(ARM::t2B))
.addMBB(exitMBB)
.add(predOps(ARMCC::AL));
else
BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
}
case ARM::Int_eh_sjlj_setjmp:
case ARM::Int_eh_sjlj_setjmp_nofp:
case ARM::tInt_eh_sjlj_setjmp:
case ARM::t2Int_eh_sjlj_setjmp:
case ARM::t2Int_eh_sjlj_setjmp_nofp:
return BB;
case ARM::Int_eh_sjlj_setup_dispatch:
EmitSjLjDispatchBlock(MI, BB);
return BB;
case ARM::ABS:
case ARM::t2ABS: {
// To insert an ABS instruction, we have to insert the
// diamond control-flow pattern. The incoming instruction knows the
// source vreg to test against 0, the destination vreg to set,
// the condition code register to branch on, the
// true/false values to select between, and a branch opcode to use.
// It transforms
// V1 = ABS V0
// into
// V2 = MOVS V0
// BCC (branch to SinkBB if V0 >= 0)
// RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0)
// SinkBB: V1 = PHI(V2, V3)
const BasicBlock *LLVM_BB = BB->getBasicBlock();
MachineFunction::iterator BBI = ++BB->getIterator();
MachineFunction *Fn = BB->getParent();
MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB);
Fn->insert(BBI, RSBBB);
Fn->insert(BBI, SinkBB);
unsigned int ABSSrcReg = MI.getOperand(1).getReg();
unsigned int ABSDstReg = MI.getOperand(0).getReg();
bool ABSSrcKIll = MI.getOperand(1).isKill();
bool isThumb2 = Subtarget->isThumb2();
MachineRegisterInfo &MRI = Fn->getRegInfo();
// In Thumb mode S must not be specified if source register is the SP or
// PC and if destination register is the SP, so restrict register class
unsigned NewRsbDstReg =
MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
// Transfer the remainder of BB and its successor edges to sinkMBB.
SinkBB->splice(SinkBB->begin(), BB,
std::next(MachineBasicBlock::iterator(MI)), BB->end());
SinkBB->transferSuccessorsAndUpdatePHIs(BB);
BB->addSuccessor(RSBBB);
BB->addSuccessor(SinkBB);
// fall through to SinkMBB
RSBBB->addSuccessor(SinkBB);
// insert a cmp at the end of BB
BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
.addReg(ABSSrcReg)
.addImm(0)
.add(predOps(ARMCC::AL));
// insert a bcc with opposite CC to ARMCC::MI at the end of BB
BuildMI(BB, dl,
TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
.addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
// insert rsbri in RSBBB
// Note: BCC and rsbri will be converted into predicated rsbmi
// by if-conversion pass
BuildMI(*RSBBB, RSBBB->begin(), dl,
TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
.addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
.addImm(0)
.add(predOps(ARMCC::AL))
.add(condCodeOp());
// insert PHI in SinkBB,
// reuse ABSDstReg to not change uses of ABS instruction
BuildMI(*SinkBB, SinkBB->begin(), dl,
TII->get(ARM::PHI), ABSDstReg)
.addReg(NewRsbDstReg).addMBB(RSBBB)
.addReg(ABSSrcReg).addMBB(BB);
// remove ABS instruction
MI.eraseFromParent();
// return last added BB
return SinkBB;
}
case ARM::COPY_STRUCT_BYVAL_I32:
++NumLoopByVals;
return EmitStructByval(MI, BB);
case ARM::WIN__CHKSTK:
return EmitLowered__chkstk(MI, BB);
case ARM::WIN__DBZCHK:
return EmitLowered__dbzchk(MI, BB);
}
}
/// Attaches vregs to MEMCPY that it will use as scratch registers
/// when it is expanded into LDM/STM. This is done as a post-isel lowering
/// instead of as a custom inserter because we need the use list from the SDNode.
static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
MachineInstr &MI, const SDNode *Node) {
bool isThumb1 = Subtarget->isThumb1Only();
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = MI.getParent()->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
MachineInstrBuilder MIB(*MF, MI);
// If the new dst/src is unused mark it as dead.
if (!Node->hasAnyUseOfValue(0)) {
MI.getOperand(0).setIsDead(true);
}
if (!Node->hasAnyUseOfValue(1)) {
MI.getOperand(1).setIsDead(true);
}
// The MEMCPY both defines and kills the scratch registers.
for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
: &ARM::GPRRegClass);
MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
}
}
void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const {
if (MI.getOpcode() == ARM::MEMCPY) {
attachMEMCPYScratchRegs(Subtarget, MI, Node);
return;
}
const MCInstrDesc *MCID = &MI.getDesc();
// Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
// RSC. Coming out of isel, they have an implicit CPSR def, but the optional
// operand is still set to noreg. If needed, set the optional operand's
// register to CPSR, and remove the redundant implicit def.
//
// e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
// Rename pseudo opcodes.
unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
unsigned ccOutIdx;
if (NewOpc) {
const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
MCID = &TII->get(NewOpc);
assert(MCID->getNumOperands() ==
MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
&& "converted opcode should be the same except for cc_out"
" (and, on Thumb1, pred)");
MI.setDesc(*MCID);
// Add the optional cc_out operand
MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
// On Thumb1, move all input operands to the end, then add the predicate
if (Subtarget->isThumb1Only()) {
for (unsigned c = MCID->getNumOperands() - 4; c--;) {
MI.addOperand(MI.getOperand(1));
MI.RemoveOperand(1);
}
// Restore the ties
for (unsigned i = MI.getNumOperands(); i--;) {
const MachineOperand& op = MI.getOperand(i);
if (op.isReg() && op.isUse()) {
int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
if (DefIdx != -1)
MI.tieOperands(DefIdx, i);
}
}
MI.addOperand(MachineOperand::CreateImm(ARMCC::AL));
MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
ccOutIdx = 1;
} else
ccOutIdx = MCID->getNumOperands() - 1;
} else
ccOutIdx = MCID->getNumOperands() - 1;
// Any ARM instruction that sets the 's' bit should specify an optional
// "cc_out" operand in the last operand position.
if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
assert(!NewOpc && "Optional cc_out operand required");
return;
}
// Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
// since we already have an optional CPSR def.
bool definesCPSR = false;
bool deadCPSR = false;
for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
++i) {
const MachineOperand &MO = MI.getOperand(i);
if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
definesCPSR = true;
if (MO.isDead())
deadCPSR = true;
MI.RemoveOperand(i);
break;
}
}
if (!definesCPSR) {
assert(!NewOpc && "Optional cc_out operand required");
return;
}
assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
if (deadCPSR) {
assert(!MI.getOperand(ccOutIdx).getReg() &&
"expect uninitialized optional cc_out operand");
// Thumb1 instructions must have the S bit even if the CPSR is dead.
if (!Subtarget->isThumb1Only())
return;
}
// If this instruction was defined with an optional CPSR def and its dag node
// had a live implicit CPSR def, then activate the optional CPSR def.
MachineOperand &MO = MI.getOperand(ccOutIdx);
MO.setReg(ARM::CPSR);
MO.setIsDef(true);
}
//===----------------------------------------------------------------------===//
// ARM Optimization Hooks
//===----------------------------------------------------------------------===//
// Helper function that checks if N is a null or all ones constant.
static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
}
// Return true if N is conditionally 0 or all ones.
// Detects these expressions where cc is an i1 value:
//
// (select cc 0, y) [AllOnes=0]
// (select cc y, 0) [AllOnes=0]
// (zext cc) [AllOnes=0]
// (sext cc) [AllOnes=0/1]
// (select cc -1, y) [AllOnes=1]
// (select cc y, -1) [AllOnes=1]
//
// Invert is set when N is the null/all ones constant when CC is false.
// OtherOp is set to the alternative value of N.
static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
SDValue &CC, bool &Invert,
SDValue &OtherOp,
SelectionDAG &DAG) {
switch (N->getOpcode()) {
default: return false;
case ISD::SELECT: {
CC = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue N2 = N->getOperand(2);
if (isZeroOrAllOnes(N1, AllOnes)) {
Invert = false;
OtherOp = N2;
return true;
}
if (isZeroOrAllOnes(N2, AllOnes)) {
Invert = true;
OtherOp = N1;
return true;
}
return false;
}
case ISD::ZERO_EXTEND:
// (zext cc) can never be the all ones value.
if (AllOnes)
return false;
LLVM_FALLTHROUGH;
case ISD::SIGN_EXTEND: {
SDLoc dl(N);
EVT VT = N->getValueType(0);
CC = N->getOperand(0);
if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
return false;
Invert = !AllOnes;
if (AllOnes)
// When looking for an AllOnes constant, N is an sext, and the 'other'
// value is 0.
OtherOp = DAG.getConstant(0, dl, VT);
else if (N->getOpcode() == ISD::ZERO_EXTEND)
// When looking for a 0 constant, N can be zext or sext.
OtherOp = DAG.getConstant(1, dl, VT);
else
OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
VT);
return true;
}
}
}
// Combine a constant select operand into its use:
//
// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
//
// The transform is rejected if the select doesn't have a constant operand that
// is null, or all ones when AllOnes is set.
//
// Also recognize sext/zext from i1:
//
// (add (zext cc), x) -> (select cc (add x, 1), x)
// (add (sext cc), x) -> (select cc (add x, -1), x)
//
// These transformations eventually create predicated instructions.
//
// @param N The node to transform.
// @param Slct The N operand that is a select.
// @param OtherOp The other N operand (x above).
// @param DCI Context.
// @param AllOnes Require the select constant to be all ones instead of null.
// @returns The new node, or SDValue() on failure.
static
SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
TargetLowering::DAGCombinerInfo &DCI,
bool AllOnes = false) {
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
SDValue NonConstantVal;
SDValue CCOp;
bool SwapSelectOps;
if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
NonConstantVal, DAG))
return SDValue();
// Slct is now know to be the desired identity constant when CC is true.
SDValue TrueVal = OtherOp;
SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
OtherOp, NonConstantVal);
// Unless SwapSelectOps says CC should be false.
if (SwapSelectOps)
std::swap(TrueVal, FalseVal);
return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
CCOp, TrueVal, FalseVal);
}
// Attempt combineSelectAndUse on each operand of a commutative operator N.
static
SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
TargetLowering::DAGCombinerInfo &DCI) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
if (N0.getNode()->hasOneUse())
if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
return Result;
if (N1.getNode()->hasOneUse())
if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
return Result;
return SDValue();
}
static bool IsVUZPShuffleNode(SDNode *N) {
// VUZP shuffle node.
if (N->getOpcode() == ARMISD::VUZP)
return true;
// "VUZP" on i32 is an alias for VTRN.
if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
return true;
return false;
}
static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
// Look for ADD(VUZP.0, VUZP.1).
if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
N0 == N1)
return SDValue();
// Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
if (!N->getValueType(0).is64BitVector())
return SDValue();
// Generate vpadd.
SelectionDAG &DAG = DCI.DAG;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDLoc dl(N);
SDNode *Unzip = N0.getNode();
EVT VT = N->getValueType(0);
SmallVector<SDValue, 8> Ops;
Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
TLI.getPointerTy(DAG.getDataLayout())));
Ops.push_back(Unzip->getOperand(0));
Ops.push_back(Unzip->getOperand(1));
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
}
static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
// Check for two extended operands.
if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
N1.getOpcode() == ISD::SIGN_EXTEND) &&
!(N0.getOpcode() == ISD::ZERO_EXTEND &&
N1.getOpcode() == ISD::ZERO_EXTEND))
return SDValue();
SDValue N00 = N0.getOperand(0);
SDValue N10 = N1.getOperand(0);
// Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
N00 == N10)
return SDValue();
// We only recognize Q register paddl here; this can't be reached until
// after type legalization.
if (!N00.getValueType().is64BitVector() ||
!N0.getValueType().is128BitVector())
return SDValue();
// Generate vpaddl.
SelectionDAG &DAG = DCI.DAG;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDLoc dl(N);
EVT VT = N->getValueType(0);
SmallVector<SDValue, 8> Ops;
// Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
unsigned Opcode;
if (N0.getOpcode() == ISD::SIGN_EXTEND)
Opcode = Intrinsic::arm_neon_vpaddls;
else
Opcode = Intrinsic::arm_neon_vpaddlu;
Ops.push_back(DAG.getConstant(Opcode, dl,
TLI.getPointerTy(DAG.getDataLayout())));
EVT ElemTy = N00.getValueType().getVectorElementType();
unsigned NumElts = VT.getVectorNumElements();
EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
N00.getOperand(0), N00.getOperand(1));
Ops.push_back(Concat);
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
}
// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
// much easier to match.
static SDValue
AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
// Only perform optimization if after legalize, and if NEON is available. We
// also expected both operands to be BUILD_VECTORs.
if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
|| N0.getOpcode() != ISD::BUILD_VECTOR
|| N1.getOpcode() != ISD::BUILD_VECTOR)
return SDValue();
// Check output type since VPADDL operand elements can only be 8, 16, or 32.
EVT VT = N->getValueType(0);
if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
return SDValue();
// Check that the vector operands are of the right form.
// N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
// operands, where N is the size of the formed vector.
// Each EXTRACT_VECTOR should have the same input vector and odd or even
// index such that we have a pair wise add pattern.
// Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
SDValue Vec = N0->getOperand(0)->getOperand(0);
SDNode *V = Vec.getNode();
unsigned nextIndex = 0;
// For each operands to the ADD which are BUILD_VECTORs,
// check to see if each of their operands are an EXTRACT_VECTOR with
// the same vector and appropriate index.
for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
SDValue ExtVec0 = N0->getOperand(i);
SDValue ExtVec1 = N1->getOperand(i);
// First operand is the vector, verify its the same.
if (V != ExtVec0->getOperand(0).getNode() ||
V != ExtVec1->getOperand(0).getNode())
return SDValue();
// Second is the constant, verify its correct.
ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
// For the constant, we want to see all the even or all the odd.
if (!C0 || !C1 || C0->getZExtValue() != nextIndex
|| C1->getZExtValue() != nextIndex+1)
return SDValue();
// Increment index.
nextIndex+=2;
} else
return SDValue();
}
// Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
// we're using the entire input vector, otherwise there's a size/legality
// mismatch somewhere.
if (nextIndex != Vec.getValueType().getVectorNumElements() ||
Vec.getValueType().getVectorElementType() == VT.getVectorElementType())
return SDValue();
// Create VPADDL node.
SelectionDAG &DAG = DCI.DAG;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDLoc dl(N);
// Build operand list.
SmallVector<SDValue, 8> Ops;
Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
TLI.getPointerTy(DAG.getDataLayout())));
// Input is the vector.
Ops.push_back(Vec);
// Get widened type and narrowed type.
MVT widenType;
unsigned numElem = VT.getVectorNumElements();
EVT inputLaneType = Vec.getValueType().getVectorElementType();
switch (inputLaneType.getSimpleVT().SimpleTy) {
case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
default:
llvm_unreachable("Invalid vector element type for padd optimization.");
}
SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
return DAG.getNode(ExtOp, dl, VT, tmp);
}
static SDValue findMUL_LOHI(SDValue V) {
if (V->getOpcode() == ISD::UMUL_LOHI ||
V->getOpcode() == ISD::SMUL_LOHI)
return V;
return SDValue();
}
static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
if (Subtarget->isThumb()) {
if (!Subtarget->hasDSP())
return SDValue();
} else if (!Subtarget->hasV5TEOps())
return SDValue();
// SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
// accumulates the product into a 64-bit value. The 16-bit values will
// be sign extended somehow or SRA'd into 32-bit values
// (addc (adde (mul 16bit, 16bit), lo), hi)
SDValue Mul = AddcNode->getOperand(0);
SDValue Lo = AddcNode->getOperand(1);
if (Mul.getOpcode() != ISD::MUL) {
Lo = AddcNode->getOperand(0);
Mul = AddcNode->getOperand(1);
if (Mul.getOpcode() != ISD::MUL)
return SDValue();
}
SDValue SRA = AddeNode->getOperand(0);
SDValue Hi = AddeNode->getOperand(1);
if (SRA.getOpcode() != ISD::SRA) {
SRA = AddeNode->getOperand(1);
Hi = AddeNode->getOperand(0);
if (SRA.getOpcode() != ISD::SRA)
return SDValue();
}
if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
if (Const->getZExtValue() != 31)
return SDValue();
} else
return SDValue();
if (SRA.getOperand(0) != Mul)
return SDValue();
SelectionDAG &DAG = DCI.DAG;
SDLoc dl(AddcNode);
unsigned Opcode = 0;
SDValue Op0;
SDValue Op1;
if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
Opcode = ARMISD::SMLALBB;
Op0 = Mul.getOperand(0);
Op1 = Mul.getOperand(1);
} else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
Opcode = ARMISD::SMLALBT;
Op0 = Mul.getOperand(0);
Op1 = Mul.getOperand(1).getOperand(0);
} else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
Opcode = ARMISD::SMLALTB;
Op0 = Mul.getOperand(0).getOperand(0);
Op1 = Mul.getOperand(1);
} else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
Opcode = ARMISD::SMLALTT;
Op0 = Mul->getOperand(0).getOperand(0);
Op1 = Mul->getOperand(1).getOperand(0);
}
if (!Op0 || !Op1)
return SDValue();
SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
Op0, Op1, Lo, Hi);
// Replace the ADDs' nodes uses by the MLA node's values.
SDValue HiMLALResult(SMLAL.getNode(), 1);
SDValue LoMLALResult(SMLAL.getNode(), 0);
DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
// Return original node to notify the driver to stop replacing.
SDValue resNode(AddcNode, 0);
return resNode;
}
static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
// Look for multiply add opportunities.
// The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
// each add nodes consumes a value from ISD::UMUL_LOHI and there is
// a glue link from the first add to the second add.
// If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
// a S/UMLAL instruction.
// UMUL_LOHI
// / :lo \ :hi
// V \ [no multiline comment]
// loAdd -> ADDC |
// \ :carry /
// V V
// ADDE <- hiAdd
//
// In the special case where only the higher part of a signed result is used
// and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
// a constant with the exact value of 0x80000000, we recognize we are dealing
// with a "rounded multiply and add" (or subtract) and transform it into
// either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
"Expect an ADDE or SUBE");
assert(AddeSubeNode->getNumOperands() == 3 &&
AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
"ADDE node has the wrong inputs");
// Check that we are chained to the right ADDC or SUBC node.
SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
(AddeSubeNode->getOpcode() == ARMISD::SUBE &&
AddcSubcNode->getOpcode() != ARMISD::SUBC))
return SDValue();
SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
// Check if the two operands are from the same mul_lohi node.
if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
return SDValue();
assert(AddcSubcNode->getNumValues() == 2 &&
AddcSubcNode->getValueType(0) == MVT::i32 &&
"Expect ADDC with two result values. First: i32");
// Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
// maybe a SMLAL which multiplies two 16-bit values.
if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
// Check for the triangle shape.
SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
// Make sure that the ADDE/SUBE operands are not coming from the same node.
if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
return SDValue();
// Find the MUL_LOHI node walking up ADDE/SUBE's operands.
bool IsLeftOperandMUL = false;
SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
if (MULOp == SDValue())
MULOp = findMUL_LOHI(AddeSubeOp1);
else
IsLeftOperandMUL = true;
if (MULOp == SDValue())
return SDValue();
// Figure out the right opcode.
unsigned Opc = MULOp->getOpcode();
unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
// Figure out the high and low input values to the MLAL node.
SDValue *HiAddSub = nullptr;
SDValue *LoMul = nullptr;
SDValue *LowAddSub = nullptr;
// Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
return SDValue();
if (IsLeftOperandMUL)
HiAddSub = &AddeSubeOp1;
else
HiAddSub = &AddeSubeOp0;
// Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
// whose low result is fed to the ADDC/SUBC we are checking.
if (AddcSubcOp0 == MULOp.getValue(0)) {
LoMul = &AddcSubcOp0;
LowAddSub = &AddcSubcOp1;
}
if (AddcSubcOp1 == MULOp.getValue(0)) {
LoMul = &AddcSubcOp1;
LowAddSub = &AddcSubcOp0;
}
if (!LoMul)
return SDValue();
// If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
// the replacement below will create a cycle.
if (AddcSubcNode == HiAddSub->getNode() ||
AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
return SDValue();
// Create the merged node.
SelectionDAG &DAG = DCI.DAG;
// Start building operand list.
SmallVector<SDValue, 8> Ops;
Ops.push_back(LoMul->getOperand(0));
Ops.push_back(LoMul->getOperand(1));
// Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
// the case, we must be doing signed multiplication and only use the higher
// part of the result of the MLAL, furthermore the LowAddSub must be a constant
// addition or subtraction with the value of 0x800000.
if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
LowAddSub->getNode()->getOpcode() == ISD::Constant &&
static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
0x80000000) {
Ops.push_back(*HiAddSub);
if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
FinalOpc = ARMISD::SMMLSR;
} else {
FinalOpc = ARMISD::SMMLAR;
}
SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
return SDValue(AddeSubeNode, 0);
} else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
// SMMLS is generated during instruction selection and the rest of this
// function can not handle the case where AddcSubcNode is a SUBC.
return SDValue();
// Finish building the operand list for {U/S}MLAL
Ops.push_back(*LowAddSub);
Ops.push_back(*HiAddSub);
SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
DAG.getVTList(MVT::i32, MVT::i32), Ops);
// Replace the ADDs' nodes uses by the MLA node's values.
SDValue HiMLALResult(MLALNode.getNode(), 1);
DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
SDValue LoMLALResult(MLALNode.getNode(), 0);
DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
// Return original node to notify the driver to stop replacing.
return SDValue(AddeSubeNode, 0);
}
static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
// UMAAL is similar to UMLAL except that it adds two unsigned values.
// While trying to combine for the other MLAL nodes, first search for the
// chance to use UMAAL. Check if Addc uses a node which has already
// been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
// as the addend, and it's handled in PerformUMLALCombine.
if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
// Check that we have a glued ADDC node.
SDNode* AddcNode = AddeNode->getOperand(2).getNode();
if (AddcNode->getOpcode() != ARMISD::ADDC)
return SDValue();
// Find the converted UMAAL or quit if it doesn't exist.
SDNode *UmlalNode = nullptr;
SDValue AddHi;
if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
UmlalNode = AddcNode->getOperand(0).getNode();
AddHi = AddcNode->getOperand(1);
} else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
UmlalNode = AddcNode->getOperand(1).getNode();
AddHi = AddcNode->getOperand(0);
} else {
return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
}
// The ADDC should be glued to an ADDE node, which uses the same UMLAL as
// the ADDC as well as Zero.
if (!isNullConstant(UmlalNode->getOperand(3)))
return SDValue();
if ((isNullConstant(AddeNode->getOperand(0)) &&
AddeNode->getOperand(1).getNode() == UmlalNode) ||
(AddeNode->getOperand(0).getNode() == UmlalNode &&
isNullConstant(AddeNode->getOperand(1)))) {
SelectionDAG &DAG = DCI.DAG;
SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
UmlalNode->getOperand(2), AddHi };
SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
DAG.getVTList(MVT::i32, MVT::i32), Ops);
// Replace the ADDs' nodes uses by the UMAAL node's values.
DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
// Return original node to notify the driver to stop replacing.
return SDValue(AddeNode, 0);
}
return SDValue();
}
static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *Subtarget) {
if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
return SDValue();
// Check that we have a pair of ADDC and ADDE as operands.
// Both addends of the ADDE must be zero.
SDNode* AddcNode = N->getOperand(2).getNode();
SDNode* AddeNode = N->getOperand(3).getNode();
if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
(AddeNode->getOpcode() == ARMISD::ADDE) &&
isNullConstant(AddeNode->getOperand(0)) &&
isNullConstant(AddeNode->getOperand(1)) &&
(AddeNode->getOperand(2).getNode() == AddcNode))
return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
DAG.getVTList(MVT::i32, MVT::i32),
{N->getOperand(0), N->getOperand(1),
AddcNode->getOperand(0), AddcNode->getOperand(1)});
else
return SDValue();
}
static SDValue PerformAddcSubcCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
SelectionDAG &DAG(DCI.DAG);
if (N->getOpcode() == ARMISD::SUBC) {
// (SUBC (ADDE 0, 0, C), 1) -> C
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
if (LHS->getOpcode() == ARMISD::ADDE &&
isNullConstant(LHS->getOperand(0)) &&
isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
}
}
if (Subtarget->isThumb1Only()) {
SDValue RHS = N->getOperand(1);
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
int32_t imm = C->getSExtValue();
if (imm < 0 && imm > std::numeric_limits<int>::min()) {
SDLoc DL(N);
RHS = DAG.getConstant(-imm, DL, MVT::i32);
unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
: ARMISD::ADDC;
return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
}
}
}
return SDValue();
}
static SDValue PerformAddeSubeCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
if (Subtarget->isThumb1Only()) {
SelectionDAG &DAG = DCI.DAG;
SDValue RHS = N->getOperand(1);
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
int64_t imm = C->getSExtValue();
if (imm < 0) {
SDLoc DL(N);
// The with-carry-in form matches bitwise not instead of the negation.
// Effectively, the inverse interpretation of the carry flag already
// accounts for part of the negation.
RHS = DAG.getConstant(~imm, DL, MVT::i32);
unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
: ARMISD::ADDE;
return DAG.getNode(Opcode, DL, N->getVTList(),
N->getOperand(0), RHS, N->getOperand(2));
}
}
} else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
return AddCombineTo64bitMLAL(N, DCI, Subtarget);
}
return SDValue();
}
static SDValue PerformABSCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
SDValue res;
SelectionDAG &DAG = DCI.DAG;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0)))
return SDValue();
if (!TLI.expandABS(N, res, DAG))
return SDValue();
return res;
}
/// PerformADDECombine - Target-specific dag combine transform from
/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
static SDValue PerformADDECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
// Only ARM and Thumb2 support UMLAL/SMLAL.
if (Subtarget->isThumb1Only())
return PerformAddeSubeCombine(N, DCI, Subtarget);
// Only perform the checks after legalize when the pattern is available.
if (DCI.isBeforeLegalize()) return SDValue();
return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
}
/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
/// operands N0 and N1. This is a helper for PerformADDCombine that is
/// called with the default operands, and if that fails, with commuted
/// operands.
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget){
// Attempt to create vpadd for this add.
if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
return Result;
// Attempt to create vpaddl for this add.
if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
return Result;
if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
Subtarget))
return Result;
// fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
if (N0.getNode()->hasOneUse())
if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
return Result;
return SDValue();
}
bool
ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const {
if (Level == BeforeLegalizeTypes)
return true;
if (N->getOpcode() != ISD::SHL)
return true;
if (Subtarget->isThumb1Only()) {
// Avoid making expensive immediates by commuting shifts. (This logic
// only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
// for free.)
if (N->getOpcode() != ISD::SHL)
return true;
SDValue N1 = N->getOperand(0);
if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
return true;
if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
if (Const->getAPIntValue().ult(256))
return false;
if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
Const->getAPIntValue().sgt(-256))
return false;
}
return true;
}
// Turn off commute-with-shift transform after legalization, so it doesn't
// conflict with PerformSHLSimplify. (We could try to detect when
// PerformSHLSimplify would trigger more precisely, but it isn't
// really necessary.)
return false;
}
bool ARMTargetLowering::shouldFoldConstantShiftPairToMask(
const SDNode *N, CombineLevel Level) const {
if (!Subtarget->isThumb1Only())
return true;
if (Level == BeforeLegalizeTypes)
return true;
return false;
}
bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
if (!Subtarget->hasNEON()) {
if (Subtarget->isThumb1Only())
return VT.getScalarSizeInBits() <= 32;
return true;
}
return VT.isScalarInteger();
}
static SDValue PerformSHLSimplify(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *ST) {
// Allow the generic combiner to identify potential bswaps.
if (DCI.isBeforeLegalize())
return SDValue();
// DAG combiner will fold:
// (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
// (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
// Other code patterns that can be also be modified have the following form:
// b + ((a << 1) | 510)
// b + ((a << 1) & 510)
// b + ((a << 1) ^ 510)
// b + ((a << 1) + 510)
// Many instructions can perform the shift for free, but it requires both
// the operands to be registers. If c1 << c2 is too large, a mov immediate
// instruction will needed. So, unfold back to the original pattern if:
// - if c1 and c2 are small enough that they don't require mov imms.
// - the user(s) of the node can perform an shl
// No shifted operands for 16-bit instructions.
if (ST->isThumb() && ST->isThumb1Only())
return SDValue();
// Check that all the users could perform the shl themselves.
for (auto U : N->uses()) {
switch(U->getOpcode()) {
default:
return SDValue();
case ISD::SUB:
case ISD::ADD:
case ISD::AND:
case ISD::OR:
case ISD::XOR:
case ISD::SETCC:
case ARMISD::CMP:
// Check that the user isn't already using a constant because there
// aren't any instructions that support an immediate operand and a
// shifted operand.
if (isa<ConstantSDNode>(U->getOperand(0)) ||
isa<ConstantSDNode>(U->getOperand(1)))
return SDValue();
// Check that it's not already using a shift.
if (U->getOperand(0).getOpcode() == ISD::SHL ||
U->getOperand(1).getOpcode() == ISD::SHL)
return SDValue();
break;
}
}
if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
return SDValue();
if (N->getOperand(0).getOpcode() != ISD::SHL)
return SDValue();
SDValue SHL = N->getOperand(0);
auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
if (!C1ShlC2 || !C2)
return SDValue();
APInt C2Int = C2->getAPIntValue();
APInt C1Int = C1ShlC2->getAPIntValue();
// Check that performing a lshr will not lose any information.
APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(),
C2Int.getBitWidth() - C2->getZExtValue());
if ((C1Int & Mask) != C1Int)
return SDValue();
// Shift the first constant.
C1Int.lshrInPlace(C2Int);
// The immediates are encoded as an 8-bit value that can be rotated.
auto LargeImm = [](const APInt &Imm) {
unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros();
return Imm.getBitWidth() - Zeros > 8;
};
if (LargeImm(C1Int) || LargeImm(C2Int))
return SDValue();
SelectionDAG &DAG = DCI.DAG;
SDLoc dl(N);
SDValue X = SHL.getOperand(0);
SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
DAG.getConstant(C1Int, dl, MVT::i32));
// Shift left to compensate for the lshr of C1Int.
SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
SHL.dump(); N->dump());
LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
return Res;
}
/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
///
static SDValue PerformADDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// Only works one way, because it needs an immediate operand.
if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
return Result;
// First try with the default operand order.
if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
return Result;
// If that didn't work, try again with the operands commuted.
return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
}
/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
///
static SDValue PerformSUBCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
if (N1.getNode()->hasOneUse())
if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
return Result;
return SDValue();
}
/// PerformVMULCombine
/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
/// special multiplier accumulator forwarding.
/// vmul d3, d0, d2
/// vmla d3, d1, d2
/// is faster than
/// vadd d3, d0, d1
/// vmul d3, d3, d2
// However, for (A + B) * (A + B),
// vadd d2, d0, d1
// vmul d3, d0, d2
// vmla d3, d1, d2
// is slower than
// vadd d2, d0, d1
// vmul d3, d2, d2
static SDValue PerformVMULCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
if (!Subtarget->hasVMLxForwarding())
return SDValue();
SelectionDAG &DAG = DCI.DAG;
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
unsigned Opcode = N0.getOpcode();
if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
Opcode != ISD::FADD && Opcode != ISD::FSUB) {
Opcode = N1.getOpcode();
if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
Opcode != ISD::FADD && Opcode != ISD::FSUB)
return SDValue();
std::swap(N0, N1);
}
if (N0 == N1)
return SDValue();
EVT VT = N->getValueType(0);
SDLoc DL(N);
SDValue N00 = N0->getOperand(0);
SDValue N01 = N0->getOperand(1);
return DAG.getNode(Opcode, DL, VT,
DAG.getNode(ISD::MUL, DL, VT, N00, N1),
DAG.getNode(ISD::MUL, DL, VT, N01, N1));
}
static SDValue PerformMULCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
SelectionDAG &DAG = DCI.DAG;
if (Subtarget->isThumb1Only())
return SDValue();
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
return SDValue();
EVT VT = N->getValueType(0);
if (VT.is64BitVector() || VT.is128BitVector())
return PerformVMULCombine(N, DCI, Subtarget);
if (VT != MVT::i32)
return SDValue();
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!C)
return SDValue();
int64_t MulAmt = C->getSExtValue();
unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
ShiftAmt = ShiftAmt & (32 - 1);
SDValue V = N->getOperand(0);
SDLoc DL(N);
SDValue Res;
MulAmt >>= ShiftAmt;
if (MulAmt >= 0) {
if (isPowerOf2_32(MulAmt - 1)) {
// (mul x, 2^N + 1) => (add (shl x, N), x)
Res = DAG.getNode(ISD::ADD, DL, VT,
V,
DAG.getNode(ISD::SHL, DL, VT,
V,
DAG.getConstant(Log2_32(MulAmt - 1), DL,
MVT::i32)));
} else if (isPowerOf2_32(MulAmt + 1)) {
// (mul x, 2^N - 1) => (sub (shl x, N), x)
Res = DAG.getNode(ISD::SUB, DL, VT,
DAG.getNode(ISD::SHL, DL, VT,
V,
DAG.getConstant(Log2_32(MulAmt + 1), DL,
MVT::i32)),
V);
} else
return SDValue();
} else {
uint64_t MulAmtAbs = -MulAmt;
if (isPowerOf2_32(MulAmtAbs + 1)) {
// (mul x, -(2^N - 1)) => (sub x, (shl x, N))
Res = DAG.getNode(ISD::SUB, DL, VT,
V,
DAG.getNode(ISD::SHL, DL, VT,
V,
DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
MVT::i32)));
} else if (isPowerOf2_32(MulAmtAbs - 1)) {
// (mul x, -(2^N + 1)) => - (add (shl x, N), x)
Res = DAG.getNode(ISD::ADD, DL, VT,
V,
DAG.getNode(ISD::SHL, DL, VT,
V,
DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
MVT::i32)));
Res = DAG.getNode(ISD::SUB, DL, VT,
DAG.getConstant(0, DL, MVT::i32), Res);
} else
return SDValue();
}
if (ShiftAmt != 0)
Res = DAG.getNode(ISD::SHL, DL, VT,
Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
// Do not add new nodes to DAG combiner worklist.
DCI.CombineTo(N, Res, false);
return SDValue();
}
static SDValue CombineANDShift(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
// Allow DAGCombine to pattern-match before we touch the canonical form.
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
return SDValue();
if (N->getValueType(0) != MVT::i32)
return SDValue();
ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!N1C)
return SDValue();
uint32_t C1 = (uint32_t)N1C->getZExtValue();
// Don't transform uxtb/uxth.
if (C1 == 255 || C1 == 65535)
return SDValue();
SDNode *N0 = N->getOperand(0).getNode();
if (!N0->hasOneUse())
return SDValue();
if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
return SDValue();
bool LeftShift = N0->getOpcode() == ISD::SHL;
ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
if (!N01C)
return SDValue();
uint32_t C2 = (uint32_t)N01C->getZExtValue();
if (!C2 || C2 >= 32)
return SDValue();
// Clear irrelevant bits in the mask.
if (LeftShift)
C1 &= (-1U << C2);
else
C1 &= (-1U >> C2);
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
// We have a pattern of the form "(and (shl x, c2) c1)" or
// "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
// transform to a pair of shifts, to save materializing c1.
// First pattern: right shift, then mask off leading bits.
// FIXME: Use demanded bits?
if (!LeftShift && isMask_32(C1)) {
uint32_t C3 = countLeadingZeros(C1);
if (C2 < C3) {
SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
DAG.getConstant(C3 - C2, DL, MVT::i32));
return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
DAG.getConstant(C3, DL, MVT::i32));
}
}
// First pattern, reversed: left shift, then mask off trailing bits.
if (LeftShift && isMask_32(~C1)) {
uint32_t C3 = countTrailingZeros(C1);
if (C2 < C3) {
SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
DAG.getConstant(C3 - C2, DL, MVT::i32));
return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
DAG.getConstant(C3, DL, MVT::i32));
}
}
// Second pattern: left shift, then mask off leading bits.
// FIXME: Use demanded bits?
if (LeftShift && isShiftedMask_32(C1)) {
uint32_t Trailing = countTrailingZeros(C1);
uint32_t C3 = countLeadingZeros(C1);
if (Trailing == C2 && C2 + C3 < 32) {
SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
DAG.getConstant(C2 + C3, DL, MVT::i32));
return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
DAG.getConstant(C3, DL, MVT::i32));
}
}
// Second pattern, reversed: right shift, then mask off trailing bits.
// FIXME: Handle other patterns of known/demanded bits.
if (!LeftShift && isShiftedMask_32(C1)) {
uint32_t Leading = countLeadingZeros(C1);
uint32_t C3 = countTrailingZeros(C1);
if (Leading == C2 && C2 + C3 < 32) {
SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
DAG.getConstant(C2 + C3, DL, MVT::i32));
return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
DAG.getConstant(C3, DL, MVT::i32));
}
}
// FIXME: Transform "(and (shl x, c2) c1)" ->
// "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than
// c1.
return SDValue();
}
static SDValue PerformANDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
// Attempt to use immediate-form VBIC
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
SDLoc dl(N);
EVT VT = N->getValueType(0);
SelectionDAG &DAG = DCI.DAG;
if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
if (BVN && Subtarget->hasNEON() &&
BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
if (SplatBitSize <= 64) {
EVT VbicVT;
SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
SplatUndef.getZExtValue(), SplatBitSize,
DAG, dl, VbicVT, VT.is128BitVector(),
OtherModImm);
if (Val.getNode()) {
SDValue Input =
DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
}
}
}
if (!Subtarget->isThumb1Only()) {
// fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
return Result;
if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
return Result;
}
if (Subtarget->isThumb1Only())
if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
return Result;
return SDValue();
}
// Try combining OR nodes to SMULWB, SMULWT.
static SDValue PerformORCombineToSMULWBT(SDNode *OR,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
if (!Subtarget->hasV6Ops() ||
(Subtarget->isThumb() &&
(!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
return SDValue();
SDValue SRL = OR->getOperand(0);
SDValue SHL = OR->getOperand(1);
if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
SRL = OR->getOperand(1);
SHL = OR->getOperand(0);
}
if (!isSRL16(SRL) || !isSHL16(SHL))
return SDValue();
// The first operands to the shifts need to be the two results from the
// same smul_lohi node.
if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
return SDValue();
SDNode *SMULLOHI = SRL.getOperand(0).getNode();
if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
SHL.getOperand(0) != SDValue(SMULLOHI, 1))
return SDValue();
// Now we have:
// (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
// For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
// For SMUWB the 16-bit value will signed extended somehow.
// For SMULWT only the SRA is required.
// Check both sides of SMUL_LOHI
SDValue OpS16 = SMULLOHI->getOperand(0);
SDValue OpS32 = SMULLOHI->getOperand(1);
SelectionDAG &DAG = DCI.DAG;
if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
OpS16 = OpS32;
OpS32 = SMULLOHI->getOperand(0);
}
SDLoc dl(OR);
unsigned Opcode = 0;
if (isS16(OpS16, DAG))
Opcode = ARMISD::SMULWB;
else if (isSRA16(OpS16)) {
Opcode = ARMISD::SMULWT;
OpS16 = OpS16->getOperand(0);
}
else
return SDValue();
SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
return SDValue(OR, 0);
}
static SDValue PerformORCombineToBFI(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
// BFI is only available on V6T2+
if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
return SDValue();
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
// 1) or (and A, mask), val => ARMbfi A, val, mask
// iff (val & mask) == val
//
// 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
// 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
// && mask == ~mask2
// 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
// && ~mask == mask2
// (i.e., copy a bitfield value into another bitfield of the same width)
if (VT != MVT::i32)
return SDValue();
SDValue N00 = N0.getOperand(0);
// The value and the mask need to be constants so we can verify this is
// actually a bitfield set. If the mask is 0xffff, we can do better
// via a movt instruction, so don't use BFI in that case.
SDValue MaskOp = N0.getOperand(1);
ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
if (!MaskC)
return SDValue();
unsigned Mask = MaskC->getZExtValue();
if (Mask == 0xffff)
return SDValue();
SDValue Res;
// Case (1): or (and A, mask), val => ARMbfi A, val, mask
ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
if (N1C) {
unsigned Val = N1C->getZExtValue();
if ((Val & ~Mask) != Val)
return SDValue();
if (ARM::isBitFieldInvertedMask(Mask)) {
Val >>= countTrailingZeros(~Mask);
Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
DAG.getConstant(Val, DL, MVT::i32),
DAG.getConstant(Mask, DL, MVT::i32));
DCI.CombineTo(N, Res, false);
// Return value from the original node to inform the combiner than N is
// now dead.
return SDValue(N, 0);
}
} else if (N1.getOpcode() == ISD::AND) {
// case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
if (!N11C)
return SDValue();
unsigned Mask2 = N11C->getZExtValue();
// Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
// as is to match.
if (ARM::isBitFieldInvertedMask(Mask) &&
(Mask == ~Mask2)) {
// The pack halfword instruction works better for masks that fit it,
// so use that when it's available.
if (Subtarget->hasDSP() &&
(Mask == 0xffff || Mask == 0xffff0000))
return SDValue();
// 2a
unsigned amt = countTrailingZeros(Mask2);
Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
DAG.getConstant(amt, DL, MVT::i32));
Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
DAG.getConstant(Mask, DL, MVT::i32));
DCI.CombineTo(N, Res, false);
// Return value from the original node to inform the combiner than N is
// now dead.
return SDValue(N, 0);
} else if (ARM::isBitFieldInvertedMask(~Mask) &&
(~Mask == Mask2)) {
// The pack halfword instruction works better for masks that fit it,
// so use that when it's available.
if (Subtarget->hasDSP() &&
(Mask2 == 0xffff || Mask2 == 0xffff0000))
return SDValue();
// 2b
unsigned lsb = countTrailingZeros(Mask);
Res = DAG.getNode(ISD::SRL, DL, VT, N00,
DAG.getConstant(lsb, DL, MVT::i32));
Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
DAG.getConstant(Mask2, DL, MVT::i32));
DCI.CombineTo(N, Res, false);
// Return value from the original node to inform the combiner than N is
// now dead.
return SDValue(N, 0);
}
}
if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
ARM::isBitFieldInvertedMask(~Mask)) {
// Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
// where lsb(mask) == #shamt and masked bits of B are known zero.
SDValue ShAmt = N00.getOperand(1);
unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
unsigned LSB = countTrailingZeros(Mask);
if (ShAmtC != LSB)
return SDValue();
Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
DAG.getConstant(~Mask, DL, MVT::i32));
DCI.CombineTo(N, Res, false);
// Return value from the original node to inform the combiner than N is
// now dead.
return SDValue(N, 0);
}
return SDValue();
}
/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
static SDValue PerformORCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
// Attempt to use immediate-form VORR
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
SDLoc dl(N);
EVT VT = N->getValueType(0);
SelectionDAG &DAG = DCI.DAG;
if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
if (BVN && Subtarget->hasNEON() &&
BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
if (SplatBitSize <= 64) {
EVT VorrVT;
SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
SplatUndef.getZExtValue(), SplatBitSize,
DAG, dl, VorrVT, VT.is128BitVector(),
OtherModImm);
if (Val.getNode()) {
SDValue Input =
DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
}
}
}
if (!Subtarget->isThumb1Only()) {
// fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
return Result;
if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
return Result;
}
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
// The code below optimizes (or (and X, Y), Z).
// The AND operand needs to have a single user to make these optimizations
// profitable.
if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
return SDValue();
APInt SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
APInt SplatBits0, SplatBits1;
BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
// Ensure that the second operand of both ands are constants
if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
HasAnyUndefs) && !HasAnyUndefs) {
if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
HasAnyUndefs) && !HasAnyUndefs) {
// Ensure that the bit width of the constants are the same and that
// the splat arguments are logical inverses as per the pattern we
// are trying to simplify.
if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
SplatBits0 == ~SplatBits1) {
// Canonicalize the vector type to make instruction selection
// simpler.
EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
N0->getOperand(1),
N0->getOperand(0),
N1->getOperand(0));
return DAG.getNode(ISD::BITCAST, dl, VT, Result);
}
}
}
}
// Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
// reasonable.
if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
return Res;
}
if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
return Result;
return SDValue();
}
static SDValue PerformXORCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
EVT VT = N->getValueType(0);
SelectionDAG &DAG = DCI.DAG;
if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
if (!Subtarget->isThumb1Only()) {
// fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
return Result;
if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
return Result;
}
return SDValue();
}
// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
// their position in "to" (Rd).
static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
assert(N->getOpcode() == ARMISD::BFI);
SDValue From = N->getOperand(1);
ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation());
// If the Base came from a SHR #C, we can deduce that it is really testing bit
// #C in the base of the SHR.
if (From->getOpcode() == ISD::SRL &&
isa<ConstantSDNode>(From->getOperand(1))) {
APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
assert(Shift.getLimitedValue() < 32 && "Shift too large!");
FromMask <<= Shift.getLimitedValue(31);
From = From->getOperand(0);
}
return From;
}
// If A and B contain one contiguous set of bits, does A | B == A . B?
//
// Neither A nor B must be zero.
static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
unsigned LastActiveBitInA = A.countTrailingZeros();
unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1;
return LastActiveBitInA - 1 == FirstActiveBitInB;
}
static SDValue FindBFIToCombineWith(SDNode *N) {
// We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with,
// if one exists.
APInt ToMask, FromMask;
SDValue From = ParseBFI(N, ToMask, FromMask);
SDValue To = N->getOperand(0);
// Now check for a compatible BFI to merge with. We can pass through BFIs that
// aren't compatible, but not if they set the same bit in their destination as
// we do (or that of any BFI we're going to combine with).
SDValue V = To;
APInt CombinedToMask = ToMask;
while (V.getOpcode() == ARMISD::BFI) {
APInt NewToMask, NewFromMask;
SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
if (NewFrom != From) {
// This BFI has a different base. Keep going.
CombinedToMask |= NewToMask;
V = V.getOperand(0);
continue;
}
// Do the written bits conflict with any we've seen so far?
if ((NewToMask & CombinedToMask).getBoolValue())
// Conflicting bits - bail out because going further is unsafe.
return SDValue();
// Are the new bits contiguous when combined with the old bits?
if (BitsProperlyConcatenate(ToMask, NewToMask) &&
BitsProperlyConcatenate(FromMask, NewFromMask))
return V;
if (BitsProperlyConcatenate(NewToMask, ToMask) &&
BitsProperlyConcatenate(NewFromMask, FromMask))
return V;
// We've seen a write to some bits, so track it.
CombinedToMask |= NewToMask;
// Keep going...
V = V.getOperand(0);
}
return SDValue();
}
static SDValue PerformBFICombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SDValue N1 = N->getOperand(1);
if (N1.getOpcode() == ISD::AND) {
// (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
// the bits being cleared by the AND are not demanded by the BFI.
ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
if (!N11C)
return SDValue();
unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
unsigned LSB = countTrailingZeros(~InvMask);
unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
assert(Width <
static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
"undefined behavior");
unsigned Mask = (1u << Width) - 1;
unsigned Mask2 = N11C->getZExtValue();
if ((Mask & (~Mask2)) == 0)
return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
N->getOperand(0), N1.getOperand(0),
N->getOperand(2));
} else if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
// We have a BFI of a BFI. Walk up the BFI chain to see how long it goes.
// Keep track of any consecutive bits set that all come from the same base
// value. We can combine these together into a single BFI.
SDValue CombineBFI = FindBFIToCombineWith(N);
if (CombineBFI == SDValue())
return SDValue();
// We've found a BFI.
APInt ToMask1, FromMask1;
SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
APInt ToMask2, FromMask2;
SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
assert(From1 == From2);
(void)From2;
// First, unlink CombineBFI.
DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0));
// Then create a new BFI, combining the two together.
APInt NewFromMask = FromMask1 | FromMask2;
APInt NewToMask = ToMask1 | ToMask2;
EVT VT = N->getValueType(0);
SDLoc dl(N);
if (NewFromMask[0] == 0)
From1 = DCI.DAG.getNode(
ISD::SRL, dl, VT, From1,
DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1,
DCI.DAG.getConstant(~NewToMask, dl, VT));
}
return SDValue();
}
/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
/// ARMISD::VMOVRRD.
static SDValue PerformVMOVRRDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
// vmovrrd(vmovdrr x, y) -> x,y
SDValue InDouble = N->getOperand(0);
if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
// vmovrrd(load f64) -> (load i32), (load i32)
SDNode *InNode = InDouble.getNode();
if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
InNode->getValueType(0) == MVT::f64 &&
InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
!cast<LoadSDNode>(InNode)->isVolatile()) {
// TODO: Should this be done for non-FrameIndex operands?
LoadSDNode *LD = cast<LoadSDNode>(InNode);
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(LD);
SDValue BasePtr = LD->getBasePtr();
SDValue NewLD1 =
DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
LD->getAlignment(), LD->getMemOperand()->getFlags());
SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
DAG.getConstant(4, DL, MVT::i32));
SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
LD->getPointerInfo().getWithOffset(4),
std::min(4U, LD->getAlignment()),
LD->getMemOperand()->getFlags());
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
if (DCI.DAG.getDataLayout().isBigEndian())
std::swap (NewLD1, NewLD2);
SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
return Result;
}
return SDValue();
}
/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
// N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
if (Op0.getOpcode() == ISD::BITCAST)
Op0 = Op0.getOperand(0);
if (Op1.getOpcode() == ISD::BITCAST)
Op1 = Op1.getOperand(0);
if (Op0.getOpcode() == ARMISD::VMOVRRD &&
Op0.getNode() == Op1.getNode() &&
Op0.getResNo() == 0 && Op1.getResNo() == 1)
return DAG.getNode(ISD::BITCAST, SDLoc(N),
N->getValueType(0), Op0.getOperand(0));
return SDValue();
}
/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
/// are normal, non-volatile loads. If so, it is profitable to bitcast an
/// i64 vector to have f64 elements, since the value can then be loaded
/// directly into a VFP register.
static bool hasNormalLoadOperand(SDNode *N) {
unsigned NumElts = N->getValueType(0).getVectorNumElements();
for (unsigned i = 0; i < NumElts; ++i) {
SDNode *Elt = N->getOperand(i).getNode();
if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
return true;
}
return false;
}
/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
/// ISD::BUILD_VECTOR.
static SDValue PerformBUILD_VECTORCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
// build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
// VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
// into a pair of GPRs, which is fine when the value is used as a scalar,
// but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
SelectionDAG &DAG = DCI.DAG;
if (N->getNumOperands() == 2)
if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
return RV;
// Load i64 elements as f64 values so that type legalization does not split
// them up into i32 values.
EVT VT = N->getValueType(0);
if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
return SDValue();
SDLoc dl(N);
SmallVector<SDValue, 8> Ops;
unsigned NumElts = VT.getVectorNumElements();
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
Ops.push_back(V);
// Make the DAGCombiner fold the bitcast.
DCI.AddToWorklist(V.getNode());
}
EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
return DAG.getNode(ISD::BITCAST, dl, VT, BV);
}
/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
static SDValue
PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
// ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
// At that time, we may have inserted bitcasts from integer to float.
// If these bitcasts have survived DAGCombine, change the lowering of this
// BUILD_VECTOR in something more vector friendly, i.e., that does not
// force to use floating point types.
// Make sure we can change the type of the vector.
// This is possible iff:
// 1. The vector is only used in a bitcast to a integer type. I.e.,
// 1.1. Vector is used only once.
// 1.2. Use is a bit convert to an integer type.
// 2. The size of its operands are 32-bits (64-bits are not legal).
EVT VT = N->getValueType(0);
EVT EltVT = VT.getVectorElementType();
// Check 1.1. and 2.
if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
return SDValue();
// By construction, the input type must be float.
assert(EltVT == MVT::f32 && "Unexpected type!");
// Check 1.2.
SDNode *Use = *N->use_begin();
if (Use->getOpcode() != ISD::BITCAST ||
Use->getValueType(0).isFloatingPoint())
return SDValue();
// Check profitability.
// Model is, if more than half of the relevant operands are bitcast from
// i32, turn the build_vector into a sequence of insert_vector_elt.
// Relevant operands are everything that is not statically
// (i.e., at compile time) bitcasted.
unsigned NumOfBitCastedElts = 0;
unsigned NumElts = VT.getVectorNumElements();
unsigned NumOfRelevantElts = NumElts;
for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
SDValue Elt = N->getOperand(Idx);
if (Elt->getOpcode() == ISD::BITCAST) {
// Assume only bit cast to i32 will go away.
if (Elt->getOperand(0).getValueType() == MVT::i32)
++NumOfBitCastedElts;
} else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
// Constants are statically casted, thus do not count them as
// relevant operands.
--NumOfRelevantElts;
}
// Check if more than half of the elements require a non-free bitcast.
if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
return SDValue();
SelectionDAG &DAG = DCI.DAG;
// Create the new vector type.
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
// Check if the type is legal.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!TLI.isTypeLegal(VecVT))
return SDValue();
// Combine:
// ARMISD::BUILD_VECTOR E1, E2, ..., EN.
// => BITCAST INSERT_VECTOR_ELT
// (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
// (BITCAST EN), N.
SDValue Vec = DAG.getUNDEF(VecVT);
SDLoc dl(N);
for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
SDValue V = N->getOperand(Idx);
if (V.isUndef())
continue;
if (V.getOpcode() == ISD::BITCAST &&
V->getOperand(0).getValueType() == MVT::i32)
// Fold obvious case.
V = V.getOperand(0);
else {
V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
// Make the DAGCombiner fold the bitcasts.
DCI.AddToWorklist(V.getNode());
}
SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
}
Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
// Make the DAGCombiner fold the bitcasts.
DCI.AddToWorklist(Vec.getNode());
return Vec;
}
/// PerformInsertEltCombine - Target-specific dag combine xforms for
/// ISD::INSERT_VECTOR_ELT.
static SDValue PerformInsertEltCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
// Bitcast an i64 load inserted into a vector to f64.
// Otherwise, the i64 value will be legalized to a pair of i32 values.
EVT VT = N->getValueType(0);
SDNode *Elt = N->getOperand(1).getNode();
if (VT.getVectorElementType() != MVT::i64 ||
!ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
return SDValue();
SelectionDAG &DAG = DCI.DAG;
SDLoc dl(N);
EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
VT.getVectorNumElements());
SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
// Make the DAGCombiner fold the bitcasts.
DCI.AddToWorklist(Vec.getNode());
DCI.AddToWorklist(V.getNode());
SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
Vec, V, N->getOperand(2));
return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
}
/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
/// ISD::VECTOR_SHUFFLE.
static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
// The LLVM shufflevector instruction does not require the shuffle mask
// length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
// have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
// operands do not match the mask length, they are extended by concatenating
// them with undef vectors. That is probably the right thing for other
// targets, but for NEON it is better to concatenate two double-register
// size vector operands into a single quad-register size vector. Do that
// transformation here:
// shuffle(concat(v1, undef), concat(v2, undef)) ->
// shuffle(concat(v1, v2), undef)
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
Op1.getOpcode() != ISD::CONCAT_VECTORS ||
Op0.getNumOperands() != 2 ||
Op1.getNumOperands() != 2)
return SDValue();
SDValue Concat0Op1 = Op0.getOperand(1);
SDValue Concat1Op1 = Op1.getOperand(1);
if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
return SDValue();
// Skip the transformation if any of the types are illegal.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = N->getValueType(0);
if (!TLI.isTypeLegal(VT) ||
!TLI.isTypeLegal(Concat0Op1.getValueType()) ||
!TLI.isTypeLegal(Concat1Op1.getValueType()))
return SDValue();
SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
Op0.getOperand(0), Op1.getOperand(0));
// Translate the shuffle mask.
SmallVector<int, 16> NewMask;
unsigned NumElts = VT.getVectorNumElements();
unsigned HalfElts = NumElts/2;
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
for (unsigned n = 0; n < NumElts; ++n) {
int MaskElt = SVN->getMaskElt(n);
int NewElt = -1;
if (MaskElt < (int)HalfElts)
NewElt = MaskElt;
else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
NewElt = HalfElts + MaskElt - NumElts;
NewMask.push_back(NewElt);
}
return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
DAG.getUNDEF(VT), NewMask);
}
/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
/// NEON load/store intrinsics, and generic vector load/stores, to merge
/// base address updates.
/// For generic load/stores, the memory type is assumed to be a vector.
/// The caller is assumed to have checked legality.
static SDValue CombineBaseUpdate(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
const bool isStore = N->getOpcode() == ISD::STORE;
const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
SDValue Addr = N->getOperand(AddrOpIdx);
MemSDNode *MemN = cast<MemSDNode>(N);
SDLoc dl(N);
// Search for a use of the address operand that is an increment.
for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
SDNode *User = *UI;
if (User->getOpcode() != ISD::ADD ||
UI.getUse().getResNo() != Addr.getResNo())
continue;
// Check that the add is independent of the load/store. Otherwise, folding
// it would create a cycle. We can avoid searching through Addr as it's a
// predecessor to both.
SmallPtrSet<const SDNode *, 32> Visited;
SmallVector<const SDNode *, 16> Worklist;
Visited.insert(Addr.getNode());
Worklist.push_back(N);
Worklist.push_back(User);
if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
SDNode::hasPredecessorHelper(User, Visited, Worklist))
continue;
// Find the new opcode for the updating load/store.
bool isLoadOp = true;
bool isLaneOp = false;
unsigned NewOpc = 0;
unsigned NumVecs = 0;
if (isIntrinsic) {
unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
switch (IntNo) {
default: llvm_unreachable("unexpected intrinsic for Neon base update");
case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD;
NumVecs = 1; break;
case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD;
NumVecs = 2; break;
case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD;
NumVecs = 3; break;
case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD;
NumVecs = 4; break;
case Intrinsic::arm_neon_vld2dup:
case Intrinsic::arm_neon_vld3dup:
case Intrinsic::arm_neon_vld4dup:
// TODO: Support updating VLDxDUP nodes. For now, we just skip
// combining base updates for such intrinsics.
continue;
case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
NumVecs = 2; isLaneOp = true; break;
case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
NumVecs = 3; isLaneOp = true; break;
case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
NumVecs = 4; isLaneOp = true; break;
case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD;
NumVecs = 1; isLoadOp = false; break;
case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD;
NumVecs = 2; isLoadOp = false; break;
case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD;
NumVecs = 3; isLoadOp = false; break;
case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD;
NumVecs = 4; isLoadOp = false; break;
case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
}
} else {
isLaneOp = true;
switch (N->getOpcode()) {
default: llvm_unreachable("unexpected opcode for Neon base update");
case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD;
NumVecs = 1; isLaneOp = false; break;
case ISD::STORE: NewOpc = ARMISD::VST1_UPD;
NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
}
}
// Find the size of memory referenced by the load/store.
EVT VecTy;
if (isLoadOp) {
VecTy = N->getValueType(0);
} else if (isIntrinsic) {
VecTy = N->getOperand(AddrOpIdx+1).getValueType();
} else {
assert(isStore && "Node has to be a load, a store, or an intrinsic!");
VecTy = N->getOperand(1).getValueType();
}
unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
if (isLaneOp)
NumBytes /= VecTy.getVectorNumElements();
// If the increment is a constant, it must match the memory ref size.
SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) {
// VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
// separate instructions that make it harder to use a non-constant update.
continue;
}
// OK, we found an ADD we can fold into the base update.
// Now, create a _UPD node, taking care of not breaking alignment.
EVT AlignedVecTy = VecTy;
unsigned Alignment = MemN->getAlignment();
// If this is a less-than-standard-aligned load/store, change the type to
// match the standard alignment.
// The alignment is overlooked when selecting _UPD variants; and it's
// easier to introduce bitcasts here than fix that.
// There are 3 ways to get to this base-update combine:
// - intrinsics: they are assumed to be properly aligned (to the standard
// alignment of the memory type), so we don't need to do anything.
// - ARMISD::VLDx nodes: they are only generated from the aforementioned
// intrinsics, so, likewise, there's nothing to do.
// - generic load/store instructions: the alignment is specified as an
// explicit operand, rather than implicitly as the standard alignment
// of the memory type (like the intrisics). We need to change the
// memory type to match the explicit alignment. That way, we don't
// generate non-standard-aligned ARMISD::VLDx nodes.
if (isa<LSBaseSDNode>(N)) {
if (Alignment == 0)
Alignment = 1;
if (Alignment < VecTy.getScalarSizeInBits() / 8) {
MVT EltTy = MVT::getIntegerVT(Alignment * 8);
assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
assert(!isLaneOp && "Unexpected generic load/store lane.");
unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
}
// Don't set an explicit alignment on regular load/stores that we want
// to transform to VLD/VST 1_UPD nodes.
// This matches the behavior of regular load/stores, which only get an
// explicit alignment if the MMO alignment is larger than the standard
// alignment of the memory type.
// Intrinsics, however, always get an explicit alignment, set to the
// alignment of the MMO.
Alignment = 1;
}
// Create the new updating load/store node.
// First, create an SDVTList for the new updating node's results.
EVT Tys[6];
unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
unsigned n;
for (n = 0; n < NumResultVecs; ++n)
Tys[n] = AlignedVecTy;
Tys[n++] = MVT::i32;
Tys[n] = MVT::Other;
SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
// Then, gather the new node's operands.
SmallVector<SDValue, 8> Ops;
Ops.push_back(N->getOperand(0)); // incoming chain
Ops.push_back(N->getOperand(AddrOpIdx));
Ops.push_back(Inc);
if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
// Try to match the intrinsic's signature
Ops.push_back(StN->getValue());
} else {
// Loads (and of course intrinsics) match the intrinsics' signature,
// so just add all but the alignment operand.
for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i)
Ops.push_back(N->getOperand(i));
}
// For all node types, the alignment operand is always the last one.
Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
// If this is a non-standard-aligned STORE, the penultimate operand is the
// stored value. Bitcast it to the aligned type.
if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
SDValue &StVal = Ops[Ops.size()-2];
StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
}
EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
MemN->getMemOperand());
// Update the uses.
SmallVector<SDValue, 5> NewResults;
for (unsigned i = 0; i < NumResultVecs; ++i)
NewResults.push_back(SDValue(UpdN.getNode(), i));
// If this is an non-standard-aligned LOAD, the first result is the loaded
// value. Bitcast it to the expected result type.
if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
SDValue &LdVal = NewResults[0];
LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
}
NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
DCI.CombineTo(N, NewResults);
DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
break;
}
return SDValue();
}
static SDValue PerformVLDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
return SDValue();
return CombineBaseUpdate(N, DCI);
}
/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
/// return true.
static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
// vldN-dup instructions only support 64-bit vectors for N > 1.
if (!VT.is64BitVector())
return false;
// Check if the VDUPLANE operand is a vldN-dup intrinsic.
SDNode *VLD = N->getOperand(0).getNode();
if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
return false;
unsigned NumVecs = 0;
unsigned NewOpc = 0;
unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
if (IntNo == Intrinsic::arm_neon_vld2lane) {
NumVecs = 2;
NewOpc = ARMISD::VLD2DUP;
} else if (IntNo == Intrinsic::arm_neon_vld3lane) {
NumVecs = 3;
NewOpc = ARMISD::VLD3DUP;
} else if (IntNo == Intrinsic::arm_neon_vld4lane) {
NumVecs = 4;
NewOpc = ARMISD::VLD4DUP;
} else {
return false;
}
// First check that all the vldN-lane uses are VDUPLANEs and that the lane
// numbers match the load.
unsigned VLDLaneNo =
cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
UI != UE; ++UI) {
// Ignore uses of the chain result.
if (UI.getUse().getResNo() == NumVecs)
continue;
SDNode *User = *UI;
if (User->getOpcode() != ARMISD::VDUPLANE ||
VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
return false;
}
// Create the vldN-dup node.
EVT Tys[5];
unsigned n;
for (n = 0; n < NumVecs; ++n)
Tys[n] = VT;
Tys[n] = MVT::Other;
SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1));
SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
Ops, VLDMemInt->getMemoryVT(),
VLDMemInt->getMemOperand());
// Update the uses.
for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
UI != UE; ++UI) {
unsigned ResNo = UI.getUse().getResNo();
// Ignore uses of the chain result.
if (ResNo == NumVecs)
continue;
SDNode *User = *UI;
DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
}
// Now the vldN-lane intrinsic is dead except for its chain result.
// Update uses of the chain.
std::vector<SDValue> VLDDupResults;
for (unsigned n = 0; n < NumVecs; ++n)
VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
DCI.CombineTo(VLD, VLDDupResults);
return true;
}
/// PerformVDUPLANECombine - Target-specific dag combine xforms for
/// ARMISD::VDUPLANE.
static SDValue PerformVDUPLANECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SDValue Op = N->getOperand(0);
// If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
// of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
if (CombineVLDDUP(N, DCI))
return SDValue(N, 0);
// If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
// redundant. Ignore bit_converts for now; element sizes are checked below.
while (Op.getOpcode() == ISD::BITCAST)
Op = Op.getOperand(0);
if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
return SDValue();
// Make sure the VMOV element size is not bigger than the VDUPLANE elements.
unsigned EltSize = Op.getScalarValueSizeInBits();
// The canonical VMOV for a zero vector uses a 32-bit element size.
unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
unsigned EltBits;
if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
EltSize = 8;
EVT VT = N->getValueType(0);
if (EltSize > VT.getScalarSizeInBits())
return SDValue();
return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
}
/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
static SDValue PerformVDUPCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
SelectionDAG &DAG = DCI.DAG;
SDValue Op = N->getOperand(0);
if (!Subtarget->hasNEON())
return SDValue();
// Match VDUP(LOAD) -> VLD1DUP.
// We match this pattern here rather than waiting for isel because the
// transform is only legal for unindexed loads.
LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
if (LD && Op.hasOneUse() && LD->isUnindexed() &&
LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1),
DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) };
SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys,
Ops, LD->getMemoryVT(),
LD->getMemOperand());
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
return VLDDup;
}
return SDValue();
}
static SDValue PerformLOADCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
EVT VT = N->getValueType(0);
// If this is a legal vector load, try to combine it into a VLD1_UPD.
if (ISD::isNormalLoad(N) && VT.isVector() &&
DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
return CombineBaseUpdate(N, DCI);
return SDValue();
}
/// PerformSTORECombine - Target-specific dag combine xforms for
/// ISD::STORE.
static SDValue PerformSTORECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
StoreSDNode *St = cast<StoreSDNode>(N);
if (St->isVolatile())
return SDValue();
// Optimize trunc store (of multiple scalars) to shuffle and store. First,
// pack all of the elements in one place. Next, store to memory in fewer
// chunks.
SDValue StVal = St->getValue();
EVT VT = StVal.getValueType();
if (St->isTruncatingStore() && VT.isVector()) {
SelectionDAG &DAG = DCI.DAG;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT StVT = St->getMemoryVT();
unsigned NumElems = VT.getVectorNumElements();
assert(StVT != VT && "Cannot truncate to the same type");
unsigned FromEltSz = VT.getScalarSizeInBits();
unsigned ToEltSz = StVT.getScalarSizeInBits();
// From, To sizes and ElemCount must be pow of two
if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
// We are going to use the original vector elt for storing.
// Accumulated smaller vector elements must be a multiple of the store size.
if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
unsigned SizeRatio = FromEltSz / ToEltSz;
assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
// Create a type on which we perform the shuffle.
EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
NumElems*SizeRatio);
assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
SDLoc DL(St);
SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
for (unsigned i = 0; i < NumElems; ++i)
ShuffleVec[i] = DAG.getDataLayout().isBigEndian()
? (i + 1) * SizeRatio - 1
: i * SizeRatio;
// Can't shuffle using an illegal type.
if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
DAG.getUNDEF(WideVec.getValueType()),
ShuffleVec);
// At this point all of the data is stored at the bottom of the
// register. We now need to save it to mem.
// Find the largest store unit
MVT StoreType = MVT::i8;
for (MVT Tp : MVT::integer_valuetypes()) {
if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
StoreType = Tp;
}
// Didn't find a legal store type.
if (!TLI.isTypeLegal(StoreType))
return SDValue();
// Bitcast the original vector into a vector of store-size units
EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
SmallVector<SDValue, 8> Chains;
SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
TLI.getPointerTy(DAG.getDataLayout()));
SDValue BasePtr = St->getBasePtr();
// Perform one or more big stores into memory.
unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
for (unsigned I = 0; I < E; I++) {
SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
StoreType, ShuffWide,
DAG.getIntPtrConstant(I, DL));
SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
St->getPointerInfo(), St->getAlignment(),
St->getMemOperand()->getFlags());
BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
Increment);
Chains.push_back(Ch);
}
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
}
if (!ISD::isNormalStore(St))
return SDValue();
// Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
// ARM stores of arguments in the same cache line.
if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
StVal.getNode()->hasOneUse()) {
SelectionDAG &DAG = DCI.DAG;
bool isBigEndian = DAG.getDataLayout().isBigEndian();
SDLoc DL(St);
SDValue BasePtr = St->getBasePtr();
SDValue NewST1 = DAG.getStore(
St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
BasePtr, St->getPointerInfo(), St->getAlignment(),
St->getMemOperand()->getFlags());
SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
DAG.getConstant(4, DL, MVT::i32));
return DAG.getStore(NewST1.getValue(0), DL,
StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
OffsetPtr, St->getPointerInfo(),
std::min(4U, St->getAlignment() / 2),
St->getMemOperand()->getFlags());
}
if (StVal.getValueType() == MVT::i64 &&
StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
// Bitcast an i64 store extracted from a vector to f64.
// Otherwise, the i64 value will be legalized to a pair of i32 values.
SelectionDAG &DAG = DCI.DAG;
SDLoc dl(StVal);
SDValue IntVec = StVal.getOperand(0);
EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
IntVec.getValueType().getVectorNumElements());
SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
Vec, StVal.getOperand(1));
dl = SDLoc(N);
SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
// Make the DAGCombiner fold the bitcasts.
DCI.AddToWorklist(Vec.getNode());
DCI.AddToWorklist(ExtElt.getNode());
DCI.AddToWorklist(V.getNode());
return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
St->getPointerInfo(), St->getAlignment(),
St->getMemOperand()->getFlags(), St->getAAInfo());
}
// If this is a legal vector store, try to combine it into a VST1_UPD.
if (ISD::isNormalStore(N) && VT.isVector() &&
DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
return CombineBaseUpdate(N, DCI);
return SDValue();
}
/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
/// can replace combinations of VMUL and VCVT (floating-point to integer)
/// when the VMUL has a constant operand that is a power of 2.
///
/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
/// vmul.f32 d16, d17, d16
/// vcvt.s32.f32 d16, d16
/// becomes:
/// vcvt.s32.f32 d16, d16, #3
static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *Subtarget) {
if (!Subtarget->hasNEON())
return SDValue();
SDValue Op = N->getOperand(0);
if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
Op.getOpcode() != ISD::FMUL)
return SDValue();
SDValue ConstVec = Op->getOperand(1);
if (!isa<BuildVectorSDNode>(ConstVec))
return SDValue();
MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
uint32_t FloatBits = FloatTy.getSizeInBits();
MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
uint32_t IntBits = IntTy.getSizeInBits();
unsigned NumLanes = Op.getValueType().getVectorNumElements();
if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
// These instructions only exist converting from f32 to i32. We can handle
// smaller integers by generating an extra truncate, but larger ones would
// be lossy. We also can't handle anything other than 2 or 4 lanes, since
// these intructions only support v2i32/v4i32 types.
return SDValue();
}
BitVector UndefElements;
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
if (C == -1 || C == 0 || C > 32)
return SDValue();
SDLoc dl(N);
bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
Intrinsic::arm_neon_vcvtfp2fxu;
SDValue FixConv = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
DAG.getConstant(C, dl, MVT::i32));
if (IntBits < FloatBits)
FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
return FixConv;
}
/// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
/// can replace combinations of VCVT (integer to floating-point) and VDIV
/// when the VDIV has a constant operand that is a power of 2.
///
/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
/// vcvt.f32.s32 d16, d16
/// vdiv.f32 d16, d17, d16
/// becomes:
/// vcvt.f32.s32 d16, d16, #3
static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *Subtarget) {
if (!Subtarget->hasNEON())
return SDValue();
SDValue Op = N->getOperand(0);
unsigned OpOpcode = Op.getNode()->getOpcode();
if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
(OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
return SDValue();
SDValue ConstVec = N->getOperand(1);
if (!isa<BuildVectorSDNode>(ConstVec))
return SDValue();
MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
uint32_t FloatBits = FloatTy.getSizeInBits();
MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
uint32_t IntBits = IntTy.getSizeInBits();
unsigned NumLanes = Op.getValueType().getVectorNumElements();
if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
// These instructions only exist converting from i32 to f32. We can handle
// smaller integers by generating an extra extend, but larger ones would
// be lossy. We also can't handle anything other than 2 or 4 lanes, since
// these intructions only support v2i32/v4i32 types.
return SDValue();
}
BitVector UndefElements;
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
if (C == -1 || C == 0 || C > 32)
return SDValue();
SDLoc dl(N);
bool isSigned = OpOpcode == ISD::SINT_TO_FP;
SDValue ConvInput = Op.getOperand(0);
if (IntBits < FloatBits)
ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
ConvInput);
unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
Intrinsic::arm_neon_vcvtfxu2fp;
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
Op.getValueType(),
DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
ConvInput, DAG.getConstant(C, dl, MVT::i32));
}
/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
switch (IntNo) {
default:
// Don't do anything for most intrinsics.
break;
// Vector shifts: check for immediate versions and lower them.
// Note: This is done during DAG combining instead of DAG legalizing because
// the build_vectors for 64-bit vector element shift counts are generally
// not legal, and it is hard to see their values after they get legalized to
// loads from a constant pool.
case Intrinsic::arm_neon_vshifts:
case Intrinsic::arm_neon_vshiftu:
case Intrinsic::arm_neon_vrshifts:
case Intrinsic::arm_neon_vrshiftu:
case Intrinsic::arm_neon_vrshiftn:
case Intrinsic::arm_neon_vqshifts:
case Intrinsic::arm_neon_vqshiftu:
case Intrinsic::arm_neon_vqshiftsu:
case Intrinsic::arm_neon_vqshiftns:
case Intrinsic::arm_neon_vqshiftnu:
case Intrinsic::arm_neon_vqshiftnsu:
case Intrinsic::arm_neon_vqrshiftns:
case Intrinsic::arm_neon_vqrshiftnu:
case Intrinsic::arm_neon_vqrshiftnsu: {
EVT VT = N->getOperand(1).getValueType();
int64_t Cnt;
unsigned VShiftOpc = 0;
switch (IntNo) {
case Intrinsic::arm_neon_vshifts:
case Intrinsic::arm_neon_vshiftu:
if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
VShiftOpc = ARMISD::VSHLIMM;
break;
}
if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
: ARMISD::VSHRuIMM);
break;
}
return SDValue();
case Intrinsic::arm_neon_vrshifts:
case Intrinsic::arm_neon_vrshiftu:
if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
break;
return SDValue();
case Intrinsic::arm_neon_vqshifts:
case Intrinsic::arm_neon_vqshiftu:
if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
break;
return SDValue();
case Intrinsic::arm_neon_vqshiftsu:
if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
break;
llvm_unreachable("invalid shift count for vqshlu intrinsic");
case Intrinsic::arm_neon_vrshiftn:
case Intrinsic::arm_neon_vqshiftns:
case Intrinsic::arm_neon_vqshiftnu:
case Intrinsic::arm_neon_vqshiftnsu:
case Intrinsic::arm_neon_vqrshiftns:
case Intrinsic::arm_neon_vqrshiftnu:
case Intrinsic::arm_neon_vqrshiftnsu:
// Narrowing shifts require an immediate right shift.
if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
break;
llvm_unreachable("invalid shift count for narrowing vector shift "
"intrinsic");
default:
llvm_unreachable("unhandled vector shift");
}
switch (IntNo) {
case Intrinsic::arm_neon_vshifts:
case Intrinsic::arm_neon_vshiftu:
// Opcode already set above.
break;
case Intrinsic::arm_neon_vrshifts:
VShiftOpc = ARMISD::VRSHRsIMM;
break;
case Intrinsic::arm_neon_vrshiftu:
VShiftOpc = ARMISD::VRSHRuIMM;
break;
case Intrinsic::arm_neon_vrshiftn:
VShiftOpc = ARMISD::VRSHRNIMM;
break;
case Intrinsic::arm_neon_vqshifts:
VShiftOpc = ARMISD::VQSHLsIMM;
break;
case Intrinsic::arm_neon_vqshiftu:
VShiftOpc = ARMISD::VQSHLuIMM;
break;
case Intrinsic::arm_neon_vqshiftsu:
VShiftOpc = ARMISD::VQSHLsuIMM;
break;
case Intrinsic::arm_neon_vqshiftns:
VShiftOpc = ARMISD::VQSHRNsIMM;
break;
case Intrinsic::arm_neon_vqshiftnu:
VShiftOpc = ARMISD::VQSHRNuIMM;
break;
case Intrinsic::arm_neon_vqshiftnsu:
VShiftOpc = ARMISD::VQSHRNsuIMM;
break;
case Intrinsic::arm_neon_vqrshiftns:
VShiftOpc = ARMISD::VQRSHRNsIMM;
break;
case Intrinsic::arm_neon_vqrshiftnu:
VShiftOpc = ARMISD::VQRSHRNuIMM;
break;
case Intrinsic::arm_neon_vqrshiftnsu:
VShiftOpc = ARMISD::VQRSHRNsuIMM;
break;
}
SDLoc dl(N);
return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
}
case Intrinsic::arm_neon_vshiftins: {
EVT VT = N->getOperand(1).getValueType();
int64_t Cnt;
unsigned VShiftOpc = 0;
if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
VShiftOpc = ARMISD::VSLIIMM;
else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
VShiftOpc = ARMISD::VSRIIMM;
else {
llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
}
SDLoc dl(N);
return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
N->getOperand(1), N->getOperand(2),
DAG.getConstant(Cnt, dl, MVT::i32));
}
case Intrinsic::arm_neon_vqrshifts:
case Intrinsic::arm_neon_vqrshiftu:
// No immediate versions of these to check for.
break;
}
return SDValue();
}
/// PerformShiftCombine - Checks for immediate versions of vector shifts and
/// lowers them. As with the vector shift intrinsics, this is done during DAG
/// combining instead of DAG legalizing because the build_vectors for 64-bit
/// vector element shift counts are generally not legal, and it is hard to see
/// their values after they get legalized to loads from a constant pool.
static SDValue PerformShiftCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *ST) {
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
// Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
// 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
SDValue N1 = N->getOperand(1);
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
SDValue N0 = N->getOperand(0);
if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
DAG.MaskedValueIsZero(N0.getOperand(0),
APInt::getHighBitsSet(32, 16)))
return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
}
}
if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
N->getOperand(0)->getOpcode() == ISD::AND &&
N->getOperand(0)->hasOneUse()) {
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
return SDValue();
// Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
// usually show up because instcombine prefers to canonicalize it to
// (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
// out of GEP lowering in some cases.
SDValue N0 = N->getOperand(0);
ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!ShiftAmtNode)
return SDValue();
uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
if (!AndMaskNode)
return SDValue();
uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
// Don't transform uxtb/uxth.
if (AndMask == 255 || AndMask == 65535)
return SDValue();
if (isMask_32(AndMask)) {
uint32_t MaskedBits = countLeadingZeros(AndMask);
if (MaskedBits > ShiftAmt) {
SDLoc DL(N);
SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
DAG.getConstant(MaskedBits, DL, MVT::i32));
return DAG.getNode(
ISD::SRL, DL, MVT::i32, SHL,
DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
}
}
}
// Nothing to be done for scalar shifts.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!VT.isVector() || !TLI.isTypeLegal(VT))
return SDValue();
if (ST->hasMVEIntegerOps() && VT == MVT::v2i64)
return SDValue();
int64_t Cnt;
switch (N->getOpcode()) {
default: llvm_unreachable("unexpected shift opcode");
case ISD::SHL:
if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
SDLoc dl(N);
return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
DAG.getConstant(Cnt, dl, MVT::i32));
}
break;
case ISD::SRA:
case ISD::SRL:
if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
unsigned VShiftOpc =
(N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
SDLoc dl(N);
return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
DAG.getConstant(Cnt, dl, MVT::i32));
}
}
return SDValue();
}
/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *ST) {
SDValue N0 = N->getOperand(0);
// Check for sign- and zero-extensions of vector extract operations of 8-
// and 16-bit vector elements. NEON supports these directly. They are
// handled during DAG combining because type legalization will promote them
// to 32-bit types and it is messy to recognize the operations after that.
if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
SDValue Vec = N0.getOperand(0);
SDValue Lane = N0.getOperand(1);
EVT VT = N->getValueType(0);
EVT EltVT = N0.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (VT == MVT::i32 &&
(EltVT == MVT::i8 || EltVT == MVT::i16) &&
TLI.isTypeLegal(Vec.getValueType()) &&
isa<ConstantSDNode>(Lane)) {
unsigned Opc = 0;
switch (N->getOpcode()) {
default: llvm_unreachable("unexpected opcode");
case ISD::SIGN_EXTEND:
Opc = ARMISD::VGETLANEs;
break;
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND:
Opc = ARMISD::VGETLANEu;
break;
}
return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
}
}
return SDValue();
}
static const APInt *isPowerOf2Constant(SDValue V) {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
if (!C)
return nullptr;
const APInt *CV = &C->getAPIntValue();
return CV->isPowerOf2() ? CV : nullptr;
}
SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const {
// If we have a CMOV, OR and AND combination such as:
// if (x & CN)
// y |= CM;
//
// And:
// * CN is a single bit;
// * All bits covered by CM are known zero in y
//
// Then we can convert this into a sequence of BFI instructions. This will
// always be a win if CM is a single bit, will always be no worse than the
// TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
// three bits (due to the extra IT instruction).
SDValue Op0 = CMOV->getOperand(0);
SDValue Op1 = CMOV->getOperand(1);
auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2));
auto CC = CCNode->getAPIntValue().getLimitedValue();
SDValue CmpZ = CMOV->getOperand(4);
// The compare must be against zero.
if (!isNullConstant(CmpZ->getOperand(1)))
return SDValue();
assert(CmpZ->getOpcode() == ARMISD::CMPZ);
SDValue And = CmpZ->getOperand(0);
if (And->getOpcode() != ISD::AND)
return SDValue();
const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
if (!AndC)
return SDValue();
SDValue X = And->getOperand(0);
if (CC == ARMCC::EQ) {
// We're performing an "equal to zero" compare. Swap the operands so we
// canonicalize on a "not equal to zero" compare.
std::swap(Op0, Op1);
} else {
assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
}
if (Op1->getOpcode() != ISD::OR)
return SDValue();
ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
if (!OrC)
return SDValue();
SDValue Y = Op1->getOperand(0);
if (Op0 != Y)
return SDValue();
// Now, is it profitable to continue?
APInt OrCI = OrC->getAPIntValue();
unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
if (OrCI.countPopulation() > Heuristic)
return SDValue();
// Lastly, can we determine that the bits defined by OrCI
// are zero in Y?
KnownBits Known = DAG.computeKnownBits(Y);
if ((OrCI & Known.Zero) != OrCI)
return SDValue();
// OK, we can do the combine.
SDValue V = Y;
SDLoc dl(X);
EVT VT = X.getValueType();
unsigned BitInX = AndC->logBase2();
if (BitInX != 0) {
// We must shift X first.
X = DAG.getNode(ISD::SRL, dl, VT, X,
DAG.getConstant(BitInX, dl, VT));
}
for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
BitInY < NumActiveBits; ++BitInY) {
if (OrCI[BitInY] == 0)
continue;
APInt Mask(VT.getSizeInBits(), 0);
Mask.setBit(BitInY);
V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
// Confusingly, the operand is an *inverted* mask.
DAG.getConstant(~Mask, dl, VT));
}
return V;
}
static SDValue PerformHWLoopCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *ST) {
// Look for (brcond (xor test.set.loop.iterations, -1)
SDValue CC = N->getOperand(1);
unsigned Opc = CC->getOpcode();
SDValue Int;
if ((Opc == ISD::XOR || Opc == ISD::SETCC) &&
(CC->getOperand(0)->getOpcode() == ISD::INTRINSIC_W_CHAIN)) {
assert((isa<ConstantSDNode>(CC->getOperand(1)) &&
cast<ConstantSDNode>(CC->getOperand(1))->isOne()) &&
"Expected to compare against 1");
Int = CC->getOperand(0);
} else if (CC->getOpcode() == ISD::INTRINSIC_W_CHAIN)
Int = CC;
else
return SDValue();
unsigned IntOp = cast<ConstantSDNode>(Int.getOperand(1))->getZExtValue();
if (IntOp != Intrinsic::test_set_loop_iterations)
return SDValue();
SDLoc dl(Int);
SDValue Chain = N->getOperand(0);
SDValue Elements = Int.getOperand(2);
SDValue ExitBlock = N->getOperand(2);
// TODO: Once we start supporting tail predication, we can add another
// operand to WLS for the number of elements processed in a vector loop.
SDValue Ops[] = { Chain, Elements, ExitBlock };
SDValue Res = DCI.DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
DCI.DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0));
return Res;
}
/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
SDValue
ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
SDValue Cmp = N->getOperand(4);
if (Cmp.getOpcode() != ARMISD::CMPZ)
// Only looking at NE cases.
return SDValue();
EVT VT = N->getValueType(0);
SDLoc dl(N);
SDValue LHS = Cmp.getOperand(0);
SDValue RHS = Cmp.getOperand(1);
SDValue Chain = N->getOperand(0);
SDValue BB = N->getOperand(1);
SDValue ARMcc = N->getOperand(2);
ARMCC::CondCodes CC =
(ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
// (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
// -> (brcond Chain BB CC CPSR Cmp)
if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
LHS->getOperand(0)->hasOneUse()) {
auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0));
auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1));
auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
if ((LHS00C && LHS00C->getZExtValue() == 0) &&
(LHS01C && LHS01C->getZExtValue() == 1) &&
(LHS1C && LHS1C->getZExtValue() == 1) &&
(RHSC && RHSC->getZExtValue() == 0)) {
return DAG.getNode(
ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),
LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));
}
}
return SDValue();
}
/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
SDValue
ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
SDValue Cmp = N->getOperand(4);
if (Cmp.getOpcode() != ARMISD::CMPZ)
// Only looking at EQ and NE cases.
return SDValue();
EVT VT = N->getValueType(0);
SDLoc dl(N);
SDValue LHS = Cmp.getOperand(0);
SDValue RHS = Cmp.getOperand(1);
SDValue FalseVal = N->getOperand(0);
SDValue TrueVal = N->getOperand(1);
SDValue ARMcc = N->getOperand(2);
ARMCC::CondCodes CC =
(ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
// BFI is only available on V6T2+.
if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
SDValue R = PerformCMOVToBFICombine(N, DAG);
if (R)
return R;
}
// Simplify
// mov r1, r0
// cmp r1, x
// mov r0, y
// moveq r0, x
// to
// cmp r0, x
// movne r0, y
//
// mov r1, r0
// cmp r1, x
// mov r0, x
// movne r0, y
// to
// cmp r0, x
// movne r0, y
/// FIXME: Turn this into a target neutral optimization?
SDValue Res;
if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
N->getOperand(3), Cmp);
} else if (CC == ARMCC::EQ && TrueVal == RHS) {
SDValue ARMcc;
SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
N->getOperand(3), NewCmp);
}
// (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
// -> (cmov F T CC CPSR Cmp)
if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) {
auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0));
auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
if ((LHS0C && LHS0C->getZExtValue() == 0) &&
(LHS1C && LHS1C->getZExtValue() == 1) &&
(RHSC && RHSC->getZExtValue() == 0)) {
return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
LHS->getOperand(2), LHS->getOperand(3),
LHS->getOperand(4));
}
}
if (!VT.isInteger())
return SDValue();
// Materialize a boolean comparison for integers so we can avoid branching.
if (isNullConstant(FalseVal)) {
if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
// If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
// right 5 bits will make that 32 be 1, otherwise it will be 0.
// CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
DAG.getConstant(5, dl, MVT::i32));
} else {
// CMOV 0, 1, ==, (CMPZ x, y) ->
// (ADDCARRY (SUB x, y), t:0, t:1)
// where t = (SUBCARRY 0, (SUB x, y), 0)
//
// The SUBCARRY computes 0 - (x - y) and this will give a borrow when
// x != y. In other words, a carry C == 1 when x == y, C == 0
// otherwise.
// The final ADDCARRY computes
// x - y + (0 - (x - y)) + C == C
SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
// ISD::SUBCARRY returns a borrow but we want the carry here
// actually.
SDValue Carry =
DAG.getNode(ISD::SUB, dl, MVT::i32,
DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry);
}
} else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
(!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
// This seems pointless but will allow us to combine it further below.
// CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
SDValue Sub =
DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
Sub.getValue(1), SDValue());
Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
N->getOperand(3), CPSRGlue.getValue(1));
FalseVal = Sub;
}
} else if (isNullConstant(TrueVal)) {
if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
(!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
// This seems pointless but will allow us to combine it further below
// Note that we change == for != as this is the dual for the case above.
// CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
SDValue Sub =
DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
Sub.getValue(1), SDValue());
Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
DAG.getConstant(ARMCC::NE, dl, MVT::i32),
N->getOperand(3), CPSRGlue.getValue(1));
FalseVal = Sub;
}
}
// On Thumb1, the DAG above may be further combined if z is a power of 2
// (z == 2 ^ K).
// CMOV (SUBS x, y), z, !=, (SUBS x, y):1 ->
// t1 = (USUBO (SUB x, y), 1)
// t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)
// Result = if K != 0 then (SHL t2:0, K) else t2:0
//
// This also handles the special case of comparing against zero; it's
// essentially, the same pattern, except there's no SUBS:
// CMOV x, z, !=, (CMPZ x, 0) ->
// t1 = (USUBO x, 1)
// t2 = (SUBCARRY x, t1:0, t1:1)
// Result = if K != 0 then (SHL t2:0, K) else t2:0
const APInt *TrueConst;
if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
((FalseVal.getOpcode() == ARMISD::SUBS &&
FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) ||
(FalseVal == LHS && isNullConstant(RHS))) &&
(TrueConst = isPowerOf2Constant(TrueVal))) {
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
unsigned ShiftAmount = TrueConst->logBase2();
if (ShiftAmount)
TrueVal = DAG.getConstant(1, dl, VT);
SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1));
if (ShiftAmount)
Res = DAG.getNode(ISD::SHL, dl, VT, Res,
DAG.getConstant(ShiftAmount, dl, MVT::i32));
}
if (Res.getNode()) {
KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
// Capture demanded bits information that would be otherwise lost.
if (Known.Zero == 0xfffffffe)
Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
DAG.getValueType(MVT::i1));
else if (Known.Zero == 0xffffff00)
Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
DAG.getValueType(MVT::i8));
else if (Known.Zero == 0xffff0000)
Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
DAG.getValueType(MVT::i16));
}
return Res;
}
SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
switch (N->getOpcode()) {
default: break;
case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget);
case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
case ISD::SUB: return PerformSUBCombine(N, DCI);
case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
case ISD::BRCOND: return PerformHWLoopCombine(N, DCI, Subtarget);
case ARMISD::ADDC:
case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
case ARMISD::BFI: return PerformBFICombine(N, DCI);
case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
case ISD::STORE: return PerformSTORECombine(N, DCI);
case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
return PerformVCVTCombine(N, DCI.DAG, Subtarget);
case ISD::FDIV:
return PerformVDIVCombine(N, DCI.DAG, Subtarget);
case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
return PerformShiftCombine(N, DCI, Subtarget);
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
case ISD::LOAD: return PerformLOADCombine(N, DCI);
case ARMISD::VLD1DUP:
case ARMISD::VLD2DUP:
case ARMISD::VLD3DUP:
case ARMISD::VLD4DUP:
return PerformVLDCombine(N, DCI);
case ARMISD::BUILD_VECTOR:
return PerformARMBUILD_VECTORCombine(N, DCI);
case ARMISD::SMULWB: {
unsigned BitWidth = N->getValueType(0).getSizeInBits();
APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
return SDValue();
break;
}
case ARMISD::SMULWT: {
unsigned BitWidth = N->getValueType(0).getSizeInBits();
APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
return SDValue();
break;
}
case ARMISD::SMLALBB: {
unsigned BitWidth = N->getValueType(0).getSizeInBits();
APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
(SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
return SDValue();
break;
}
case ARMISD::SMLALBT: {
unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
(SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
return SDValue();
break;
}
case ARMISD::SMLALTB: {
unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
(SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
return SDValue();
break;
}
case ARMISD::SMLALTT: {
unsigned BitWidth = N->getValueType(0).getSizeInBits();
APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
(SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
return SDValue();
break;
}
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
case Intrinsic::arm_neon_vld1:
case Intrinsic::arm_neon_vld1x2:
case Intrinsic::arm_neon_vld1x3:
case Intrinsic::arm_neon_vld1x4:
case Intrinsic::arm_neon_vld2:
case Intrinsic::arm_neon_vld3:
case Intrinsic::arm_neon_vld4:
case Intrinsic::arm_neon_vld2lane:
case Intrinsic::arm_neon_vld3lane:
case Intrinsic::arm_neon_vld4lane:
case Intrinsic::arm_neon_vld2dup:
case Intrinsic::arm_neon_vld3dup:
case Intrinsic::arm_neon_vld4dup:
case Intrinsic::arm_neon_vst1:
case Intrinsic::arm_neon_vst1x2:
case Intrinsic::arm_neon_vst1x3:
case Intrinsic::arm_neon_vst1x4:
case Intrinsic::arm_neon_vst2:
case Intrinsic::arm_neon_vst3:
case Intrinsic::arm_neon_vst4:
case Intrinsic::arm_neon_vst2lane:
case Intrinsic::arm_neon_vst3lane:
case Intrinsic::arm_neon_vst4lane:
return PerformVLDCombine(N, DCI);
default: break;
}
break;
}
return SDValue();
}
bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
EVT VT) const {
return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
}
bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
unsigned Alignment,
MachineMemOperand::Flags,
bool *Fast) const {
// Depends what it gets converted into if the type is weird.
if (!VT.isSimple())
return false;
// The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
auto Ty = VT.getSimpleVT().SimpleTy;
if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
// Unaligned access can use (for example) LRDB, LRDH, LDR
if (AllowsUnaligned) {
if (Fast)
*Fast = Subtarget->hasV7Ops();
return true;
}
}
if (Ty == MVT::f64 || Ty == MVT::v2f64) {
// For any little-endian targets with neon, we can support unaligned ld/st
// of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
// A big-endian target may also explicitly support unaligned accesses
if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
if (Fast)
*Fast = true;
return true;
}
}
if (!Subtarget->hasMVEIntegerOps())
return false;
if (Ty != MVT::v16i8 && Ty != MVT::v8i16 && Ty != MVT::v8f16 &&
Ty != MVT::v4i32 && Ty != MVT::v4f32 && Ty != MVT::v2i64 &&
Ty != MVT::v2f64 &&
// These are for truncated stores
Ty != MVT::v4i8 && Ty != MVT::v8i8 && Ty != MVT::v4i16)
return false;
if (Subtarget->isLittle()) {
// In little-endian MVE, the store instructions VSTRB.U8,
// VSTRH.U16 and VSTRW.U32 all store the vector register in
// exactly the same format, and differ only in the range of
// their immediate offset field and the required alignment.
//
// In particular, VSTRB.U8 can store a vector at byte alignment.
// So at this stage we can simply say that loads/stores of all
// 128-bit wide vector types are permitted at any alignment,
// because we know at least _one_ instruction can manage that.
//
// Later on we might find that some of those loads are better
// generated as VLDRW.U32 if alignment permits, to take
// advantage of the larger immediate range. But for the moment,
// all that matters is that if we don't lower the load then
// _some_ instruction can handle it.
if (Fast)
*Fast = true;
return true;
} else {
// In big-endian MVE, those instructions aren't so similar
// after all, because they reorder the bytes of the vector
// differently. So this time we can only store a particular
// kind of vector if its alignment is at least the element
// type. And we can't store vectors of i64 or f64 at all
// without having to do some postprocessing, because there's
// no VSTRD.U64.
if (Ty == MVT::v16i8 ||
((Ty == MVT::v8i16 || Ty == MVT::v8f16) && Alignment >= 2) ||
((Ty == MVT::v4i32 || Ty == MVT::v4f32) && Alignment >= 4)) {
if (Fast)
*Fast = true;
return true;
}
}
return false;
}
static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
unsigned AlignCheck) {
return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
(DstAlign == 0 || DstAlign % AlignCheck == 0));
}
EVT ARMTargetLowering::getOptimalMemOpType(
uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
bool ZeroMemset, bool MemcpyStrSrc,
const AttributeList &FuncAttributes) const {
// See if we can use NEON instructions for this...
if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() &&
!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
bool Fast;
if (Size >= 16 &&
(memOpAlign(SrcAlign, DstAlign, 16) ||
(allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1,
MachineMemOperand::MONone, &Fast) &&
Fast))) {
return MVT::v2f64;
} else if (Size >= 8 &&
(memOpAlign(SrcAlign, DstAlign, 8) ||
(allowsMisalignedMemoryAccesses(
MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) &&
Fast))) {
return MVT::f64;
}
}
// Let the target-independent logic figure it out.
return MVT::Other;
}
// 64-bit integers are split into their high and low parts and held in two
// different registers, so the trunc is free since the low register can just
// be used.
bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
return false;
unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
unsigned DestBits = DstTy->getPrimitiveSizeInBits();
return (SrcBits == 64 && DestBits == 32);
}
bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
!DstVT.isInteger())
return false;
unsigned SrcBits = SrcVT.getSizeInBits();
unsigned DestBits = DstVT.getSizeInBits();
return (SrcBits == 64 && DestBits == 32);
}
bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
if (Val.getOpcode() != ISD::LOAD)
return false;
EVT VT1 = Val.getValueType();
if (!VT1.isSimple() || !VT1.isInteger() ||
!VT2.isSimple() || !VT2.isInteger())
return false;
switch (VT1.getSimpleVT().SimpleTy) {
default: break;
case MVT::i1:
case MVT::i8:
case MVT::i16:
// 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
return true;
}
return false;
}
bool ARMTargetLowering::isFNegFree(EVT VT) const {
if (!VT.isSimple())
return false;
// There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
// negate values directly (fneg is free). So, we don't want to let the DAG
// combiner rewrite fneg into xors and some other instructions. For f16 and
// FullFP16 argument passing, some bitcast nodes may be introduced,
// triggering this DAG combine rewrite, so we are avoiding that with this.
switch (VT.getSimpleVT().SimpleTy) {
default: break;
case MVT::f16:
return Subtarget->hasFullFP16();
}
return false;
}
/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
/// of the vector elements.
static bool areExtractExts(Value *Ext1, Value *Ext2) {
auto areExtDoubled = [](Instruction *Ext) {
return Ext->getType()->getScalarSizeInBits() ==
2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
};
if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
!match(Ext2, m_ZExtOrSExt(m_Value())) ||
!areExtDoubled(cast<Instruction>(Ext1)) ||
!areExtDoubled(cast<Instruction>(Ext2)))
return false;
return true;
}
/// Check if sinking \p I's operands to I's basic block is profitable, because
/// the operands can be folded into a target instruction, e.g.
/// sext/zext can be folded into vsubl.
bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const {
if (!Subtarget->hasNEON() || !I->getType()->isVectorTy())
return false;
switch (I->getOpcode()) {
case Instruction::Sub:
case Instruction::Add: {
if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
return false;
Ops.push_back(&I->getOperandUse(0));
Ops.push_back(&I->getOperandUse(1));
return true;
}
default:
return false;
}
return false;
}
bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
EVT VT = ExtVal.getValueType();
if (!isTypeLegal(VT))
return false;
// Don't create a loadext if we can fold the extension into a wide/long
// instruction.
// If there's more than one user instruction, the loadext is desirable no
// matter what. There can be two uses by the same instruction.
if (ExtVal->use_empty() ||
!ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
return true;
SDNode *U = *ExtVal->use_begin();
if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
return false;
return true;
}
bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
if (!isTypeLegal(EVT::getEVT(Ty1)))
return false;
assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
// Assuming the caller doesn't have a zeroext or signext return parameter,
// truncation all the way down to i1 is valid.
return true;
}
int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS) const {
if (isLegalAddressingMode(DL, AM, Ty, AS)) {
if (Subtarget->hasFPAO())
return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
return 0;
}
return -1;
}
static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
if (V < 0)
return false;
unsigned Scale = 1;
switch (VT.getSimpleVT().SimpleTy) {
case MVT::i1:
case MVT::i8:
// Scale == 1;
break;
case MVT::i16:
// Scale == 2;
Scale = 2;
break;
default:
// On thumb1 we load most things (i32, i64, floats, etc) with a LDR
// Scale == 4;
Scale = 4;
break;
}
if ((V & (Scale - 1)) != 0)
return false;
return isUInt<5>(V / Scale);
}
static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
const ARMSubtarget *Subtarget) {
if (!VT.isInteger() && !VT.isFloatingPoint())
return false;
if (VT.isVector() && Subtarget->hasNEON())
return false;
if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
!Subtarget->hasMVEFloatOps())
return false;
bool IsNeg = false;
if (V < 0) {
IsNeg = true;
V = -V;
}
unsigned NumBytes = std::max(VT.getSizeInBits() / 8, 1U);
// MVE: size * imm7
if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
case MVT::i32:
case MVT::f32:
return isShiftedUInt<7,2>(V);
case MVT::i16:
case MVT::f16:
return isShiftedUInt<7,1>(V);
case MVT::i8:
return isUInt<7>(V);
default:
return false;
}
}
// half VLDR: 2 * imm8
if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
return isShiftedUInt<8, 1>(V);
// VLDR and LDRD: 4 * imm8
if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
return isShiftedUInt<8, 2>(V);
if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
// + imm12 or - imm8
if (IsNeg)
return isUInt<8>(V);
return isUInt<12>(V);
}
return false;
}
/// isLegalAddressImmediate - Return true if the integer value can be used
/// as the offset of the target addressing mode for load / store of the
/// given type.
static bool isLegalAddressImmediate(int64_t V, EVT VT,
const ARMSubtarget *Subtarget) {
if (V == 0)
return true;
if (!VT.isSimple())
return false;
if (Subtarget->isThumb1Only())
return isLegalT1AddressImmediate(V, VT);
else if (Subtarget->isThumb2())
return isLegalT2AddressImmediate(V, VT, Subtarget);
// ARM mode.
if (V < 0)
V = - V;
switch (VT.getSimpleVT().SimpleTy) {
default: return false;
case MVT::i1:
case MVT::i8:
case MVT::i32:
// +- imm12
return isUInt<12>(V);
case MVT::i16:
// +- imm8
return isUInt<8>(V);
case MVT::f32:
case MVT::f64:
if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
return false;
return isShiftedUInt<8, 2>(V);
}
}
bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
EVT VT) const {
int Scale = AM.Scale;
if (Scale < 0)
return false;
switch (VT.getSimpleVT().SimpleTy) {
default: return false;
case MVT::i1:
case MVT::i8:
case MVT::i16:
case MVT::i32:
if (Scale == 1)
return true;
// r + r << imm
Scale = Scale & ~1;
return Scale == 2 || Scale == 4 || Scale == 8;
case MVT::i64:
// FIXME: What are we trying to model here? ldrd doesn't have an r + r
// version in Thumb mode.
// r + r
if (Scale == 1)
return true;
// r * 2 (this can be lowered to r + r).
if (!AM.HasBaseReg && Scale == 2)
return true;
return false;
case MVT::isVoid:
// Note, we allow "void" uses (basically, uses that aren't loads or
// stores), because arm allows folding a scale into many arithmetic
// operations. This should be made more precise and revisited later.
// Allow r << imm, but the imm has to be a multiple of two.
if (Scale & 1) return false;
return isPowerOf2_32(Scale);
}
}
bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM,
EVT VT) const {
const int Scale = AM.Scale;
// Negative scales are not supported in Thumb1.
if (Scale < 0)
return false;
// Thumb1 addressing modes do not support register scaling excepting the
// following cases:
// 1. Scale == 1 means no scaling.
// 2. Scale == 2 this can be lowered to r + r if there is no base register.
return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
}
/// isLegalAddressingMode - Return true if the addressing mode represented
/// by AM is legal for this target, for a load/store of the specified type.
bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS, Instruction *I) const {
EVT VT = getValueType(DL, Ty, true);
if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
return false;
// Can never fold addr of global into load/store.
if (AM.BaseGV)
return false;
switch (AM.Scale) {
case 0: // no scale reg, must be "r+i" or "r", or "i".
break;
default:
// ARM doesn't support any R+R*scale+imm addr modes.
if (AM.BaseOffs)
return false;
if (!VT.isSimple())
return false;
if (Subtarget->isThumb1Only())
return isLegalT1ScaledAddressingMode(AM, VT);
if (Subtarget->isThumb2())
return isLegalT2ScaledAddressingMode(AM, VT);
int Scale = AM.Scale;
switch (VT.getSimpleVT().SimpleTy) {
default: return false;
case MVT::i1:
case MVT::i8:
case MVT::i32:
if (Scale < 0) Scale = -Scale;
if (Scale == 1)
return true;
// r + r << imm
return isPowerOf2_32(Scale & ~1);
case MVT::i16:
case MVT::i64:
// r +/- r
if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
return true;
// r * 2 (this can be lowered to r + r).
if (!AM.HasBaseReg && Scale == 2)
return true;
return false;
case MVT::isVoid:
// Note, we allow "void" uses (basically, uses that aren't loads or
// stores), because arm allows folding a scale into many arithmetic
// operations. This should be made more precise and revisited later.
// Allow r << imm, but the imm has to be a multiple of two.
if (Scale & 1) return false;
return isPowerOf2_32(Scale);
}
}
return true;
}
/// isLegalICmpImmediate - Return true if the specified immediate is legal
/// icmp immediate, that is the target has icmp instructions which can compare
/// a register against the immediate without having to materialize the
/// immediate into a register.
bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
// Thumb2 and ARM modes can use cmn for negative immediates.
if (!Subtarget->isThumb())
return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
if (Subtarget->isThumb2())
return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
// Thumb1 doesn't have cmn, and only 8-bit immediates.
return Imm >= 0 && Imm <= 255;
}
/// isLegalAddImmediate - Return true if the specified immediate is a legal add
/// *or sub* immediate, that is the target has add or sub instructions which can
/// add a register with the immediate without having to materialize the
/// immediate into a register.
bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
// Same encoding for add/sub, just flip the sign.
int64_t AbsImm = std::abs(Imm);
if (!Subtarget->isThumb())
return ARM_AM::getSOImmVal(AbsImm) != -1;
if (Subtarget->isThumb2())
return ARM_AM::getT2SOImmVal(AbsImm) != -1;
// Thumb1 only has 8-bit unsigned immediate.
return AbsImm >= 0 && AbsImm <= 255;
}
static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
bool isSEXTLoad, SDValue &Base,
SDValue &Offset, bool &isInc,
SelectionDAG &DAG) {
if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
return false;
if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
// AddressingMode 3
Base = Ptr->getOperand(0);
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
int RHSC = (int)RHS->getZExtValue();
if (RHSC < 0 && RHSC > -256) {
assert(Ptr->getOpcode() == ISD::ADD);
isInc = false;
Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
return true;
}
}
isInc = (Ptr->getOpcode() == ISD::ADD);
Offset = Ptr->getOperand(1);
return true;
} else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
// AddressingMode 2
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
int RHSC = (int)RHS->getZExtValue();
if (RHSC < 0 && RHSC > -0x1000) {
assert(Ptr->getOpcode() == ISD::ADD);
isInc = false;
Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
Base = Ptr->getOperand(0);
return true;
}
}
if (Ptr->getOpcode() == ISD::ADD) {
isInc = true;
ARM_AM::ShiftOpc ShOpcVal=
ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
if (ShOpcVal != ARM_AM::no_shift) {
Base = Ptr->getOperand(1);
Offset = Ptr->getOperand(0);
} else {
Base = Ptr->getOperand(0);
Offset = Ptr->getOperand(1);
}
return true;
}
isInc = (Ptr->getOpcode() == ISD::ADD);
Base = Ptr->getOperand(0);
Offset = Ptr->getOperand(1);
return true;
}
// FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
return false;
}
static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
bool isSEXTLoad, SDValue &Base,
SDValue &Offset, bool &isInc,
SelectionDAG &DAG) {
if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
return false;
Base = Ptr->getOperand(0);
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
int RHSC = (int)RHS->getZExtValue();
if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
assert(Ptr->getOpcode() == ISD::ADD);
isInc = false;
Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
return true;
} else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
isInc = Ptr->getOpcode() == ISD::ADD;
Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
return true;
}
}
return false;
}
/// getPreIndexedAddressParts - returns true by value, base pointer and
/// offset pointer and addressing mode by reference if the node's address
/// can be legally represented as pre-indexed load / store address.
bool
ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
SDValue &Offset,
ISD::MemIndexedMode &AM,
SelectionDAG &DAG) const {
if (Subtarget->isThumb1Only())
return false;
EVT VT;
SDValue Ptr;
bool isSEXTLoad = false;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
Ptr = LD->getBasePtr();
VT = LD->getMemoryVT();
isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
Ptr = ST->getBasePtr();
VT = ST->getMemoryVT();
} else
return false;
bool isInc;
bool isLegal = false;
if (Subtarget->isThumb2())
isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
Offset, isInc, DAG);
else
isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
Offset, isInc, DAG);
if (!isLegal)
return false;
AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
return true;
}
/// getPostIndexedAddressParts - returns true by value, base pointer and
/// offset pointer and addressing mode by reference if this node can be
/// combined with a load / store to form a post-indexed load / store.
bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
SDValue &Base,
SDValue &Offset,
ISD::MemIndexedMode &AM,
SelectionDAG &DAG) const {
EVT VT;
SDValue Ptr;
bool isSEXTLoad = false, isNonExt;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
VT = LD->getMemoryVT();
Ptr = LD->getBasePtr();
isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
VT = ST->getMemoryVT();
Ptr = ST->getBasePtr();
isNonExt = !ST->isTruncatingStore();
} else
return false;
if (Subtarget->isThumb1Only()) {
// Thumb-1 can do a limited post-inc load or store as an updating LDM. It
// must be non-extending/truncating, i32, with an offset of 4.
assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
if (Op->getOpcode() != ISD::ADD || !isNonExt)
return false;
auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
if (!RHS || RHS->getZExtValue() != 4)
return false;
Offset = Op->getOperand(1);
Base = Op->getOperand(0);
AM = ISD::POST_INC;
return true;
}
bool isInc;
bool isLegal = false;
if (Subtarget->isThumb2())
isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
isInc, DAG);
else
isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
isInc, DAG);
if (!isLegal)
return false;
if (Ptr != Base) {
// Swap base ptr and offset to catch more post-index load / store when
// it's legal. In Thumb2 mode, offset must be an immediate.
if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
!Subtarget->isThumb2())
std::swap(Base, Offset);
// Post-indexed load / store update the base pointer.
if (Ptr != Base)
return false;
}
AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
return true;
}
void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
KnownBits &Known,
const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth) const {
unsigned BitWidth = Known.getBitWidth();
Known.resetAll();
switch (Op.getOpcode()) {
default: break;
case ARMISD::ADDC:
case ARMISD::ADDE:
case ARMISD::SUBC:
case ARMISD::SUBE:
// Special cases when we convert a carry to a boolean.
if (Op.getResNo() == 0) {
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
// (ADDE 0, 0, C) will give us a single bit.
if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
isNullConstant(RHS)) {
Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
return;
}
}
break;
case ARMISD::CMOV: {
// Bits are known zero/one if known on the LHS and RHS.
Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
if (Known.isUnknown())
return;
KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
Known.Zero &= KnownRHS.Zero;
Known.One &= KnownRHS.One;
return;
}
case ISD::INTRINSIC_W_CHAIN: {
ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
switch (IntID) {
default: return;
case Intrinsic::arm_ldaex:
case Intrinsic::arm_ldrex: {
EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
unsigned MemBits = VT.getScalarSizeInBits();
Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
return;
}
}
}
case ARMISD::BFI: {
// Conservatively, we can recurse down the first operand
// and just mask out all affected bits.
Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
// The operand to BFI is already a mask suitable for removing the bits it
// sets.
ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
const APInt &Mask = CI->getAPIntValue();
Known.Zero &= Mask;
Known.One &= Mask;
return;
}
case ARMISD::VGETLANEs:
case ARMISD::VGETLANEu: {
const SDValue &SrcSV = Op.getOperand(0);
EVT VecVT = SrcSV.getValueType();
assert(VecVT.isVector() && "VGETLANE expected a vector type");
const unsigned NumSrcElts = VecVT.getVectorNumElements();
ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
assert(Pos->getAPIntValue().ult(NumSrcElts) &&
"VGETLANE index out of bounds");
unsigned Idx = Pos->getZExtValue();
APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
EVT VT = Op.getValueType();
const unsigned DstSz = VT.getScalarSizeInBits();
const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
(void)SrcSz;
assert(SrcSz == Known.getBitWidth());
assert(DstSz > SrcSz);
if (Op.getOpcode() == ARMISD::VGETLANEs)
Known = Known.sext(DstSz);
else {
Known = Known.zext(DstSz, true /* extended bits are known zero */);
}
assert(DstSz == Known.getBitWidth());
break;
}
}
}
bool
ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op,
const APInt &DemandedAPInt,
TargetLoweringOpt &TLO) const {
// Delay optimization, so we don't have to deal with illegal types, or block
// optimizations.
if (!TLO.LegalOps)
return false;
// Only optimize AND for now.
if (Op.getOpcode() != ISD::AND)
return false;
EVT VT = Op.getValueType();
// Ignore vectors.
if (VT.isVector())
return false;
assert(VT == MVT::i32 && "Unexpected integer type");
// Make sure the RHS really is a constant.
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
if (!C)
return false;
unsigned Mask = C->getZExtValue();
unsigned Demanded = DemandedAPInt.getZExtValue();
unsigned ShrunkMask = Mask & Demanded;
unsigned ExpandedMask = Mask | ~Demanded;
// If the mask is all zeros, let the target-independent code replace the
// result with zero.
if (ShrunkMask == 0)
return false;
// If the mask is all ones, erase the AND. (Currently, the target-independent
// code won't do this, so we have to do it explicitly to avoid an infinite
// loop in obscure cases.)
if (ExpandedMask == ~0U)
return TLO.CombineTo(Op, Op.getOperand(0));
auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
};
auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
if (NewMask == Mask)
return true;
SDLoc DL(Op);
SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
return TLO.CombineTo(Op, NewOp);
};
// Prefer uxtb mask.
if (IsLegalMask(0xFF))
return UseMask(0xFF);
// Prefer uxth mask.
if (IsLegalMask(0xFFFF))
return UseMask(0xFFFF);
// [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
// FIXME: Prefer a contiguous sequence of bits for other optimizations.
if (ShrunkMask < 256)
return UseMask(ShrunkMask);
// [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
// FIXME: Prefer a contiguous sequence of bits for other optimizations.
if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
return UseMask(ExpandedMask);
// Potential improvements:
//
// We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
// We could try to prefer Thumb1 immediates which can be lowered to a
// two-instruction sequence.
// We could try to recognize more legal ARM/Thumb2 immediates here.
return false;
}
//===----------------------------------------------------------------------===//
// ARM Inline Assembly Support
//===----------------------------------------------------------------------===//
bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
// Looking for "rev" which is V6+.
if (!Subtarget->hasV6Ops())
return false;
InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
std::string AsmStr = IA->getAsmString();
SmallVector<StringRef, 4> AsmPieces;
SplitString(AsmStr, AsmPieces, ";\n");
switch (AsmPieces.size()) {
default: return false;
case 1:
AsmStr = AsmPieces[0];
AsmPieces.clear();
SplitString(AsmStr, AsmPieces, " \t,");
// rev $0, $1
if (AsmPieces.size() == 3 &&
AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
if (Ty && Ty->getBitWidth() == 32)
return IntrinsicLowering::LowerToByteSwap(CI);
}
break;
}
return false;
}
const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
// At this point, we have to lower this constraint to something else, so we
// lower it to an "r" or "w". However, by doing this we will force the result
// to be in register, while the X constraint is much more permissive.
//
// Although we are correct (we are free to emit anything, without
// constraints), we might break use cases that would expect us to be more
// efficient and emit something else.
if (!Subtarget->hasVFP2Base())
return "r";
if (ConstraintVT.isFloatingPoint())
return "w";
if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
(ConstraintVT.getSizeInBits() == 64 ||
ConstraintVT.getSizeInBits() == 128))
return "w";
return "r";
}
/// getConstraintType - Given a constraint letter, return the type of
/// constraint it is for this target.
ARMTargetLowering::ConstraintType
ARMTargetLowering::getConstraintType(StringRef Constraint) const {
- if (Constraint.size() == 1) {
+ unsigned S = Constraint.size();
+ if (S == 1) {
switch (Constraint[0]) {
default: break;
case 'l': return C_RegisterClass;
case 'w': return C_RegisterClass;
case 'h': return C_RegisterClass;
case 'x': return C_RegisterClass;
case 't': return C_RegisterClass;
- case 'j': return C_Other; // Constant for movw.
- // An address with a single base register. Due to the way we
- // currently handle addresses it is the same as an 'r' memory constraint.
+ case 'j': return C_Immediate; // Constant for movw.
+ // An address with a single base register. Due to the way we
+ // currently handle addresses it is the same as an 'r' memory constraint.
case 'Q': return C_Memory;
}
- } else if (Constraint.size() == 2) {
+ } else if (S == 2) {
switch (Constraint[0]) {
default: break;
case 'T': return C_RegisterClass;
// All 'U+' constraints are addresses.
case 'U': return C_Memory;
}
}
return TargetLowering::getConstraintType(Constraint);
}
/// Examine constraint type and operand type and determine a weight value.
/// This object must already have been set up with the operand type
/// and the current alternative constraint selected.
TargetLowering::ConstraintWeight
ARMTargetLowering::getSingleConstraintMatchWeight(
AsmOperandInfo &info, const char *constraint) const {
ConstraintWeight weight = CW_Invalid;
Value *CallOperandVal = info.CallOperandVal;
// If we don't have a value, we can't do a match,
// but allow it at the lowest weight.
if (!CallOperandVal)
return CW_Default;
Type *type = CallOperandVal->getType();
// Look at the constraint type.
switch (*constraint) {
default:
weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
break;
case 'l':
if (type->isIntegerTy()) {
if (Subtarget->isThumb())
weight = CW_SpecificReg;
else
weight = CW_Register;
}
break;
case 'w':
if (type->isFloatingPointTy())
weight = CW_Register;
break;
}
return weight;
}
using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
switch (Constraint.size()) {
case 1:
// GCC ARM Constraint Letters
switch (Constraint[0]) {
case 'l': // Low regs or general regs.
if (Subtarget->isThumb())
return RCPair(0U, &ARM::tGPRRegClass);
return RCPair(0U, &ARM::GPRRegClass);
case 'h': // High regs or no regs.
if (Subtarget->isThumb())
return RCPair(0U, &ARM::hGPRRegClass);
break;
case 'r':
if (Subtarget->isThumb1Only())
return RCPair(0U, &ARM::tGPRRegClass);
return RCPair(0U, &ARM::GPRRegClass);
case 'w':
if (VT == MVT::Other)
break;
if (VT == MVT::f32)
return RCPair(0U, &ARM::SPRRegClass);
if (VT.getSizeInBits() == 64)
return RCPair(0U, &ARM::DPRRegClass);
if (VT.getSizeInBits() == 128)
return RCPair(0U, &ARM::QPRRegClass);
break;
case 'x':
if (VT == MVT::Other)
break;
if (VT == MVT::f32)
return RCPair(0U, &ARM::SPR_8RegClass);
if (VT.getSizeInBits() == 64)
return RCPair(0U, &ARM::DPR_8RegClass);
if (VT.getSizeInBits() == 128)
return RCPair(0U, &ARM::QPR_8RegClass);
break;
case 't':
if (VT == MVT::Other)
break;
if (VT == MVT::f32 || VT == MVT::i32)
return RCPair(0U, &ARM::SPRRegClass);
if (VT.getSizeInBits() == 64)
return RCPair(0U, &ARM::DPR_VFP2RegClass);
if (VT.getSizeInBits() == 128)
return RCPair(0U, &ARM::QPR_VFP2RegClass);
break;
}
break;
case 2:
if (Constraint[0] == 'T') {
switch (Constraint[1]) {
default:
break;
case 'e':
return RCPair(0U, &ARM::tGPREvenRegClass);
case 'o':
return RCPair(0U, &ARM::tGPROddRegClass);
}
}
break;
default:
break;
}
if (StringRef("{cc}").equals_lower(Constraint))
return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
}
/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
/// vector. If it is invalid, don't add anything to Ops.
void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
std::string &Constraint,
std::vector<SDValue>&Ops,
SelectionDAG &DAG) const {
SDValue Result;
// Currently only support length 1 constraints.
if (Constraint.length() != 1) return;
char ConstraintLetter = Constraint[0];
switch (ConstraintLetter) {
default: break;
case 'j':
case 'I': case 'J': case 'K': case 'L':
case 'M': case 'N': case 'O':
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
if (!C)
return;
int64_t CVal64 = C->getSExtValue();
int CVal = (int) CVal64;
// None of these constraints allow values larger than 32 bits. Check
// that the value fits in an int.
if (CVal != CVal64)
return;
switch (ConstraintLetter) {
case 'j':
// Constant suitable for movw, must be between 0 and
// 65535.
if (Subtarget->hasV6T2Ops())
if (CVal >= 0 && CVal <= 65535)
break;
return;
case 'I':
if (Subtarget->isThumb1Only()) {
// This must be a constant between 0 and 255, for ADD
// immediates.
if (CVal >= 0 && CVal <= 255)
break;
} else if (Subtarget->isThumb2()) {
// A constant that can be used as an immediate value in a
// data-processing instruction.
if (ARM_AM::getT2SOImmVal(CVal) != -1)
break;
} else {
// A constant that can be used as an immediate value in a
// data-processing instruction.
if (ARM_AM::getSOImmVal(CVal) != -1)
break;
}
return;
case 'J':
if (Subtarget->isThumb1Only()) {
// This must be a constant between -255 and -1, for negated ADD
// immediates. This can be used in GCC with an "n" modifier that
// prints the negated value, for use with SUB instructions. It is
// not useful otherwise but is implemented for compatibility.
if (CVal >= -255 && CVal <= -1)
break;
} else {
// This must be a constant between -4095 and 4095. It is not clear
// what this constraint is intended for. Implemented for
// compatibility with GCC.
if (CVal >= -4095 && CVal <= 4095)
break;
}
return;
case 'K':
if (Subtarget->isThumb1Only()) {
// A 32-bit value where only one byte has a nonzero value. Exclude
// zero to match GCC. This constraint is used by GCC internally for
// constants that can be loaded with a move/shift combination.
// It is not useful otherwise but is implemented for compatibility.
if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
break;
} else if (Subtarget->isThumb2()) {
// A constant whose bitwise inverse can be used as an immediate
// value in a data-processing instruction. This can be used in GCC
// with a "B" modifier that prints the inverted value, for use with
// BIC and MVN instructions. It is not useful otherwise but is
// implemented for compatibility.
if (ARM_AM::getT2SOImmVal(~CVal) != -1)
break;
} else {
// A constant whose bitwise inverse can be used as an immediate
// value in a data-processing instruction. This can be used in GCC
// with a "B" modifier that prints the inverted value, for use with
// BIC and MVN instructions. It is not useful otherwise but is
// implemented for compatibility.
if (ARM_AM::getSOImmVal(~CVal) != -1)
break;
}
return;
case 'L':
if (Subtarget->isThumb1Only()) {
// This must be a constant between -7 and 7,
// for 3-operand ADD/SUB immediate instructions.
if (CVal >= -7 && CVal < 7)
break;
} else if (Subtarget->isThumb2()) {
// A constant whose negation can be used as an immediate value in a
// data-processing instruction. This can be used in GCC with an "n"
// modifier that prints the negated value, for use with SUB
// instructions. It is not useful otherwise but is implemented for
// compatibility.
if (ARM_AM::getT2SOImmVal(-CVal) != -1)
break;
} else {
// A constant whose negation can be used as an immediate value in a
// data-processing instruction. This can be used in GCC with an "n"
// modifier that prints the negated value, for use with SUB
// instructions. It is not useful otherwise but is implemented for
// compatibility.
if (ARM_AM::getSOImmVal(-CVal) != -1)
break;
}
return;
case 'M':
if (Subtarget->isThumb1Only()) {
// This must be a multiple of 4 between 0 and 1020, for
// ADD sp + immediate.
if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
break;
} else {
// A power of two or a constant between 0 and 32. This is used in
// GCC for the shift amount on shifted register operands, but it is
// useful in general for any shift amounts.
if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
break;
}
return;
case 'N':
if (Subtarget->isThumb()) { // FIXME thumb2
// This must be a constant between 0 and 31, for shift amounts.
if (CVal >= 0 && CVal <= 31)
break;
}
return;
case 'O':
if (Subtarget->isThumb()) { // FIXME thumb2
// This must be a multiple of 4 between -508 and 508, for
// ADD/SUB sp = sp + immediate.
if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
break;
}
return;
}
Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType());
break;
}
if (Result.getNode()) {
Ops.push_back(Result);
return;
}
return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
}
static RTLIB::Libcall getDivRemLibcall(
const SDNode *N, MVT::SimpleValueType SVT) {
assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
"Unhandled Opcode in getDivRemLibcall");
bool isSigned = N->getOpcode() == ISD::SDIVREM ||
N->getOpcode() == ISD::SREM;
RTLIB::Libcall LC;
switch (SVT) {
default: llvm_unreachable("Unexpected request for libcall!");
case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
}
return LC;
}
static TargetLowering::ArgListTy getDivRemArgList(
const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
"Unhandled Opcode in getDivRemArgList");
bool isSigned = N->getOpcode() == ISD::SDIVREM ||
N->getOpcode() == ISD::SREM;
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
EVT ArgVT = N->getOperand(i).getValueType();
Type *ArgTy = ArgVT.getTypeForEVT(*Context);
Entry.Node = N->getOperand(i);
Entry.Ty = ArgTy;
Entry.IsSExt = isSigned;
Entry.IsZExt = !isSigned;
Args.push_back(Entry);
}
if (Subtarget->isTargetWindows() && Args.size() >= 2)
std::swap(Args[0], Args[1]);
return Args;
}
SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
Subtarget->isTargetWindows()) &&
"Register-based DivRem lowering only");
unsigned Opcode = Op->getOpcode();
assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
"Invalid opcode for Div/Rem lowering");
bool isSigned = (Opcode == ISD::SDIVREM);
EVT VT = Op->getValueType(0);
Type *Ty = VT.getTypeForEVT(*DAG.getContext());
SDLoc dl(Op);
// If the target has hardware divide, use divide + multiply + subtract:
// div = a / b
// rem = a - b * div
// return {div, rem}
// This should be lowered into UDIV/SDIV + MLS later on.
bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
: Subtarget->hasDivideInARMMode();
if (hasDivide && Op->getValueType(0).isSimple() &&
Op->getSimpleValueType(0) == MVT::i32) {
unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
const SDValue Dividend = Op->getOperand(0);
const SDValue Divisor = Op->getOperand(1);
SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
SDValue Values[2] = {Div, Rem};
return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
}
RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
VT.getSimpleVT().SimpleTy);
SDValue InChain = DAG.getEntryNode();
TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(),
DAG.getContext(),
Subtarget);
SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
getPointerTy(DAG.getDataLayout()));
Type *RetTy = StructType::get(Ty, Ty);
if (Subtarget->isTargetWindows())
InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl).setChain(InChain)
.setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
.setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
return CallInfo.first;
}
// Lowers REM using divmod helpers
// see RTABI section 4.2/4.3
SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
// Build return types (div and rem)
std::vector<Type*> RetTyParams;
Type *RetTyElement;
switch (N->getValueType(0).getSimpleVT().SimpleTy) {
default: llvm_unreachable("Unexpected request for libcall!");
case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
}
RetTyParams.push_back(RetTyElement);
RetTyParams.push_back(RetTyElement);
ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
Type *RetTy = StructType::get(*DAG.getContext(), ret);
RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
SimpleTy);
SDValue InChain = DAG.getEntryNode();
TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(),
Subtarget);
bool isSigned = N->getOpcode() == ISD::SREM;
SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
getPointerTy(DAG.getDataLayout()));
if (Subtarget->isTargetWindows())
InChain = WinDBZCheckDenominator(DAG, N, InChain);
// Lower call
CallLoweringInfo CLI(DAG);
CLI.setChain(InChain)
.setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
.setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
// Return second (rem) result operand (first contains div)
SDNode *ResNode = CallResult.first.getNode();
assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
return ResNode->getOperand(1);
}
SDValue
ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
assert(Subtarget->isTargetWindows() && "unsupported target platform");
SDLoc DL(Op);
// Get the inputs.
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
if (DAG.getMachineFunction().getFunction().hasFnAttribute(
"no-stack-arg-probe")) {
unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
Chain = SP.getValue(1);
SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
DAG.getConstant(-(uint64_t)Align, DL, MVT::i32));
Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
SDValue Ops[2] = { SP, Chain };
return DAG.getMergeValues(Ops, DL);
}
SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
DAG.getConstant(2, DL, MVT::i32));
SDValue Flag;
Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
Flag = Chain.getValue(1);
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
Chain = NewSP.getValue(1);
SDValue Ops[2] = { NewSP, Chain };
return DAG.getMergeValues(Ops, DL);
}
SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
SDValue SrcVal = Op.getOperand(0);
const unsigned DstSz = Op.getValueType().getSizeInBits();
const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
"Unexpected type for custom-lowering FP_EXTEND");
assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
"With both FP DP and 16, any FP conversion is legal!");
assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
"With FP16, 16 to 32 conversion is legal!");
// Either we are converting from 16 -> 64, without FP16 and/or
// FP.double-precision or without Armv8-fp. So we must do it in two
// steps.
// Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
// without FP16. So we must do a function call.
SDLoc Loc(Op);
RTLIB::Libcall LC;
if (SrcSz == 16) {
// Instruction from 16 -> 32
if (Subtarget->hasFP16())
SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, SrcVal);
// Lib call from 16 -> 32
else {
LC = RTLIB::getFPEXT(MVT::f16, MVT::f32);
assert(LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected type for custom-lowering FP_EXTEND");
SrcVal =
makeLibCall(DAG, LC, MVT::f32, SrcVal, /*isSigned*/ false, Loc).first;
}
}
if (DstSz != 64)
return SrcVal;
// For sure now SrcVal is 32 bits
if (Subtarget->hasFP64()) // Instruction from 32 -> 64
return DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f64, SrcVal);
LC = RTLIB::getFPEXT(MVT::f32, MVT::f64);
assert(LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected type for custom-lowering FP_EXTEND");
return makeLibCall(DAG, LC, MVT::f64, SrcVal, /*isSigned*/ false, Loc).first;
}
SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
SDValue SrcVal = Op.getOperand(0);
EVT SrcVT = SrcVal.getValueType();
EVT DstVT = Op.getValueType();
const unsigned DstSz = Op.getValueType().getSizeInBits();
const unsigned SrcSz = SrcVT.getSizeInBits();
(void)DstSz;
assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
"Unexpected type for custom-lowering FP_ROUND");
assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
"With both FP DP and 16, any FP conversion is legal!");
SDLoc Loc(Op);
// Instruction from 32 -> 16 if hasFP16 is valid
if (SrcSz == 32 && Subtarget->hasFP16())
return Op;
// Lib call from 32 -> 16 / 64 -> [32, 16]
RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
assert(LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected type for custom-lowering FP_ROUND");
return makeLibCall(DAG, LC, DstVT, SrcVal, /*isSigned*/ false, Loc).first;
}
void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS.");
MVT HalfT = MVT::i32;
SDLoc dl(N);
SDValue Hi, Lo, Tmp;
if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) ||
!isOperationLegalOrCustom(ISD::UADDO, HalfT))
return ;
unsigned OpTypeBits = HalfT.getScalarSizeInBits();
SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
DAG.getConstant(0, dl, HalfT));
Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
DAG.getConstant(1, dl, HalfT));
Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi,
DAG.getConstant(OpTypeBits - 1, dl,
getShiftAmountTy(HalfT, DAG.getDataLayout())));
Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
SDValue(Lo.getNode(), 1));
Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
Results.push_back(Lo);
Results.push_back(Hi);
}
bool
ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
// The ARM target isn't yet aware of offsets.
return false;
}
bool ARM::isBitFieldInvertedMask(unsigned v) {
if (v == 0xffffffff)
return false;
// there can be 1's on either or both "outsides", all the "inside"
// bits must be 0's
return isShiftedMask_32(~v);
}
/// isFPImmLegal - Returns true if the target can instruction select the
/// specified FP immediate natively. If false, the legalizer will
/// materialize the FP immediate as a load from a constant pool.
bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const {
if (!Subtarget->hasVFP3Base())
return false;
if (VT == MVT::f16 && Subtarget->hasFullFP16())
return ARM_AM::getFP16Imm(Imm) != -1;
if (VT == MVT::f32)
return ARM_AM::getFP32Imm(Imm) != -1;
if (VT == MVT::f64 && Subtarget->hasFP64())
return ARM_AM::getFP64Imm(Imm) != -1;
return false;
}
/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
/// specified in the intrinsic calls.
bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,
MachineFunction &MF,
unsigned Intrinsic) const {
switch (Intrinsic) {
case Intrinsic::arm_neon_vld1:
case Intrinsic::arm_neon_vld2:
case Intrinsic::arm_neon_vld3:
case Intrinsic::arm_neon_vld4:
case Intrinsic::arm_neon_vld2lane:
case Intrinsic::arm_neon_vld3lane:
case Intrinsic::arm_neon_vld4lane:
case Intrinsic::arm_neon_vld2dup:
case Intrinsic::arm_neon_vld3dup:
case Intrinsic::arm_neon_vld4dup: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
// Conservatively set memVT to the entire set of vectors loaded.
auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
// volatile loads with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOLoad;
return true;
}
case Intrinsic::arm_neon_vld1x2:
case Intrinsic::arm_neon_vld1x3:
case Intrinsic::arm_neon_vld1x4: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
// Conservatively set memVT to the entire set of vectors loaded.
auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
Info.offset = 0;
Info.align = 0;
// volatile loads with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOLoad;
return true;
}
case Intrinsic::arm_neon_vst1:
case Intrinsic::arm_neon_vst2:
case Intrinsic::arm_neon_vst3:
case Intrinsic::arm_neon_vst4:
case Intrinsic::arm_neon_vst2lane:
case Intrinsic::arm_neon_vst3lane:
case Intrinsic::arm_neon_vst4lane: {
Info.opc = ISD::INTRINSIC_VOID;
// Conservatively set memVT to the entire set of vectors stored.
auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
unsigned NumElts = 0;
for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
Type *ArgTy = I.getArgOperand(ArgI)->getType();
if (!ArgTy->isVectorTy())
break;
NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
}
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
// volatile stores with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOStore;
return true;
}
case Intrinsic::arm_neon_vst1x2:
case Intrinsic::arm_neon_vst1x3:
case Intrinsic::arm_neon_vst1x4: {
Info.opc = ISD::INTRINSIC_VOID;
// Conservatively set memVT to the entire set of vectors stored.
auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
unsigned NumElts = 0;
for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
Type *ArgTy = I.getArgOperand(ArgI)->getType();
if (!ArgTy->isVectorTy())
break;
NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
}
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.align = 0;
// volatile stores with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOStore;
return true;
}
case Intrinsic::arm_ldaex:
case Intrinsic::arm_ldrex: {
auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
return true;
}
case Intrinsic::arm_stlex:
case Intrinsic::arm_strex: {
auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(1);
Info.offset = 0;
Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
return true;
}
case Intrinsic::arm_stlexd:
case Intrinsic::arm_strexd:
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::i64;
Info.ptrVal = I.getArgOperand(2);
Info.offset = 0;
Info.align = 8;
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
return true;
case Intrinsic::arm_ldaexd:
case Intrinsic::arm_ldrexd:
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::i64;
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.align = 8;
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
return true;
default:
break;
}
return false;
}
/// Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
assert(Ty->isIntegerTy());
unsigned Bits = Ty->getPrimitiveSizeInBits();
if (Bits == 0 || Bits > 32)
return false;
return true;
}
bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
unsigned Index) const {
if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
return false;
return (Index == 0 || Index == ResVT.getVectorNumElements());
}
Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
ARM_MB::MemBOpt Domain) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
// First, if the target has no DMB, see what fallback we can use.
if (!Subtarget->hasDataBarrier()) {
// Some ARMv6 cpus can support data barriers with an mcr instruction.
// Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
// here.
if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
Builder.getInt32(0), Builder.getInt32(7),
Builder.getInt32(10), Builder.getInt32(5)};
return Builder.CreateCall(MCR, args);
} else {
// Instead of using barriers, atomic accesses on these subtargets use
// libcalls.
llvm_unreachable("makeDMB on a target so old that it has no barriers");
}
} else {
Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
// Only a full system barrier exists in the M-class architectures.
Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
Constant *CDomain = Builder.getInt32(Domain);
return Builder.CreateCall(DMB, CDomain);
}
}
// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
llvm_unreachable("Invalid fence: unordered/non-atomic");
case AtomicOrdering::Monotonic:
case AtomicOrdering::Acquire:
return nullptr; // Nothing to do
case AtomicOrdering::SequentiallyConsistent:
if (!Inst->hasAtomicStore())
return nullptr; // Nothing to do
LLVM_FALLTHROUGH;
case AtomicOrdering::Release:
case AtomicOrdering::AcquireRelease:
if (Subtarget->preferISHSTBarriers())
return makeDMB(Builder, ARM_MB::ISHST);
// FIXME: add a comment with a link to documentation justifying this.
else
return makeDMB(Builder, ARM_MB::ISH);
}
llvm_unreachable("Unknown fence ordering in emitLeadingFence");
}
Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
llvm_unreachable("Invalid fence: unordered/not-atomic");
case AtomicOrdering::Monotonic:
case AtomicOrdering::Release:
return nullptr; // Nothing to do
case AtomicOrdering::Acquire:
case AtomicOrdering::AcquireRelease:
case AtomicOrdering::SequentiallyConsistent:
return makeDMB(Builder, ARM_MB::ISH);
}
llvm_unreachable("Unknown fence ordering in emitTrailingFence");
}
// Loads and stores less than 64-bits are already atomic; ones above that
// are doomed anyway, so defer to the default libcall and blame the OS when
// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
// anything for those.
bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
return (Size == 64) && !Subtarget->isMClass();
}
// Loads and stores less than 64-bits are already atomic; ones above that
// are doomed anyway, so defer to the default libcall and blame the OS when
// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
// anything for those.
// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
// guarantee, see DDI0406C ARM architecture reference manual,
// sections A8.8.72-74 LDRD)
TargetLowering::AtomicExpansionKind
ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
unsigned Size = LI->getType()->getPrimitiveSizeInBits();
return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
: AtomicExpansionKind::None;
}
// For the real atomic operations, we have ldrex/strex up to 32 bits,
// and up to 64 bits on the non-M profiles
TargetLowering::AtomicExpansionKind
ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
if (AI->isFloatingPointOperation())
return AtomicExpansionKind::CmpXChg;
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)
? AtomicExpansionKind::LLSC
: AtomicExpansionKind::None;
}
TargetLowering::AtomicExpansionKind
ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
// At -O0, fast-regalloc cannot cope with the live vregs necessary to
// implement cmpxchg without spilling. If the address being exchanged is also
// on the stack and close enough to the spill slot, this can lead to a
// situation where the monitor always gets cleared and the atomic operation
// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
bool HasAtomicCmpXchg =
!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg)
return AtomicExpansionKind::LLSC;
return AtomicExpansionKind::None;
}
bool ARMTargetLowering::shouldInsertFencesForAtomic(
const Instruction *I) const {
return InsertFencesForAtomic;
}
// This has so far only been implemented for MachO.
bool ARMTargetLowering::useLoadStackGuardNode() const {
return Subtarget->isTargetMachO();
}
void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
return TargetLowering::insertSSPDeclarations(M);
// MSVC CRT has a global variable holding security cookie.
M.getOrInsertGlobal("__security_cookie",
Type::getInt8PtrTy(M.getContext()));
// MSVC CRT has a function to validate security cookie.
FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
"__security_check_cookie", Type::getVoidTy(M.getContext()),
Type::getInt8PtrTy(M.getContext()));
if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
F->addAttribute(1, Attribute::AttrKind::InReg);
}
Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const {
// MSVC CRT has a global variable holding security cookie.
if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
return M.getGlobalVariable("__security_cookie");
return TargetLowering::getSDagStackGuard(M);
}
Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const {
// MSVC CRT has a function to validate security cookie.
if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
return M.getFunction("__security_check_cookie");
return TargetLowering::getSSPStackGuardCheck(M);
}
bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
unsigned &Cost) const {
// If we do not have NEON, vector types are not natively supported.
if (!Subtarget->hasNEON())
return false;
// Floating point values and vector values map to the same register file.
// Therefore, although we could do a store extract of a vector type, this is
// better to leave at float as we have more freedom in the addressing mode for
// those.
if (VectorTy->isFPOrFPVectorTy())
return false;
// If the index is unknown at compile time, this is very expensive to lower
// and it is not possible to combine the store with the extract.
if (!isa<ConstantInt>(Idx))
return false;
assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
// We can do a store + vector extract on any vector that fits perfectly in a D
// or Q register.
if (BitWidth == 64 || BitWidth == 128) {
Cost = 0;
return true;
}
return false;
}
bool ARMTargetLowering::isCheapToSpeculateCttz() const {
return Subtarget->hasV6T2Ops();
}
bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
return Subtarget->hasV6T2Ops();
}
bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const {
return !Subtarget->hasMinSize();
}
Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
AtomicOrdering Ord) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
bool IsAcquire = isAcquireOrStronger(Ord);
// Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
// intrinsic must return {i32, i32} and we have to recombine them into a
// single i64 here.
if (ValTy->getPrimitiveSizeInBits() == 64) {
Intrinsic::ID Int =
IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
Function *Ldrex = Intrinsic::getDeclaration(M, Int);
Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
if (!Subtarget->isLittle())
std::swap (Lo, Hi);
Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
return Builder.CreateOr(
Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
}
Type *Tys[] = { Addr->getType() };
Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys);
return Builder.CreateTruncOrBitCast(
Builder.CreateCall(Ldrex, Addr),
cast<PointerType>(Addr->getType())->getElementType());
}
void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
IRBuilder<> &Builder) const {
if (!Subtarget->hasV7Ops())
return;
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
}
Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
Value *Addr,
AtomicOrdering Ord) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
bool IsRelease = isReleaseOrStronger(Ord);
// Since the intrinsics must have legal type, the i64 intrinsics take two
// parameters: "i32, i32". We must marshal Val into the appropriate form
// before the call.
if (Val->getType()->getPrimitiveSizeInBits() == 64) {
Intrinsic::ID Int =
IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
Function *Strex = Intrinsic::getDeclaration(M, Int);
Type *Int32Ty = Type::getInt32Ty(M->getContext());
Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
if (!Subtarget->isLittle())
std::swap(Lo, Hi);
Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
return Builder.CreateCall(Strex, {Lo, Hi, Addr});
}
Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
Type *Tys[] = { Addr->getType() };
Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
return Builder.CreateCall(
Strex, {Builder.CreateZExtOrBitCast(
Val, Strex->getFunctionType()->getParamType(0)),
Addr});
}
bool ARMTargetLowering::alignLoopsWithOptSize() const {
return Subtarget->isMClass();
}
/// A helper function for determining the number of interleaved accesses we
/// will generate when lowering accesses of the given type.
unsigned
ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
const DataLayout &DL) const {
return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
}
bool ARMTargetLowering::isLegalInterleavedAccessType(
VectorType *VecTy, const DataLayout &DL) const {
unsigned VecSize = DL.getTypeSizeInBits(VecTy);
unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
// Ensure the vector doesn't have f16 elements. Even though we could do an
// i16 vldN, we can't hold the f16 vectors and will end up converting via
// f32.
if (VecTy->getElementType()->isHalfTy())
return false;
// Ensure the number of vector elements is greater than 1.
if (VecTy->getNumElements() < 2)
return false;
// Ensure the element type is legal.
if (ElSize != 8 && ElSize != 16 && ElSize != 32)
return false;
// Ensure the total vector size is 64 or a multiple of 128. Types larger than
// 128 will be split into multiple interleaved accesses.
return VecSize == 64 || VecSize % 128 == 0;
}
/// Lower an interleaved load into a vldN intrinsic.
///
/// E.g. Lower an interleaved load (Factor = 2):
/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
///
/// Into:
/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
bool ARMTargetLowering::lowerInterleavedLoad(
LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
assert(!Shuffles.empty() && "Empty shufflevector input");
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
VectorType *VecTy = Shuffles[0]->getType();
Type *EltTy = VecTy->getVectorElementType();
const DataLayout &DL = LI->getModule()->getDataLayout();
// Skip if we do not have NEON and skip illegal vector types. We can
// "legalize" wide vector types into multiple interleaved accesses as long as
// the vector types are divisible by 128.
if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL))
return false;
unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
// A pointer vector can not be the return type of the ldN intrinsics. Need to
// load integer vectors first and then convert to pointer vectors.
if (EltTy->isPointerTy())
VecTy =
VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
IRBuilder<> Builder(LI);
// The base address of the load.
Value *BaseAddr = LI->getPointerOperand();
if (NumLoads > 1) {
// If we're going to generate more than one load, reset the sub-vector type
// to something legal.
VecTy = VectorType::get(VecTy->getVectorElementType(),
VecTy->getVectorNumElements() / NumLoads);
// We will compute the pointer operand of each load from the original base
// address using GEPs. Cast the base address to a pointer to the scalar
// element type.
BaseAddr = Builder.CreateBitCast(
BaseAddr, VecTy->getVectorElementType()->getPointerTo(
LI->getPointerAddressSpace()));
}
assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
Type *Tys[] = {VecTy, Int8Ptr};
static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
Intrinsic::arm_neon_vld3,
Intrinsic::arm_neon_vld4};
Function *VldnFunc =
Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
// Holds sub-vectors extracted from the load intrinsic return values. The
// sub-vectors are associated with the shufflevector instructions they will
// replace.
DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
// If we're generating more than one load, compute the base address of
// subsequent loads as an offset from the previous.
if (LoadCount > 0)
BaseAddr =
Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr,
VecTy->getVectorNumElements() * Factor);
SmallVector<Value *, 2> Ops;
Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
Ops.push_back(Builder.getInt32(LI->getAlignment()));
CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
// Replace uses of each shufflevector with the corresponding vector loaded
// by ldN.
for (unsigned i = 0; i < Shuffles.size(); i++) {
ShuffleVectorInst *SV = Shuffles[i];
unsigned Index = Indices[i];
Value *SubVec = Builder.CreateExtractValue(VldN, Index);
// Convert the integer vector to pointer vector if the element is pointer.
if (EltTy->isPointerTy())
SubVec = Builder.CreateIntToPtr(
SubVec, VectorType::get(SV->getType()->getVectorElementType(),
VecTy->getVectorNumElements()));
SubVecs[SV].push_back(SubVec);
}
}
// Replace uses of the shufflevector instructions with the sub-vectors
// returned by the load intrinsic. If a shufflevector instruction is
// associated with more than one sub-vector, those sub-vectors will be
// concatenated into a single wide vector.
for (ShuffleVectorInst *SVI : Shuffles) {
auto &SubVec = SubVecs[SVI];
auto *WideVec =
SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
SVI->replaceAllUsesWith(WideVec);
}
return true;
}
/// Lower an interleaved store into a vstN intrinsic.
///
/// E.g. Lower an interleaved store (Factor = 3):
/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
///
/// Into:
/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
///
/// Note that the new shufflevectors will be removed and we'll only generate one
/// vst3 instruction in CodeGen.
///
/// Example for a more general valid mask (Factor 3). Lower:
/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
/// store <12 x i32> %i.vec, <12 x i32>* %ptr
///
/// Into:
/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
VectorType *VecTy = SVI->getType();
assert(VecTy->getVectorNumElements() % Factor == 0 &&
"Invalid interleaved store");
unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
Type *EltTy = VecTy->getVectorElementType();
VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
const DataLayout &DL = SI->getModule()->getDataLayout();
// Skip if we do not have NEON and skip illegal vector types. We can
// "legalize" wide vector types into multiple interleaved accesses as long as
// the vector types are divisible by 128.
if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
return false;
unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
Value *Op0 = SVI->getOperand(0);
Value *Op1 = SVI->getOperand(1);
IRBuilder<> Builder(SI);
// StN intrinsics don't support pointer vectors as arguments. Convert pointer
// vectors to integer vectors.
if (EltTy->isPointerTy()) {
Type *IntTy = DL.getIntPtrType(EltTy);
// Convert to the corresponding integer vector.
Type *IntVecTy =
VectorType::get(IntTy, Op0->getType()->getVectorNumElements());
Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
SubVecTy = VectorType::get(IntTy, LaneLen);
}
// The base address of the store.
Value *BaseAddr = SI->getPointerOperand();
if (NumStores > 1) {
// If we're going to generate more than one store, reset the lane length
// and sub-vector type to something legal.
LaneLen /= NumStores;
SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
// We will compute the pointer operand of each store from the original base
// address using GEPs. Cast the base address to a pointer to the scalar
// element type.
BaseAddr = Builder.CreateBitCast(
BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
SI->getPointerAddressSpace()));
}
assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
auto Mask = SVI->getShuffleMask();
Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
Type *Tys[] = {Int8Ptr, SubVecTy};
static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
Intrinsic::arm_neon_vst3,
Intrinsic::arm_neon_vst4};
for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
// If we generating more than one store, we compute the base address of
// subsequent stores as an offset from the previous.
if (StoreCount > 0)
BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(),
BaseAddr, LaneLen * Factor);
SmallVector<Value *, 6> Ops;
Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
Function *VstNFunc =
Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
// Split the shufflevector operands into sub vectors for the new vstN call.
for (unsigned i = 0; i < Factor; i++) {
unsigned IdxI = StoreCount * LaneLen * Factor + i;
if (Mask[IdxI] >= 0) {
Ops.push_back(Builder.CreateShuffleVector(
Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
} else {
unsigned StartMask = 0;
for (unsigned j = 1; j < LaneLen; j++) {
unsigned IdxJ = StoreCount * LaneLen * Factor + j;
if (Mask[IdxJ * Factor + IdxI] >= 0) {
StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
break;
}
}
// Note: If all elements in a chunk are undefs, StartMask=0!
// Note: Filling undef gaps with random elements is ok, since
// those elements were being written anyway (with undefs).
// In the case of all undefs we're defaulting to using elems from 0
// Note: StartMask cannot be negative, it's checked in
// isReInterleaveMask
Ops.push_back(Builder.CreateShuffleVector(
Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
}
}
Ops.push_back(Builder.getInt32(SI->getAlignment()));
Builder.CreateCall(VstNFunc, Ops);
}
return true;
}
enum HABaseType {
HA_UNKNOWN = 0,
HA_FLOAT,
HA_DOUBLE,
HA_VECT64,
HA_VECT128
};
static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
uint64_t &Members) {
if (auto *ST = dyn_cast<StructType>(Ty)) {
for (unsigned i = 0; i < ST->getNumElements(); ++i) {
uint64_t SubMembers = 0;
if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
return false;
Members += SubMembers;
}
} else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
uint64_t SubMembers = 0;
if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
return false;
Members += SubMembers * AT->getNumElements();
} else if (Ty->isFloatTy()) {
if (Base != HA_UNKNOWN && Base != HA_FLOAT)
return false;
Members = 1;
Base = HA_FLOAT;
} else if (Ty->isDoubleTy()) {
if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
return false;
Members = 1;
Base = HA_DOUBLE;
} else if (auto *VT = dyn_cast<VectorType>(Ty)) {
Members = 1;
switch (Base) {
case HA_FLOAT:
case HA_DOUBLE:
return false;
case HA_VECT64:
return VT->getBitWidth() == 64;
case HA_VECT128:
return VT->getBitWidth() == 128;
case HA_UNKNOWN:
switch (VT->getBitWidth()) {
case 64:
Base = HA_VECT64;
return true;
case 128:
Base = HA_VECT128;
return true;
default:
return false;
}
}
}
return (Members > 0 && Members <= 4);
}
/// Return the correct alignment for the current calling convention.
unsigned
ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy,
DataLayout DL) const {
if (!ArgTy->isVectorTy())
return DL.getABITypeAlignment(ArgTy);
// Avoid over-aligning vector parameters. It would require realigning the
// stack and waste space for no real benefit.
return std::min(DL.getABITypeAlignment(ArgTy), DL.getStackAlignment());
}
/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
/// passing according to AAPCS rules.
bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
if (getEffectiveCallingConv(CallConv, isVarArg) !=
CallingConv::ARM_AAPCS_VFP)
return false;
HABaseType Base = HA_UNKNOWN;
uint64_t Members = 0;
bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
return IsHA || IsIntArray;
}
unsigned ARMTargetLowering::getExceptionPointerRegister(
const Constant *PersonalityFn) const {
// Platforms which do not use SjLj EH may return values in these registers
// via the personality function.
return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0;
}
unsigned ARMTargetLowering::getExceptionSelectorRegister(
const Constant *PersonalityFn) const {
// Platforms which do not use SjLj EH may return values in these registers
// via the personality function.
return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1;
}
void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
// Update IsSplitCSR in ARMFunctionInfo.
ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
AFI->setIsSplitCSR(true);
}
void ARMTargetLowering::insertCopiesSplitCSR(
MachineBasicBlock *Entry,
const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
if (!IStart)
return;
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
MachineBasicBlock::iterator MBBI = Entry->begin();
for (const MCPhysReg *I = IStart; *I; ++I) {
const TargetRegisterClass *RC = nullptr;
if (ARM::GPRRegClass.contains(*I))
RC = &ARM::GPRRegClass;
else if (ARM::DPRRegClass.contains(*I))
RC = &ARM::DPRRegClass;
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
unsigned NewVR = MRI->createVirtualRegister(RC);
// Create copy from CSR to a virtual register.
// FIXME: this currently does not emit CFI pseudo-instructions, it works
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
// nounwind. If we want to generalize this later, we may need to emit
// CFI pseudo-instructions.
assert(Entry->getParent()->getFunction().hasFnAttribute(
Attribute::NoUnwind) &&
"Function should be nounwind in insertCopiesSplitCSR!");
Entry->addLiveIn(*I);
BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
.addReg(*I);
// Insert the copy-back instructions right before the terminator.
for (auto *Exit : Exits)
BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
TII->get(TargetOpcode::COPY), *I)
.addReg(NewVR);
}
}
void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const {
MF.getFrameInfo().computeMaxCallFrameSize(MF);
TargetLoweringBase::finalizeLowering(MF);
}
Index: vendor/llvm/dist-release_90/lib/Target/ARM/ARMInstrThumb.td
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/ARM/ARMInstrThumb.td (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/ARM/ARMInstrThumb.td (revision 351303)
@@ -1,1736 +1,1737 @@
//===-- ARMInstrThumb.td - Thumb support for ARM -----------*- tablegen -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file describes the Thumb instruction set.
//
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
// Thumb specific DAG Nodes.
//
def imm_sr_XFORM: SDNodeXForm<imm, [{
unsigned Imm = N->getZExtValue();
return CurDAG->getTargetConstant((Imm == 32 ? 0 : Imm), SDLoc(N), MVT::i32);
}]>;
def ThumbSRImmAsmOperand: ImmAsmOperand<1,32> { let Name = "ImmThumbSR"; }
def imm_sr : Operand<i32>, PatLeaf<(imm), [{
uint64_t Imm = N->getZExtValue();
return Imm > 0 && Imm <= 32;
}], imm_sr_XFORM> {
let PrintMethod = "printThumbSRImm";
let ParserMatchClass = ThumbSRImmAsmOperand;
}
def imm0_7_neg : PatLeaf<(i32 imm), [{
return (uint32_t)-N->getZExtValue() < 8;
}], imm_neg_XFORM>;
def ThumbModImmNeg1_7AsmOperand : AsmOperandClass { let Name = "ThumbModImmNeg1_7"; }
def mod_imm1_7_neg : Operand<i32>, PatLeaf<(imm), [{
unsigned Value = -(unsigned)N->getZExtValue();
return 0 < Value && Value < 8;
}], imm_neg_XFORM> {
let ParserMatchClass = ThumbModImmNeg1_7AsmOperand;
}
def ThumbModImmNeg8_255AsmOperand : AsmOperandClass { let Name = "ThumbModImmNeg8_255"; }
def mod_imm8_255_neg : Operand<i32>, PatLeaf<(imm), [{
unsigned Value = -(unsigned)N->getZExtValue();
return 7 < Value && Value < 256;
}], imm_neg_XFORM> {
let ParserMatchClass = ThumbModImmNeg8_255AsmOperand;
}
def imm0_255_comp : PatLeaf<(i32 imm), [{
return ~((uint32_t)N->getZExtValue()) < 256;
}]>;
def imm8_255_neg : PatLeaf<(i32 imm), [{
unsigned Val = -N->getZExtValue();
return Val >= 8 && Val < 256;
}], imm_neg_XFORM>;
// Break imm's up into two pieces: an immediate + a left shift. This uses
// thumb_immshifted to match and thumb_immshifted_val and thumb_immshifted_shamt
// to get the val/shift pieces.
def thumb_immshifted : PatLeaf<(imm), [{
return ARM_AM::isThumbImmShiftedVal((unsigned)N->getZExtValue());
}]>;
def thumb_immshifted_val : SDNodeXForm<imm, [{
unsigned V = ARM_AM::getThumbImmNonShiftedVal((unsigned)N->getZExtValue());
return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32);
}]>;
def thumb_immshifted_shamt : SDNodeXForm<imm, [{
unsigned V = ARM_AM::getThumbImmValShift((unsigned)N->getZExtValue());
return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32);
}]>;
def imm256_510 : ImmLeaf<i32, [{
return Imm >= 256 && Imm < 511;
}]>;
def thumb_imm256_510_addend : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() - 255, SDLoc(N), MVT::i32);
}]>;
// Scaled 4 immediate.
def t_imm0_1020s4_asmoperand: AsmOperandClass { let Name = "Imm0_1020s4"; }
def t_imm0_1020s4 : Operand<i32> {
let PrintMethod = "printThumbS4ImmOperand";
let ParserMatchClass = t_imm0_1020s4_asmoperand;
let OperandType = "OPERAND_IMMEDIATE";
}
def t_imm0_508s4_asmoperand: AsmOperandClass { let Name = "Imm0_508s4"; }
def t_imm0_508s4 : Operand<i32> {
let PrintMethod = "printThumbS4ImmOperand";
let ParserMatchClass = t_imm0_508s4_asmoperand;
let OperandType = "OPERAND_IMMEDIATE";
}
// Alias use only, so no printer is necessary.
def t_imm0_508s4_neg_asmoperand: AsmOperandClass { let Name = "Imm0_508s4Neg"; }
def t_imm0_508s4_neg : Operand<i32> {
let ParserMatchClass = t_imm0_508s4_neg_asmoperand;
let OperandType = "OPERAND_IMMEDIATE";
}
// Define Thumb specific addressing modes.
// unsigned 8-bit, 2-scaled memory offset
class OperandUnsignedOffset_b8s2 : AsmOperandClass {
let Name = "UnsignedOffset_b8s2";
let PredicateMethod = "isUnsignedOffset<8, 2>";
}
def UnsignedOffset_b8s2 : OperandUnsignedOffset_b8s2;
// thumb style PC relative operand. signed, 8 bits magnitude,
// two bits shift. can be represented as either [pc, #imm], #imm,
// or relocatable expression...
def ThumbMemPC : AsmOperandClass {
let Name = "ThumbMemPC";
}
let OperandType = "OPERAND_PCREL" in {
def t_brtarget : Operand<OtherVT> {
let EncoderMethod = "getThumbBRTargetOpValue";
let DecoderMethod = "DecodeThumbBROperand";
}
// ADR instruction labels.
def t_adrlabel : Operand<i32> {
let EncoderMethod = "getThumbAdrLabelOpValue";
let PrintMethod = "printAdrLabelOperand<2>";
let ParserMatchClass = UnsignedOffset_b8s2;
}
def thumb_br_target : Operand<OtherVT> {
let ParserMatchClass = ThumbBranchTarget;
let EncoderMethod = "getThumbBranchTargetOpValue";
let OperandType = "OPERAND_PCREL";
}
def thumb_bl_target : Operand<i32> {
let ParserMatchClass = ThumbBranchTarget;
let EncoderMethod = "getThumbBLTargetOpValue";
let DecoderMethod = "DecodeThumbBLTargetOperand";
}
// Target for BLX *from* thumb mode.
def thumb_blx_target : Operand<i32> {
let ParserMatchClass = ARMBranchTarget;
let EncoderMethod = "getThumbBLXTargetOpValue";
let DecoderMethod = "DecodeThumbBLXOffset";
}
def thumb_bcc_target : Operand<OtherVT> {
let ParserMatchClass = ThumbBranchTarget;
let EncoderMethod = "getThumbBCCTargetOpValue";
let DecoderMethod = "DecodeThumbBCCTargetOperand";
}
def thumb_cb_target : Operand<OtherVT> {
let ParserMatchClass = ThumbBranchTarget;
let EncoderMethod = "getThumbCBTargetOpValue";
let DecoderMethod = "DecodeThumbCmpBROperand";
}
// t_addrmode_pc := <label> => pc + imm8 * 4
//
def t_addrmode_pc : MemOperand {
let EncoderMethod = "getAddrModePCOpValue";
let DecoderMethod = "DecodeThumbAddrModePC";
let PrintMethod = "printThumbLdrLabelOperand";
let ParserMatchClass = ThumbMemPC;
}
}
// t_addrmode_rr := reg + reg
//
def t_addrmode_rr_asm_operand : AsmOperandClass { let Name = "MemThumbRR"; }
def t_addrmode_rr : MemOperand,
ComplexPattern<i32, 2, "SelectThumbAddrModeRR", []> {
let EncoderMethod = "getThumbAddrModeRegRegOpValue";
let PrintMethod = "printThumbAddrModeRROperand";
let DecoderMethod = "DecodeThumbAddrModeRR";
let ParserMatchClass = t_addrmode_rr_asm_operand;
let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
}
// t_addrmode_rr_sext := reg + reg
//
// This is similar to t_addrmode_rr, but uses different heuristics for
// ldrsb/ldrsh.
def t_addrmode_rr_sext : MemOperand,
ComplexPattern<i32, 2, "SelectThumbAddrModeRRSext", []> {
let EncoderMethod = "getThumbAddrModeRegRegOpValue";
let PrintMethod = "printThumbAddrModeRROperand";
let DecoderMethod = "DecodeThumbAddrModeRR";
let ParserMatchClass = t_addrmode_rr_asm_operand;
let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
}
// t_addrmode_rrs := reg + reg
//
// We use separate scaled versions because the Select* functions need
// to explicitly check for a matching constant and return false here so that
// the reg+imm forms will match instead. This is a horrible way to do that,
// as it forces tight coupling between the methods, but it's how selectiondag
// currently works.
def t_addrmode_rrs1 : MemOperand,
ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S1", []> {
let EncoderMethod = "getThumbAddrModeRegRegOpValue";
let PrintMethod = "printThumbAddrModeRROperand";
let DecoderMethod = "DecodeThumbAddrModeRR";
let ParserMatchClass = t_addrmode_rr_asm_operand;
let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
}
def t_addrmode_rrs2 : MemOperand,
ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S2", []> {
let EncoderMethod = "getThumbAddrModeRegRegOpValue";
let DecoderMethod = "DecodeThumbAddrModeRR";
let PrintMethod = "printThumbAddrModeRROperand";
let ParserMatchClass = t_addrmode_rr_asm_operand;
let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
}
def t_addrmode_rrs4 : MemOperand,
ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S4", []> {
let EncoderMethod = "getThumbAddrModeRegRegOpValue";
let DecoderMethod = "DecodeThumbAddrModeRR";
let PrintMethod = "printThumbAddrModeRROperand";
let ParserMatchClass = t_addrmode_rr_asm_operand;
let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
}
// t_addrmode_is4 := reg + imm5 * 4
//
def t_addrmode_is4_asm_operand : AsmOperandClass { let Name = "MemThumbRIs4"; }
def t_addrmode_is4 : MemOperand,
ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S4", []> {
let EncoderMethod = "getAddrModeISOpValue";
let DecoderMethod = "DecodeThumbAddrModeIS";
let PrintMethod = "printThumbAddrModeImm5S4Operand";
let ParserMatchClass = t_addrmode_is4_asm_operand;
let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
}
// t_addrmode_is2 := reg + imm5 * 2
//
def t_addrmode_is2_asm_operand : AsmOperandClass { let Name = "MemThumbRIs2"; }
def t_addrmode_is2 : MemOperand,
ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S2", []> {
let EncoderMethod = "getAddrModeISOpValue";
let DecoderMethod = "DecodeThumbAddrModeIS";
let PrintMethod = "printThumbAddrModeImm5S2Operand";
let ParserMatchClass = t_addrmode_is2_asm_operand;
let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
}
// t_addrmode_is1 := reg + imm5
//
def t_addrmode_is1_asm_operand : AsmOperandClass { let Name = "MemThumbRIs1"; }
def t_addrmode_is1 : MemOperand,
ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S1", []> {
let EncoderMethod = "getAddrModeISOpValue";
let DecoderMethod = "DecodeThumbAddrModeIS";
let PrintMethod = "printThumbAddrModeImm5S1Operand";
let ParserMatchClass = t_addrmode_is1_asm_operand;
let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
}
// t_addrmode_sp := sp + imm8 * 4
//
// FIXME: This really shouldn't have an explicit SP operand at all. It should
// be implicit, just like in the instruction encoding itself.
def t_addrmode_sp_asm_operand : AsmOperandClass { let Name = "MemThumbSPI"; }
def t_addrmode_sp : MemOperand,
ComplexPattern<i32, 2, "SelectThumbAddrModeSP", []> {
let EncoderMethod = "getAddrModeThumbSPOpValue";
let DecoderMethod = "DecodeThumbAddrModeSP";
let PrintMethod = "printThumbAddrModeSPOperand";
let ParserMatchClass = t_addrmode_sp_asm_operand;
let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
}
// Inspects parent to determine whether an or instruction can be implemented as
// an add (i.e. whether we know overflow won't occur in the add).
def AddLikeOrOp : ComplexPattern<i32, 1, "SelectAddLikeOr", [],
[SDNPWantParent]>;
// Pattern to exclude immediates from matching
def non_imm32 : PatLeaf<(i32 GPR), [{ return !isa<ConstantSDNode>(N); }]>;
//===----------------------------------------------------------------------===//
// Miscellaneous Instructions.
//
// FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE
// from removing one half of the matched pairs. That breaks PEI, which assumes
// these will always be in pairs, and asserts if it finds otherwise. Better way?
let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
def tADJCALLSTACKUP :
PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), NoItinerary,
[(ARMcallseq_end imm:$amt1, imm:$amt2)]>,
Requires<[IsThumb, IsThumb1Only]>;
def tADJCALLSTACKDOWN :
PseudoInst<(outs), (ins i32imm:$amt, i32imm:$amt2), NoItinerary,
[(ARMcallseq_start imm:$amt, imm:$amt2)]>,
Requires<[IsThumb, IsThumb1Only]>;
}
class T1SystemEncoding<bits<8> opc>
: T1Encoding<0b101111> {
let Inst{9-8} = 0b11;
let Inst{7-0} = opc;
}
def tHINT : T1pI<(outs), (ins imm0_15:$imm), NoItinerary, "hint", "\t$imm",
[(int_arm_hint imm0_15:$imm)]>,
T1SystemEncoding<0x00>,
Requires<[IsThumb, HasV6M]> {
bits<4> imm;
let Inst{7-4} = imm;
}
// Note: When EmitPriority == 1, the alias will be used for printing
class tHintAlias<string Asm, dag Result, bit EmitPriority = 0> : tInstAlias<Asm, Result, EmitPriority> {
let Predicates = [IsThumb, HasV6M];
}
def : tHintAlias<"nop$p", (tHINT 0, pred:$p), 1>; // A8.6.110
def : tHintAlias<"yield$p", (tHINT 1, pred:$p), 1>; // A8.6.410
def : tHintAlias<"wfe$p", (tHINT 2, pred:$p), 1>; // A8.6.408
def : tHintAlias<"wfi$p", (tHINT 3, pred:$p), 1>; // A8.6.409
def : tHintAlias<"sev$p", (tHINT 4, pred:$p), 1>; // A8.6.157
def : tInstAlias<"sevl$p", (tHINT 5, pred:$p), 1> {
let Predicates = [IsThumb2, HasV8];
}
// The imm operand $val can be used by a debugger to store more information
// about the breakpoint.
def tBKPT : T1I<(outs), (ins imm0_255:$val), NoItinerary, "bkpt\t$val",
[]>,
T1Encoding<0b101111> {
let Inst{9-8} = 0b10;
// A8.6.22
bits<8> val;
let Inst{7-0} = val;
}
// default immediate for breakpoint mnemonic
def : InstAlias<"bkpt", (tBKPT 0), 0>, Requires<[IsThumb]>;
def tHLT : T1I<(outs), (ins imm0_63:$val), NoItinerary, "hlt\t$val",
[]>, T1Encoding<0b101110>, Requires<[IsThumb, HasV8]> {
let Inst{9-6} = 0b1010;
bits<6> val;
let Inst{5-0} = val;
}
def tSETEND : T1I<(outs), (ins setend_op:$end), NoItinerary, "setend\t$end",
[]>, T1Encoding<0b101101>, Requires<[IsThumb, IsNotMClass]>, Deprecated<HasV8Ops> {
bits<1> end;
// A8.6.156
let Inst{9-5} = 0b10010;
let Inst{4} = 1;
let Inst{3} = end;
let Inst{2-0} = 0b000;
}
// Change Processor State is a system instruction -- for disassembly only.
def tCPS : T1I<(outs), (ins imod_op:$imod, iflags_op:$iflags),
NoItinerary, "cps$imod $iflags", []>,
T1Misc<0b0110011> {
// A8.6.38 & B6.1.1
bit imod;
bits<3> iflags;
let Inst{4} = imod;
let Inst{3} = 0;
let Inst{2-0} = iflags;
let DecoderMethod = "DecodeThumbCPS";
}
// For both thumb1 and thumb2.
let isNotDuplicable = 1, isCodeGenOnly = 1 in
def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr, "",
[(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>,
T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
// A8.6.6
bits<3> dst;
let Inst{6-3} = 0b1111; // Rm = pc
let Inst{2-0} = dst;
}
// ADD <Rd>, sp, #<imm8>
// FIXME: This should not be marked as having side effects, and it should be
// rematerializable. Clearing the side effect bit causes miscompilations,
// probably because the instruction can be moved around.
def tADDrSPi : T1pI<(outs tGPR:$dst), (ins GPRsp:$sp, t_imm0_1020s4:$imm),
IIC_iALUi, "add", "\t$dst, $sp, $imm", []>,
T1Encoding<{1,0,1,0,1,?}>, Sched<[WriteALU]> {
// A6.2 & A8.6.8
bits<3> dst;
bits<8> imm;
let Inst{10-8} = dst;
let Inst{7-0} = imm;
let DecoderMethod = "DecodeThumbAddSpecialReg";
}
// Thumb1 frame lowering is rather fragile, we hope to be able to use
// tADDrSPi, but we may need to insert a sequence that clobbers CPSR.
def tADDframe : PseudoInst<(outs tGPR:$dst), (ins i32imm:$base, i32imm:$offset),
NoItinerary, []>,
Requires<[IsThumb, IsThumb1Only]> {
let Defs = [CPSR];
}
// ADD sp, sp, #<imm7>
def tADDspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
IIC_iALUi, "add", "\t$Rdn, $imm", []>,
T1Misc<{0,0,0,0,0,?,?}>, Sched<[WriteALU]> {
// A6.2.5 & A8.6.8
bits<7> imm;
let Inst{6-0} = imm;
let DecoderMethod = "DecodeThumbAddSPImm";
}
// SUB sp, sp, #<imm7>
// FIXME: The encoding and the ASM string don't match up.
def tSUBspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
IIC_iALUi, "sub", "\t$Rdn, $imm", []>,
T1Misc<{0,0,0,0,1,?,?}>, Sched<[WriteALU]> {
// A6.2.5 & A8.6.214
bits<7> imm;
let Inst{6-0} = imm;
let DecoderMethod = "DecodeThumbAddSPImm";
}
def : tInstSubst<"add${p} sp, $imm",
(tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>;
def : tInstSubst<"add${p} sp, sp, $imm",
(tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>;
// Can optionally specify SP as a three operand instruction.
def : tInstAlias<"add${p} sp, sp, $imm",
(tADDspi SP, t_imm0_508s4:$imm, pred:$p)>;
def : tInstAlias<"sub${p} sp, sp, $imm",
(tSUBspi SP, t_imm0_508s4:$imm, pred:$p)>;
// ADD <Rm>, sp
def tADDrSP : T1pI<(outs GPR:$Rdn), (ins GPRsp:$sp, GPR:$Rn), IIC_iALUr,
"add", "\t$Rdn, $sp, $Rn", []>,
T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
// A8.6.9 Encoding T1
bits<4> Rdn;
let Inst{7} = Rdn{3};
let Inst{6-3} = 0b1101;
let Inst{2-0} = Rdn{2-0};
let DecoderMethod = "DecodeThumbAddSPReg";
}
// ADD sp, <Rm>
def tADDspr : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, GPR:$Rm), IIC_iALUr,
"add", "\t$Rdn, $Rm", []>,
T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
// A8.6.9 Encoding T2
bits<4> Rm;
let Inst{7} = 1;
let Inst{6-3} = Rm;
let Inst{2-0} = 0b101;
let DecoderMethod = "DecodeThumbAddSPReg";
}
//===----------------------------------------------------------------------===//
// Control Flow Instructions.
//
// Indirect branches
let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
def tBX : TI<(outs), (ins GPR:$Rm, pred:$p), IIC_Br, "bx${p}\t$Rm", []>,
T1Special<{1,1,0,?}>, Sched<[WriteBr]> {
// A6.2.3 & A8.6.25
bits<4> Rm;
let Inst{6-3} = Rm;
let Inst{2-0} = 0b000;
let Unpredictable{2-0} = 0b111;
}
def tBXNS : TI<(outs), (ins GPR:$Rm, pred:$p), IIC_Br, "bxns${p}\t$Rm", []>,
Requires<[IsThumb, Has8MSecExt]>,
T1Special<{1,1,0,?}>, Sched<[WriteBr]> {
bits<4> Rm;
let Inst{6-3} = Rm;
let Inst{2-0} = 0b100;
let Unpredictable{1-0} = 0b11;
}
}
let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
def tBX_RET : tPseudoExpand<(outs), (ins pred:$p), 2, IIC_Br,
[(ARMretflag)], (tBX LR, pred:$p)>, Sched<[WriteBr]>;
// Alternative return instruction used by vararg functions.
def tBX_RET_vararg : tPseudoExpand<(outs), (ins tGPR:$Rm, pred:$p),
2, IIC_Br, [],
(tBX GPR:$Rm, pred:$p)>, Sched<[WriteBr]>;
}
// All calls clobber the non-callee saved registers. SP is marked as a use to
// prevent stack-pointer assignments that appear immediately before calls from
// potentially appearing dead.
let isCall = 1,
Defs = [LR], Uses = [SP] in {
// Also used for Thumb2
def tBL : TIx2<0b11110, 0b11, 1,
(outs), (ins pred:$p, thumb_bl_target:$func), IIC_Br,
"bl${p}\t$func",
[(ARMcall tglobaladdr:$func)]>,
Requires<[IsThumb]>, Sched<[WriteBrL]> {
bits<24> func;
let Inst{26} = func{23};
let Inst{25-16} = func{20-11};
let Inst{13} = func{22};
let Inst{11} = func{21};
let Inst{10-0} = func{10-0};
}
// ARMv5T and above, also used for Thumb2
def tBLXi : TIx2<0b11110, 0b11, 0,
(outs), (ins pred:$p, thumb_blx_target:$func), IIC_Br,
"blx${p}\t$func", []>,
Requires<[IsThumb, HasV5T, IsNotMClass]>, Sched<[WriteBrL]> {
bits<24> func;
let Inst{26} = func{23};
let Inst{25-16} = func{20-11};
let Inst{13} = func{22};
let Inst{11} = func{21};
let Inst{10-1} = func{10-1};
let Inst{0} = 0; // func{0} is assumed zero
}
// Also used for Thumb2
def tBLXr : TI<(outs), (ins pred:$p, GPR:$func), IIC_Br,
"blx${p}\t$func",
[(ARMcall GPR:$func)]>,
Requires<[IsThumb, HasV5T]>,
T1Special<{1,1,1,?}>, Sched<[WriteBrL]> { // A6.2.3 & A8.6.24;
bits<4> func;
let Inst{6-3} = func;
let Inst{2-0} = 0b000;
}
// ARMv8-M Security Extensions
def tBLXNSr : TI<(outs), (ins pred:$p, GPRnopc:$func), IIC_Br,
"blxns${p}\t$func", []>,
Requires<[IsThumb, Has8MSecExt]>,
T1Special<{1,1,1,?}>, Sched<[WriteBrL]> {
bits<4> func;
let Inst{6-3} = func;
let Inst{2-0} = 0b100;
let Unpredictable{1-0} = 0b11;
}
// ARMv4T
def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func),
4, IIC_Br,
[(ARMcall_nolink tGPR:$func)]>,
Requires<[IsThumb, IsThumb1Only]>, Sched<[WriteBr]>;
}
let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
let isPredicable = 1 in
def tB : T1pI<(outs), (ins t_brtarget:$target), IIC_Br,
"b", "\t$target", [(br bb:$target)]>,
T1Encoding<{1,1,1,0,0,?}>, Sched<[WriteBr]> {
bits<11> target;
let Inst{10-0} = target;
let AsmMatchConverter = "cvtThumbBranches";
}
// Far jump
// Just a pseudo for a tBL instruction. Needed to let regalloc know about
// the clobber of LR.
let Defs = [LR] in
def tBfar : tPseudoExpand<(outs), (ins thumb_bl_target:$target, pred:$p),
4, IIC_Br, [],
(tBL pred:$p, thumb_bl_target:$target)>,
Sched<[WriteBrTbl]>;
def tBR_JTr : tPseudoInst<(outs),
(ins tGPR:$target, i32imm:$jt),
0, IIC_Br,
[(ARMbrjt tGPR:$target, tjumptable:$jt)]>,
Sched<[WriteBrTbl]> {
let Size = 2;
+ let isNotDuplicable = 1;
list<Predicate> Predicates = [IsThumb, IsThumb1Only];
}
}
// FIXME: should be able to write a pattern for ARMBrcond, but can't use
// a two-value operand where a dag node expects two operands. :(
let isBranch = 1, isTerminator = 1 in
def tBcc : T1I<(outs), (ins thumb_bcc_target:$target, pred:$p), IIC_Br,
"b${p}\t$target",
[/*(ARMbrcond bb:$target, imm:$cc)*/]>,
T1BranchCond<{1,1,0,1}>, Sched<[WriteBr]> {
bits<4> p;
bits<8> target;
let Inst{11-8} = p;
let Inst{7-0} = target;
let AsmMatchConverter = "cvtThumbBranches";
}
// Tail calls
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
// IOS versions.
let Uses = [SP] in {
def tTAILJMPr : tPseudoExpand<(outs), (ins tcGPR:$dst),
4, IIC_Br, [],
(tBX GPR:$dst, (ops 14, zero_reg))>,
Requires<[IsThumb]>, Sched<[WriteBr]>;
}
// tTAILJMPd: MachO version uses a Thumb2 branch (no Thumb1 tail calls
// on MachO), so it's in ARMInstrThumb2.td.
// Non-MachO version:
let Uses = [SP] in {
def tTAILJMPdND : tPseudoExpand<(outs),
(ins t_brtarget:$dst, pred:$p),
4, IIC_Br, [],
(tB t_brtarget:$dst, pred:$p)>,
Requires<[IsThumb, IsNotMachO]>, Sched<[WriteBr]>;
}
}
// A8.6.218 Supervisor Call (Software Interrupt)
// A8.6.16 B: Encoding T1
// If Inst{11-8} == 0b1111 then SEE SVC
let isCall = 1, Uses = [SP] in
def tSVC : T1pI<(outs), (ins imm0_255:$imm), IIC_Br,
"svc", "\t$imm", []>, Encoding16, Sched<[WriteBr]> {
bits<8> imm;
let Inst{15-12} = 0b1101;
let Inst{11-8} = 0b1111;
let Inst{7-0} = imm;
}
// The assembler uses 0xDEFE for a trap instruction.
let isBarrier = 1, isTerminator = 1 in
def tTRAP : TI<(outs), (ins), IIC_Br,
"trap", [(trap)]>, Encoding16, Sched<[WriteBr]> {
let Inst = 0xdefe;
}
//===----------------------------------------------------------------------===//
// Load Store Instructions.
//
// PC-relative loads need to be matched first as constant pool accesses need to
// always be PC-relative. We do this using AddedComplexity, as the pattern is
// simpler than the patterns of the other load instructions.
let canFoldAsLoad = 1, isReMaterializable = 1, AddedComplexity = 10 in
def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
"ldr", "\t$Rt, $addr",
[(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>,
T1Encoding<{0,1,0,0,1,?}>, Sched<[WriteLd]> {
// A6.2 & A8.6.59
bits<3> Rt;
bits<8> addr;
let Inst{10-8} = Rt;
let Inst{7-0} = addr;
}
// SP-relative loads should be matched before standard immediate-offset loads as
// it means we avoid having to move SP to another register.
let canFoldAsLoad = 1 in
def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i,
"ldr", "\t$Rt, $addr",
[(set tGPR:$Rt, (load t_addrmode_sp:$addr))]>,
T1LdStSP<{1,?,?}>, Sched<[WriteLd]> {
bits<3> Rt;
bits<8> addr;
let Inst{10-8} = Rt;
let Inst{7-0} = addr;
}
// Loads: reg/reg and reg/imm5
let canFoldAsLoad = 1, isReMaterializable = 1 in
multiclass thumb_ld_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
Operand AddrMode_r, Operand AddrMode_i,
AddrMode am, InstrItinClass itin_r,
InstrItinClass itin_i, string asm,
PatFrag opnode> {
// Immediate-offset loads should be matched before register-offset loads as
// when the offset is a constant it's simpler to first check if it fits in the
// immediate offset field then fall back to register-offset if it doesn't.
def i : // reg/imm5
T1pILdStEncodeImm<imm_opc, 1 /* Load */,
(outs tGPR:$Rt), (ins AddrMode_i:$addr),
am, itin_i, asm, "\t$Rt, $addr",
[(set tGPR:$Rt, (opnode AddrMode_i:$addr))]>;
// Register-offset loads are matched last.
def r : // reg/reg
T1pILdStEncode<reg_opc,
(outs tGPR:$Rt), (ins AddrMode_r:$addr),
am, itin_r, asm, "\t$Rt, $addr",
[(set tGPR:$Rt, (opnode AddrMode_r:$addr))]>;
}
// Stores: reg/reg and reg/imm5
multiclass thumb_st_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
Operand AddrMode_r, Operand AddrMode_i,
AddrMode am, InstrItinClass itin_r,
InstrItinClass itin_i, string asm,
PatFrag opnode> {
def i : // reg/imm5
T1pILdStEncodeImm<imm_opc, 0 /* Store */,
(outs), (ins tGPR:$Rt, AddrMode_i:$addr),
am, itin_i, asm, "\t$Rt, $addr",
[(opnode tGPR:$Rt, AddrMode_i:$addr)]>;
def r : // reg/reg
T1pILdStEncode<reg_opc,
(outs), (ins tGPR:$Rt, AddrMode_r:$addr),
am, itin_r, asm, "\t$Rt, $addr",
[(opnode tGPR:$Rt, AddrMode_r:$addr)]>;
}
// A8.6.57 & A8.6.60
defm tLDR : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rr,
t_addrmode_is4, AddrModeT1_4,
IIC_iLoad_r, IIC_iLoad_i, "ldr",
load>, Sched<[WriteLd]>;
// A8.6.64 & A8.6.61
defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rr,
t_addrmode_is1, AddrModeT1_1,
IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrb",
zextloadi8>, Sched<[WriteLd]>;
// A8.6.76 & A8.6.73
defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rr,
t_addrmode_is2, AddrModeT1_2,
IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrh",
zextloadi16>, Sched<[WriteLd]>;
let AddedComplexity = 10 in
def tLDRSB : // A8.6.80
T1pILdStEncode<0b011, (outs tGPR:$Rt), (ins t_addrmode_rr_sext:$addr),
AddrModeT1_1, IIC_iLoad_bh_r,
"ldrsb", "\t$Rt, $addr",
[(set tGPR:$Rt, (sextloadi8 t_addrmode_rr_sext:$addr))]>, Sched<[WriteLd]>;
let AddedComplexity = 10 in
def tLDRSH : // A8.6.84
T1pILdStEncode<0b111, (outs tGPR:$Rt), (ins t_addrmode_rr_sext:$addr),
AddrModeT1_2, IIC_iLoad_bh_r,
"ldrsh", "\t$Rt, $addr",
[(set tGPR:$Rt, (sextloadi16 t_addrmode_rr_sext:$addr))]>, Sched<[WriteLd]>;
def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i,
"str", "\t$Rt, $addr",
[(store tGPR:$Rt, t_addrmode_sp:$addr)]>,
T1LdStSP<{0,?,?}>, Sched<[WriteST]> {
bits<3> Rt;
bits<8> addr;
let Inst{10-8} = Rt;
let Inst{7-0} = addr;
}
// A8.6.194 & A8.6.192
defm tSTR : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rr,
t_addrmode_is4, AddrModeT1_4,
IIC_iStore_r, IIC_iStore_i, "str",
store>, Sched<[WriteST]>;
// A8.6.197 & A8.6.195
defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rr,
t_addrmode_is1, AddrModeT1_1,
IIC_iStore_bh_r, IIC_iStore_bh_i, "strb",
truncstorei8>, Sched<[WriteST]>;
// A8.6.207 & A8.6.205
defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rr,
t_addrmode_is2, AddrModeT1_2,
IIC_iStore_bh_r, IIC_iStore_bh_i, "strh",
truncstorei16>, Sched<[WriteST]>;
//===----------------------------------------------------------------------===//
// Load / store multiple Instructions.
//
// These require base address to be written back or one of the loaded regs.
let hasSideEffects = 0 in {
let mayLoad = 1, hasExtraDefRegAllocReq = 1, variadicOpsAreDefs = 1 in
def tLDMIA : T1I<(outs), (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
IIC_iLoad_m, "ldm${p}\t$Rn, $regs", []>, T1Encoding<{1,1,0,0,1,?}> {
bits<3> Rn;
bits<8> regs;
let Inst{10-8} = Rn;
let Inst{7-0} = regs;
}
// Writeback version is just a pseudo, as there's no encoding difference.
// Writeback happens iff the base register is not in the destination register
// list.
let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
def tLDMIA_UPD :
InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain,
"$Rn = $wb", IIC_iLoad_mu>,
PseudoInstExpansion<(tLDMIA tGPR:$Rn, pred:$p, reglist:$regs)> {
let Size = 2;
let OutOperandList = (outs tGPR:$wb);
let InOperandList = (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops);
let Pattern = [];
let isCodeGenOnly = 1;
let isPseudo = 1;
list<Predicate> Predicates = [IsThumb];
}
// There is no non-writeback version of STM for Thumb.
let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
def tSTMIA_UPD : Thumb1I<(outs tGPR:$wb),
(ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
AddrModeNone, 2, IIC_iStore_mu,
"stm${p}\t$Rn!, $regs", "$Rn = $wb", []>,
T1Encoding<{1,1,0,0,0,?}> {
bits<3> Rn;
bits<8> regs;
let Inst{10-8} = Rn;
let Inst{7-0} = regs;
}
} // hasSideEffects
def : InstAlias<"ldm${p} $Rn!, $regs",
(tLDMIA tGPR:$Rn, pred:$p, reglist:$regs), 0>,
Requires<[IsThumb, IsThumb1Only]>;
let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1,
variadicOpsAreDefs = 1 in
def tPOP : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
IIC_iPop,
"pop${p}\t$regs", []>,
T1Misc<{1,1,0,?,?,?,?}>, Sched<[WriteLd]> {
bits<16> regs;
let Inst{8} = regs{15};
let Inst{7-0} = regs{7-0};
}
let mayStore = 1, Uses = [SP], Defs = [SP], hasExtraSrcRegAllocReq = 1 in
def tPUSH : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
IIC_iStore_m,
"push${p}\t$regs", []>,
T1Misc<{0,1,0,?,?,?,?}>, Sched<[WriteST]> {
bits<16> regs;
let Inst{8} = regs{14};
let Inst{7-0} = regs{7-0};
}
//===----------------------------------------------------------------------===//
// Arithmetic Instructions.
//
// Helper classes for encoding T1pI patterns:
class T1pIDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin,
string opc, string asm, list<dag> pattern>
: T1pI<oops, iops, itin, opc, asm, pattern>,
T1DataProcessing<opA> {
bits<3> Rm;
bits<3> Rn;
let Inst{5-3} = Rm;
let Inst{2-0} = Rn;
}
class T1pIMiscEncode<bits<7> opA, dag oops, dag iops, InstrItinClass itin,
string opc, string asm, list<dag> pattern>
: T1pI<oops, iops, itin, opc, asm, pattern>,
T1Misc<opA> {
bits<3> Rm;
bits<3> Rd;
let Inst{5-3} = Rm;
let Inst{2-0} = Rd;
}
// Helper classes for encoding T1sI patterns:
class T1sIDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin,
string opc, string asm, list<dag> pattern>
: T1sI<oops, iops, itin, opc, asm, pattern>,
T1DataProcessing<opA> {
bits<3> Rd;
bits<3> Rn;
let Inst{5-3} = Rn;
let Inst{2-0} = Rd;
}
class T1sIGenEncode<bits<5> opA, dag oops, dag iops, InstrItinClass itin,
string opc, string asm, list<dag> pattern>
: T1sI<oops, iops, itin, opc, asm, pattern>,
T1General<opA> {
bits<3> Rm;
bits<3> Rn;
bits<3> Rd;
let Inst{8-6} = Rm;
let Inst{5-3} = Rn;
let Inst{2-0} = Rd;
}
class T1sIGenEncodeImm<bits<5> opA, dag oops, dag iops, InstrItinClass itin,
string opc, string asm, list<dag> pattern>
: T1sI<oops, iops, itin, opc, asm, pattern>,
T1General<opA> {
bits<3> Rd;
bits<3> Rm;
let Inst{5-3} = Rm;
let Inst{2-0} = Rd;
}
// Helper classes for encoding T1sIt patterns:
class T1sItDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin,
string opc, string asm, list<dag> pattern>
: T1sIt<oops, iops, itin, opc, asm, pattern>,
T1DataProcessing<opA> {
bits<3> Rdn;
bits<3> Rm;
let Inst{5-3} = Rm;
let Inst{2-0} = Rdn;
}
class T1sItGenEncodeImm<bits<5> opA, dag oops, dag iops, InstrItinClass itin,
string opc, string asm, list<dag> pattern>
: T1sIt<oops, iops, itin, opc, asm, pattern>,
T1General<opA> {
bits<3> Rdn;
bits<8> imm8;
let Inst{10-8} = Rdn;
let Inst{7-0} = imm8;
}
let isAdd = 1 in {
// Add with carry register
let isCommutable = 1, Uses = [CPSR] in
def tADC : // A8.6.2
T1sItDPEncode<0b0101, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), IIC_iALUr,
"adc", "\t$Rdn, $Rm",
[]>, Sched<[WriteALU]>;
// Add immediate
def tADDi3 : // A8.6.4 T1
T1sIGenEncodeImm<0b01110, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
IIC_iALUi,
"add", "\t$Rd, $Rm, $imm3",
[(set tGPR:$Rd, (add tGPR:$Rm, imm0_7:$imm3))]>,
Sched<[WriteALU]> {
bits<3> imm3;
let Inst{8-6} = imm3;
}
def tADDi8 : // A8.6.4 T2
T1sItGenEncodeImm<{1,1,0,?,?}, (outs tGPR:$Rdn),
(ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi,
"add", "\t$Rdn, $imm8",
[(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255:$imm8))]>,
Sched<[WriteALU]>;
// Add register
let isCommutable = 1 in
def tADDrr : // A8.6.6 T1
T1sIGenEncode<0b01100, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
IIC_iALUr,
"add", "\t$Rd, $Rn, $Rm",
[(set tGPR:$Rd, (add tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
/// Similar to the above except these set the 's' bit so the
/// instruction modifies the CPSR register.
///
/// These opcodes will be converted to the real non-S opcodes by
/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
let hasPostISelHook = 1, Defs = [CPSR] in {
let isCommutable = 1, Uses = [CPSR] in
def tADCS : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
2, IIC_iALUr,
[(set tGPR:$Rdn, CPSR, (ARMadde tGPR:$Rn, tGPR:$Rm,
CPSR))]>,
Requires<[IsThumb1Only]>,
Sched<[WriteALU]>;
def tADDSi3 : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
2, IIC_iALUi,
[(set tGPR:$Rd, CPSR, (ARMaddc tGPR:$Rm,
imm0_7:$imm3))]>,
Requires<[IsThumb1Only]>,
Sched<[WriteALU]>;
def tADDSi8 : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, imm0_255:$imm8),
2, IIC_iALUi,
[(set tGPR:$Rdn, CPSR, (ARMaddc tGPR:$Rn,
imm8_255:$imm8))]>,
Requires<[IsThumb1Only]>,
Sched<[WriteALU]>;
let isCommutable = 1 in
def tADDSrr : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
2, IIC_iALUr,
[(set tGPR:$Rd, CPSR, (ARMaddc tGPR:$Rn,
tGPR:$Rm))]>,
Requires<[IsThumb1Only]>,
Sched<[WriteALU]>;
}
let hasSideEffects = 0 in
def tADDhirr : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPR:$Rm), IIC_iALUr,
"add", "\t$Rdn, $Rm", []>,
T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
// A8.6.6 T2
bits<4> Rdn;
bits<4> Rm;
let Inst{7} = Rdn{3};
let Inst{6-3} = Rm;
let Inst{2-0} = Rdn{2-0};
}
}
// Thumb has more flexible short encodings for ADD than ORR, so use those where
// possible.
def : T1Pat<(or AddLikeOrOp:$Rn, imm0_7:$imm), (tADDi3 $Rn, imm0_7:$imm)>;
def : T1Pat<(or AddLikeOrOp:$Rn, imm8_255:$imm), (tADDi8 $Rn, imm8_255:$imm)>;
def : T1Pat<(or AddLikeOrOp:$Rn, tGPR:$Rm), (tADDrr $Rn, $Rm)>;
def : tInstAlias <"add${s}${p} $Rdn, $Rm",
(tADDrr tGPR:$Rdn,s_cc_out:$s, tGPR:$Rdn, tGPR:$Rm, pred:$p)>;
def : tInstSubst<"sub${s}${p} $rd, $rn, $imm",
(tADDi3 tGPR:$rd, s_cc_out:$s, tGPR:$rn, mod_imm1_7_neg:$imm, pred:$p)>;
def : tInstSubst<"sub${s}${p} $rdn, $imm",
(tADDi8 tGPR:$rdn, s_cc_out:$s, mod_imm8_255_neg:$imm, pred:$p)>;
// AND register
let isCommutable = 1 in
def tAND : // A8.6.12
T1sItDPEncode<0b0000, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
IIC_iBITr,
"and", "\t$Rdn, $Rm",
[(set tGPR:$Rdn, (and tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
// ASR immediate
def tASRri : // A8.6.14
T1sIGenEncodeImm<{0,1,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5),
IIC_iMOVsi,
"asr", "\t$Rd, $Rm, $imm5",
[(set tGPR:$Rd, (sra tGPR:$Rm, (i32 imm_sr:$imm5)))]>,
Sched<[WriteALU]> {
bits<5> imm5;
let Inst{10-6} = imm5;
}
// ASR register
def tASRrr : // A8.6.15
T1sItDPEncode<0b0100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
IIC_iMOVsr,
"asr", "\t$Rdn, $Rm",
[(set tGPR:$Rdn, (sra tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
// BIC register
def tBIC : // A8.6.20
T1sItDPEncode<0b1110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
IIC_iBITr,
"bic", "\t$Rdn, $Rm",
[(set tGPR:$Rdn, (and tGPR:$Rn, (not tGPR:$Rm)))]>,
Sched<[WriteALU]>;
// CMN register
let isCompare = 1, Defs = [CPSR] in {
//FIXME: Disable CMN, as CCodes are backwards from compare expectations
// Compare-to-zero still works out, just not the relationals
//def tCMN : // A8.6.33
// T1pIDPEncode<0b1011, (outs), (ins tGPR:$lhs, tGPR:$rhs),
// IIC_iCMPr,
// "cmn", "\t$lhs, $rhs",
// [(ARMcmp tGPR:$lhs, (ineg tGPR:$rhs))]>;
def tCMNz : // A8.6.33
T1pIDPEncode<0b1011, (outs), (ins tGPR:$Rn, tGPR:$Rm),
IIC_iCMPr,
"cmn", "\t$Rn, $Rm",
[(ARMcmpZ tGPR:$Rn, (ineg tGPR:$Rm))]>, Sched<[WriteCMP]>;
} // isCompare = 1, Defs = [CPSR]
// CMP immediate
let isCompare = 1, Defs = [CPSR] in {
def tCMPi8 : T1pI<(outs), (ins tGPR:$Rn, imm0_255:$imm8), IIC_iCMPi,
"cmp", "\t$Rn, $imm8",
[(ARMcmp tGPR:$Rn, imm0_255:$imm8)]>,
T1General<{1,0,1,?,?}>, Sched<[WriteCMP]> {
// A8.6.35
bits<3> Rn;
bits<8> imm8;
let Inst{10-8} = Rn;
let Inst{7-0} = imm8;
}
// CMP register
def tCMPr : // A8.6.36 T1
T1pIDPEncode<0b1010, (outs), (ins tGPR:$Rn, tGPR:$Rm),
IIC_iCMPr,
"cmp", "\t$Rn, $Rm",
[(ARMcmp tGPR:$Rn, tGPR:$Rm)]>, Sched<[WriteCMP]>;
def tCMPhir : T1pI<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_iCMPr,
"cmp", "\t$Rn, $Rm", []>,
T1Special<{0,1,?,?}>, Sched<[WriteCMP]> {
// A8.6.36 T2
bits<4> Rm;
bits<4> Rn;
let Inst{7} = Rn{3};
let Inst{6-3} = Rm;
let Inst{2-0} = Rn{2-0};
}
} // isCompare = 1, Defs = [CPSR]
// XOR register
let isCommutable = 1 in
def tEOR : // A8.6.45
T1sItDPEncode<0b0001, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
IIC_iBITr,
"eor", "\t$Rdn, $Rm",
[(set tGPR:$Rdn, (xor tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
// LSL immediate
def tLSLri : // A8.6.88
T1sIGenEncodeImm<{0,0,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_31:$imm5),
IIC_iMOVsi,
"lsl", "\t$Rd, $Rm, $imm5",
[(set tGPR:$Rd, (shl tGPR:$Rm, (i32 imm:$imm5)))]>,
Sched<[WriteALU]> {
bits<5> imm5;
let Inst{10-6} = imm5;
}
// LSL register
def tLSLrr : // A8.6.89
T1sItDPEncode<0b0010, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
IIC_iMOVsr,
"lsl", "\t$Rdn, $Rm",
[(set tGPR:$Rdn, (shl tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
// LSR immediate
def tLSRri : // A8.6.90
T1sIGenEncodeImm<{0,0,1,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5),
IIC_iMOVsi,
"lsr", "\t$Rd, $Rm, $imm5",
[(set tGPR:$Rd, (srl tGPR:$Rm, (i32 imm_sr:$imm5)))]>,
Sched<[WriteALU]> {
bits<5> imm5;
let Inst{10-6} = imm5;
}
// LSR register
def tLSRrr : // A8.6.91
T1sItDPEncode<0b0011, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
IIC_iMOVsr,
"lsr", "\t$Rdn, $Rm",
[(set tGPR:$Rdn, (srl tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
// Move register
let isMoveImm = 1 in
def tMOVi8 : T1sI<(outs tGPR:$Rd), (ins imm0_255:$imm8), IIC_iMOVi,
"mov", "\t$Rd, $imm8",
[(set tGPR:$Rd, imm0_255:$imm8)]>,
T1General<{1,0,0,?,?}>, Sched<[WriteALU]> {
// A8.6.96
bits<3> Rd;
bits<8> imm8;
let Inst{10-8} = Rd;
let Inst{7-0} = imm8;
}
// Because we have an explicit tMOVSr below, we need an alias to handle
// the immediate "movs" form here. Blech.
def : tInstAlias <"movs $Rdn, $imm",
(tMOVi8 tGPR:$Rdn, CPSR, imm0_255:$imm, 14, 0)>;
// A7-73: MOV(2) - mov setting flag.
let hasSideEffects = 0, isMoveReg = 1 in {
def tMOVr : Thumb1pI<(outs GPR:$Rd), (ins GPR:$Rm), AddrModeNone,
2, IIC_iMOVr,
"mov", "\t$Rd, $Rm", "", []>,
T1Special<{1,0,?,?}>, Sched<[WriteALU]> {
// A8.6.97
bits<4> Rd;
bits<4> Rm;
let Inst{7} = Rd{3};
let Inst{6-3} = Rm;
let Inst{2-0} = Rd{2-0};
}
let Defs = [CPSR] in
def tMOVSr : T1I<(outs tGPR:$Rd), (ins tGPR:$Rm), IIC_iMOVr,
"movs\t$Rd, $Rm", []>, Encoding16, Sched<[WriteALU]> {
// A8.6.97
bits<3> Rd;
bits<3> Rm;
let Inst{15-6} = 0b0000000000;
let Inst{5-3} = Rm;
let Inst{2-0} = Rd;
}
} // hasSideEffects
// Multiply register
let isCommutable = 1 in
def tMUL : // A8.6.105 T1
Thumb1sI<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm), AddrModeNone, 2,
IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm", "$Rm = $Rd",
[(set tGPR:$Rd, (mul tGPR:$Rn, tGPR:$Rm))]>,
T1DataProcessing<0b1101>, Sched<[WriteMUL32, ReadMUL, ReadMUL]> {
bits<3> Rd;
bits<3> Rn;
let Inst{5-3} = Rn;
let Inst{2-0} = Rd;
let AsmMatchConverter = "cvtThumbMultiply";
}
def :tInstAlias<"mul${s}${p} $Rdm, $Rn", (tMUL tGPR:$Rdm, s_cc_out:$s, tGPR:$Rn,
pred:$p)>;
// Move inverse register
def tMVN : // A8.6.107
T1sIDPEncode<0b1111, (outs tGPR:$Rd), (ins tGPR:$Rn), IIC_iMVNr,
"mvn", "\t$Rd, $Rn",
[(set tGPR:$Rd, (not tGPR:$Rn))]>, Sched<[WriteALU]>;
// Bitwise or register
let isCommutable = 1 in
def tORR : // A8.6.114
T1sItDPEncode<0b1100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
IIC_iBITr,
"orr", "\t$Rdn, $Rm",
[(set tGPR:$Rdn, (or tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
// Swaps
def tREV : // A8.6.134
T1pIMiscEncode<{1,0,1,0,0,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
IIC_iUNAr,
"rev", "\t$Rd, $Rm",
[(set tGPR:$Rd, (bswap tGPR:$Rm))]>,
Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
def tREV16 : // A8.6.135
T1pIMiscEncode<{1,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
IIC_iUNAr,
"rev16", "\t$Rd, $Rm",
[(set tGPR:$Rd, (rotr (bswap tGPR:$Rm), (i32 16)))]>,
Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
def tREVSH : // A8.6.136
T1pIMiscEncode<{1,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
IIC_iUNAr,
"revsh", "\t$Rd, $Rm",
[(set tGPR:$Rd, (sra (bswap tGPR:$Rm), (i32 16)))]>,
Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
// Rotate right register
def tROR : // A8.6.139
T1sItDPEncode<0b0111, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
IIC_iMOVsr,
"ror", "\t$Rdn, $Rm",
[(set tGPR:$Rdn, (rotr tGPR:$Rn, tGPR:$Rm))]>,
Sched<[WriteALU]>;
// Negate register
def tRSB : // A8.6.141
T1sIDPEncode<0b1001, (outs tGPR:$Rd), (ins tGPR:$Rn),
IIC_iALUi,
"rsb", "\t$Rd, $Rn, #0",
[(set tGPR:$Rd, (ineg tGPR:$Rn))]>, Sched<[WriteALU]>;
// Subtract with carry register
let Uses = [CPSR] in
def tSBC : // A8.6.151
T1sItDPEncode<0b0110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
IIC_iALUr,
"sbc", "\t$Rdn, $Rm",
[]>,
Sched<[WriteALU]>;
// Subtract immediate
def tSUBi3 : // A8.6.210 T1
T1sIGenEncodeImm<0b01111, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
IIC_iALUi,
"sub", "\t$Rd, $Rm, $imm3",
[(set tGPR:$Rd, (add tGPR:$Rm, imm0_7_neg:$imm3))]>,
Sched<[WriteALU]> {
bits<3> imm3;
let Inst{8-6} = imm3;
}
def tSUBi8 : // A8.6.210 T2
T1sItGenEncodeImm<{1,1,1,?,?}, (outs tGPR:$Rdn),
(ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi,
"sub", "\t$Rdn, $imm8",
[(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255_neg:$imm8))]>,
Sched<[WriteALU]>;
def : tInstSubst<"add${s}${p} $rd, $rn, $imm",
(tSUBi3 tGPR:$rd, s_cc_out:$s, tGPR:$rn, mod_imm1_7_neg:$imm, pred:$p)>;
def : tInstSubst<"add${s}${p} $rdn, $imm",
(tSUBi8 tGPR:$rdn, s_cc_out:$s, mod_imm8_255_neg:$imm, pred:$p)>;
// Subtract register
def tSUBrr : // A8.6.212
T1sIGenEncode<0b01101, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
IIC_iALUr,
"sub", "\t$Rd, $Rn, $Rm",
[(set tGPR:$Rd, (sub tGPR:$Rn, tGPR:$Rm))]>,
Sched<[WriteALU]>;
def : tInstAlias <"sub${s}${p} $Rdn, $Rm",
(tSUBrr tGPR:$Rdn,s_cc_out:$s, tGPR:$Rdn, tGPR:$Rm, pred:$p)>;
/// Similar to the above except these set the 's' bit so the
/// instruction modifies the CPSR register.
///
/// These opcodes will be converted to the real non-S opcodes by
/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
let hasPostISelHook = 1, Defs = [CPSR] in {
let Uses = [CPSR] in
def tSBCS : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
2, IIC_iALUr,
[(set tGPR:$Rdn, CPSR, (ARMsube tGPR:$Rn, tGPR:$Rm,
CPSR))]>,
Requires<[IsThumb1Only]>,
Sched<[WriteALU]>;
def tSUBSi3 : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
2, IIC_iALUi,
[(set tGPR:$Rd, CPSR, (ARMsubc tGPR:$Rm,
imm0_7:$imm3))]>,
Requires<[IsThumb1Only]>,
Sched<[WriteALU]>;
def tSUBSi8 : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, imm0_255:$imm8),
2, IIC_iALUi,
[(set tGPR:$Rdn, CPSR, (ARMsubc tGPR:$Rn,
imm8_255:$imm8))]>,
Requires<[IsThumb1Only]>,
Sched<[WriteALU]>;
def tSUBSrr : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
2, IIC_iALUr,
[(set tGPR:$Rd, CPSR, (ARMsubc tGPR:$Rn,
tGPR:$Rm))]>,
Requires<[IsThumb1Only]>,
Sched<[WriteALU]>;
def tRSBS : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rn),
2, IIC_iALUr,
[(set tGPR:$Rd, CPSR, (ARMsubc 0, tGPR:$Rn))]>,
Requires<[IsThumb1Only]>,
Sched<[WriteALU]>;
}
def : T1Pat<(ARMsubs tGPR:$Rn, tGPR:$Rm), (tSUBSrr $Rn, $Rm)>;
def : T1Pat<(ARMsubs tGPR:$Rn, imm0_7:$imm3), (tSUBSi3 $Rn, imm0_7:$imm3)>;
def : T1Pat<(ARMsubs tGPR:$Rn, imm0_255:$imm8), (tSUBSi8 $Rn, imm0_255:$imm8)>;
// Sign-extend byte
def tSXTB : // A8.6.222
T1pIMiscEncode<{0,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
IIC_iUNAr,
"sxtb", "\t$Rd, $Rm",
[(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i8))]>,
Requires<[IsThumb, IsThumb1Only, HasV6]>,
Sched<[WriteALU]>;
// Sign-extend short
def tSXTH : // A8.6.224
T1pIMiscEncode<{0,0,1,0,0,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
IIC_iUNAr,
"sxth", "\t$Rd, $Rm",
[(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i16))]>,
Requires<[IsThumb, IsThumb1Only, HasV6]>,
Sched<[WriteALU]>;
// Test
let isCompare = 1, isCommutable = 1, Defs = [CPSR] in
def tTST : // A8.6.230
T1pIDPEncode<0b1000, (outs), (ins tGPR:$Rn, tGPR:$Rm), IIC_iTSTr,
"tst", "\t$Rn, $Rm",
[(ARMcmpZ (and_su tGPR:$Rn, tGPR:$Rm), 0)]>,
Sched<[WriteALU]>;
// A8.8.247 UDF - Undefined (Encoding T1)
def tUDF : TI<(outs), (ins imm0_255:$imm8), IIC_Br, "udf\t$imm8",
[(int_arm_undefined imm0_255:$imm8)]>, Encoding16 {
bits<8> imm8;
let Inst{15-12} = 0b1101;
let Inst{11-8} = 0b1110;
let Inst{7-0} = imm8;
}
def : Pat<(debugtrap), (tBKPT 0)>, Requires<[IsThumb, HasV5T]>;
def : Pat<(debugtrap), (tUDF 254)>, Requires<[IsThumb, NoV5T]>;
def t__brkdiv0 : TI<(outs), (ins), IIC_Br, "__brkdiv0",
[(int_arm_undefined 249)]>, Encoding16,
Requires<[IsThumb, IsWindows]> {
let Inst = 0xdef9;
let isTerminator = 1;
}
// Zero-extend byte
def tUXTB : // A8.6.262
T1pIMiscEncode<{0,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
IIC_iUNAr,
"uxtb", "\t$Rd, $Rm",
[(set tGPR:$Rd, (and tGPR:$Rm, 0xFF))]>,
Requires<[IsThumb, IsThumb1Only, HasV6]>,
Sched<[WriteALU]>;
// Zero-extend short
def tUXTH : // A8.6.264
T1pIMiscEncode<{0,0,1,0,1,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
IIC_iUNAr,
"uxth", "\t$Rd, $Rm",
[(set tGPR:$Rd, (and tGPR:$Rm, 0xFFFF))]>,
Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
// Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC operation.
// Expanded after instruction selection into a branch sequence.
let usesCustomInserter = 1 in // Expanded after instruction selection.
def tMOVCCr_pseudo :
PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, cmovpred:$p),
NoItinerary,
[(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, cmovpred:$p))]>;
// tLEApcrel - Load a pc-relative address into a register without offending the
// assembler.
def tADR : T1I<(outs tGPR:$Rd), (ins t_adrlabel:$addr, pred:$p),
IIC_iALUi, "adr{$p}\t$Rd, $addr", []>,
T1Encoding<{1,0,1,0,0,?}>, Sched<[WriteALU]> {
bits<3> Rd;
bits<8> addr;
let Inst{10-8} = Rd;
let Inst{7-0} = addr;
let DecoderMethod = "DecodeThumbAddSpecialReg";
}
let hasSideEffects = 0, isReMaterializable = 1 in
def tLEApcrel : tPseudoInst<(outs tGPR:$Rd), (ins i32imm:$label, pred:$p),
2, IIC_iALUi, []>, Sched<[WriteALU]>;
let hasSideEffects = 1 in
def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
(ins i32imm:$label, pred:$p),
2, IIC_iALUi, []>, Sched<[WriteALU]>;
// Thumb-1 doesn't have the TBB or TBH instructions, but we can synthesize them
// and make use of the same compressed jump table format as Thumb-2.
let Size = 2, isBranch = 1, isTerminator = 1, isBarrier = 1,
- isIndirectBranch = 1 in {
+ isIndirectBranch = 1, isNotDuplicable = 1 in {
def tTBB_JT : tPseudoInst<(outs),
(ins tGPRwithpc:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0,
IIC_Br, []>, Sched<[WriteBr]>;
def tTBH_JT : tPseudoInst<(outs),
(ins tGPRwithpc:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0,
IIC_Br, []>, Sched<[WriteBr]>;
}
//===----------------------------------------------------------------------===//
// TLS Instructions
//
// __aeabi_read_tp preserves the registers r1-r3.
// This is a pseudo inst so that we can get the encoding right,
// complete with fixup for the aeabi_read_tp function.
let isCall = 1, Defs = [R0, R12, LR, CPSR], Uses = [SP] in
def tTPsoft : tPseudoInst<(outs), (ins), 4, IIC_Br,
[(set R0, ARMthread_pointer)]>,
Sched<[WriteBr]>;
//===----------------------------------------------------------------------===//
// SJLJ Exception handling intrinsics
//
// eh_sjlj_setjmp() is an instruction sequence to store the return address and
// save #0 in R0 for the non-longjmp case. Since by its nature we may be coming
// from some other function to get here, and we're using the stack frame for the
// containing function to save/restore registers, we can't keep anything live in
// regs across the eh_sjlj_setjmp(), else it will almost certainly have been
// tromped upon when we get here from a longjmp(). We force everything out of
// registers except for our own input by listing the relevant registers in
// Defs. By doing so, we also cause the prologue/epilogue code to actively
// preserve all of the callee-saved resgisters, which is exactly what we want.
// $val is a scratch register for our use.
let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R12, CPSR ],
hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
usesCustomInserter = 1 in
def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val),
AddrModeNone, 0, NoItinerary, "","",
[(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>;
// FIXME: Non-IOS version(s)
let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1,
Defs = [ R7, LR, SP ] in
def tInt_eh_sjlj_longjmp : XI<(outs), (ins tGPR:$src, tGPR:$scratch),
AddrModeNone, 0, IndexModeNone,
Pseudo, NoItinerary, "", "",
[(ARMeh_sjlj_longjmp tGPR:$src, tGPR:$scratch)]>,
Requires<[IsThumb,IsNotWindows]>;
// (Windows is Thumb2-only)
let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1,
Defs = [ R11, LR, SP ] in
def tInt_WIN_eh_sjlj_longjmp
: XI<(outs), (ins GPR:$src, GPR:$scratch), AddrModeNone, 0, IndexModeNone,
Pseudo, NoItinerary, "", "", [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
Requires<[IsThumb,IsWindows]>;
//===----------------------------------------------------------------------===//
// Non-Instruction Patterns
//
// Comparisons
def : T1Pat<(ARMcmpZ tGPR:$Rn, imm0_255:$imm8),
(tCMPi8 tGPR:$Rn, imm0_255:$imm8)>;
def : T1Pat<(ARMcmpZ tGPR:$Rn, tGPR:$Rm),
(tCMPr tGPR:$Rn, tGPR:$Rm)>;
// Bswap 16 with load/store
def : T1Pat<(srl (bswap (extloadi16 t_addrmode_is2:$addr)), (i32 16)),
(tREV16 (tLDRHi t_addrmode_is2:$addr))>;
def : T1Pat<(srl (bswap (extloadi16 t_addrmode_rr:$addr)), (i32 16)),
(tREV16 (tLDRHr t_addrmode_rr:$addr))>;
def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)),
t_addrmode_is2:$addr),
(tSTRHi(tREV16 tGPR:$Rn), t_addrmode_is2:$addr)>;
def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)),
t_addrmode_rr:$addr),
(tSTRHr (tREV16 tGPR:$Rn), t_addrmode_rr:$addr)>;
// ConstantPool
def : T1Pat<(ARMWrapper tconstpool :$dst), (tLEApcrel tconstpool :$dst)>;
// GlobalAddress
def tLDRLIT_ga_pcrel : PseudoInst<(outs tGPR:$dst), (ins i32imm:$addr),
IIC_iLoadiALU,
[(set tGPR:$dst,
(ARMWrapperPIC tglobaladdr:$addr))]>,
Requires<[IsThumb, DontUseMovtInPic]>;
def tLDRLIT_ga_abs : PseudoInst<(outs tGPR:$dst), (ins i32imm:$src),
IIC_iLoad_i,
[(set tGPR:$dst,
(ARMWrapper tglobaladdr:$src))]>,
Requires<[IsThumb, DontUseMovt]>;
// TLS globals
def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr),
(tLDRLIT_ga_pcrel tglobaltlsaddr:$addr)>,
Requires<[IsThumb, DontUseMovtInPic]>;
def : Pat<(ARMWrapper tglobaltlsaddr:$addr),
(tLDRLIT_ga_abs tglobaltlsaddr:$addr)>,
Requires<[IsThumb, DontUseMovt]>;
// JumpTable
def : T1Pat<(ARMWrapperJT tjumptable:$dst),
(tLEApcrelJT tjumptable:$dst)>;
// Direct calls
def : T1Pat<(ARMcall texternalsym:$func), (tBL texternalsym:$func)>,
Requires<[IsThumb]>;
// zextload i1 -> zextload i8
def : T1Pat<(zextloadi1 t_addrmode_is1:$addr),
(tLDRBi t_addrmode_is1:$addr)>;
def : T1Pat<(zextloadi1 t_addrmode_rr:$addr),
(tLDRBr t_addrmode_rr:$addr)>;
// extload from the stack -> word load from the stack, as it avoids having to
// materialize the base in a separate register. This only works when a word
// load puts the byte/halfword value in the same place in the register that the
// byte/halfword load would, i.e. when little-endian.
def : T1Pat<(extloadi1 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>,
Requires<[IsThumb, IsThumb1Only, IsLE]>;
def : T1Pat<(extloadi8 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>,
Requires<[IsThumb, IsThumb1Only, IsLE]>;
def : T1Pat<(extloadi16 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>,
Requires<[IsThumb, IsThumb1Only, IsLE]>;
// extload -> zextload
def : T1Pat<(extloadi1 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>;
def : T1Pat<(extloadi1 t_addrmode_rr:$addr), (tLDRBr t_addrmode_rr:$addr)>;
def : T1Pat<(extloadi8 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>;
def : T1Pat<(extloadi8 t_addrmode_rr:$addr), (tLDRBr t_addrmode_rr:$addr)>;
def : T1Pat<(extloadi16 t_addrmode_is2:$addr), (tLDRHi t_addrmode_is2:$addr)>;
def : T1Pat<(extloadi16 t_addrmode_rr:$addr), (tLDRHr t_addrmode_rr:$addr)>;
// post-inc loads and stores
// post-inc LDR -> LDM r0!, {r1}. The way operands are layed out in LDMs is
// different to how ISel expects them for a post-inc load, so use a pseudo
// and expand it just after ISel.
let usesCustomInserter = 1, mayLoad =1,
Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in
def tLDR_postidx: tPseudoInst<(outs tGPR:$Rt, tGPR:$Rn_wb),
(ins tGPR:$Rn, pred:$p),
4, IIC_iStore_ru,
[]>;
// post-inc STR -> STM r0!, {r1}. The layout of this (because it doesn't def
// multiple registers) is the same in ISel as MachineInstr, so there's no need
// for a pseudo.
def : T1Pat<(post_store tGPR:$Rt, tGPR:$Rn, 4),
(tSTMIA_UPD tGPR:$Rn, tGPR:$Rt)>;
// If it's impossible to use [r,r] address mode for sextload, select to
// ldr{b|h} + sxt{b|h} instead.
def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
(tSXTB (tLDRBi t_addrmode_is1:$addr))>,
Requires<[IsThumb, IsThumb1Only, HasV6]>;
def : T1Pat<(sextloadi8 t_addrmode_rr:$addr),
(tSXTB (tLDRBr t_addrmode_rr:$addr))>,
Requires<[IsThumb, IsThumb1Only, HasV6]>;
def : T1Pat<(sextloadi16 t_addrmode_is2:$addr),
(tSXTH (tLDRHi t_addrmode_is2:$addr))>,
Requires<[IsThumb, IsThumb1Only, HasV6]>;
def : T1Pat<(sextloadi16 t_addrmode_rr:$addr),
(tSXTH (tLDRHr t_addrmode_rr:$addr))>,
Requires<[IsThumb, IsThumb1Only, HasV6]>;
def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
(tASRri (tLSLri (tLDRBi t_addrmode_is1:$addr), 24), 24)>;
def : T1Pat<(sextloadi8 t_addrmode_rr:$addr),
(tASRri (tLSLri (tLDRBr t_addrmode_rr:$addr), 24), 24)>;
def : T1Pat<(sextloadi16 t_addrmode_is2:$addr),
(tASRri (tLSLri (tLDRHi t_addrmode_is2:$addr), 16), 16)>;
def : T1Pat<(sextloadi16 t_addrmode_rr:$addr),
(tASRri (tLSLri (tLDRHr t_addrmode_rr:$addr), 16), 16)>;
def : T1Pat<(atomic_load_8 t_addrmode_is1:$src),
(tLDRBi t_addrmode_is1:$src)>;
def : T1Pat<(atomic_load_8 t_addrmode_rr:$src),
(tLDRBr t_addrmode_rr:$src)>;
def : T1Pat<(atomic_load_16 t_addrmode_is2:$src),
(tLDRHi t_addrmode_is2:$src)>;
def : T1Pat<(atomic_load_16 t_addrmode_rr:$src),
(tLDRHr t_addrmode_rr:$src)>;
def : T1Pat<(atomic_load_32 t_addrmode_is4:$src),
(tLDRi t_addrmode_is4:$src)>;
def : T1Pat<(atomic_load_32 t_addrmode_rr:$src),
(tLDRr t_addrmode_rr:$src)>;
def : T1Pat<(atomic_store_8 t_addrmode_is1:$ptr, tGPR:$val),
(tSTRBi tGPR:$val, t_addrmode_is1:$ptr)>;
def : T1Pat<(atomic_store_8 t_addrmode_rr:$ptr, tGPR:$val),
(tSTRBr tGPR:$val, t_addrmode_rr:$ptr)>;
def : T1Pat<(atomic_store_16 t_addrmode_is2:$ptr, tGPR:$val),
(tSTRHi tGPR:$val, t_addrmode_is2:$ptr)>;
def : T1Pat<(atomic_store_16 t_addrmode_rr:$ptr, tGPR:$val),
(tSTRHr tGPR:$val, t_addrmode_rr:$ptr)>;
def : T1Pat<(atomic_store_32 t_addrmode_is4:$ptr, tGPR:$val),
(tSTRi tGPR:$val, t_addrmode_is4:$ptr)>;
def : T1Pat<(atomic_store_32 t_addrmode_rr:$ptr, tGPR:$val),
(tSTRr tGPR:$val, t_addrmode_rr:$ptr)>;
// Large immediate handling.
// Two piece imms.
def : T1Pat<(i32 thumb_immshifted:$src),
(tLSLri (tMOVi8 (thumb_immshifted_val imm:$src)),
(thumb_immshifted_shamt imm:$src))>;
def : T1Pat<(i32 imm0_255_comp:$src),
(tMVN (tMOVi8 (imm_not_XFORM imm:$src)))>;
def : T1Pat<(i32 imm256_510:$src),
(tADDi8 (tMOVi8 255),
(thumb_imm256_510_addend imm:$src))>;
// Pseudo instruction that combines ldr from constpool and add pc. This should
// be expanded into two instructions late to allow if-conversion and
// scheduling.
let isReMaterializable = 1 in
def tLDRpci_pic : PseudoInst<(outs tGPR:$dst), (ins i32imm:$addr, pclabel:$cp),
NoItinerary,
[(set tGPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
imm:$cp))]>,
Requires<[IsThumb, IsThumb1Only]>;
// Pseudo-instruction for merged POP and return.
// FIXME: remove when we have a way to marking a MI with these properties.
let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
hasExtraDefRegAllocReq = 1 in
def tPOP_RET : tPseudoExpand<(outs), (ins pred:$p, reglist:$regs, variable_ops),
2, IIC_iPop_Br, [],
(tPOP pred:$p, reglist:$regs)>, Sched<[WriteBrL]>;
// Indirect branch using "mov pc, $Rm"
let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
def tBRIND : tPseudoExpand<(outs), (ins GPR:$Rm, pred:$p),
2, IIC_Br, [(brind GPR:$Rm)],
(tMOVr PC, GPR:$Rm, pred:$p)>, Sched<[WriteBr]>;
}
// In Thumb1, "nop" is encoded as a "mov r8, r8". Technically, the bf00
// encoding is available on ARMv6K, but we don't differentiate that finely.
def : InstAlias<"nop", (tMOVr R8, R8, 14, 0), 0>, Requires<[IsThumb, IsThumb1Only]>;
// "neg" is and alias for "rsb rd, rn, #0"
def : tInstAlias<"neg${s}${p} $Rd, $Rm",
(tRSB tGPR:$Rd, s_cc_out:$s, tGPR:$Rm, pred:$p)>;
// Implied destination operand forms for shifts.
def : tInstAlias<"lsl${s}${p} $Rdm, $imm",
(tLSLri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm0_31:$imm, pred:$p)>;
def : tInstAlias<"lsr${s}${p} $Rdm, $imm",
(tLSRri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm_sr:$imm, pred:$p)>;
def : tInstAlias<"asr${s}${p} $Rdm, $imm",
(tASRri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm_sr:$imm, pred:$p)>;
// Pseudo instruction ldr Rt, =immediate
def tLDRConstPool
: tAsmPseudo<"ldr${p} $Rt, $immediate",
(ins tGPR:$Rt, const_pool_asm_imm:$immediate, pred:$p)>;
Index: vendor/llvm/dist-release_90/lib/Target/AVR/AVRISelLowering.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/AVR/AVRISelLowering.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/AVR/AVRISelLowering.cpp (revision 351303)
@@ -1,2049 +1,2049 @@
//===-- AVRISelLowering.cpp - AVR DAG Lowering Implementation -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines the interfaces that AVR uses to lower LLVM code into a
// selection DAG.
//
//===----------------------------------------------------------------------===//
#include "AVRISelLowering.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/ErrorHandling.h"
#include "AVR.h"
#include "AVRMachineFunctionInfo.h"
#include "AVRSubtarget.h"
#include "AVRTargetMachine.h"
#include "MCTargetDesc/AVRMCTargetDesc.h"
namespace llvm {
AVRTargetLowering::AVRTargetLowering(const AVRTargetMachine &TM,
const AVRSubtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
// Set up the register classes.
addRegisterClass(MVT::i8, &AVR::GPR8RegClass);
addRegisterClass(MVT::i16, &AVR::DREGSRegClass);
// Compute derived properties from the register classes.
computeRegisterProperties(Subtarget.getRegisterInfo());
setBooleanContents(ZeroOrOneBooleanContent);
setBooleanVectorContents(ZeroOrOneBooleanContent);
setSchedulingPreference(Sched::RegPressure);
setStackPointerRegisterToSaveRestore(AVR::SP);
setSupportsUnalignedAtomics(true);
setOperationAction(ISD::GlobalAddress, MVT::i16, Custom);
setOperationAction(ISD::BlockAddress, MVT::i16, Custom);
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i8, Expand);
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Expand);
for (MVT VT : MVT::integer_valuetypes()) {
for (auto N : {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}) {
setLoadExtAction(N, VT, MVT::i1, Promote);
setLoadExtAction(N, VT, MVT::i8, Expand);
}
}
setTruncStoreAction(MVT::i16, MVT::i8, Expand);
for (MVT VT : MVT::integer_valuetypes()) {
setOperationAction(ISD::ADDC, VT, Legal);
setOperationAction(ISD::SUBC, VT, Legal);
setOperationAction(ISD::ADDE, VT, Legal);
setOperationAction(ISD::SUBE, VT, Legal);
}
// sub (x, imm) gets canonicalized to add (x, -imm), so for illegal types
// revert into a sub since we don't have an add with immediate instruction.
setOperationAction(ISD::ADD, MVT::i32, Custom);
setOperationAction(ISD::ADD, MVT::i64, Custom);
// our shift instructions are only able to shift 1 bit at a time, so handle
// this in a custom way.
setOperationAction(ISD::SRA, MVT::i8, Custom);
setOperationAction(ISD::SHL, MVT::i8, Custom);
setOperationAction(ISD::SRL, MVT::i8, Custom);
setOperationAction(ISD::SRA, MVT::i16, Custom);
setOperationAction(ISD::SHL, MVT::i16, Custom);
setOperationAction(ISD::SRL, MVT::i16, Custom);
setOperationAction(ISD::SHL_PARTS, MVT::i16, Expand);
setOperationAction(ISD::SRA_PARTS, MVT::i16, Expand);
setOperationAction(ISD::SRL_PARTS, MVT::i16, Expand);
setOperationAction(ISD::ROTL, MVT::i8, Custom);
setOperationAction(ISD::ROTL, MVT::i16, Expand);
setOperationAction(ISD::ROTR, MVT::i8, Custom);
setOperationAction(ISD::ROTR, MVT::i16, Expand);
setOperationAction(ISD::BR_CC, MVT::i8, Custom);
setOperationAction(ISD::BR_CC, MVT::i16, Custom);
setOperationAction(ISD::BR_CC, MVT::i32, Custom);
setOperationAction(ISD::BR_CC, MVT::i64, Custom);
setOperationAction(ISD::BRCOND, MVT::Other, Expand);
setOperationAction(ISD::SELECT_CC, MVT::i8, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i16, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
setOperationAction(ISD::SETCC, MVT::i8, Custom);
setOperationAction(ISD::SETCC, MVT::i16, Custom);
setOperationAction(ISD::SETCC, MVT::i32, Custom);
setOperationAction(ISD::SETCC, MVT::i64, Custom);
setOperationAction(ISD::SELECT, MVT::i8, Expand);
setOperationAction(ISD::SELECT, MVT::i16, Expand);
setOperationAction(ISD::BSWAP, MVT::i16, Expand);
// Add support for postincrement and predecrement load/stores.
setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal);
setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal);
setIndexedLoadAction(ISD::PRE_DEC, MVT::i8, Legal);
setIndexedLoadAction(ISD::PRE_DEC, MVT::i16, Legal);
setIndexedStoreAction(ISD::POST_INC, MVT::i8, Legal);
setIndexedStoreAction(ISD::POST_INC, MVT::i16, Legal);
setIndexedStoreAction(ISD::PRE_DEC, MVT::i8, Legal);
setIndexedStoreAction(ISD::PRE_DEC, MVT::i16, Legal);
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
setOperationAction(ISD::VASTART, MVT::Other, Custom);
setOperationAction(ISD::VAEND, MVT::Other, Expand);
setOperationAction(ISD::VAARG, MVT::Other, Expand);
setOperationAction(ISD::VACOPY, MVT::Other, Expand);
// Atomic operations which must be lowered to rtlib calls
for (MVT VT : MVT::integer_valuetypes()) {
setOperationAction(ISD::ATOMIC_SWAP, VT, Expand);
setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Expand);
setOperationAction(ISD::ATOMIC_LOAD_NAND, VT, Expand);
setOperationAction(ISD::ATOMIC_LOAD_MAX, VT, Expand);
setOperationAction(ISD::ATOMIC_LOAD_MIN, VT, Expand);
setOperationAction(ISD::ATOMIC_LOAD_UMAX, VT, Expand);
setOperationAction(ISD::ATOMIC_LOAD_UMIN, VT, Expand);
}
// Division/remainder
setOperationAction(ISD::UDIV, MVT::i8, Expand);
setOperationAction(ISD::UDIV, MVT::i16, Expand);
setOperationAction(ISD::UREM, MVT::i8, Expand);
setOperationAction(ISD::UREM, MVT::i16, Expand);
setOperationAction(ISD::SDIV, MVT::i8, Expand);
setOperationAction(ISD::SDIV, MVT::i16, Expand);
setOperationAction(ISD::SREM, MVT::i8, Expand);
setOperationAction(ISD::SREM, MVT::i16, Expand);
// Make division and modulus custom
for (MVT VT : MVT::integer_valuetypes()) {
setOperationAction(ISD::UDIVREM, VT, Custom);
setOperationAction(ISD::SDIVREM, VT, Custom);
}
// Do not use MUL. The AVR instructions are closer to SMUL_LOHI &co.
setOperationAction(ISD::MUL, MVT::i8, Expand);
setOperationAction(ISD::MUL, MVT::i16, Expand);
// Expand 16 bit multiplications.
setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
// Expand multiplications to libcalls when there is
// no hardware MUL.
if (!Subtarget.supportsMultiplication()) {
setOperationAction(ISD::SMUL_LOHI, MVT::i8, Expand);
setOperationAction(ISD::UMUL_LOHI, MVT::i8, Expand);
}
for (MVT VT : MVT::integer_valuetypes()) {
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::MULHU, VT, Expand);
}
for (MVT VT : MVT::integer_valuetypes()) {
setOperationAction(ISD::CTPOP, VT, Expand);
setOperationAction(ISD::CTLZ, VT, Expand);
setOperationAction(ISD::CTTZ, VT, Expand);
}
for (MVT VT : MVT::integer_valuetypes()) {
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
// TODO: The generated code is pretty poor. Investigate using the
// same "shift and subtract with carry" trick that we do for
// extending 8-bit to 16-bit. This may require infrastructure
// improvements in how we treat 16-bit "registers" to be feasible.
}
// Division rtlib functions (not supported)
setLibcallName(RTLIB::SDIV_I8, nullptr);
setLibcallName(RTLIB::SDIV_I16, nullptr);
setLibcallName(RTLIB::SDIV_I32, nullptr);
setLibcallName(RTLIB::SDIV_I64, nullptr);
setLibcallName(RTLIB::SDIV_I128, nullptr);
setLibcallName(RTLIB::UDIV_I8, nullptr);
setLibcallName(RTLIB::UDIV_I16, nullptr);
setLibcallName(RTLIB::UDIV_I32, nullptr);
setLibcallName(RTLIB::UDIV_I64, nullptr);
setLibcallName(RTLIB::UDIV_I128, nullptr);
// Modulus rtlib functions (not supported)
setLibcallName(RTLIB::SREM_I8, nullptr);
setLibcallName(RTLIB::SREM_I16, nullptr);
setLibcallName(RTLIB::SREM_I32, nullptr);
setLibcallName(RTLIB::SREM_I64, nullptr);
setLibcallName(RTLIB::SREM_I128, nullptr);
setLibcallName(RTLIB::UREM_I8, nullptr);
setLibcallName(RTLIB::UREM_I16, nullptr);
setLibcallName(RTLIB::UREM_I32, nullptr);
setLibcallName(RTLIB::UREM_I64, nullptr);
setLibcallName(RTLIB::UREM_I128, nullptr);
// Division and modulus rtlib functions
setLibcallName(RTLIB::SDIVREM_I8, "__divmodqi4");
setLibcallName(RTLIB::SDIVREM_I16, "__divmodhi4");
setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
setLibcallName(RTLIB::SDIVREM_I64, "__divmoddi4");
setLibcallName(RTLIB::SDIVREM_I128, "__divmodti4");
setLibcallName(RTLIB::UDIVREM_I8, "__udivmodqi4");
setLibcallName(RTLIB::UDIVREM_I16, "__udivmodhi4");
setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
setLibcallName(RTLIB::UDIVREM_I64, "__udivmoddi4");
setLibcallName(RTLIB::UDIVREM_I128, "__udivmodti4");
// Several of the runtime library functions use a special calling conv
setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::AVR_BUILTIN);
setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::AVR_BUILTIN);
setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::AVR_BUILTIN);
setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::AVR_BUILTIN);
// Trigonometric rtlib functions
setLibcallName(RTLIB::SIN_F32, "sin");
setLibcallName(RTLIB::COS_F32, "cos");
setMinFunctionAlignment(1);
setMinimumJumpTableEntries(UINT_MAX);
}
const char *AVRTargetLowering::getTargetNodeName(unsigned Opcode) const {
#define NODE(name) \
case AVRISD::name: \
return #name
switch (Opcode) {
default:
return nullptr;
NODE(RET_FLAG);
NODE(RETI_FLAG);
NODE(CALL);
NODE(WRAPPER);
NODE(LSL);
NODE(LSR);
NODE(ROL);
NODE(ROR);
NODE(ASR);
NODE(LSLLOOP);
NODE(LSRLOOP);
NODE(ASRLOOP);
NODE(BRCOND);
NODE(CMP);
NODE(CMPC);
NODE(TST);
NODE(SELECT_CC);
#undef NODE
}
}
EVT AVRTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
EVT VT) const {
assert(!VT.isVector() && "No AVR SetCC type for vectors!");
return MVT::i8;
}
SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
//:TODO: this function has to be completely rewritten to produce optimal
// code, for now it's producing very long but correct code.
unsigned Opc8;
const SDNode *N = Op.getNode();
EVT VT = Op.getValueType();
SDLoc dl(N);
// Expand non-constant shifts to loops.
if (!isa<ConstantSDNode>(N->getOperand(1))) {
switch (Op.getOpcode()) {
default:
llvm_unreachable("Invalid shift opcode!");
case ISD::SHL:
return DAG.getNode(AVRISD::LSLLOOP, dl, VT, N->getOperand(0),
N->getOperand(1));
case ISD::SRL:
return DAG.getNode(AVRISD::LSRLOOP, dl, VT, N->getOperand(0),
N->getOperand(1));
case ISD::ROTL:
return DAG.getNode(AVRISD::ROLLOOP, dl, VT, N->getOperand(0),
N->getOperand(1));
case ISD::ROTR:
return DAG.getNode(AVRISD::RORLOOP, dl, VT, N->getOperand(0),
N->getOperand(1));
case ISD::SRA:
return DAG.getNode(AVRISD::ASRLOOP, dl, VT, N->getOperand(0),
N->getOperand(1));
}
}
uint64_t ShiftAmount = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
SDValue Victim = N->getOperand(0);
switch (Op.getOpcode()) {
case ISD::SRA:
Opc8 = AVRISD::ASR;
break;
case ISD::ROTL:
Opc8 = AVRISD::ROL;
break;
case ISD::ROTR:
Opc8 = AVRISD::ROR;
break;
case ISD::SRL:
Opc8 = AVRISD::LSR;
break;
case ISD::SHL:
Opc8 = AVRISD::LSL;
break;
default:
llvm_unreachable("Invalid shift opcode");
}
while (ShiftAmount--) {
Victim = DAG.getNode(Opc8, dl, VT, Victim);
}
return Victim;
}
SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
unsigned Opcode = Op->getOpcode();
assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
"Invalid opcode for Div/Rem lowering");
bool IsSigned = (Opcode == ISD::SDIVREM);
EVT VT = Op->getValueType(0);
Type *Ty = VT.getTypeForEVT(*DAG.getContext());
RTLIB::Libcall LC;
switch (VT.getSimpleVT().SimpleTy) {
default:
llvm_unreachable("Unexpected request for libcall!");
case MVT::i8:
LC = IsSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8;
break;
case MVT::i16:
LC = IsSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16;
break;
case MVT::i32:
LC = IsSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32;
break;
case MVT::i64:
LC = IsSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64;
break;
case MVT::i128:
LC = IsSigned ? RTLIB::SDIVREM_I128 : RTLIB::UDIVREM_I128;
break;
}
SDValue InChain = DAG.getEntryNode();
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
for (SDValue const &Value : Op->op_values()) {
Entry.Node = Value;
Entry.Ty = Value.getValueType().getTypeForEVT(*DAG.getContext());
Entry.IsSExt = IsSigned;
Entry.IsZExt = !IsSigned;
Args.push_back(Entry);
}
SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
getPointerTy(DAG.getDataLayout()));
Type *RetTy = (Type *)StructType::get(Ty, Ty);
SDLoc dl(Op);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl)
.setChain(InChain)
.setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
.setInRegister()
.setSExtResult(IsSigned)
.setZExtResult(!IsSigned);
std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
return CallInfo.first;
}
SDValue AVRTargetLowering::LowerGlobalAddress(SDValue Op,
SelectionDAG &DAG) const {
auto DL = DAG.getDataLayout();
const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
// Create the TargetGlobalAddress node, folding in the constant offset.
SDValue Result =
DAG.getTargetGlobalAddress(GV, SDLoc(Op), getPointerTy(DL), Offset);
return DAG.getNode(AVRISD::WRAPPER, SDLoc(Op), getPointerTy(DL), Result);
}
SDValue AVRTargetLowering::LowerBlockAddress(SDValue Op,
SelectionDAG &DAG) const {
auto DL = DAG.getDataLayout();
const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(DL));
return DAG.getNode(AVRISD::WRAPPER, SDLoc(Op), getPointerTy(DL), Result);
}
/// IntCCToAVRCC - Convert a DAG integer condition code to an AVR CC.
static AVRCC::CondCodes intCCToAVRCC(ISD::CondCode CC) {
switch (CC) {
default:
llvm_unreachable("Unknown condition code!");
case ISD::SETEQ:
return AVRCC::COND_EQ;
case ISD::SETNE:
return AVRCC::COND_NE;
case ISD::SETGE:
return AVRCC::COND_GE;
case ISD::SETLT:
return AVRCC::COND_LT;
case ISD::SETUGE:
return AVRCC::COND_SH;
case ISD::SETULT:
return AVRCC::COND_LO;
}
}
/// Returns appropriate AVR CMP/CMPC nodes and corresponding condition code for
/// the given operands.
SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
SDValue &AVRcc, SelectionDAG &DAG,
SDLoc DL) const {
SDValue Cmp;
EVT VT = LHS.getValueType();
bool UseTest = false;
switch (CC) {
default:
break;
case ISD::SETLE: {
// Swap operands and reverse the branching condition.
std::swap(LHS, RHS);
CC = ISD::SETGE;
break;
}
case ISD::SETGT: {
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
switch (C->getSExtValue()) {
case -1: {
// When doing lhs > -1 use a tst instruction on the top part of lhs
// and use brpl instead of using a chain of cp/cpc.
UseTest = true;
AVRcc = DAG.getConstant(AVRCC::COND_PL, DL, MVT::i8);
break;
}
case 0: {
// Turn lhs > 0 into 0 < lhs since 0 can be materialized with
// __zero_reg__ in lhs.
RHS = LHS;
LHS = DAG.getConstant(0, DL, VT);
CC = ISD::SETLT;
break;
}
default: {
// Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows
// us to fold the constant into the cmp instruction.
RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
CC = ISD::SETGE;
break;
}
}
break;
}
// Swap operands and reverse the branching condition.
std::swap(LHS, RHS);
CC = ISD::SETLT;
break;
}
case ISD::SETLT: {
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
switch (C->getSExtValue()) {
case 1: {
// Turn lhs < 1 into 0 >= lhs since 0 can be materialized with
// __zero_reg__ in lhs.
RHS = LHS;
LHS = DAG.getConstant(0, DL, VT);
CC = ISD::SETGE;
break;
}
case 0: {
// When doing lhs < 0 use a tst instruction on the top part of lhs
// and use brmi instead of using a chain of cp/cpc.
UseTest = true;
AVRcc = DAG.getConstant(AVRCC::COND_MI, DL, MVT::i8);
break;
}
}
}
break;
}
case ISD::SETULE: {
// Swap operands and reverse the branching condition.
std::swap(LHS, RHS);
CC = ISD::SETUGE;
break;
}
case ISD::SETUGT: {
// Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows us to
// fold the constant into the cmp instruction.
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
CC = ISD::SETUGE;
break;
}
// Swap operands and reverse the branching condition.
std::swap(LHS, RHS);
CC = ISD::SETULT;
break;
}
}
// Expand 32 and 64 bit comparisons with custom CMP and CMPC nodes instead of
// using the default and/or/xor expansion code which is much longer.
if (VT == MVT::i32) {
SDValue LHSlo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS,
DAG.getIntPtrConstant(0, DL));
SDValue LHShi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS,
DAG.getIntPtrConstant(1, DL));
SDValue RHSlo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS,
DAG.getIntPtrConstant(0, DL));
SDValue RHShi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS,
DAG.getIntPtrConstant(1, DL));
if (UseTest) {
// When using tst we only care about the highest part.
SDValue Top = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, LHShi,
DAG.getIntPtrConstant(1, DL));
Cmp = DAG.getNode(AVRISD::TST, DL, MVT::Glue, Top);
} else {
Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHSlo, RHSlo);
Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHShi, RHShi, Cmp);
}
} else if (VT == MVT::i64) {
SDValue LHS_0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS,
DAG.getIntPtrConstant(0, DL));
SDValue LHS_1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS,
DAG.getIntPtrConstant(1, DL));
SDValue LHS0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS_0,
DAG.getIntPtrConstant(0, DL));
SDValue LHS1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS_0,
DAG.getIntPtrConstant(1, DL));
SDValue LHS2 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS_1,
DAG.getIntPtrConstant(0, DL));
SDValue LHS3 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS_1,
DAG.getIntPtrConstant(1, DL));
SDValue RHS_0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS,
DAG.getIntPtrConstant(0, DL));
SDValue RHS_1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS,
DAG.getIntPtrConstant(1, DL));
SDValue RHS0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS_0,
DAG.getIntPtrConstant(0, DL));
SDValue RHS1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS_0,
DAG.getIntPtrConstant(1, DL));
SDValue RHS2 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS_1,
DAG.getIntPtrConstant(0, DL));
SDValue RHS3 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS_1,
DAG.getIntPtrConstant(1, DL));
if (UseTest) {
// When using tst we only care about the highest part.
SDValue Top = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, LHS3,
DAG.getIntPtrConstant(1, DL));
Cmp = DAG.getNode(AVRISD::TST, DL, MVT::Glue, Top);
} else {
Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHS0, RHS0);
Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHS1, RHS1, Cmp);
Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHS2, RHS2, Cmp);
Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHS3, RHS3, Cmp);
}
} else if (VT == MVT::i8 || VT == MVT::i16) {
if (UseTest) {
// When using tst we only care about the highest part.
Cmp = DAG.getNode(AVRISD::TST, DL, MVT::Glue,
(VT == MVT::i8)
? LHS
: DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8,
LHS, DAG.getIntPtrConstant(1, DL)));
} else {
Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHS, RHS);
}
} else {
llvm_unreachable("Invalid comparison size");
}
// When using a test instruction AVRcc is already set.
if (!UseTest) {
AVRcc = DAG.getConstant(intCCToAVRCC(CC), DL, MVT::i8);
}
return Cmp;
}
SDValue AVRTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
SDValue LHS = Op.getOperand(2);
SDValue RHS = Op.getOperand(3);
SDValue Dest = Op.getOperand(4);
SDLoc dl(Op);
SDValue TargetCC;
SDValue Cmp = getAVRCmp(LHS, RHS, CC, TargetCC, DAG, dl);
return DAG.getNode(AVRISD::BRCOND, dl, MVT::Other, Chain, Dest, TargetCC,
Cmp);
}
SDValue AVRTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
SDValue TrueV = Op.getOperand(2);
SDValue FalseV = Op.getOperand(3);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
SDLoc dl(Op);
SDValue TargetCC;
SDValue Cmp = getAVRCmp(LHS, RHS, CC, TargetCC, DAG, dl);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
SDValue Ops[] = {TrueV, FalseV, TargetCC, Cmp};
return DAG.getNode(AVRISD::SELECT_CC, dl, VTs, Ops);
}
SDValue AVRTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
SDLoc DL(Op);
SDValue TargetCC;
SDValue Cmp = getAVRCmp(LHS, RHS, CC, TargetCC, DAG, DL);
SDValue TrueV = DAG.getConstant(1, DL, Op.getValueType());
SDValue FalseV = DAG.getConstant(0, DL, Op.getValueType());
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
SDValue Ops[] = {TrueV, FalseV, TargetCC, Cmp};
return DAG.getNode(AVRISD::SELECT_CC, DL, VTs, Ops);
}
SDValue AVRTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
const MachineFunction &MF = DAG.getMachineFunction();
const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
auto DL = DAG.getDataLayout();
SDLoc dl(Op);
// Vastart just stores the address of the VarArgsFrameIndex slot into the
// memory location argument.
SDValue FI = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(), getPointerTy(DL));
return DAG.getStore(Op.getOperand(0), dl, FI, Op.getOperand(1),
MachinePointerInfo(SV), 0);
}
SDValue AVRTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default:
llvm_unreachable("Don't know how to custom lower this!");
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
case ISD::ROTL:
case ISD::ROTR:
return LowerShifts(Op, DAG);
case ISD::GlobalAddress:
return LowerGlobalAddress(Op, DAG);
case ISD::BlockAddress:
return LowerBlockAddress(Op, DAG);
case ISD::BR_CC:
return LowerBR_CC(Op, DAG);
case ISD::SELECT_CC:
return LowerSELECT_CC(Op, DAG);
case ISD::SETCC:
return LowerSETCC(Op, DAG);
case ISD::VASTART:
return LowerVASTART(Op, DAG);
case ISD::SDIVREM:
case ISD::UDIVREM:
return LowerDivRem(Op, DAG);
}
return SDValue();
}
/// Replace a node with an illegal result type
/// with a new node built out of custom code.
void AVRTargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
SDLoc DL(N);
switch (N->getOpcode()) {
case ISD::ADD: {
// Convert add (x, imm) into sub (x, -imm).
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
SDValue Sub = DAG.getNode(
ISD::SUB, DL, N->getValueType(0), N->getOperand(0),
DAG.getConstant(-C->getAPIntValue(), DL, C->getValueType(0)));
Results.push_back(Sub);
}
break;
}
default: {
SDValue Res = LowerOperation(SDValue(N, 0), DAG);
for (unsigned I = 0, E = Res->getNumValues(); I != E; ++I)
Results.push_back(Res.getValue(I));
break;
}
}
}
/// Return true if the addressing mode represented
/// by AM is legal for this target, for a load/store of the specified type.
bool AVRTargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS, Instruction *I) const {
int64_t Offs = AM.BaseOffs;
// Allow absolute addresses.
if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && Offs == 0) {
return true;
}
// Flash memory instructions only allow zero offsets.
if (isa<PointerType>(Ty) && AS == AVR::ProgramMemory) {
return false;
}
// Allow reg+<6bit> offset.
if (Offs < 0)
Offs = -Offs;
if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 0 && isUInt<6>(Offs)) {
return true;
}
return false;
}
/// Returns true by value, base pointer and
/// offset pointer and addressing mode by reference if the node's address
/// can be legally represented as pre-indexed load / store address.
bool AVRTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
SDValue &Offset,
ISD::MemIndexedMode &AM,
SelectionDAG &DAG) const {
EVT VT;
const SDNode *Op;
SDLoc DL(N);
if (const LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
VT = LD->getMemoryVT();
Op = LD->getBasePtr().getNode();
if (LD->getExtensionType() != ISD::NON_EXTLOAD)
return false;
if (AVR::isProgramMemoryAccess(LD)) {
return false;
}
} else if (const StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
VT = ST->getMemoryVT();
Op = ST->getBasePtr().getNode();
if (AVR::isProgramMemoryAccess(ST)) {
return false;
}
} else {
return false;
}
if (VT != MVT::i8 && VT != MVT::i16) {
return false;
}
if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) {
return false;
}
if (const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
int RHSC = RHS->getSExtValue();
if (Op->getOpcode() == ISD::SUB)
RHSC = -RHSC;
if ((VT == MVT::i16 && RHSC != -2) || (VT == MVT::i8 && RHSC != -1)) {
return false;
}
Base = Op->getOperand(0);
Offset = DAG.getConstant(RHSC, DL, MVT::i8);
AM = ISD::PRE_DEC;
return true;
}
return false;
}
/// Returns true by value, base pointer and
/// offset pointer and addressing mode by reference if this node can be
/// combined with a load / store to form a post-indexed load / store.
bool AVRTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
SDValue &Base,
SDValue &Offset,
ISD::MemIndexedMode &AM,
SelectionDAG &DAG) const {
EVT VT;
SDLoc DL(N);
if (const LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
VT = LD->getMemoryVT();
if (LD->getExtensionType() != ISD::NON_EXTLOAD)
return false;
} else if (const StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
VT = ST->getMemoryVT();
if (AVR::isProgramMemoryAccess(ST)) {
return false;
}
} else {
return false;
}
if (VT != MVT::i8 && VT != MVT::i16) {
return false;
}
if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) {
return false;
}
if (const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
int RHSC = RHS->getSExtValue();
if (Op->getOpcode() == ISD::SUB)
RHSC = -RHSC;
if ((VT == MVT::i16 && RHSC != 2) || (VT == MVT::i8 && RHSC != 1)) {
return false;
}
Base = Op->getOperand(0);
Offset = DAG.getConstant(RHSC, DL, MVT::i8);
AM = ISD::POST_INC;
return true;
}
return false;
}
bool AVRTargetLowering::isOffsetFoldingLegal(
const GlobalAddressSDNode *GA) const {
return true;
}
//===----------------------------------------------------------------------===//
// Formal Arguments Calling Convention Implementation
//===----------------------------------------------------------------------===//
#include "AVRGenCallingConv.inc"
/// For each argument in a function store the number of pieces it is composed
/// of.
static void parseFunctionArgs(const SmallVectorImpl<ISD::InputArg> &Ins,
SmallVectorImpl<unsigned> &Out) {
for (const ISD::InputArg &Arg : Ins) {
if(Arg.PartOffset > 0) continue;
unsigned Bytes = ((Arg.ArgVT.getSizeInBits()) + 7) / 8;
Out.push_back((Bytes + 1) / 2);
}
}
/// For external symbols there is no function prototype information so we
/// have to rely directly on argument sizes.
static void parseExternFuncCallArgs(const SmallVectorImpl<ISD::OutputArg> &In,
SmallVectorImpl<unsigned> &Out) {
for (unsigned i = 0, e = In.size(); i != e;) {
unsigned Size = 0;
unsigned Offset = 0;
while ((i != e) && (In[i].PartOffset == Offset)) {
Offset += In[i].VT.getStoreSize();
++i;
++Size;
}
Out.push_back(Size);
}
}
static StringRef getFunctionName(TargetLowering::CallLoweringInfo &CLI) {
SDValue Callee = CLI.Callee;
if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee)) {
return G->getSymbol();
}
if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
return G->getGlobal()->getName();
}
llvm_unreachable("don't know how to get the name for this callee");
}
/// Analyze incoming and outgoing function arguments. We need custom C++ code
/// to handle special constraints in the ABI like reversing the order of the
/// pieces of splitted arguments. In addition, all pieces of a certain argument
/// have to be passed either using registers or the stack but never mixing both.
static void analyzeStandardArguments(TargetLowering::CallLoweringInfo *CLI,
const Function *F, const DataLayout *TD,
const SmallVectorImpl<ISD::OutputArg> *Outs,
const SmallVectorImpl<ISD::InputArg> *Ins,
CallingConv::ID CallConv,
SmallVectorImpl<CCValAssign> &ArgLocs,
CCState &CCInfo, bool IsCall, bool IsVarArg) {
static const MCPhysReg RegList8[] = {AVR::R24, AVR::R22, AVR::R20,
AVR::R18, AVR::R16, AVR::R14,
AVR::R12, AVR::R10, AVR::R8};
static const MCPhysReg RegList16[] = {AVR::R25R24, AVR::R23R22, AVR::R21R20,
AVR::R19R18, AVR::R17R16, AVR::R15R14,
AVR::R13R12, AVR::R11R10, AVR::R9R8};
if (IsVarArg) {
// Variadic functions do not need all the analysis below.
if (IsCall) {
CCInfo.AnalyzeCallOperands(*Outs, ArgCC_AVR_Vararg);
} else {
CCInfo.AnalyzeFormalArguments(*Ins, ArgCC_AVR_Vararg);
}
return;
}
// Fill in the Args array which will contain original argument sizes.
SmallVector<unsigned, 8> Args;
if (IsCall) {
parseExternFuncCallArgs(*Outs, Args);
} else {
assert(F != nullptr && "function should not be null");
parseFunctionArgs(*Ins, Args);
}
unsigned RegsLeft = array_lengthof(RegList8), ValNo = 0;
// Variadic functions always use the stack.
bool UsesStack = false;
for (unsigned i = 0, pos = 0, e = Args.size(); i != e; ++i) {
unsigned Size = Args[i];
// If we have a zero-sized argument, don't attempt to lower it.
// AVR-GCC does not support zero-sized arguments and so we need not
// worry about ABI compatibility.
if (Size == 0) continue;
MVT LocVT = (IsCall) ? (*Outs)[pos].VT : (*Ins)[pos].VT;
// If we have plenty of regs to pass the whole argument do it.
if (!UsesStack && (Size <= RegsLeft)) {
const MCPhysReg *RegList = (LocVT == MVT::i16) ? RegList16 : RegList8;
for (unsigned j = 0; j != Size; ++j) {
unsigned Reg = CCInfo.AllocateReg(
ArrayRef<MCPhysReg>(RegList, array_lengthof(RegList8)));
CCInfo.addLoc(
CCValAssign::getReg(ValNo++, LocVT, Reg, LocVT, CCValAssign::Full));
--RegsLeft;
}
// Reverse the order of the pieces to agree with the "big endian" format
// required in the calling convention ABI.
std::reverse(ArgLocs.begin() + pos, ArgLocs.begin() + pos + Size);
} else {
// Pass the rest of arguments using the stack.
UsesStack = true;
for (unsigned j = 0; j != Size; ++j) {
unsigned Offset = CCInfo.AllocateStack(
TD->getTypeAllocSize(EVT(LocVT).getTypeForEVT(CCInfo.getContext())),
TD->getABITypeAlignment(
EVT(LocVT).getTypeForEVT(CCInfo.getContext())));
CCInfo.addLoc(CCValAssign::getMem(ValNo++, LocVT, Offset, LocVT,
CCValAssign::Full));
}
}
pos += Size;
}
}
static void analyzeBuiltinArguments(TargetLowering::CallLoweringInfo &CLI,
const Function *F, const DataLayout *TD,
const SmallVectorImpl<ISD::OutputArg> *Outs,
const SmallVectorImpl<ISD::InputArg> *Ins,
CallingConv::ID CallConv,
SmallVectorImpl<CCValAssign> &ArgLocs,
CCState &CCInfo, bool IsCall, bool IsVarArg) {
StringRef FuncName = getFunctionName(CLI);
if (FuncName.startswith("__udivmod") || FuncName.startswith("__divmod")) {
CCInfo.AnalyzeCallOperands(*Outs, ArgCC_AVR_BUILTIN_DIV);
} else {
analyzeStandardArguments(&CLI, F, TD, Outs, Ins,
CallConv, ArgLocs, CCInfo,
IsCall, IsVarArg);
}
}
static void analyzeArguments(TargetLowering::CallLoweringInfo *CLI,
const Function *F, const DataLayout *TD,
const SmallVectorImpl<ISD::OutputArg> *Outs,
const SmallVectorImpl<ISD::InputArg> *Ins,
CallingConv::ID CallConv,
SmallVectorImpl<CCValAssign> &ArgLocs,
CCState &CCInfo, bool IsCall, bool IsVarArg) {
switch (CallConv) {
case CallingConv::AVR_BUILTIN: {
analyzeBuiltinArguments(*CLI, F, TD, Outs, Ins,
CallConv, ArgLocs, CCInfo,
IsCall, IsVarArg);
return;
}
default: {
analyzeStandardArguments(CLI, F, TD, Outs, Ins,
CallConv, ArgLocs, CCInfo,
IsCall, IsVarArg);
return;
}
}
}
SDValue AVRTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
auto DL = DAG.getDataLayout();
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
analyzeArguments(nullptr, &MF.getFunction(), &DL, 0, &Ins, CallConv, ArgLocs, CCInfo,
false, isVarArg);
SDValue ArgValue;
for (CCValAssign &VA : ArgLocs) {
// Arguments stored on registers.
if (VA.isRegLoc()) {
EVT RegVT = VA.getLocVT();
const TargetRegisterClass *RC;
if (RegVT == MVT::i8) {
RC = &AVR::GPR8RegClass;
} else if (RegVT == MVT::i16) {
RC = &AVR::DREGSRegClass;
} else {
llvm_unreachable("Unknown argument type!");
}
unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
// :NOTE: Clang should not promote any i8 into i16 but for safety the
// following code will handle zexts or sexts generated by other
// front ends. Otherwise:
// If this is an 8 bit value, it is really passed promoted
// to 16 bits. Insert an assert[sz]ext to capture this, then
// truncate to the right size.
switch (VA.getLocInfo()) {
default:
llvm_unreachable("Unknown loc info!");
case CCValAssign::Full:
break;
case CCValAssign::BCvt:
ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
break;
case CCValAssign::SExt:
ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
DAG.getValueType(VA.getValVT()));
ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
break;
case CCValAssign::ZExt:
ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
DAG.getValueType(VA.getValVT()));
ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
break;
}
InVals.push_back(ArgValue);
} else {
// Sanity check.
assert(VA.isMemLoc());
EVT LocVT = VA.getLocVT();
// Create the frame index object for this incoming parameter.
int FI = MFI.CreateFixedObject(LocVT.getSizeInBits() / 8,
VA.getLocMemOffset(), true);
// Create the SelectionDAG nodes corresponding to a load
// from this parameter.
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DL));
InVals.push_back(DAG.getLoad(LocVT, dl, Chain, FIN,
MachinePointerInfo::getFixedStack(MF, FI),
0));
}
}
// If the function takes variable number of arguments, make a frame index for
// the start of the first vararg value... for expansion of llvm.va_start.
if (isVarArg) {
unsigned StackSize = CCInfo.getNextStackOffset();
AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
AFI->setVarArgsFrameIndex(MFI.CreateFixedObject(2, StackSize, true));
}
return Chain;
}
//===----------------------------------------------------------------------===//
// Call Calling Convention Implementation
//===----------------------------------------------------------------------===//
SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
SelectionDAG &DAG = CLI.DAG;
SDLoc &DL = CLI.DL;
SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
bool &isTailCall = CLI.IsTailCall;
CallingConv::ID CallConv = CLI.CallConv;
bool isVarArg = CLI.IsVarArg;
MachineFunction &MF = DAG.getMachineFunction();
// AVR does not yet support tail call optimization.
isTailCall = false;
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
// direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
// node so that legalize doesn't hack it.
const Function *F = nullptr;
if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
const GlobalValue *GV = G->getGlobal();
F = cast<Function>(GV);
Callee =
DAG.getTargetGlobalAddress(GV, DL, getPointerTy(DAG.getDataLayout()));
} else if (const ExternalSymbolSDNode *ES =
dyn_cast<ExternalSymbolSDNode>(Callee)) {
Callee = DAG.getTargetExternalSymbol(ES->getSymbol(),
getPointerTy(DAG.getDataLayout()));
}
analyzeArguments(&CLI, F, &DAG.getDataLayout(), &Outs, 0, CallConv, ArgLocs, CCInfo,
true, isVarArg);
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
// First, walk the register assignments, inserting copies.
unsigned AI, AE;
bool HasStackArgs = false;
for (AI = 0, AE = ArgLocs.size(); AI != AE; ++AI) {
CCValAssign &VA = ArgLocs[AI];
EVT RegVT = VA.getLocVT();
SDValue Arg = OutVals[AI];
// Promote the value if needed. With Clang this should not happen.
switch (VA.getLocInfo()) {
default:
llvm_unreachable("Unknown loc info!");
case CCValAssign::Full:
break;
case CCValAssign::SExt:
Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, RegVT, Arg);
break;
case CCValAssign::ZExt:
Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, RegVT, Arg);
break;
case CCValAssign::AExt:
Arg = DAG.getNode(ISD::ANY_EXTEND, DL, RegVT, Arg);
break;
case CCValAssign::BCvt:
Arg = DAG.getNode(ISD::BITCAST, DL, RegVT, Arg);
break;
}
// Stop when we encounter a stack argument, we need to process them
// in reverse order in the loop below.
if (VA.isMemLoc()) {
HasStackArgs = true;
break;
}
// Arguments that can be passed on registers must be kept in the RegsToPass
// vector.
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
}
// Second, stack arguments have to walked in reverse order by inserting
// chained stores, this ensures their order is not changed by the scheduler
// and that the push instruction sequence generated is correct, otherwise they
// can be freely intermixed.
if (HasStackArgs) {
for (AE = AI, AI = ArgLocs.size(); AI != AE; --AI) {
unsigned Loc = AI - 1;
CCValAssign &VA = ArgLocs[Loc];
SDValue Arg = OutVals[Loc];
assert(VA.isMemLoc());
// SP points to one stack slot further so add one to adjust it.
SDValue PtrOff = DAG.getNode(
ISD::ADD, DL, getPointerTy(DAG.getDataLayout()),
DAG.getRegister(AVR::SP, getPointerTy(DAG.getDataLayout())),
DAG.getIntPtrConstant(VA.getLocMemOffset() + 1, DL));
Chain =
DAG.getStore(Chain, DL, Arg, PtrOff,
MachinePointerInfo::getStack(MF, VA.getLocMemOffset()),
0);
}
}
// Build a sequence of copy-to-reg nodes chained together with token chain and
// flag operands which copy the outgoing args into registers. The InFlag in
// necessary since all emited instructions must be stuck together.
SDValue InFlag;
for (auto Reg : RegsToPass) {
Chain = DAG.getCopyToReg(Chain, DL, Reg.first, Reg.second, InFlag);
InFlag = Chain.getValue(1);
}
// Returns a chain & a flag for retval copy to use.
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
SmallVector<SDValue, 8> Ops;
Ops.push_back(Chain);
Ops.push_back(Callee);
// Add argument registers to the end of the list so that they are known live
// into the call.
for (auto Reg : RegsToPass) {
Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
}
// Add a register mask operand representing the call-preserved registers.
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
const uint32_t *Mask =
TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
if (InFlag.getNode()) {
Ops.push_back(InFlag);
}
Chain = DAG.getNode(AVRISD::CALL, DL, NodeTys, Ops);
InFlag = Chain.getValue(1);
// Create the CALLSEQ_END node.
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
if (!Ins.empty()) {
InFlag = Chain.getValue(1);
}
// Handle result values, copying them out of physregs into vregs that we
// return.
return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, DL, DAG,
InVals);
}
/// Lower the result values of a call into the
/// appropriate copies out of appropriate physical registers.
///
SDValue AVRTargetLowering::LowerCallResult(
SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals) const {
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
// Handle runtime calling convs.
auto CCFunction = CCAssignFnForReturn(CallConv);
CCInfo.AnalyzeCallResult(Ins, CCFunction);
if (CallConv != CallingConv::AVR_BUILTIN && RVLocs.size() > 1) {
// Reverse splitted return values to get the "big endian" format required
// to agree with the calling convention ABI.
std::reverse(RVLocs.begin(), RVLocs.end());
}
// Copy all of the result registers out of their specified physreg.
for (CCValAssign const &RVLoc : RVLocs) {
Chain = DAG.getCopyFromReg(Chain, dl, RVLoc.getLocReg(), RVLoc.getValVT(),
InFlag)
.getValue(1);
InFlag = Chain.getValue(2);
InVals.push_back(Chain.getValue(0));
}
return Chain;
}
//===----------------------------------------------------------------------===//
// Return Value Calling Convention Implementation
//===----------------------------------------------------------------------===//
CCAssignFn *AVRTargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
switch (CC) {
case CallingConv::AVR_BUILTIN:
return RetCC_AVR_BUILTIN;
default:
return RetCC_AVR;
}
}
bool
AVRTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
MachineFunction &MF, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
LLVMContext &Context) const
{
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
auto CCFunction = CCAssignFnForReturn(CallConv);
return CCInfo.CheckReturn(Outs, CCFunction);
}
SDValue
AVRTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &dl, SelectionDAG &DAG) const {
// CCValAssign - represent the assignment of the return value to locations.
SmallVector<CCValAssign, 16> RVLocs;
// CCState - Info about the registers and stack slot.
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
// Analyze return values.
auto CCFunction = CCAssignFnForReturn(CallConv);
CCInfo.AnalyzeReturn(Outs, CCFunction);
// If this is the first return lowered for this function, add the regs to
// the liveout set for the function.
MachineFunction &MF = DAG.getMachineFunction();
unsigned e = RVLocs.size();
// Reverse splitted return values to get the "big endian" format required
// to agree with the calling convention ABI.
if (e > 1) {
std::reverse(RVLocs.begin(), RVLocs.end());
}
SDValue Flag;
SmallVector<SDValue, 4> RetOps(1, Chain);
// Copy the result values into the output registers.
for (unsigned i = 0; i != e; ++i) {
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), OutVals[i], Flag);
// Guarantee that all emitted copies are stuck together with flags.
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
}
// Don't emit the ret/reti instruction when the naked attribute is present in
// the function being compiled.
if (MF.getFunction().getAttributes().hasAttribute(
AttributeList::FunctionIndex, Attribute::Naked)) {
return Chain;
}
unsigned RetOpc =
(CallConv == CallingConv::AVR_INTR || CallConv == CallingConv::AVR_SIGNAL)
? AVRISD::RETI_FLAG
: AVRISD::RET_FLAG;
RetOps[0] = Chain; // Update chain.
if (Flag.getNode()) {
RetOps.push_back(Flag);
}
return DAG.getNode(RetOpc, dl, MVT::Other, RetOps);
}
//===----------------------------------------------------------------------===//
// Custom Inserters
//===----------------------------------------------------------------------===//
MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
MachineBasicBlock *BB) const {
unsigned Opc;
const TargetRegisterClass *RC;
bool HasRepeatedOperand = false;
MachineFunction *F = BB->getParent();
MachineRegisterInfo &RI = F->getRegInfo();
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
DebugLoc dl = MI.getDebugLoc();
switch (MI.getOpcode()) {
default:
llvm_unreachable("Invalid shift opcode!");
case AVR::Lsl8:
Opc = AVR::ADDRdRr; // LSL is an alias of ADD Rd, Rd
RC = &AVR::GPR8RegClass;
HasRepeatedOperand = true;
break;
case AVR::Lsl16:
Opc = AVR::LSLWRd;
RC = &AVR::DREGSRegClass;
break;
case AVR::Asr8:
Opc = AVR::ASRRd;
RC = &AVR::GPR8RegClass;
break;
case AVR::Asr16:
Opc = AVR::ASRWRd;
RC = &AVR::DREGSRegClass;
break;
case AVR::Lsr8:
Opc = AVR::LSRRd;
RC = &AVR::GPR8RegClass;
break;
case AVR::Lsr16:
Opc = AVR::LSRWRd;
RC = &AVR::DREGSRegClass;
break;
case AVR::Rol8:
Opc = AVR::ADCRdRr; // ROL is an alias of ADC Rd, Rd
RC = &AVR::GPR8RegClass;
HasRepeatedOperand = true;
break;
case AVR::Rol16:
Opc = AVR::ROLWRd;
RC = &AVR::DREGSRegClass;
break;
case AVR::Ror8:
Opc = AVR::RORRd;
RC = &AVR::GPR8RegClass;
break;
case AVR::Ror16:
Opc = AVR::RORWRd;
RC = &AVR::DREGSRegClass;
break;
}
const BasicBlock *LLVM_BB = BB->getBasicBlock();
MachineFunction::iterator I;
for (I = BB->getIterator(); I != F->end() && &(*I) != BB; ++I);
if (I != F->end()) ++I;
// Create loop block.
MachineBasicBlock *LoopBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *RemBB = F->CreateMachineBasicBlock(LLVM_BB);
F->insert(I, LoopBB);
F->insert(I, RemBB);
// Update machine-CFG edges by transferring all successors of the current
// block to the block containing instructions after shift.
RemBB->splice(RemBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)),
BB->end());
RemBB->transferSuccessorsAndUpdatePHIs(BB);
// Add adges BB => LoopBB => RemBB, BB => RemBB, LoopBB => LoopBB.
BB->addSuccessor(LoopBB);
BB->addSuccessor(RemBB);
LoopBB->addSuccessor(RemBB);
LoopBB->addSuccessor(LoopBB);
unsigned ShiftAmtReg = RI.createVirtualRegister(&AVR::LD8RegClass);
unsigned ShiftAmtReg2 = RI.createVirtualRegister(&AVR::LD8RegClass);
unsigned ShiftReg = RI.createVirtualRegister(RC);
unsigned ShiftReg2 = RI.createVirtualRegister(RC);
unsigned ShiftAmtSrcReg = MI.getOperand(2).getReg();
unsigned SrcReg = MI.getOperand(1).getReg();
unsigned DstReg = MI.getOperand(0).getReg();
// BB:
// cpi N, 0
// breq RemBB
BuildMI(BB, dl, TII.get(AVR::CPIRdK)).addReg(ShiftAmtSrcReg).addImm(0);
BuildMI(BB, dl, TII.get(AVR::BREQk)).addMBB(RemBB);
// LoopBB:
// ShiftReg = phi [%SrcReg, BB], [%ShiftReg2, LoopBB]
// ShiftAmt = phi [%N, BB], [%ShiftAmt2, LoopBB]
// ShiftReg2 = shift ShiftReg
// ShiftAmt2 = ShiftAmt - 1;
BuildMI(LoopBB, dl, TII.get(AVR::PHI), ShiftReg)
.addReg(SrcReg)
.addMBB(BB)
.addReg(ShiftReg2)
.addMBB(LoopBB);
BuildMI(LoopBB, dl, TII.get(AVR::PHI), ShiftAmtReg)
.addReg(ShiftAmtSrcReg)
.addMBB(BB)
.addReg(ShiftAmtReg2)
.addMBB(LoopBB);
auto ShiftMI = BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2).addReg(ShiftReg);
if (HasRepeatedOperand)
ShiftMI.addReg(ShiftReg);
BuildMI(LoopBB, dl, TII.get(AVR::SUBIRdK), ShiftAmtReg2)
.addReg(ShiftAmtReg)
.addImm(1);
BuildMI(LoopBB, dl, TII.get(AVR::BRNEk)).addMBB(LoopBB);
// RemBB:
// DestReg = phi [%SrcReg, BB], [%ShiftReg, LoopBB]
BuildMI(*RemBB, RemBB->begin(), dl, TII.get(AVR::PHI), DstReg)
.addReg(SrcReg)
.addMBB(BB)
.addReg(ShiftReg2)
.addMBB(LoopBB);
MI.eraseFromParent(); // The pseudo instruction is gone now.
return RemBB;
}
static bool isCopyMulResult(MachineBasicBlock::iterator const &I) {
if (I->getOpcode() == AVR::COPY) {
unsigned SrcReg = I->getOperand(1).getReg();
return (SrcReg == AVR::R0 || SrcReg == AVR::R1);
}
return false;
}
// The mul instructions wreak havock on our zero_reg R1. We need to clear it
// after the result has been evacuated. This is probably not the best way to do
// it, but it works for now.
MachineBasicBlock *AVRTargetLowering::insertMul(MachineInstr &MI,
MachineBasicBlock *BB) const {
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
MachineBasicBlock::iterator I(MI);
++I; // in any case insert *after* the mul instruction
if (isCopyMulResult(I))
++I;
if (isCopyMulResult(I))
++I;
BuildMI(*BB, I, MI.getDebugLoc(), TII.get(AVR::EORRdRr), AVR::R1)
.addReg(AVR::R1)
.addReg(AVR::R1);
return BB;
}
MachineBasicBlock *
AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const {
int Opc = MI.getOpcode();
// Pseudo shift instructions with a non constant shift amount are expanded
// into a loop.
switch (Opc) {
case AVR::Lsl8:
case AVR::Lsl16:
case AVR::Lsr8:
case AVR::Lsr16:
case AVR::Rol8:
case AVR::Rol16:
case AVR::Ror8:
case AVR::Ror16:
case AVR::Asr8:
case AVR::Asr16:
return insertShift(MI, MBB);
case AVR::MULRdRr:
case AVR::MULSRdRr:
return insertMul(MI, MBB);
}
assert((Opc == AVR::Select16 || Opc == AVR::Select8) &&
"Unexpected instr type to insert");
const AVRInstrInfo &TII = (const AVRInstrInfo &)*MI.getParent()
->getParent()
->getSubtarget()
.getInstrInfo();
DebugLoc dl = MI.getDebugLoc();
// To "insert" a SELECT instruction, we insert the diamond
// control-flow pattern. The incoming instruction knows the
// destination vreg to set, the condition code register to branch
// on, the true/false values to select between, and a branch opcode
// to use.
MachineFunction *MF = MBB->getParent();
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
MachineBasicBlock *FallThrough = MBB->getFallThrough();
// If the current basic block falls through to another basic block,
// we must insert an unconditional branch to the fallthrough destination
// if we are to insert basic blocks at the prior fallthrough point.
if (FallThrough != nullptr) {
BuildMI(MBB, dl, TII.get(AVR::RJMPk)).addMBB(FallThrough);
}
MachineBasicBlock *trueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *falseMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineFunction::iterator I;
for (I = MF->begin(); I != MF->end() && &(*I) != MBB; ++I);
if (I != MF->end()) ++I;
MF->insert(I, trueMBB);
MF->insert(I, falseMBB);
// Transfer remaining instructions and all successors of the current
// block to the block which will contain the Phi node for the
// select.
trueMBB->splice(trueMBB->begin(), MBB,
std::next(MachineBasicBlock::iterator(MI)), MBB->end());
trueMBB->transferSuccessorsAndUpdatePHIs(MBB);
AVRCC::CondCodes CC = (AVRCC::CondCodes)MI.getOperand(3).getImm();
BuildMI(MBB, dl, TII.getBrCond(CC)).addMBB(trueMBB);
BuildMI(MBB, dl, TII.get(AVR::RJMPk)).addMBB(falseMBB);
MBB->addSuccessor(falseMBB);
MBB->addSuccessor(trueMBB);
// Unconditionally flow back to the true block
BuildMI(falseMBB, dl, TII.get(AVR::RJMPk)).addMBB(trueMBB);
falseMBB->addSuccessor(trueMBB);
// Set up the Phi node to determine where we came from
BuildMI(*trueMBB, trueMBB->begin(), dl, TII.get(AVR::PHI), MI.getOperand(0).getReg())
.addReg(MI.getOperand(1).getReg())
.addMBB(MBB)
.addReg(MI.getOperand(2).getReg())
.addMBB(falseMBB) ;
MI.eraseFromParent(); // The pseudo instruction is gone now.
return trueMBB;
}
//===----------------------------------------------------------------------===//
// Inline Asm Support
//===----------------------------------------------------------------------===//
AVRTargetLowering::ConstraintType
AVRTargetLowering::getConstraintType(StringRef Constraint) const {
if (Constraint.size() == 1) {
// See http://www.nongnu.org/avr-libc/user-manual/inline_asm.html
switch (Constraint[0]) {
+ default:
+ break;
case 'a': // Simple upper registers
case 'b': // Base pointer registers pairs
case 'd': // Upper register
case 'l': // Lower registers
case 'e': // Pointer register pairs
case 'q': // Stack pointer register
case 'r': // Any register
case 'w': // Special upper register pairs
return C_RegisterClass;
case 't': // Temporary register
case 'x': case 'X': // Pointer register pair X
case 'y': case 'Y': // Pointer register pair Y
case 'z': case 'Z': // Pointer register pair Z
return C_Register;
case 'Q': // A memory address based on Y or Z pointer with displacement.
return C_Memory;
case 'G': // Floating point constant
case 'I': // 6-bit positive integer constant
case 'J': // 6-bit negative integer constant
case 'K': // Integer constant (Range: 2)
case 'L': // Integer constant (Range: 0)
case 'M': // 8-bit integer constant
case 'N': // Integer constant (Range: -1)
case 'O': // Integer constant (Range: 8, 16, 24)
case 'P': // Integer constant (Range: 1)
case 'R': // Integer constant (Range: -6 to 5)x
- return C_Other;
- default:
- break;
+ return C_Immediate;
}
}
return TargetLowering::getConstraintType(Constraint);
}
unsigned
AVRTargetLowering::getInlineAsmMemConstraint(StringRef ConstraintCode) const {
// Not sure if this is actually the right thing to do, but we got to do
// *something* [agnat]
switch (ConstraintCode[0]) {
case 'Q':
return InlineAsm::Constraint_Q;
}
return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
}
AVRTargetLowering::ConstraintWeight
AVRTargetLowering::getSingleConstraintMatchWeight(
AsmOperandInfo &info, const char *constraint) const {
ConstraintWeight weight = CW_Invalid;
Value *CallOperandVal = info.CallOperandVal;
// If we don't have a value, we can't do a match,
// but allow it at the lowest weight.
// (this behaviour has been copied from the ARM backend)
if (!CallOperandVal) {
return CW_Default;
}
// Look at the constraint type.
switch (*constraint) {
default:
weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
break;
case 'd':
case 'r':
case 'l':
weight = CW_Register;
break;
case 'a':
case 'b':
case 'e':
case 'q':
case 't':
case 'w':
case 'x': case 'X':
case 'y': case 'Y':
case 'z': case 'Z':
weight = CW_SpecificReg;
break;
case 'G':
if (const ConstantFP *C = dyn_cast<ConstantFP>(CallOperandVal)) {
if (C->isZero()) {
weight = CW_Constant;
}
}
break;
case 'I':
if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if (isUInt<6>(C->getZExtValue())) {
weight = CW_Constant;
}
}
break;
case 'J':
if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if ((C->getSExtValue() >= -63) && (C->getSExtValue() <= 0)) {
weight = CW_Constant;
}
}
break;
case 'K':
if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if (C->getZExtValue() == 2) {
weight = CW_Constant;
}
}
break;
case 'L':
if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if (C->getZExtValue() == 0) {
weight = CW_Constant;
}
}
break;
case 'M':
if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if (isUInt<8>(C->getZExtValue())) {
weight = CW_Constant;
}
}
break;
case 'N':
if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if (C->getSExtValue() == -1) {
weight = CW_Constant;
}
}
break;
case 'O':
if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if ((C->getZExtValue() == 8) || (C->getZExtValue() == 16) ||
(C->getZExtValue() == 24)) {
weight = CW_Constant;
}
}
break;
case 'P':
if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if (C->getZExtValue() == 1) {
weight = CW_Constant;
}
}
break;
case 'R':
if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if ((C->getSExtValue() >= -6) && (C->getSExtValue() <= 5)) {
weight = CW_Constant;
}
}
break;
case 'Q':
weight = CW_Memory;
break;
}
return weight;
}
std::pair<unsigned, const TargetRegisterClass *>
AVRTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
MVT VT) const {
// We only support i8 and i16.
//
//:FIXME: remove this assert for now since it gets sometimes executed
// assert((VT == MVT::i16 || VT == MVT::i8) && "Wrong operand type.");
if (Constraint.size() == 1) {
switch (Constraint[0]) {
case 'a': // Simple upper registers r16..r23.
return std::make_pair(0U, &AVR::LD8loRegClass);
case 'b': // Base pointer registers: y, z.
return std::make_pair(0U, &AVR::PTRDISPREGSRegClass);
case 'd': // Upper registers r16..r31.
return std::make_pair(0U, &AVR::LD8RegClass);
case 'l': // Lower registers r0..r15.
return std::make_pair(0U, &AVR::GPR8loRegClass);
case 'e': // Pointer register pairs: x, y, z.
return std::make_pair(0U, &AVR::PTRREGSRegClass);
case 'q': // Stack pointer register: SPH:SPL.
return std::make_pair(0U, &AVR::GPRSPRegClass);
case 'r': // Any register: r0..r31.
if (VT == MVT::i8)
return std::make_pair(0U, &AVR::GPR8RegClass);
assert(VT == MVT::i16 && "inline asm constraint too large");
return std::make_pair(0U, &AVR::DREGSRegClass);
case 't': // Temporary register: r0.
return std::make_pair(unsigned(AVR::R0), &AVR::GPR8RegClass);
case 'w': // Special upper register pairs: r24, r26, r28, r30.
return std::make_pair(0U, &AVR::IWREGSRegClass);
case 'x': // Pointer register pair X: r27:r26.
case 'X':
return std::make_pair(unsigned(AVR::R27R26), &AVR::PTRREGSRegClass);
case 'y': // Pointer register pair Y: r29:r28.
case 'Y':
return std::make_pair(unsigned(AVR::R29R28), &AVR::PTRREGSRegClass);
case 'z': // Pointer register pair Z: r31:r30.
case 'Z':
return std::make_pair(unsigned(AVR::R31R30), &AVR::PTRREGSRegClass);
default:
break;
}
}
return TargetLowering::getRegForInlineAsmConstraint(
Subtarget.getRegisterInfo(), Constraint, VT);
}
void AVRTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
std::string &Constraint,
std::vector<SDValue> &Ops,
SelectionDAG &DAG) const {
SDValue Result(0, 0);
SDLoc DL(Op);
EVT Ty = Op.getValueType();
// Currently only support length 1 constraints.
if (Constraint.length() != 1) {
return;
}
char ConstraintLetter = Constraint[0];
switch (ConstraintLetter) {
default:
break;
// Deal with integers first:
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'R': {
const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
if (!C) {
return;
}
int64_t CVal64 = C->getSExtValue();
uint64_t CUVal64 = C->getZExtValue();
switch (ConstraintLetter) {
case 'I': // 0..63
if (!isUInt<6>(CUVal64))
return;
Result = DAG.getTargetConstant(CUVal64, DL, Ty);
break;
case 'J': // -63..0
if (CVal64 < -63 || CVal64 > 0)
return;
Result = DAG.getTargetConstant(CVal64, DL, Ty);
break;
case 'K': // 2
if (CUVal64 != 2)
return;
Result = DAG.getTargetConstant(CUVal64, DL, Ty);
break;
case 'L': // 0
if (CUVal64 != 0)
return;
Result = DAG.getTargetConstant(CUVal64, DL, Ty);
break;
case 'M': // 0..255
if (!isUInt<8>(CUVal64))
return;
// i8 type may be printed as a negative number,
// e.g. 254 would be printed as -2,
// so we force it to i16 at least.
if (Ty.getSimpleVT() == MVT::i8) {
Ty = MVT::i16;
}
Result = DAG.getTargetConstant(CUVal64, DL, Ty);
break;
case 'N': // -1
if (CVal64 != -1)
return;
Result = DAG.getTargetConstant(CVal64, DL, Ty);
break;
case 'O': // 8, 16, 24
if (CUVal64 != 8 && CUVal64 != 16 && CUVal64 != 24)
return;
Result = DAG.getTargetConstant(CUVal64, DL, Ty);
break;
case 'P': // 1
if (CUVal64 != 1)
return;
Result = DAG.getTargetConstant(CUVal64, DL, Ty);
break;
case 'R': // -6..5
if (CVal64 < -6 || CVal64 > 5)
return;
Result = DAG.getTargetConstant(CVal64, DL, Ty);
break;
}
break;
}
case 'G':
const ConstantFPSDNode *FC = dyn_cast<ConstantFPSDNode>(Op);
if (!FC || !FC->isZero())
return;
// Soften float to i8 0
Result = DAG.getTargetConstant(0, DL, MVT::i8);
break;
}
if (Result.getNode()) {
Ops.push_back(Result);
return;
}
return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
}
unsigned AVRTargetLowering::getRegisterByName(const char *RegName,
EVT VT,
SelectionDAG &DAG) const {
unsigned Reg;
if (VT == MVT::i8) {
Reg = StringSwitch<unsigned>(RegName)
.Case("r0", AVR::R0).Case("r1", AVR::R1).Case("r2", AVR::R2)
.Case("r3", AVR::R3).Case("r4", AVR::R4).Case("r5", AVR::R5)
.Case("r6", AVR::R6).Case("r7", AVR::R7).Case("r8", AVR::R8)
.Case("r9", AVR::R9).Case("r10", AVR::R10).Case("r11", AVR::R11)
.Case("r12", AVR::R12).Case("r13", AVR::R13).Case("r14", AVR::R14)
.Case("r15", AVR::R15).Case("r16", AVR::R16).Case("r17", AVR::R17)
.Case("r18", AVR::R18).Case("r19", AVR::R19).Case("r20", AVR::R20)
.Case("r21", AVR::R21).Case("r22", AVR::R22).Case("r23", AVR::R23)
.Case("r24", AVR::R24).Case("r25", AVR::R25).Case("r26", AVR::R26)
.Case("r27", AVR::R27).Case("r28", AVR::R28).Case("r29", AVR::R29)
.Case("r30", AVR::R30).Case("r31", AVR::R31)
.Case("X", AVR::R27R26).Case("Y", AVR::R29R28).Case("Z", AVR::R31R30)
.Default(0);
} else {
Reg = StringSwitch<unsigned>(RegName)
.Case("r0", AVR::R1R0).Case("r2", AVR::R3R2)
.Case("r4", AVR::R5R4).Case("r6", AVR::R7R6)
.Case("r8", AVR::R9R8).Case("r10", AVR::R11R10)
.Case("r12", AVR::R13R12).Case("r14", AVR::R15R14)
.Case("r16", AVR::R17R16).Case("r18", AVR::R19R18)
.Case("r20", AVR::R21R20).Case("r22", AVR::R23R22)
.Case("r24", AVR::R25R24).Case("r26", AVR::R27R26)
.Case("r28", AVR::R29R28).Case("r30", AVR::R31R30)
.Case("X", AVR::R27R26).Case("Y", AVR::R29R28).Case("Z", AVR::R31R30)
.Default(0);
}
if (Reg)
return Reg;
report_fatal_error("Invalid register name global variable");
}
} // end of namespace llvm
Index: vendor/llvm/dist-release_90/lib/Target/BPF/BPFAbstractMemberAccess.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/BPF/BPFAbstractMemberAccess.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/BPF/BPFAbstractMemberAccess.cpp (revision 351303)
@@ -1,482 +1,480 @@
//===------ BPFAbstractMemberAccess.cpp - Abstracting Member Accesses -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass abstracted struct/union member accesses in order to support
// compile-once run-everywhere (CO-RE). The CO-RE intends to compile the program
// which can run on different kernels. In particular, if bpf program tries to
// access a particular kernel data structure member, the details of the
// intermediate member access will be remembered so bpf loader can do
// necessary adjustment right before program loading.
//
// For example,
//
// struct s {
// int a;
// int b;
// };
// struct t {
// struct s c;
// int d;
// };
// struct t e;
//
// For the member access e.c.b, the compiler will generate code
// &e + 4
//
// The compile-once run-everywhere instead generates the following code
// r = 4
// &e + r
// The "4" in "r = 4" can be changed based on a particular kernel version.
// For example, on a particular kernel version, if struct s is changed to
//
// struct s {
// int new_field;
// int a;
// int b;
// }
//
// By repeating the member access on the host, the bpf loader can
// adjust "r = 4" as "r = 8".
//
// This feature relies on the following three intrinsic calls:
// addr = preserve_array_access_index(base, dimension, index)
// addr = preserve_union_access_index(base, di_index)
// !llvm.preserve.access.index <union_ditype>
// addr = preserve_struct_access_index(base, gep_index, di_index)
// !llvm.preserve.access.index <struct_ditype>
//
//===----------------------------------------------------------------------===//
#include "BPF.h"
#include "BPFCORE.h"
#include "BPFTargetMachine.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#define DEBUG_TYPE "bpf-abstract-member-access"
namespace llvm {
const std::string BPFCoreSharedInfo::AmaAttr = "btf_ama";
const std::string BPFCoreSharedInfo::PatchableExtSecName =
".BPF.patchable_externs";
} // namespace llvm
using namespace llvm;
namespace {
class BPFAbstractMemberAccess final : public ModulePass {
StringRef getPassName() const override {
return "BPF Abstract Member Access";
}
bool runOnModule(Module &M) override;
public:
static char ID;
BPFAbstractMemberAccess() : ModulePass(ID) {}
private:
enum : uint32_t {
BPFPreserveArrayAI = 1,
BPFPreserveUnionAI = 2,
BPFPreserveStructAI = 3,
};
std::map<std::string, GlobalVariable *> GEPGlobals;
// A map to link preserve_*_access_index instrinsic calls.
std::map<CallInst *, std::pair<CallInst *, uint32_t>> AIChain;
// A map to hold all the base preserve_*_access_index instrinsic calls.
// The base call is not an input of any other preserve_*_access_index
// intrinsics.
std::map<CallInst *, uint32_t> BaseAICalls;
bool doTransformation(Module &M);
void traceAICall(CallInst *Call, uint32_t Kind);
void traceBitCast(BitCastInst *BitCast, CallInst *Parent, uint32_t Kind);
void traceGEP(GetElementPtrInst *GEP, CallInst *Parent, uint32_t Kind);
void collectAICallChains(Module &M, Function &F);
bool IsPreserveDIAccessIndexCall(const CallInst *Call, uint32_t &Kind);
bool removePreserveAccessIndexIntrinsic(Module &M);
void replaceWithGEP(std::vector<CallInst *> &CallList,
uint32_t NumOfZerosIndex, uint32_t DIIndex);
- Value *computeBaseAndAccessStr(CallInst *Call, std::string &AccessStr,
- std::string &AccessKey, uint32_t Kind,
- MDNode *&TypeMeta);
+ Value *computeBaseAndAccessKey(CallInst *Call, std::string &AccessKey,
+ uint32_t Kind, MDNode *&TypeMeta);
bool getAccessIndex(const Value *IndexValue, uint64_t &AccessIndex);
bool transformGEPChain(Module &M, CallInst *Call, uint32_t Kind);
};
} // End anonymous namespace
char BPFAbstractMemberAccess::ID = 0;
INITIALIZE_PASS(BPFAbstractMemberAccess, DEBUG_TYPE,
"abstracting struct/union member accessees", false, false)
ModulePass *llvm::createBPFAbstractMemberAccess() {
return new BPFAbstractMemberAccess();
}
bool BPFAbstractMemberAccess::runOnModule(Module &M) {
LLVM_DEBUG(dbgs() << "********** Abstract Member Accesses **********\n");
// Bail out if no debug info.
if (empty(M.debug_compile_units()))
return false;
return doTransformation(M);
}
/// Check whether a call is a preserve_*_access_index intrinsic call or not.
bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call,
uint32_t &Kind) {
if (!Call)
return false;
const auto *GV = dyn_cast<GlobalValue>(Call->getCalledValue());
if (!GV)
return false;
if (GV->getName().startswith("llvm.preserve.array.access.index")) {
Kind = BPFPreserveArrayAI;
return true;
}
if (GV->getName().startswith("llvm.preserve.union.access.index")) {
Kind = BPFPreserveUnionAI;
return true;
}
if (GV->getName().startswith("llvm.preserve.struct.access.index")) {
Kind = BPFPreserveStructAI;
return true;
}
return false;
}
void BPFAbstractMemberAccess::replaceWithGEP(std::vector<CallInst *> &CallList,
uint32_t DimensionIndex,
uint32_t GEPIndex) {
for (auto Call : CallList) {
uint32_t Dimension = 1;
if (DimensionIndex > 0)
Dimension = cast<ConstantInt>(Call->getArgOperand(DimensionIndex))
->getZExtValue();
Constant *Zero =
ConstantInt::get(Type::getInt32Ty(Call->getParent()->getContext()), 0);
SmallVector<Value *, 4> IdxList;
for (unsigned I = 0; I < Dimension; ++I)
IdxList.push_back(Zero);
IdxList.push_back(Call->getArgOperand(GEPIndex));
auto *GEP = GetElementPtrInst::CreateInBounds(Call->getArgOperand(0),
IdxList, "", Call);
Call->replaceAllUsesWith(GEP);
Call->eraseFromParent();
}
}
bool BPFAbstractMemberAccess::removePreserveAccessIndexIntrinsic(Module &M) {
std::vector<CallInst *> PreserveArrayIndexCalls;
std::vector<CallInst *> PreserveUnionIndexCalls;
std::vector<CallInst *> PreserveStructIndexCalls;
bool Found = false;
for (Function &F : M)
for (auto &BB : F)
for (auto &I : BB) {
auto *Call = dyn_cast<CallInst>(&I);
uint32_t Kind;
if (!IsPreserveDIAccessIndexCall(Call, Kind))
continue;
Found = true;
if (Kind == BPFPreserveArrayAI)
PreserveArrayIndexCalls.push_back(Call);
else if (Kind == BPFPreserveUnionAI)
PreserveUnionIndexCalls.push_back(Call);
else
PreserveStructIndexCalls.push_back(Call);
}
// do the following transformation:
// . addr = preserve_array_access_index(base, dimension, index)
// is transformed to
// addr = GEP(base, dimenion's zero's, index)
// . addr = preserve_union_access_index(base, di_index)
// is transformed to
// addr = base, i.e., all usages of "addr" are replaced by "base".
// . addr = preserve_struct_access_index(base, gep_index, di_index)
// is transformed to
// addr = GEP(base, 0, gep_index)
replaceWithGEP(PreserveArrayIndexCalls, 1, 2);
replaceWithGEP(PreserveStructIndexCalls, 0, 1);
for (auto Call : PreserveUnionIndexCalls) {
Call->replaceAllUsesWith(Call->getArgOperand(0));
Call->eraseFromParent();
}
return Found;
}
void BPFAbstractMemberAccess::traceAICall(CallInst *Call, uint32_t Kind) {
for (User *U : Call->users()) {
Instruction *Inst = dyn_cast<Instruction>(U);
if (!Inst)
continue;
if (auto *BI = dyn_cast<BitCastInst>(Inst)) {
traceBitCast(BI, Call, Kind);
} else if (auto *CI = dyn_cast<CallInst>(Inst)) {
uint32_t CIKind;
if (IsPreserveDIAccessIndexCall(CI, CIKind)) {
AIChain[CI] = std::make_pair(Call, Kind);
traceAICall(CI, CIKind);
} else {
BaseAICalls[Call] = Kind;
}
} else if (auto *GI = dyn_cast<GetElementPtrInst>(Inst)) {
if (GI->hasAllZeroIndices())
traceGEP(GI, Call, Kind);
else
BaseAICalls[Call] = Kind;
}
}
}
void BPFAbstractMemberAccess::traceBitCast(BitCastInst *BitCast,
CallInst *Parent, uint32_t Kind) {
for (User *U : BitCast->users()) {
Instruction *Inst = dyn_cast<Instruction>(U);
if (!Inst)
continue;
if (auto *BI = dyn_cast<BitCastInst>(Inst)) {
traceBitCast(BI, Parent, Kind);
} else if (auto *CI = dyn_cast<CallInst>(Inst)) {
uint32_t CIKind;
if (IsPreserveDIAccessIndexCall(CI, CIKind)) {
AIChain[CI] = std::make_pair(Parent, Kind);
traceAICall(CI, CIKind);
} else {
BaseAICalls[Parent] = Kind;
}
} else if (auto *GI = dyn_cast<GetElementPtrInst>(Inst)) {
if (GI->hasAllZeroIndices())
traceGEP(GI, Parent, Kind);
else
BaseAICalls[Parent] = Kind;
}
}
}
void BPFAbstractMemberAccess::traceGEP(GetElementPtrInst *GEP, CallInst *Parent,
uint32_t Kind) {
for (User *U : GEP->users()) {
Instruction *Inst = dyn_cast<Instruction>(U);
if (!Inst)
continue;
if (auto *BI = dyn_cast<BitCastInst>(Inst)) {
traceBitCast(BI, Parent, Kind);
} else if (auto *CI = dyn_cast<CallInst>(Inst)) {
uint32_t CIKind;
if (IsPreserveDIAccessIndexCall(CI, CIKind)) {
AIChain[CI] = std::make_pair(Parent, Kind);
traceAICall(CI, CIKind);
} else {
BaseAICalls[Parent] = Kind;
}
} else if (auto *GI = dyn_cast<GetElementPtrInst>(Inst)) {
if (GI->hasAllZeroIndices())
traceGEP(GI, Parent, Kind);
else
BaseAICalls[Parent] = Kind;
}
}
}
void BPFAbstractMemberAccess::collectAICallChains(Module &M, Function &F) {
AIChain.clear();
BaseAICalls.clear();
for (auto &BB : F)
for (auto &I : BB) {
uint32_t Kind;
auto *Call = dyn_cast<CallInst>(&I);
if (!IsPreserveDIAccessIndexCall(Call, Kind) ||
AIChain.find(Call) != AIChain.end())
continue;
traceAICall(Call, Kind);
}
}
/// Get access index from the preserve_*_access_index intrinsic calls.
bool BPFAbstractMemberAccess::getAccessIndex(const Value *IndexValue,
uint64_t &AccessIndex) {
const ConstantInt *CV = dyn_cast<ConstantInt>(IndexValue);
if (!CV)
return false;
AccessIndex = CV->getValue().getZExtValue();
return true;
}
/// Compute the base of the whole preserve_*_access_index chains, i.e., the base
/// pointer of the first preserve_*_access_index call, and construct the access
/// string, which will be the name of a global variable.
-Value *BPFAbstractMemberAccess::computeBaseAndAccessStr(CallInst *Call,
- std::string &AccessStr,
+Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call,
std::string &AccessKey,
uint32_t Kind,
MDNode *&TypeMeta) {
Value *Base = nullptr;
std::vector<uint64_t> AccessIndices;
uint64_t TypeNameIndex = 0;
std::string LastTypeName;
while (Call) {
// Base of original corresponding GEP
Base = Call->getArgOperand(0);
// Type Name
std::string TypeName;
MDNode *MDN;
if (Kind == BPFPreserveUnionAI || Kind == BPFPreserveStructAI) {
MDN = Call->getMetadata(LLVMContext::MD_preserve_access_index);
if (!MDN)
return nullptr;
DIType *Ty = dyn_cast<DIType>(MDN);
if (!Ty)
return nullptr;
TypeName = Ty->getName();
}
// Access Index
uint64_t AccessIndex;
uint32_t ArgIndex = (Kind == BPFPreserveUnionAI) ? 1 : 2;
if (!getAccessIndex(Call->getArgOperand(ArgIndex), AccessIndex))
return nullptr;
AccessIndices.push_back(AccessIndex);
if (TypeName.size()) {
TypeNameIndex = AccessIndices.size() - 1;
LastTypeName = TypeName;
TypeMeta = MDN;
}
Kind = AIChain[Call].second;
Call = AIChain[Call].first;
}
// The intial type name is required.
// FIXME: if the initial type access is an array index, e.g.,
// &a[3].b.c, only one dimentional array is supported.
if (!LastTypeName.size() || AccessIndices.size() > TypeNameIndex + 2)
return nullptr;
- // Construct the type string AccessStr.
+ // Construct the type string AccessKey.
for (unsigned I = 0; I < AccessIndices.size(); ++I)
- AccessStr = std::to_string(AccessIndices[I]) + ":" + AccessStr;
+ AccessKey = std::to_string(AccessIndices[I]) + ":" + AccessKey;
if (TypeNameIndex == AccessIndices.size() - 1)
- AccessStr = "0:" + AccessStr;
+ AccessKey = "0:" + AccessKey;
// Access key is the type name + access string, uniquely identifying
// one kernel memory access.
- AccessKey = LastTypeName + ":" + AccessStr;
+ AccessKey = LastTypeName + ":" + AccessKey;
return Base;
}
/// Call/Kind is the base preserve_*_access_index() call. Attempts to do
/// transformation to a chain of relocable GEPs.
bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call,
uint32_t Kind) {
- std::string AccessStr, AccessKey;
+ std::string AccessKey;
MDNode *TypeMeta = nullptr;
Value *Base =
- computeBaseAndAccessStr(Call, AccessStr, AccessKey, Kind, TypeMeta);
+ computeBaseAndAccessKey(Call, AccessKey, Kind, TypeMeta);
if (!Base)
return false;
// Do the transformation
// For any original GEP Call and Base %2 like
// %4 = bitcast %struct.net_device** %dev1 to i64*
// it is transformed to:
// %6 = load __BTF_0:sk_buff:0:0:2:0:
// %7 = bitcast %struct.sk_buff* %2 to i8*
// %8 = getelementptr i8, i8* %7, %6
// %9 = bitcast i8* %8 to i64*
// using %9 instead of %4
// The original Call inst is removed.
BasicBlock *BB = Call->getParent();
GlobalVariable *GV;
if (GEPGlobals.find(AccessKey) == GEPGlobals.end()) {
GV = new GlobalVariable(M, Type::getInt64Ty(BB->getContext()), false,
- GlobalVariable::ExternalLinkage, NULL, AccessStr);
+ GlobalVariable::ExternalLinkage, NULL, AccessKey);
GV->addAttribute(BPFCoreSharedInfo::AmaAttr);
// Set the metadata (debuginfo types) for the global.
if (TypeMeta)
GV->setMetadata(LLVMContext::MD_preserve_access_index, TypeMeta);
GEPGlobals[AccessKey] = GV;
} else {
GV = GEPGlobals[AccessKey];
}
// Load the global variable.
auto *LDInst = new LoadInst(Type::getInt64Ty(BB->getContext()), GV);
BB->getInstList().insert(Call->getIterator(), LDInst);
// Generate a BitCast
auto *BCInst = new BitCastInst(Base, Type::getInt8PtrTy(BB->getContext()));
BB->getInstList().insert(Call->getIterator(), BCInst);
// Generate a GetElementPtr
auto *GEP = GetElementPtrInst::Create(Type::getInt8Ty(BB->getContext()),
BCInst, LDInst);
BB->getInstList().insert(Call->getIterator(), GEP);
// Generate a BitCast
auto *BCInst2 = new BitCastInst(GEP, Call->getType());
BB->getInstList().insert(Call->getIterator(), BCInst2);
Call->replaceAllUsesWith(BCInst2);
Call->eraseFromParent();
return true;
}
bool BPFAbstractMemberAccess::doTransformation(Module &M) {
bool Transformed = false;
for (Function &F : M) {
// Collect PreserveDIAccessIndex Intrinsic call chains.
// The call chains will be used to generate the access
// patterns similar to GEP.
collectAICallChains(M, F);
for (auto &C : BaseAICalls)
Transformed = transformGEPChain(M, C.first, C.second) || Transformed;
}
return removePreserveAccessIndexIntrinsic(M) || Transformed;
}
Index: vendor/llvm/dist-release_90/lib/Target/BPF/BTFDebug.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/BPF/BTFDebug.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/BPF/BTFDebug.cpp (revision 351303)
@@ -1,1300 +1,1326 @@
//===- BTFDebug.cpp - BTF Generator ---------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains support for writing BTF debug info.
//
//===----------------------------------------------------------------------===//
#include "BTFDebug.h"
#include "BPF.h"
#include "BPFCORE.h"
#include "MCTargetDesc/BPFMCTargetDesc.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/Support/LineIterator.h"
using namespace llvm;
static const char *BTFKindStr[] = {
#define HANDLE_BTF_KIND(ID, NAME) "BTF_KIND_" #NAME,
#include "BTF.def"
};
+static const DIType * stripQualifiers(const DIType *Ty) {
+ while (const auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
+ unsigned Tag = DTy->getTag();
+ if (Tag != dwarf::DW_TAG_typedef && Tag != dwarf::DW_TAG_const_type &&
+ Tag != dwarf::DW_TAG_volatile_type && Tag != dwarf::DW_TAG_restrict_type)
+ break;
+ Ty = DTy->getBaseType();
+ }
+
+ return Ty;
+}
+
/// Emit a BTF common type.
void BTFTypeBase::emitType(MCStreamer &OS) {
OS.AddComment(std::string(BTFKindStr[Kind]) + "(id = " + std::to_string(Id) +
")");
OS.EmitIntValue(BTFType.NameOff, 4);
OS.AddComment("0x" + Twine::utohexstr(BTFType.Info));
OS.EmitIntValue(BTFType.Info, 4);
OS.EmitIntValue(BTFType.Size, 4);
}
BTFTypeDerived::BTFTypeDerived(const DIDerivedType *DTy, unsigned Tag,
bool NeedsFixup)
: DTy(DTy), NeedsFixup(NeedsFixup) {
switch (Tag) {
case dwarf::DW_TAG_pointer_type:
Kind = BTF::BTF_KIND_PTR;
break;
case dwarf::DW_TAG_const_type:
Kind = BTF::BTF_KIND_CONST;
break;
case dwarf::DW_TAG_volatile_type:
Kind = BTF::BTF_KIND_VOLATILE;
break;
case dwarf::DW_TAG_typedef:
Kind = BTF::BTF_KIND_TYPEDEF;
break;
case dwarf::DW_TAG_restrict_type:
Kind = BTF::BTF_KIND_RESTRICT;
break;
default:
llvm_unreachable("Unknown DIDerivedType Tag");
}
BTFType.Info = Kind << 24;
}
void BTFTypeDerived::completeType(BTFDebug &BDebug) {
if (IsCompleted)
return;
IsCompleted = true;
BTFType.NameOff = BDebug.addString(DTy->getName());
if (NeedsFixup)
return;
// The base type for PTR/CONST/VOLATILE could be void.
const DIType *ResolvedType = DTy->getBaseType();
if (!ResolvedType) {
assert((Kind == BTF::BTF_KIND_PTR || Kind == BTF::BTF_KIND_CONST ||
Kind == BTF::BTF_KIND_VOLATILE) &&
"Invalid null basetype");
BTFType.Type = 0;
} else {
BTFType.Type = BDebug.getTypeId(ResolvedType);
}
}
void BTFTypeDerived::emitType(MCStreamer &OS) { BTFTypeBase::emitType(OS); }
void BTFTypeDerived::setPointeeType(uint32_t PointeeType) {
BTFType.Type = PointeeType;
}
/// Represent a struct/union forward declaration.
BTFTypeFwd::BTFTypeFwd(StringRef Name, bool IsUnion) : Name(Name) {
Kind = BTF::BTF_KIND_FWD;
BTFType.Info = IsUnion << 31 | Kind << 24;
BTFType.Type = 0;
}
void BTFTypeFwd::completeType(BTFDebug &BDebug) {
if (IsCompleted)
return;
IsCompleted = true;
BTFType.NameOff = BDebug.addString(Name);
}
void BTFTypeFwd::emitType(MCStreamer &OS) { BTFTypeBase::emitType(OS); }
BTFTypeInt::BTFTypeInt(uint32_t Encoding, uint32_t SizeInBits,
uint32_t OffsetInBits, StringRef TypeName)
: Name(TypeName) {
// Translate IR int encoding to BTF int encoding.
uint8_t BTFEncoding;
switch (Encoding) {
case dwarf::DW_ATE_boolean:
BTFEncoding = BTF::INT_BOOL;
break;
case dwarf::DW_ATE_signed:
case dwarf::DW_ATE_signed_char:
BTFEncoding = BTF::INT_SIGNED;
break;
case dwarf::DW_ATE_unsigned:
case dwarf::DW_ATE_unsigned_char:
BTFEncoding = 0;
break;
default:
llvm_unreachable("Unknown BTFTypeInt Encoding");
}
Kind = BTF::BTF_KIND_INT;
BTFType.Info = Kind << 24;
BTFType.Size = roundupToBytes(SizeInBits);
IntVal = (BTFEncoding << 24) | OffsetInBits << 16 | SizeInBits;
}
void BTFTypeInt::completeType(BTFDebug &BDebug) {
if (IsCompleted)
return;
IsCompleted = true;
BTFType.NameOff = BDebug.addString(Name);
}
void BTFTypeInt::emitType(MCStreamer &OS) {
BTFTypeBase::emitType(OS);
OS.AddComment("0x" + Twine::utohexstr(IntVal));
OS.EmitIntValue(IntVal, 4);
}
BTFTypeEnum::BTFTypeEnum(const DICompositeType *ETy, uint32_t VLen) : ETy(ETy) {
Kind = BTF::BTF_KIND_ENUM;
BTFType.Info = Kind << 24 | VLen;
BTFType.Size = roundupToBytes(ETy->getSizeInBits());
}
void BTFTypeEnum::completeType(BTFDebug &BDebug) {
if (IsCompleted)
return;
IsCompleted = true;
BTFType.NameOff = BDebug.addString(ETy->getName());
DINodeArray Elements = ETy->getElements();
for (const auto Element : Elements) {
const auto *Enum = cast<DIEnumerator>(Element);
struct BTF::BTFEnum BTFEnum;
BTFEnum.NameOff = BDebug.addString(Enum->getName());
// BTF enum value is 32bit, enforce it.
BTFEnum.Val = static_cast<uint32_t>(Enum->getValue());
EnumValues.push_back(BTFEnum);
}
}
void BTFTypeEnum::emitType(MCStreamer &OS) {
BTFTypeBase::emitType(OS);
for (const auto &Enum : EnumValues) {
OS.EmitIntValue(Enum.NameOff, 4);
OS.EmitIntValue(Enum.Val, 4);
}
}
-BTFTypeArray::BTFTypeArray(uint32_t ElemTypeId, uint32_t ElemSize,
- uint32_t NumElems)
- : ElemSize(ElemSize) {
+BTFTypeArray::BTFTypeArray(const DIType *Ty, uint32_t ElemTypeId,
+ uint32_t ElemSize, uint32_t NumElems)
+ : ElemTyNoQual(Ty), ElemSize(ElemSize) {
Kind = BTF::BTF_KIND_ARRAY;
BTFType.NameOff = 0;
BTFType.Info = Kind << 24;
BTFType.Size = 0;
ArrayInfo.ElemType = ElemTypeId;
ArrayInfo.Nelems = NumElems;
}
/// Represent a BTF array.
void BTFTypeArray::completeType(BTFDebug &BDebug) {
if (IsCompleted)
return;
IsCompleted = true;
// The IR does not really have a type for the index.
// A special type for array index should have been
// created during initial type traversal. Just
// retrieve that type id.
ArrayInfo.IndexType = BDebug.getArrayIndexTypeId();
+
+ ElemTypeNoQual = ElemTyNoQual ? BDebug.getTypeId(ElemTyNoQual)
+ : ArrayInfo.ElemType;
}
void BTFTypeArray::emitType(MCStreamer &OS) {
BTFTypeBase::emitType(OS);
OS.EmitIntValue(ArrayInfo.ElemType, 4);
OS.EmitIntValue(ArrayInfo.IndexType, 4);
OS.EmitIntValue(ArrayInfo.Nelems, 4);
}
void BTFTypeArray::getLocInfo(uint32_t Loc, uint32_t &LocOffset,
uint32_t &ElementTypeId) {
- ElementTypeId = ArrayInfo.ElemType;
+ ElementTypeId = ElemTypeNoQual;
LocOffset = Loc * ElemSize;
}
/// Represent either a struct or a union.
BTFTypeStruct::BTFTypeStruct(const DICompositeType *STy, bool IsStruct,
bool HasBitField, uint32_t Vlen)
: STy(STy), HasBitField(HasBitField) {
Kind = IsStruct ? BTF::BTF_KIND_STRUCT : BTF::BTF_KIND_UNION;
BTFType.Size = roundupToBytes(STy->getSizeInBits());
BTFType.Info = (HasBitField << 31) | (Kind << 24) | Vlen;
}
void BTFTypeStruct::completeType(BTFDebug &BDebug) {
if (IsCompleted)
return;
IsCompleted = true;
BTFType.NameOff = BDebug.addString(STy->getName());
// Add struct/union members.
const DINodeArray Elements = STy->getElements();
for (const auto *Element : Elements) {
struct BTF::BTFMember BTFMember;
const auto *DDTy = cast<DIDerivedType>(Element);
BTFMember.NameOff = BDebug.addString(DDTy->getName());
if (HasBitField) {
uint8_t BitFieldSize = DDTy->isBitField() ? DDTy->getSizeInBits() : 0;
BTFMember.Offset = BitFieldSize << 24 | DDTy->getOffsetInBits();
} else {
BTFMember.Offset = DDTy->getOffsetInBits();
}
- BTFMember.Type = BDebug.getTypeId(DDTy->getBaseType());
+ const auto *BaseTy = DDTy->getBaseType();
+ BTFMember.Type = BDebug.getTypeId(BaseTy);
+ MemberTypeNoQual.push_back(BDebug.getTypeId(stripQualifiers(BaseTy)));
Members.push_back(BTFMember);
}
}
void BTFTypeStruct::emitType(MCStreamer &OS) {
BTFTypeBase::emitType(OS);
for (const auto &Member : Members) {
OS.EmitIntValue(Member.NameOff, 4);
OS.EmitIntValue(Member.Type, 4);
OS.AddComment("0x" + Twine::utohexstr(Member.Offset));
OS.EmitIntValue(Member.Offset, 4);
}
}
std::string BTFTypeStruct::getName() { return STy->getName(); }
void BTFTypeStruct::getMemberInfo(uint32_t Loc, uint32_t &MemberOffset,
uint32_t &MemberType) {
- MemberType = Members[Loc].Type;
+ MemberType = MemberTypeNoQual[Loc];
MemberOffset =
HasBitField ? Members[Loc].Offset & 0xffffff : Members[Loc].Offset;
}
uint32_t BTFTypeStruct::getStructSize() { return STy->getSizeInBits() >> 3; }
/// The Func kind represents both subprogram and pointee of function
/// pointers. If the FuncName is empty, it represents a pointee of function
/// pointer. Otherwise, it represents a subprogram. The func arg names
/// are empty for pointee of function pointer case, and are valid names
/// for subprogram.
BTFTypeFuncProto::BTFTypeFuncProto(
const DISubroutineType *STy, uint32_t VLen,
const std::unordered_map<uint32_t, StringRef> &FuncArgNames)
: STy(STy), FuncArgNames(FuncArgNames) {
Kind = BTF::BTF_KIND_FUNC_PROTO;
BTFType.Info = (Kind << 24) | VLen;
}
void BTFTypeFuncProto::completeType(BTFDebug &BDebug) {
if (IsCompleted)
return;
IsCompleted = true;
DITypeRefArray Elements = STy->getTypeArray();
auto RetType = Elements[0];
BTFType.Type = RetType ? BDebug.getTypeId(RetType) : 0;
BTFType.NameOff = 0;
// For null parameter which is typically the last one
// to represent the vararg, encode the NameOff/Type to be 0.
for (unsigned I = 1, N = Elements.size(); I < N; ++I) {
struct BTF::BTFParam Param;
auto Element = Elements[I];
if (Element) {
Param.NameOff = BDebug.addString(FuncArgNames[I]);
Param.Type = BDebug.getTypeId(Element);
} else {
Param.NameOff = 0;
Param.Type = 0;
}
Parameters.push_back(Param);
}
}
void BTFTypeFuncProto::emitType(MCStreamer &OS) {
BTFTypeBase::emitType(OS);
for (const auto &Param : Parameters) {
OS.EmitIntValue(Param.NameOff, 4);
OS.EmitIntValue(Param.Type, 4);
}
}
BTFTypeFunc::BTFTypeFunc(StringRef FuncName, uint32_t ProtoTypeId)
: Name(FuncName) {
Kind = BTF::BTF_KIND_FUNC;
BTFType.Info = Kind << 24;
BTFType.Type = ProtoTypeId;
}
void BTFTypeFunc::completeType(BTFDebug &BDebug) {
if (IsCompleted)
return;
IsCompleted = true;
BTFType.NameOff = BDebug.addString(Name);
}
void BTFTypeFunc::emitType(MCStreamer &OS) { BTFTypeBase::emitType(OS); }
BTFKindVar::BTFKindVar(StringRef VarName, uint32_t TypeId, uint32_t VarInfo)
: Name(VarName) {
Kind = BTF::BTF_KIND_VAR;
BTFType.Info = Kind << 24;
BTFType.Type = TypeId;
Info = VarInfo;
}
void BTFKindVar::completeType(BTFDebug &BDebug) {
BTFType.NameOff = BDebug.addString(Name);
}
void BTFKindVar::emitType(MCStreamer &OS) {
BTFTypeBase::emitType(OS);
OS.EmitIntValue(Info, 4);
}
BTFKindDataSec::BTFKindDataSec(AsmPrinter *AsmPrt, std::string SecName)
: Asm(AsmPrt), Name(SecName) {
Kind = BTF::BTF_KIND_DATASEC;
BTFType.Info = Kind << 24;
BTFType.Size = 0;
}
void BTFKindDataSec::completeType(BTFDebug &BDebug) {
BTFType.NameOff = BDebug.addString(Name);
BTFType.Info |= Vars.size();
}
void BTFKindDataSec::emitType(MCStreamer &OS) {
BTFTypeBase::emitType(OS);
for (const auto &V : Vars) {
OS.EmitIntValue(std::get<0>(V), 4);
Asm->EmitLabelReference(std::get<1>(V), 4);
OS.EmitIntValue(std::get<2>(V), 4);
}
}
uint32_t BTFStringTable::addString(StringRef S) {
// Check whether the string already exists.
for (auto &OffsetM : OffsetToIdMap) {
if (Table[OffsetM.second] == S)
return OffsetM.first;
}
// Not find, add to the string table.
uint32_t Offset = Size;
OffsetToIdMap[Offset] = Table.size();
Table.push_back(S);
Size += S.size() + 1;
return Offset;
}
BTFDebug::BTFDebug(AsmPrinter *AP)
: DebugHandlerBase(AP), OS(*Asm->OutStreamer), SkipInstruction(false),
LineInfoGenerated(false), SecNameOff(0), ArrayIndexTypeId(0),
MapDefNotCollected(true) {
addString("\0");
}
uint32_t BTFDebug::addType(std::unique_ptr<BTFTypeBase> TypeEntry,
const DIType *Ty) {
TypeEntry->setId(TypeEntries.size() + 1);
uint32_t Id = TypeEntry->getId();
DIToIdMap[Ty] = Id;
TypeEntries.push_back(std::move(TypeEntry));
return Id;
}
uint32_t BTFDebug::addType(std::unique_ptr<BTFTypeBase> TypeEntry) {
TypeEntry->setId(TypeEntries.size() + 1);
uint32_t Id = TypeEntry->getId();
TypeEntries.push_back(std::move(TypeEntry));
return Id;
}
void BTFDebug::visitBasicType(const DIBasicType *BTy, uint32_t &TypeId) {
// Only int types are supported in BTF.
uint32_t Encoding = BTy->getEncoding();
if (Encoding != dwarf::DW_ATE_boolean && Encoding != dwarf::DW_ATE_signed &&
Encoding != dwarf::DW_ATE_signed_char &&
Encoding != dwarf::DW_ATE_unsigned &&
Encoding != dwarf::DW_ATE_unsigned_char)
return;
// Create a BTF type instance for this DIBasicType and put it into
// DIToIdMap for cross-type reference check.
auto TypeEntry = llvm::make_unique<BTFTypeInt>(
Encoding, BTy->getSizeInBits(), BTy->getOffsetInBits(), BTy->getName());
TypeId = addType(std::move(TypeEntry), BTy);
}
/// Handle subprogram or subroutine types.
void BTFDebug::visitSubroutineType(
const DISubroutineType *STy, bool ForSubprog,
const std::unordered_map<uint32_t, StringRef> &FuncArgNames,
uint32_t &TypeId) {
DITypeRefArray Elements = STy->getTypeArray();
uint32_t VLen = Elements.size() - 1;
if (VLen > BTF::MAX_VLEN)
return;
// Subprogram has a valid non-zero-length name, and the pointee of
// a function pointer has an empty name. The subprogram type will
// not be added to DIToIdMap as it should not be referenced by
// any other types.
auto TypeEntry = llvm::make_unique<BTFTypeFuncProto>(STy, VLen, FuncArgNames);
if (ForSubprog)
TypeId = addType(std::move(TypeEntry)); // For subprogram
else
TypeId = addType(std::move(TypeEntry), STy); // For func ptr
// Visit return type and func arg types.
for (const auto Element : Elements) {
visitTypeEntry(Element);
}
}
/// Handle structure/union types.
void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct,
uint32_t &TypeId) {
const DINodeArray Elements = CTy->getElements();
uint32_t VLen = Elements.size();
if (VLen > BTF::MAX_VLEN)
return;
// Check whether we have any bitfield members or not
bool HasBitField = false;
for (const auto *Element : Elements) {
auto E = cast<DIDerivedType>(Element);
if (E->isBitField()) {
HasBitField = true;
break;
}
}
auto TypeEntry =
llvm::make_unique<BTFTypeStruct>(CTy, IsStruct, HasBitField, VLen);
StructTypes.push_back(TypeEntry.get());
TypeId = addType(std::move(TypeEntry), CTy);
// Visit all struct members.
for (const auto *Element : Elements)
visitTypeEntry(cast<DIDerivedType>(Element));
}
void BTFDebug::visitArrayType(const DICompositeType *CTy, uint32_t &TypeId) {
// Visit array element type.
uint32_t ElemTypeId, ElemSize;
const DIType *ElemType = CTy->getBaseType();
visitTypeEntry(ElemType, ElemTypeId, false, false);
+
+ // Strip qualifiers from element type to get accurate element size.
+ ElemType = stripQualifiers(ElemType);
ElemSize = ElemType->getSizeInBits() >> 3;
if (!CTy->getSizeInBits()) {
- auto TypeEntry = llvm::make_unique<BTFTypeArray>(ElemTypeId, 0, 0);
+ auto TypeEntry = llvm::make_unique<BTFTypeArray>(ElemType, ElemTypeId, 0, 0);
ArrayTypes.push_back(TypeEntry.get());
ElemTypeId = addType(std::move(TypeEntry), CTy);
} else {
// Visit array dimensions.
DINodeArray Elements = CTy->getElements();
for (int I = Elements.size() - 1; I >= 0; --I) {
if (auto *Element = dyn_cast_or_null<DINode>(Elements[I]))
if (Element->getTag() == dwarf::DW_TAG_subrange_type) {
const DISubrange *SR = cast<DISubrange>(Element);
auto *CI = SR->getCount().dyn_cast<ConstantInt *>();
int64_t Count = CI->getSExtValue();
+ const DIType *ArrayElemTy = (I == 0) ? ElemType : nullptr;
auto TypeEntry =
- llvm::make_unique<BTFTypeArray>(ElemTypeId, ElemSize, Count);
+ llvm::make_unique<BTFTypeArray>(ArrayElemTy, ElemTypeId,
+ ElemSize, Count);
ArrayTypes.push_back(TypeEntry.get());
if (I == 0)
ElemTypeId = addType(std::move(TypeEntry), CTy);
else
ElemTypeId = addType(std::move(TypeEntry));
ElemSize = ElemSize * Count;
}
}
}
// The array TypeId is the type id of the outermost dimension.
TypeId = ElemTypeId;
// The IR does not have a type for array index while BTF wants one.
// So create an array index type if there is none.
if (!ArrayIndexTypeId) {
auto TypeEntry = llvm::make_unique<BTFTypeInt>(dwarf::DW_ATE_unsigned, 32,
0, "__ARRAY_SIZE_TYPE__");
ArrayIndexTypeId = addType(std::move(TypeEntry));
}
}
void BTFDebug::visitEnumType(const DICompositeType *CTy, uint32_t &TypeId) {
DINodeArray Elements = CTy->getElements();
uint32_t VLen = Elements.size();
if (VLen > BTF::MAX_VLEN)
return;
auto TypeEntry = llvm::make_unique<BTFTypeEnum>(CTy, VLen);
TypeId = addType(std::move(TypeEntry), CTy);
// No need to visit base type as BTF does not encode it.
}
/// Handle structure/union forward declarations.
void BTFDebug::visitFwdDeclType(const DICompositeType *CTy, bool IsUnion,
uint32_t &TypeId) {
auto TypeEntry = llvm::make_unique<BTFTypeFwd>(CTy->getName(), IsUnion);
TypeId = addType(std::move(TypeEntry), CTy);
}
/// Handle structure, union, array and enumeration types.
void BTFDebug::visitCompositeType(const DICompositeType *CTy,
uint32_t &TypeId) {
auto Tag = CTy->getTag();
if (Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type) {
// Handle forward declaration differently as it does not have members.
if (CTy->isForwardDecl())
visitFwdDeclType(CTy, Tag == dwarf::DW_TAG_union_type, TypeId);
else
visitStructType(CTy, Tag == dwarf::DW_TAG_structure_type, TypeId);
} else if (Tag == dwarf::DW_TAG_array_type)
visitArrayType(CTy, TypeId);
else if (Tag == dwarf::DW_TAG_enumeration_type)
visitEnumType(CTy, TypeId);
}
/// Handle pointer, typedef, const, volatile, restrict and member types.
void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId,
bool CheckPointer, bool SeenPointer) {
unsigned Tag = DTy->getTag();
/// Try to avoid chasing pointees, esp. structure pointees which may
/// unnecessary bring in a lot of types.
if (CheckPointer && !SeenPointer) {
SeenPointer = Tag == dwarf::DW_TAG_pointer_type;
}
if (CheckPointer && SeenPointer) {
const DIType *Base = DTy->getBaseType();
if (Base) {
if (const auto *CTy = dyn_cast<DICompositeType>(Base)) {
auto CTag = CTy->getTag();
if ((CTag == dwarf::DW_TAG_structure_type ||
CTag == dwarf::DW_TAG_union_type) &&
!CTy->isForwardDecl()) {
/// Find a candidate, generate a fixup. Later on the struct/union
/// pointee type will be replaced with either a real type or
/// a forward declaration.
auto TypeEntry = llvm::make_unique<BTFTypeDerived>(DTy, Tag, true);
auto &Fixup = FixupDerivedTypes[CTy->getName()];
Fixup.first = CTag == dwarf::DW_TAG_union_type;
Fixup.second.push_back(TypeEntry.get());
TypeId = addType(std::move(TypeEntry), DTy);
return;
}
}
}
}
if (Tag == dwarf::DW_TAG_pointer_type || Tag == dwarf::DW_TAG_typedef ||
Tag == dwarf::DW_TAG_const_type || Tag == dwarf::DW_TAG_volatile_type ||
Tag == dwarf::DW_TAG_restrict_type) {
auto TypeEntry = llvm::make_unique<BTFTypeDerived>(DTy, Tag, false);
TypeId = addType(std::move(TypeEntry), DTy);
} else if (Tag != dwarf::DW_TAG_member) {
return;
}
// Visit base type of pointer, typedef, const, volatile, restrict or
// struct/union member.
uint32_t TempTypeId = 0;
if (Tag == dwarf::DW_TAG_member)
visitTypeEntry(DTy->getBaseType(), TempTypeId, true, false);
else
visitTypeEntry(DTy->getBaseType(), TempTypeId, CheckPointer, SeenPointer);
}
void BTFDebug::visitTypeEntry(const DIType *Ty, uint32_t &TypeId,
bool CheckPointer, bool SeenPointer) {
if (!Ty || DIToIdMap.find(Ty) != DIToIdMap.end()) {
TypeId = DIToIdMap[Ty];
return;
}
if (const auto *BTy = dyn_cast<DIBasicType>(Ty))
visitBasicType(BTy, TypeId);
else if (const auto *STy = dyn_cast<DISubroutineType>(Ty))
visitSubroutineType(STy, false, std::unordered_map<uint32_t, StringRef>(),
TypeId);
else if (const auto *CTy = dyn_cast<DICompositeType>(Ty))
visitCompositeType(CTy, TypeId);
else if (const auto *DTy = dyn_cast<DIDerivedType>(Ty))
visitDerivedType(DTy, TypeId, CheckPointer, SeenPointer);
else
llvm_unreachable("Unknown DIType");
}
void BTFDebug::visitTypeEntry(const DIType *Ty) {
uint32_t TypeId;
visitTypeEntry(Ty, TypeId, false, false);
}
void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) {
if (!Ty || DIToIdMap.find(Ty) != DIToIdMap.end()) {
TypeId = DIToIdMap[Ty];
return;
}
// MapDef type is a struct type
const auto *CTy = dyn_cast<DICompositeType>(Ty);
if (!CTy)
return;
auto Tag = CTy->getTag();
if (Tag != dwarf::DW_TAG_structure_type || CTy->isForwardDecl())
return;
// Record this type
const DINodeArray Elements = CTy->getElements();
bool HasBitField = false;
for (const auto *Element : Elements) {
auto E = cast<DIDerivedType>(Element);
if (E->isBitField()) {
HasBitField = true;
break;
}
}
auto TypeEntry =
llvm::make_unique<BTFTypeStruct>(CTy, true, HasBitField, Elements.size());
StructTypes.push_back(TypeEntry.get());
TypeId = addType(std::move(TypeEntry), CTy);
// Visit all struct members
for (const auto *Element : Elements) {
const auto *MemberType = cast<DIDerivedType>(Element);
visitTypeEntry(MemberType->getBaseType());
}
}
/// Read file contents from the actual file or from the source
std::string BTFDebug::populateFileContent(const DISubprogram *SP) {
auto File = SP->getFile();
std::string FileName;
if (!File->getFilename().startswith("/") && File->getDirectory().size())
FileName = File->getDirectory().str() + "/" + File->getFilename().str();
else
FileName = File->getFilename();
// No need to populate the contends if it has been populated!
if (FileContent.find(FileName) != FileContent.end())
return FileName;
std::vector<std::string> Content;
std::string Line;
Content.push_back(Line); // Line 0 for empty string
std::unique_ptr<MemoryBuffer> Buf;
auto Source = File->getSource();
if (Source)
Buf = MemoryBuffer::getMemBufferCopy(*Source);
else if (ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrErr =
MemoryBuffer::getFile(FileName))
Buf = std::move(*BufOrErr);
if (Buf)
for (line_iterator I(*Buf, false), E; I != E; ++I)
Content.push_back(*I);
FileContent[FileName] = Content;
return FileName;
}
void BTFDebug::constructLineInfo(const DISubprogram *SP, MCSymbol *Label,
uint32_t Line, uint32_t Column) {
std::string FileName = populateFileContent(SP);
BTFLineInfo LineInfo;
LineInfo.Label = Label;
LineInfo.FileNameOff = addString(FileName);
// If file content is not available, let LineOff = 0.
if (Line < FileContent[FileName].size())
LineInfo.LineOff = addString(FileContent[FileName][Line]);
else
LineInfo.LineOff = 0;
LineInfo.LineNum = Line;
LineInfo.ColumnNum = Column;
LineInfoTable[SecNameOff].push_back(LineInfo);
}
void BTFDebug::emitCommonHeader() {
OS.AddComment("0x" + Twine::utohexstr(BTF::MAGIC));
OS.EmitIntValue(BTF::MAGIC, 2);
OS.EmitIntValue(BTF::VERSION, 1);
OS.EmitIntValue(0, 1);
}
void BTFDebug::emitBTFSection() {
// Do not emit section if no types and only "" string.
if (!TypeEntries.size() && StringTable.getSize() == 1)
return;
MCContext &Ctx = OS.getContext();
OS.SwitchSection(Ctx.getELFSection(".BTF", ELF::SHT_PROGBITS, 0));
// Emit header.
emitCommonHeader();
OS.EmitIntValue(BTF::HeaderSize, 4);
uint32_t TypeLen = 0, StrLen;
for (const auto &TypeEntry : TypeEntries)
TypeLen += TypeEntry->getSize();
StrLen = StringTable.getSize();
OS.EmitIntValue(0, 4);
OS.EmitIntValue(TypeLen, 4);
OS.EmitIntValue(TypeLen, 4);
OS.EmitIntValue(StrLen, 4);
// Emit type table.
for (const auto &TypeEntry : TypeEntries)
TypeEntry->emitType(OS);
// Emit string table.
uint32_t StringOffset = 0;
for (const auto &S : StringTable.getTable()) {
OS.AddComment("string offset=" + std::to_string(StringOffset));
OS.EmitBytes(S);
OS.EmitBytes(StringRef("\0", 1));
StringOffset += S.size() + 1;
}
}
void BTFDebug::emitBTFExtSection() {
// Do not emit section if empty FuncInfoTable and LineInfoTable.
if (!FuncInfoTable.size() && !LineInfoTable.size() &&
!OffsetRelocTable.size() && !ExternRelocTable.size())
return;
MCContext &Ctx = OS.getContext();
OS.SwitchSection(Ctx.getELFSection(".BTF.ext", ELF::SHT_PROGBITS, 0));
// Emit header.
emitCommonHeader();
OS.EmitIntValue(BTF::ExtHeaderSize, 4);
// Account for FuncInfo/LineInfo record size as well.
uint32_t FuncLen = 4, LineLen = 4;
// Do not account for optional OffsetReloc/ExternReloc.
uint32_t OffsetRelocLen = 0, ExternRelocLen = 0;
for (const auto &FuncSec : FuncInfoTable) {
FuncLen += BTF::SecFuncInfoSize;
FuncLen += FuncSec.second.size() * BTF::BPFFuncInfoSize;
}
for (const auto &LineSec : LineInfoTable) {
LineLen += BTF::SecLineInfoSize;
LineLen += LineSec.second.size() * BTF::BPFLineInfoSize;
}
for (const auto &OffsetRelocSec : OffsetRelocTable) {
OffsetRelocLen += BTF::SecOffsetRelocSize;
OffsetRelocLen += OffsetRelocSec.second.size() * BTF::BPFOffsetRelocSize;
}
for (const auto &ExternRelocSec : ExternRelocTable) {
ExternRelocLen += BTF::SecExternRelocSize;
ExternRelocLen += ExternRelocSec.second.size() * BTF::BPFExternRelocSize;
}
if (OffsetRelocLen)
OffsetRelocLen += 4;
if (ExternRelocLen)
ExternRelocLen += 4;
OS.EmitIntValue(0, 4);
OS.EmitIntValue(FuncLen, 4);
OS.EmitIntValue(FuncLen, 4);
OS.EmitIntValue(LineLen, 4);
OS.EmitIntValue(FuncLen + LineLen, 4);
OS.EmitIntValue(OffsetRelocLen, 4);
OS.EmitIntValue(FuncLen + LineLen + OffsetRelocLen, 4);
OS.EmitIntValue(ExternRelocLen, 4);
// Emit func_info table.
OS.AddComment("FuncInfo");
OS.EmitIntValue(BTF::BPFFuncInfoSize, 4);
for (const auto &FuncSec : FuncInfoTable) {
OS.AddComment("FuncInfo section string offset=" +
std::to_string(FuncSec.first));
OS.EmitIntValue(FuncSec.first, 4);
OS.EmitIntValue(FuncSec.second.size(), 4);
for (const auto &FuncInfo : FuncSec.second) {
Asm->EmitLabelReference(FuncInfo.Label, 4);
OS.EmitIntValue(FuncInfo.TypeId, 4);
}
}
// Emit line_info table.
OS.AddComment("LineInfo");
OS.EmitIntValue(BTF::BPFLineInfoSize, 4);
for (const auto &LineSec : LineInfoTable) {
OS.AddComment("LineInfo section string offset=" +
std::to_string(LineSec.first));
OS.EmitIntValue(LineSec.first, 4);
OS.EmitIntValue(LineSec.second.size(), 4);
for (const auto &LineInfo : LineSec.second) {
Asm->EmitLabelReference(LineInfo.Label, 4);
OS.EmitIntValue(LineInfo.FileNameOff, 4);
OS.EmitIntValue(LineInfo.LineOff, 4);
OS.AddComment("Line " + std::to_string(LineInfo.LineNum) + " Col " +
std::to_string(LineInfo.ColumnNum));
OS.EmitIntValue(LineInfo.LineNum << 10 | LineInfo.ColumnNum, 4);
}
}
// Emit offset reloc table.
if (OffsetRelocLen) {
OS.AddComment("OffsetReloc");
OS.EmitIntValue(BTF::BPFOffsetRelocSize, 4);
for (const auto &OffsetRelocSec : OffsetRelocTable) {
OS.AddComment("Offset reloc section string offset=" +
std::to_string(OffsetRelocSec.first));
OS.EmitIntValue(OffsetRelocSec.first, 4);
OS.EmitIntValue(OffsetRelocSec.second.size(), 4);
for (const auto &OffsetRelocInfo : OffsetRelocSec.second) {
Asm->EmitLabelReference(OffsetRelocInfo.Label, 4);
OS.EmitIntValue(OffsetRelocInfo.TypeID, 4);
OS.EmitIntValue(OffsetRelocInfo.OffsetNameOff, 4);
}
}
}
// Emit extern reloc table.
if (ExternRelocLen) {
OS.AddComment("ExternReloc");
OS.EmitIntValue(BTF::BPFExternRelocSize, 4);
for (const auto &ExternRelocSec : ExternRelocTable) {
OS.AddComment("Extern reloc section string offset=" +
std::to_string(ExternRelocSec.first));
OS.EmitIntValue(ExternRelocSec.first, 4);
OS.EmitIntValue(ExternRelocSec.second.size(), 4);
for (const auto &ExternRelocInfo : ExternRelocSec.second) {
Asm->EmitLabelReference(ExternRelocInfo.Label, 4);
OS.EmitIntValue(ExternRelocInfo.ExternNameOff, 4);
}
}
}
}
void BTFDebug::beginFunctionImpl(const MachineFunction *MF) {
auto *SP = MF->getFunction().getSubprogram();
auto *Unit = SP->getUnit();
if (Unit->getEmissionKind() == DICompileUnit::NoDebug) {
SkipInstruction = true;
return;
}
SkipInstruction = false;
// Collect MapDef types. Map definition needs to collect
// pointee types. Do it first. Otherwise, for the following
// case:
// struct m { ...};
// struct t {
// struct m *key;
// };
// foo(struct t *arg);
//
// struct mapdef {
// ...
// struct m *key;
// ...
// } __attribute__((section(".maps"))) hash_map;
//
// If subroutine foo is traversed first, a type chain
// "ptr->struct m(fwd)" will be created and later on
// when traversing mapdef, since "ptr->struct m" exists,
// the traversal of "struct m" will be omitted.
if (MapDefNotCollected) {
processGlobals(true);
MapDefNotCollected = false;
}
// Collect all types locally referenced in this function.
// Use RetainedNodes so we can collect all argument names
// even if the argument is not used.
std::unordered_map<uint32_t, StringRef> FuncArgNames;
for (const DINode *DN : SP->getRetainedNodes()) {
if (const auto *DV = dyn_cast<DILocalVariable>(DN)) {
// Collect function arguments for subprogram func type.
uint32_t Arg = DV->getArg();
if (Arg) {
visitTypeEntry(DV->getType());
FuncArgNames[Arg] = DV->getName();
}
}
}
// Construct subprogram func proto type.
uint32_t ProtoTypeId;
visitSubroutineType(SP->getType(), true, FuncArgNames, ProtoTypeId);
// Construct subprogram func type
auto FuncTypeEntry =
llvm::make_unique<BTFTypeFunc>(SP->getName(), ProtoTypeId);
uint32_t FuncTypeId = addType(std::move(FuncTypeEntry));
for (const auto &TypeEntry : TypeEntries)
TypeEntry->completeType(*this);
// Construct funcinfo and the first lineinfo for the function.
MCSymbol *FuncLabel = Asm->getFunctionBegin();
BTFFuncInfo FuncInfo;
FuncInfo.Label = FuncLabel;
FuncInfo.TypeId = FuncTypeId;
if (FuncLabel->isInSection()) {
MCSection &Section = FuncLabel->getSection();
const MCSectionELF *SectionELF = dyn_cast<MCSectionELF>(&Section);
assert(SectionELF && "Null section for Function Label");
SecNameOff = addString(SectionELF->getSectionName());
} else {
SecNameOff = addString(".text");
}
FuncInfoTable[SecNameOff].push_back(FuncInfo);
}
void BTFDebug::endFunctionImpl(const MachineFunction *MF) {
SkipInstruction = false;
LineInfoGenerated = false;
SecNameOff = 0;
}
/// On-demand populate struct types as requested from abstract member
/// accessing.
unsigned BTFDebug::populateStructType(const DIType *Ty) {
unsigned Id;
visitTypeEntry(Ty, Id, false, false);
for (const auto &TypeEntry : TypeEntries)
TypeEntry->completeType(*this);
return Id;
}
// Find struct/array debuginfo types given a type id.
void BTFDebug::setTypeFromId(uint32_t TypeId, BTFTypeStruct **PrevStructType,
BTFTypeArray **PrevArrayType) {
for (const auto &StructType : StructTypes) {
if (StructType->getId() == TypeId) {
*PrevStructType = StructType;
return;
}
}
for (const auto &ArrayType : ArrayTypes) {
if (ArrayType->getId() == TypeId) {
*PrevArrayType = ArrayType;
return;
}
}
}
/// Generate a struct member offset relocation.
void BTFDebug::generateOffsetReloc(const MachineInstr *MI,
const MCSymbol *ORSym, DIType *RootTy,
StringRef AccessPattern) {
BTFTypeStruct *PrevStructType = nullptr;
BTFTypeArray *PrevArrayType = nullptr;
unsigned RootId = populateStructType(RootTy);
setTypeFromId(RootId, &PrevStructType, &PrevArrayType);
unsigned RootTySize = PrevStructType->getStructSize();
+ StringRef IndexPattern = AccessPattern.substr(AccessPattern.find_first_of(':') + 1);
BTFOffsetReloc OffsetReloc;
OffsetReloc.Label = ORSym;
- OffsetReloc.OffsetNameOff = addString(AccessPattern.drop_back());
+ OffsetReloc.OffsetNameOff = addString(IndexPattern.drop_back());
OffsetReloc.TypeID = RootId;
uint32_t Start = 0, End = 0, Offset = 0;
bool FirstAccess = true;
- for (auto C : AccessPattern) {
+ for (auto C : IndexPattern) {
if (C != ':') {
End++;
} else {
- std::string SubStr = AccessPattern.substr(Start, End - Start);
+ std::string SubStr = IndexPattern.substr(Start, End - Start);
int Loc = std::stoi(SubStr);
if (FirstAccess) {
Offset = Loc * RootTySize;
FirstAccess = false;
} else if (PrevStructType) {
uint32_t MemberOffset, MemberTypeId;
PrevStructType->getMemberInfo(Loc, MemberOffset, MemberTypeId);
Offset += MemberOffset >> 3;
PrevStructType = nullptr;
setTypeFromId(MemberTypeId, &PrevStructType, &PrevArrayType);
} else if (PrevArrayType) {
uint32_t LocOffset, ElementTypeId;
PrevArrayType->getLocInfo(Loc, LocOffset, ElementTypeId);
Offset += LocOffset;
PrevArrayType = nullptr;
setTypeFromId(ElementTypeId, &PrevStructType, &PrevArrayType);
+ } else {
+ llvm_unreachable("Internal Error: BTF offset relocation type traversal error");
}
+
Start = End + 1;
End = Start;
}
}
- AccessOffsets[RootTy->getName().str() + ":" + AccessPattern.str()] = Offset;
+ AccessOffsets[AccessPattern.str()] = Offset;
OffsetRelocTable[SecNameOff].push_back(OffsetReloc);
}
void BTFDebug::processLDimm64(const MachineInstr *MI) {
// If the insn is an LD_imm64, the following two cases
// will generate an .BTF.ext record.
//
// If the insn is "r2 = LD_imm64 @__BTF_...",
// add this insn into the .BTF.ext OffsetReloc subsection.
// Relocation looks like:
// . SecName:
// . InstOffset
// . TypeID
// . OffSetNameOff
// Later, the insn is replaced with "r2 = <offset>"
// where "<offset>" equals to the offset based on current
// type definitions.
//
// If the insn is "r2 = LD_imm64 @VAR" and VAR is
// a patchable external global, add this insn into the .BTF.ext
// ExternReloc subsection.
// Relocation looks like:
// . SecName:
// . InstOffset
// . ExternNameOff
// Later, the insn is replaced with "r2 = <value>" or
// "LD_imm64 r2, <value>" where "<value>" = 0.
// check whether this is a candidate or not
const MachineOperand &MO = MI->getOperand(1);
if (MO.isGlobal()) {
const GlobalValue *GVal = MO.getGlobal();
auto *GVar = dyn_cast<GlobalVariable>(GVal);
if (GVar && GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) {
MCSymbol *ORSym = OS.getContext().createTempSymbol();
OS.EmitLabel(ORSym);
MDNode *MDN = GVar->getMetadata(LLVMContext::MD_preserve_access_index);
DIType *Ty = dyn_cast<DIType>(MDN);
generateOffsetReloc(MI, ORSym, Ty, GVar->getName());
} else if (GVar && !GVar->hasInitializer() && GVar->hasExternalLinkage() &&
GVar->getSection() == BPFCoreSharedInfo::PatchableExtSecName) {
MCSymbol *ORSym = OS.getContext().createTempSymbol();
OS.EmitLabel(ORSym);
BTFExternReloc ExternReloc;
ExternReloc.Label = ORSym;
ExternReloc.ExternNameOff = addString(GVar->getName());
ExternRelocTable[SecNameOff].push_back(ExternReloc);
}
}
}
void BTFDebug::beginInstruction(const MachineInstr *MI) {
DebugHandlerBase::beginInstruction(MI);
if (SkipInstruction || MI->isMetaInstruction() ||
MI->getFlag(MachineInstr::FrameSetup))
return;
if (MI->isInlineAsm()) {
// Count the number of register definitions to find the asm string.
unsigned NumDefs = 0;
for (; MI->getOperand(NumDefs).isReg() && MI->getOperand(NumDefs).isDef();
++NumDefs)
;
// Skip this inline asm instruction if the asmstr is empty.
const char *AsmStr = MI->getOperand(NumDefs).getSymbolName();
if (AsmStr[0] == 0)
return;
}
if (MI->getOpcode() == BPF::LD_imm64)
processLDimm64(MI);
// Skip this instruction if no DebugLoc or the DebugLoc
// is the same as the previous instruction.
const DebugLoc &DL = MI->getDebugLoc();
if (!DL || PrevInstLoc == DL) {
// This instruction will be skipped, no LineInfo has
// been generated, construct one based on function signature.
if (LineInfoGenerated == false) {
auto *S = MI->getMF()->getFunction().getSubprogram();
MCSymbol *FuncLabel = Asm->getFunctionBegin();
constructLineInfo(S, FuncLabel, S->getLine(), 0);
LineInfoGenerated = true;
}
return;
}
// Create a temporary label to remember the insn for lineinfo.
MCSymbol *LineSym = OS.getContext().createTempSymbol();
OS.EmitLabel(LineSym);
// Construct the lineinfo.
auto SP = DL.get()->getScope()->getSubprogram();
constructLineInfo(SP, LineSym, DL.getLine(), DL.getCol());
LineInfoGenerated = true;
PrevInstLoc = DL;
}
void BTFDebug::processGlobals(bool ProcessingMapDef) {
// Collect all types referenced by globals.
const Module *M = MMI->getModule();
for (const GlobalVariable &Global : M->globals()) {
// Ignore external globals for now.
if (!Global.hasInitializer() && Global.hasExternalLinkage())
continue;
// Decide the section name.
StringRef SecName;
if (Global.hasSection()) {
SecName = Global.getSection();
} else {
// data, bss, or readonly sections
if (Global.isConstant())
SecName = ".rodata";
else
SecName = Global.getInitializer()->isZeroValue() ? ".bss" : ".data";
}
if (ProcessingMapDef != SecName.startswith(".maps"))
continue;
SmallVector<DIGlobalVariableExpression *, 1> GVs;
Global.getDebugInfo(GVs);
uint32_t GVTypeId = 0;
for (auto *GVE : GVs) {
if (SecName.startswith(".maps"))
visitMapDefType(GVE->getVariable()->getType(), GVTypeId);
else
visitTypeEntry(GVE->getVariable()->getType(), GVTypeId, false, false);
break;
}
// Only support the following globals:
// . static variables
// . non-static global variables with section attributes
// Essentially means:
// . .bcc/.data/.rodata DataSec entities only contain static data
// . Other DataSec entities contain static or initialized global data.
// Initialized global data are mostly used for finding map key/value type
// id's. Whether DataSec is readonly or not can be found from
// corresponding ELF section flags.
auto Linkage = Global.getLinkage();
if (Linkage != GlobalValue::InternalLinkage &&
(Linkage != GlobalValue::ExternalLinkage || !Global.hasSection()))
continue;
uint32_t GVarInfo = Linkage == GlobalValue::ExternalLinkage
? BTF::VAR_GLOBAL_ALLOCATED
: BTF::VAR_STATIC;
auto VarEntry =
llvm::make_unique<BTFKindVar>(Global.getName(), GVTypeId, GVarInfo);
uint32_t VarId = addType(std::move(VarEntry));
// Find or create a DataSec
if (DataSecEntries.find(SecName) == DataSecEntries.end()) {
DataSecEntries[SecName] = llvm::make_unique<BTFKindDataSec>(Asm, SecName);
}
// Calculate symbol size
const DataLayout &DL = Global.getParent()->getDataLayout();
uint32_t Size = DL.getTypeAllocSize(Global.getType()->getElementType());
DataSecEntries[SecName]->addVar(VarId, Asm->getSymbol(&Global), Size);
}
}
/// Emit proper patchable instructions.
bool BTFDebug::InstLower(const MachineInstr *MI, MCInst &OutMI) {
if (MI->getOpcode() == BPF::LD_imm64) {
const MachineOperand &MO = MI->getOperand(1);
if (MO.isGlobal()) {
const GlobalValue *GVal = MO.getGlobal();
auto *GVar = dyn_cast<GlobalVariable>(GVal);
if (GVar && GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) {
MDNode *MDN = GVar->getMetadata(LLVMContext::MD_preserve_access_index);
DIType *Ty = dyn_cast<DIType>(MDN);
std::string TypeName = Ty->getName();
- int64_t Imm = AccessOffsets[TypeName + ":" + GVar->getName().str()];
+ int64_t Imm = AccessOffsets[GVar->getName().str()];
// Emit "mov ri, <imm>" for abstract member accesses.
OutMI.setOpcode(BPF::MOV_ri);
OutMI.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
OutMI.addOperand(MCOperand::createImm(Imm));
return true;
} else if (GVar && !GVar->hasInitializer() &&
GVar->hasExternalLinkage() &&
GVar->getSection() == BPFCoreSharedInfo::PatchableExtSecName) {
const IntegerType *IntTy = dyn_cast<IntegerType>(GVar->getValueType());
assert(IntTy);
// For patchable externals, emit "LD_imm64, ri, 0" if the external
// variable is 64bit width, emit "mov ri, 0" otherwise.
if (IntTy->getBitWidth() == 64)
OutMI.setOpcode(BPF::LD_imm64);
else
OutMI.setOpcode(BPF::MOV_ri);
OutMI.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
OutMI.addOperand(MCOperand::createImm(0));
return true;
}
}
}
return false;
}
void BTFDebug::endModule() {
// Collect MapDef globals if not collected yet.
if (MapDefNotCollected) {
processGlobals(true);
MapDefNotCollected = false;
}
// Collect global types/variables except MapDef globals.
processGlobals(false);
for (auto &DataSec : DataSecEntries)
addType(std::move(DataSec.second));
// Fixups
for (auto &Fixup : FixupDerivedTypes) {
StringRef TypeName = Fixup.first;
bool IsUnion = Fixup.second.first;
// Search through struct types
uint32_t StructTypeId = 0;
for (const auto &StructType : StructTypes) {
if (StructType->getName() == TypeName) {
StructTypeId = StructType->getId();
break;
}
}
if (StructTypeId == 0) {
auto FwdTypeEntry = llvm::make_unique<BTFTypeFwd>(TypeName, IsUnion);
StructTypeId = addType(std::move(FwdTypeEntry));
}
for (auto &DType : Fixup.second.second) {
DType->setPointeeType(StructTypeId);
}
}
// Complete BTF type cross refereences.
for (const auto &TypeEntry : TypeEntries)
TypeEntry->completeType(*this);
// Emit BTF sections.
emitBTFSection();
emitBTFExtSection();
}
Index: vendor/llvm/dist-release_90/lib/Target/BPF/BTFDebug.h
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/BPF/BTFDebug.h (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/BPF/BTFDebug.h (revision 351303)
@@ -1,371 +1,375 @@
//===- BTFDebug.h -----------------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file contains support for writing BTF debug info.
///
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_BPF_BTFDEBUG_H
#define LLVM_LIB_TARGET_BPF_BTFDEBUG_H
#include "llvm/ADT/StringMap.h"
#include "llvm/CodeGen/DebugHandlerBase.h"
#include <unordered_map>
#include "BTF.h"
namespace llvm {
class AsmPrinter;
class BTFDebug;
class DIType;
class MCStreamer;
class MCSymbol;
class MachineFunction;
/// The base class for BTF type generation.
class BTFTypeBase {
protected:
uint8_t Kind;
bool IsCompleted;
uint32_t Id;
struct BTF::CommonType BTFType;
public:
BTFTypeBase() : IsCompleted(false) {}
virtual ~BTFTypeBase() = default;
void setId(uint32_t Id) { this->Id = Id; }
uint32_t getId() { return Id; }
uint32_t roundupToBytes(uint32_t NumBits) { return (NumBits + 7) >> 3; }
/// Get the size of this BTF type entry.
virtual uint32_t getSize() { return BTF::CommonTypeSize; }
/// Complete BTF type generation after all related DebugInfo types
/// have been visited so their BTF type id's are available
/// for cross referece.
virtual void completeType(BTFDebug &BDebug) {}
/// Emit types for this BTF type entry.
virtual void emitType(MCStreamer &OS);
};
/// Handle several derived types include pointer, const,
/// volatile, typedef and restrict.
class BTFTypeDerived : public BTFTypeBase {
const DIDerivedType *DTy;
bool NeedsFixup;
public:
BTFTypeDerived(const DIDerivedType *Ty, unsigned Tag, bool NeedsFixup);
void completeType(BTFDebug &BDebug);
void emitType(MCStreamer &OS);
void setPointeeType(uint32_t PointeeType);
};
/// Handle struct or union forward declaration.
class BTFTypeFwd : public BTFTypeBase {
StringRef Name;
public:
BTFTypeFwd(StringRef Name, bool IsUnion);
void completeType(BTFDebug &BDebug);
void emitType(MCStreamer &OS);
};
/// Handle int type.
class BTFTypeInt : public BTFTypeBase {
StringRef Name;
uint32_t IntVal; ///< Encoding, offset, bits
public:
BTFTypeInt(uint32_t Encoding, uint32_t SizeInBits, uint32_t OffsetInBits,
StringRef TypeName);
uint32_t getSize() { return BTFTypeBase::getSize() + sizeof(uint32_t); }
void completeType(BTFDebug &BDebug);
void emitType(MCStreamer &OS);
};
/// Handle enumerate type.
class BTFTypeEnum : public BTFTypeBase {
const DICompositeType *ETy;
std::vector<struct BTF::BTFEnum> EnumValues;
public:
BTFTypeEnum(const DICompositeType *ETy, uint32_t NumValues);
uint32_t getSize() {
return BTFTypeBase::getSize() + EnumValues.size() * BTF::BTFEnumSize;
}
void completeType(BTFDebug &BDebug);
void emitType(MCStreamer &OS);
};
/// Handle array type.
class BTFTypeArray : public BTFTypeBase {
+ const DIType *ElemTyNoQual;
uint32_t ElemSize;
struct BTF::BTFArray ArrayInfo;
+ uint32_t ElemTypeNoQual;
public:
- BTFTypeArray(uint32_t ElemTypeId, uint32_t ElemSize, uint32_t NumElems);
+ BTFTypeArray(const DIType *Ty, uint32_t ElemTypeId,
+ uint32_t ElemSize, uint32_t NumElems);
uint32_t getSize() { return BTFTypeBase::getSize() + BTF::BTFArraySize; }
void completeType(BTFDebug &BDebug);
void emitType(MCStreamer &OS);
void getLocInfo(uint32_t Loc, uint32_t &LocOffset, uint32_t &ElementTypeId);
};
/// Handle struct/union type.
class BTFTypeStruct : public BTFTypeBase {
const DICompositeType *STy;
bool HasBitField;
std::vector<struct BTF::BTFMember> Members;
+ std::vector<uint32_t> MemberTypeNoQual;
public:
BTFTypeStruct(const DICompositeType *STy, bool IsStruct, bool HasBitField,
uint32_t NumMembers);
uint32_t getSize() {
return BTFTypeBase::getSize() + Members.size() * BTF::BTFMemberSize;
}
void completeType(BTFDebug &BDebug);
void emitType(MCStreamer &OS);
std::string getName();
void getMemberInfo(uint32_t Loc, uint32_t &Offset, uint32_t &MemberType);
uint32_t getStructSize();
};
/// Handle function pointer.
class BTFTypeFuncProto : public BTFTypeBase {
const DISubroutineType *STy;
std::unordered_map<uint32_t, StringRef> FuncArgNames;
std::vector<struct BTF::BTFParam> Parameters;
public:
BTFTypeFuncProto(const DISubroutineType *STy, uint32_t NumParams,
const std::unordered_map<uint32_t, StringRef> &FuncArgNames);
uint32_t getSize() {
return BTFTypeBase::getSize() + Parameters.size() * BTF::BTFParamSize;
}
void completeType(BTFDebug &BDebug);
void emitType(MCStreamer &OS);
};
/// Handle subprogram
class BTFTypeFunc : public BTFTypeBase {
StringRef Name;
public:
BTFTypeFunc(StringRef FuncName, uint32_t ProtoTypeId);
uint32_t getSize() { return BTFTypeBase::getSize(); }
void completeType(BTFDebug &BDebug);
void emitType(MCStreamer &OS);
};
/// Handle variable instances
class BTFKindVar : public BTFTypeBase {
StringRef Name;
uint32_t Info;
public:
BTFKindVar(StringRef VarName, uint32_t TypeId, uint32_t VarInfo);
uint32_t getSize() { return BTFTypeBase::getSize() + 4; }
void completeType(BTFDebug &BDebug);
void emitType(MCStreamer &OS);
};
/// Handle data sections
class BTFKindDataSec : public BTFTypeBase {
AsmPrinter *Asm;
std::string Name;
std::vector<std::tuple<uint32_t, const MCSymbol *, uint32_t>> Vars;
public:
BTFKindDataSec(AsmPrinter *AsmPrt, std::string SecName);
uint32_t getSize() {
return BTFTypeBase::getSize() + BTF::BTFDataSecVarSize * Vars.size();
}
void addVar(uint32_t Id, const MCSymbol *Sym, uint32_t Size) {
Vars.push_back(std::make_tuple(Id, Sym, Size));
}
std::string getName() { return Name; }
void completeType(BTFDebug &BDebug);
void emitType(MCStreamer &OS);
};
/// String table.
class BTFStringTable {
/// String table size in bytes.
uint32_t Size;
/// A mapping from string table offset to the index
/// of the Table. It is used to avoid putting
/// duplicated strings in the table.
std::unordered_map<uint32_t, uint32_t> OffsetToIdMap;
/// A vector of strings to represent the string table.
std::vector<std::string> Table;
public:
BTFStringTable() : Size(0) {}
uint32_t getSize() { return Size; }
std::vector<std::string> &getTable() { return Table; }
/// Add a string to the string table and returns its offset
/// in the table.
uint32_t addString(StringRef S);
};
/// Represent one func and its type id.
struct BTFFuncInfo {
const MCSymbol *Label; ///< Func MCSymbol
uint32_t TypeId; ///< Type id referring to .BTF type section
};
/// Represent one line info.
struct BTFLineInfo {
MCSymbol *Label; ///< MCSymbol identifying insn for the lineinfo
uint32_t FileNameOff; ///< file name offset in the .BTF string table
uint32_t LineOff; ///< line offset in the .BTF string table
uint32_t LineNum; ///< the line number
uint32_t ColumnNum; ///< the column number
};
/// Represent one offset relocation.
struct BTFOffsetReloc {
const MCSymbol *Label; ///< MCSymbol identifying insn for the reloc
uint32_t TypeID; ///< Type ID
uint32_t OffsetNameOff; ///< The string to traverse types
};
/// Represent one extern relocation.
struct BTFExternReloc {
const MCSymbol *Label; ///< MCSymbol identifying insn for the reloc
uint32_t ExternNameOff; ///< The extern variable name
};
/// Collect and emit BTF information.
class BTFDebug : public DebugHandlerBase {
MCStreamer &OS;
bool SkipInstruction;
bool LineInfoGenerated;
uint32_t SecNameOff;
uint32_t ArrayIndexTypeId;
bool MapDefNotCollected;
BTFStringTable StringTable;
std::vector<std::unique_ptr<BTFTypeBase>> TypeEntries;
std::unordered_map<const DIType *, uint32_t> DIToIdMap;
std::map<uint32_t, std::vector<BTFFuncInfo>> FuncInfoTable;
std::map<uint32_t, std::vector<BTFLineInfo>> LineInfoTable;
std::map<uint32_t, std::vector<BTFOffsetReloc>> OffsetRelocTable;
std::map<uint32_t, std::vector<BTFExternReloc>> ExternRelocTable;
StringMap<std::vector<std::string>> FileContent;
std::map<std::string, std::unique_ptr<BTFKindDataSec>> DataSecEntries;
std::vector<BTFTypeStruct *> StructTypes;
std::vector<BTFTypeArray *> ArrayTypes;
std::map<std::string, int64_t> AccessOffsets;
std::map<StringRef, std::pair<bool, std::vector<BTFTypeDerived *>>>
FixupDerivedTypes;
/// Add types to TypeEntries.
/// @{
/// Add types to TypeEntries and DIToIdMap.
uint32_t addType(std::unique_ptr<BTFTypeBase> TypeEntry, const DIType *Ty);
/// Add types to TypeEntries only and return type id.
uint32_t addType(std::unique_ptr<BTFTypeBase> TypeEntry);
/// @}
/// IR type visiting functions.
/// @{
void visitTypeEntry(const DIType *Ty);
void visitTypeEntry(const DIType *Ty, uint32_t &TypeId, bool CheckPointer,
bool SeenPointer);
void visitBasicType(const DIBasicType *BTy, uint32_t &TypeId);
void visitSubroutineType(
const DISubroutineType *STy, bool ForSubprog,
const std::unordered_map<uint32_t, StringRef> &FuncArgNames,
uint32_t &TypeId);
void visitFwdDeclType(const DICompositeType *CTy, bool IsUnion,
uint32_t &TypeId);
void visitCompositeType(const DICompositeType *CTy, uint32_t &TypeId);
void visitStructType(const DICompositeType *STy, bool IsStruct,
uint32_t &TypeId);
void visitArrayType(const DICompositeType *ATy, uint32_t &TypeId);
void visitEnumType(const DICompositeType *ETy, uint32_t &TypeId);
void visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId,
bool CheckPointer, bool SeenPointer);
void visitMapDefType(const DIType *Ty, uint32_t &TypeId);
/// @}
/// Get the file content for the subprogram. Certain lines of the file
/// later may be put into string table and referenced by line info.
std::string populateFileContent(const DISubprogram *SP);
/// Construct a line info.
void constructLineInfo(const DISubprogram *SP, MCSymbol *Label, uint32_t Line,
uint32_t Column);
/// Generate types and variables for globals.
void processGlobals(bool ProcessingMapDef);
/// Generate one offset relocation record.
void generateOffsetReloc(const MachineInstr *MI, const MCSymbol *ORSym,
DIType *RootTy, StringRef AccessPattern);
/// Set the to-be-traversed Struct/Array Type based on TypeId.
void setTypeFromId(uint32_t TypeId, BTFTypeStruct **PrevStructType,
BTFTypeArray **PrevArrayType);
/// Populating unprocessed struct type.
unsigned populateStructType(const DIType *Ty);
/// Process LD_imm64 instructions.
void processLDimm64(const MachineInstr *MI);
/// Emit common header of .BTF and .BTF.ext sections.
void emitCommonHeader();
/// Emit the .BTF section.
void emitBTFSection();
/// Emit the .BTF.ext section.
void emitBTFExtSection();
protected:
/// Gather pre-function debug information.
void beginFunctionImpl(const MachineFunction *MF) override;
/// Post process after all instructions in this function are processed.
void endFunctionImpl(const MachineFunction *MF) override;
public:
BTFDebug(AsmPrinter *AP);
///
bool InstLower(const MachineInstr *MI, MCInst &OutMI);
/// Get the special array index type id.
uint32_t getArrayIndexTypeId() {
assert(ArrayIndexTypeId);
return ArrayIndexTypeId;
}
/// Add string to the string table.
size_t addString(StringRef S) { return StringTable.addString(S); }
/// Get the type id for a particular DIType.
uint32_t getTypeId(const DIType *Ty) {
assert(Ty && "Invalid null Type");
assert(DIToIdMap.find(Ty) != DIToIdMap.end() &&
"DIType not added in the BDIToIdMap");
return DIToIdMap[Ty];
}
void setSymbolSize(const MCSymbol *Symbol, uint64_t Size) override {}
/// Process beginning of an instruction.
void beginInstruction(const MachineInstr *MI) override;
/// Complete all the types and emit the BTF sections.
void endModule() override;
};
} // end namespace llvm
#endif
Index: vendor/llvm/dist-release_90/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp (revision 351303)
@@ -1,1775 +1,1793 @@
//===-- RISCVAsmParser.cpp - Parse RISCV assembly to MCInst instructions --===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/RISCVAsmBackend.h"
#include "MCTargetDesc/RISCVMCExpr.h"
#include "MCTargetDesc/RISCVMCTargetDesc.h"
#include "MCTargetDesc/RISCVTargetStreamer.h"
#include "TargetInfo/RISCVTargetInfo.h"
#include "Utils/RISCVBaseInfo.h"
#include "Utils/RISCVMatInt.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstBuilder.h"
#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCParser/MCAsmLexer.h"
#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
#include "llvm/MC/MCParser/MCTargetAsmParser.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/TargetRegistry.h"
#include <limits>
using namespace llvm;
// Include the auto-generated portion of the compress emitter.
#define GEN_COMPRESS_INSTR
#include "RISCVGenCompressInstEmitter.inc"
namespace {
struct RISCVOperand;
class RISCVAsmParser : public MCTargetAsmParser {
SmallVector<FeatureBitset, 4> FeatureBitStack;
SMLoc getLoc() const { return getParser().getTok().getLoc(); }
bool isRV64() const { return getSTI().hasFeature(RISCV::Feature64Bit); }
bool isRV32E() const { return getSTI().hasFeature(RISCV::FeatureRV32E); }
RISCVTargetStreamer &getTargetStreamer() {
MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
return static_cast<RISCVTargetStreamer &>(TS);
}
unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
unsigned Kind) override;
bool generateImmOutOfRangeError(OperandVector &Operands, uint64_t ErrorInfo,
int64_t Lower, int64_t Upper, Twine Msg);
bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands, MCStreamer &Out,
uint64_t &ErrorInfo,
bool MatchingInlineAsm) override;
bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
SMLoc NameLoc, OperandVector &Operands) override;
bool ParseDirective(AsmToken DirectiveID) override;
// Helper to actually emit an instruction to the MCStreamer. Also, when
// possible, compression of the instruction is performed.
void emitToStreamer(MCStreamer &S, const MCInst &Inst);
// Helper to emit a combination of LUI, ADDI(W), and SLLI instructions that
// synthesize the desired immedate value into the destination register.
void emitLoadImm(unsigned DestReg, int64_t Value, MCStreamer &Out);
// Helper to emit a combination of AUIPC and SecondOpcode. Used to implement
// helpers such as emitLoadLocalAddress and emitLoadAddress.
void emitAuipcInstPair(MCOperand DestReg, MCOperand TmpReg,
const MCExpr *Symbol, RISCVMCExpr::VariantKind VKHi,
unsigned SecondOpcode, SMLoc IDLoc, MCStreamer &Out);
// Helper to emit pseudo instruction "lla" used in PC-rel addressing.
void emitLoadLocalAddress(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
// Helper to emit pseudo instruction "la" used in GOT/PC-rel addressing.
void emitLoadAddress(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
// Helper to emit pseudo instruction "la.tls.ie" used in initial-exec TLS
// addressing.
void emitLoadTLSIEAddress(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
// Helper to emit pseudo instruction "la.tls.gd" used in global-dynamic TLS
// addressing.
void emitLoadTLSGDAddress(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
// Helper to emit pseudo load/store instruction with a symbol.
void emitLoadStoreSymbol(MCInst &Inst, unsigned Opcode, SMLoc IDLoc,
MCStreamer &Out, bool HasTmpReg);
// Checks that a PseudoAddTPRel is using x4/tp in its second input operand.
// Enforcing this using a restricted register class for the second input
// operand of PseudoAddTPRel results in a poor diagnostic due to the fact
// 'add' is an overloaded mnemonic.
bool checkPseudoAddTPRel(MCInst &Inst, OperandVector &Operands);
/// Helper for processing MC instructions that have been successfully matched
/// by MatchAndEmitInstruction. Modifications to the emitted instructions,
/// like the expansion of pseudo instructions (e.g., "li"), can be performed
/// in this method.
bool processInstruction(MCInst &Inst, SMLoc IDLoc, OperandVector &Operands,
MCStreamer &Out);
// Auto-generated instruction matching functions
#define GET_ASSEMBLER_HEADER
#include "RISCVGenAsmMatcher.inc"
OperandMatchResultTy parseCSRSystemRegister(OperandVector &Operands);
OperandMatchResultTy parseImmediate(OperandVector &Operands);
OperandMatchResultTy parseRegister(OperandVector &Operands,
bool AllowParens = false);
OperandMatchResultTy parseMemOpBaseReg(OperandVector &Operands);
OperandMatchResultTy parseOperandWithModifier(OperandVector &Operands);
OperandMatchResultTy parseBareSymbol(OperandVector &Operands);
OperandMatchResultTy parseCallSymbol(OperandVector &Operands);
OperandMatchResultTy parseJALOffset(OperandVector &Operands);
bool parseOperand(OperandVector &Operands, StringRef Mnemonic);
bool parseDirectiveOption();
void setFeatureBits(uint64_t Feature, StringRef FeatureString) {
if (!(getSTI().getFeatureBits()[Feature])) {
MCSubtargetInfo &STI = copySTI();
setAvailableFeatures(
ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
}
}
void clearFeatureBits(uint64_t Feature, StringRef FeatureString) {
if (getSTI().getFeatureBits()[Feature]) {
MCSubtargetInfo &STI = copySTI();
setAvailableFeatures(
ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
}
}
void pushFeatureBits() {
FeatureBitStack.push_back(getSTI().getFeatureBits());
}
bool popFeatureBits() {
if (FeatureBitStack.empty())
return true;
FeatureBitset FeatureBits = FeatureBitStack.pop_back_val();
copySTI().setFeatureBits(FeatureBits);
setAvailableFeatures(ComputeAvailableFeatures(FeatureBits));
return false;
}
public:
enum RISCVMatchResultTy {
Match_Dummy = FIRST_TARGET_MATCH_RESULT_TY,
#define GET_OPERAND_DIAGNOSTIC_TYPES
#include "RISCVGenAsmMatcher.inc"
#undef GET_OPERAND_DIAGNOSTIC_TYPES
};
static bool classifySymbolRef(const MCExpr *Expr,
RISCVMCExpr::VariantKind &Kind,
int64_t &Addend);
RISCVAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
const MCInstrInfo &MII, const MCTargetOptions &Options)
: MCTargetAsmParser(Options, STI, MII) {
Parser.addAliasForDirective(".half", ".2byte");
Parser.addAliasForDirective(".hword", ".2byte");
Parser.addAliasForDirective(".word", ".4byte");
Parser.addAliasForDirective(".dword", ".8byte");
setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
}
};
/// RISCVOperand - Instances of this class represent a parsed machine
/// instruction
struct RISCVOperand : public MCParsedAsmOperand {
enum KindTy {
Token,
Register,
Immediate,
SystemRegister
} Kind;
bool IsRV64;
struct RegOp {
unsigned RegNum;
};
struct ImmOp {
const MCExpr *Val;
};
struct SysRegOp {
const char *Data;
unsigned Length;
unsigned Encoding;
// FIXME: Add the Encoding parsed fields as needed for checks,
// e.g.: read/write or user/supervisor/machine privileges.
};
SMLoc StartLoc, EndLoc;
union {
StringRef Tok;
RegOp Reg;
ImmOp Imm;
struct SysRegOp SysReg;
};
RISCVOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
public:
RISCVOperand(const RISCVOperand &o) : MCParsedAsmOperand() {
Kind = o.Kind;
IsRV64 = o.IsRV64;
StartLoc = o.StartLoc;
EndLoc = o.EndLoc;
switch (Kind) {
case Register:
Reg = o.Reg;
break;
case Immediate:
Imm = o.Imm;
break;
case Token:
Tok = o.Tok;
break;
case SystemRegister:
SysReg = o.SysReg;
break;
}
}
bool isToken() const override { return Kind == Token; }
bool isReg() const override { return Kind == Register; }
bool isImm() const override { return Kind == Immediate; }
bool isMem() const override { return false; }
bool isSystemRegister() const { return Kind == SystemRegister; }
static bool evaluateConstantImm(const MCExpr *Expr, int64_t &Imm,
RISCVMCExpr::VariantKind &VK) {
if (auto *RE = dyn_cast<RISCVMCExpr>(Expr)) {
VK = RE->getKind();
return RE->evaluateAsConstant(Imm);
}
if (auto CE = dyn_cast<MCConstantExpr>(Expr)) {
VK = RISCVMCExpr::VK_RISCV_None;
Imm = CE->getValue();
return true;
}
return false;
}
// True if operand is a symbol with no modifiers, or a constant with no
// modifiers and isShiftedInt<N-1, 1>(Op).
template <int N> bool isBareSimmNLsb0() const {
int64_t Imm;
RISCVMCExpr::VariantKind VK;
if (!isImm())
return false;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
bool IsValid;
if (!IsConstantImm)
IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
else
IsValid = isShiftedInt<N - 1, 1>(Imm);
return IsValid && VK == RISCVMCExpr::VK_RISCV_None;
}
// Predicate methods for AsmOperands defined in RISCVInstrInfo.td
bool isBareSymbol() const {
int64_t Imm;
RISCVMCExpr::VariantKind VK;
// Must be of 'immediate' type but not a constant.
if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
return false;
return RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm) &&
VK == RISCVMCExpr::VK_RISCV_None;
}
bool isCallSymbol() const {
int64_t Imm;
RISCVMCExpr::VariantKind VK;
// Must be of 'immediate' type but not a constant.
if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
return false;
return RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm) &&
(VK == RISCVMCExpr::VK_RISCV_CALL ||
VK == RISCVMCExpr::VK_RISCV_CALL_PLT);
}
bool isTPRelAddSymbol() const {
int64_t Imm;
RISCVMCExpr::VariantKind VK;
// Must be of 'immediate' type but not a constant.
if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
return false;
return RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm) &&
VK == RISCVMCExpr::VK_RISCV_TPREL_ADD;
}
bool isCSRSystemRegister() const { return isSystemRegister(); }
/// Return true if the operand is a valid for the fence instruction e.g.
/// ('iorw').
bool isFenceArg() const {
if (!isImm())
return false;
const MCExpr *Val = getImm();
auto *SVal = dyn_cast<MCSymbolRefExpr>(Val);
if (!SVal || SVal->getKind() != MCSymbolRefExpr::VK_None)
return false;
StringRef Str = SVal->getSymbol().getName();
// Letters must be unique, taken from 'iorw', and in ascending order. This
// holds as long as each individual character is one of 'iorw' and is
// greater than the previous character.
char Prev = '\0';
for (char c : Str) {
if (c != 'i' && c != 'o' && c != 'r' && c != 'w')
return false;
if (c <= Prev)
return false;
Prev = c;
}
return true;
}
/// Return true if the operand is a valid floating point rounding mode.
bool isFRMArg() const {
if (!isImm())
return false;
const MCExpr *Val = getImm();
auto *SVal = dyn_cast<MCSymbolRefExpr>(Val);
if (!SVal || SVal->getKind() != MCSymbolRefExpr::VK_None)
return false;
StringRef Str = SVal->getSymbol().getName();
return RISCVFPRndMode::stringToRoundingMode(Str) != RISCVFPRndMode::Invalid;
}
bool isImmXLenLI() const {
int64_t Imm;
RISCVMCExpr::VariantKind VK;
if (!isImm())
return false;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
if (VK == RISCVMCExpr::VK_RISCV_LO || VK == RISCVMCExpr::VK_RISCV_PCREL_LO)
return true;
// Given only Imm, ensuring that the actually specified constant is either
// a signed or unsigned 64-bit number is unfortunately impossible.
bool IsInRange = isRV64() ? true : isInt<32>(Imm) || isUInt<32>(Imm);
return IsConstantImm && IsInRange && VK == RISCVMCExpr::VK_RISCV_None;
}
bool isUImmLog2XLen() const {
int64_t Imm;
RISCVMCExpr::VariantKind VK;
if (!isImm())
return false;
if (!evaluateConstantImm(getImm(), Imm, VK) ||
VK != RISCVMCExpr::VK_RISCV_None)
return false;
return (isRV64() && isUInt<6>(Imm)) || isUInt<5>(Imm);
}
bool isUImmLog2XLenNonZero() const {
int64_t Imm;
RISCVMCExpr::VariantKind VK;
if (!isImm())
return false;
if (!evaluateConstantImm(getImm(), Imm, VK) ||
VK != RISCVMCExpr::VK_RISCV_None)
return false;
if (Imm == 0)
return false;
return (isRV64() && isUInt<6>(Imm)) || isUInt<5>(Imm);
}
bool isUImm5() const {
int64_t Imm;
RISCVMCExpr::VariantKind VK;
if (!isImm())
return false;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
return IsConstantImm && isUInt<5>(Imm) && VK == RISCVMCExpr::VK_RISCV_None;
}
bool isUImm5NonZero() const {
int64_t Imm;
RISCVMCExpr::VariantKind VK;
if (!isImm())
return false;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
return IsConstantImm && isUInt<5>(Imm) && (Imm != 0) &&
VK == RISCVMCExpr::VK_RISCV_None;
}
bool isSImm6() const {
if (!isImm())
return false;
RISCVMCExpr::VariantKind VK;
int64_t Imm;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
return IsConstantImm && isInt<6>(Imm) &&
VK == RISCVMCExpr::VK_RISCV_None;
}
bool isSImm6NonZero() const {
if (!isImm())
return false;
RISCVMCExpr::VariantKind VK;
int64_t Imm;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
return IsConstantImm && isInt<6>(Imm) && (Imm != 0) &&
VK == RISCVMCExpr::VK_RISCV_None;
}
bool isCLUIImm() const {
if (!isImm())
return false;
int64_t Imm;
RISCVMCExpr::VariantKind VK;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
return IsConstantImm && (Imm != 0) &&
(isUInt<5>(Imm) || (Imm >= 0xfffe0 && Imm <= 0xfffff)) &&
VK == RISCVMCExpr::VK_RISCV_None;
}
bool isUImm7Lsb00() const {
if (!isImm())
return false;
int64_t Imm;
RISCVMCExpr::VariantKind VK;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
return IsConstantImm && isShiftedUInt<5, 2>(Imm) &&
VK == RISCVMCExpr::VK_RISCV_None;
}
bool isUImm8Lsb00() const {
if (!isImm())
return false;
int64_t Imm;
RISCVMCExpr::VariantKind VK;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
return IsConstantImm && isShiftedUInt<6, 2>(Imm) &&
VK == RISCVMCExpr::VK_RISCV_None;
}
bool isUImm8Lsb000() const {
if (!isImm())
return false;
int64_t Imm;
RISCVMCExpr::VariantKind VK;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
return IsConstantImm && isShiftedUInt<5, 3>(Imm) &&
VK == RISCVMCExpr::VK_RISCV_None;
}
bool isSImm9Lsb0() const { return isBareSimmNLsb0<9>(); }
bool isUImm9Lsb000() const {
if (!isImm())
return false;
int64_t Imm;
RISCVMCExpr::VariantKind VK;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
return IsConstantImm && isShiftedUInt<6, 3>(Imm) &&
VK == RISCVMCExpr::VK_RISCV_None;
}
bool isUImm10Lsb00NonZero() const {
if (!isImm())
return false;
int64_t Imm;
RISCVMCExpr::VariantKind VK;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
return IsConstantImm && isShiftedUInt<8, 2>(Imm) && (Imm != 0) &&
VK == RISCVMCExpr::VK_RISCV_None;
}
bool isSImm12() const {
RISCVMCExpr::VariantKind VK;
int64_t Imm;
bool IsValid;
if (!isImm())
return false;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
if (!IsConstantImm)
IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
else
IsValid = isInt<12>(Imm);
return IsValid && ((IsConstantImm && VK == RISCVMCExpr::VK_RISCV_None) ||
VK == RISCVMCExpr::VK_RISCV_LO ||
VK == RISCVMCExpr::VK_RISCV_PCREL_LO ||
VK == RISCVMCExpr::VK_RISCV_TPREL_LO);
}
bool isSImm12Lsb0() const { return isBareSimmNLsb0<12>(); }
bool isSImm13Lsb0() const { return isBareSimmNLsb0<13>(); }
bool isSImm10Lsb0000NonZero() const {
if (!isImm())
return false;
int64_t Imm;
RISCVMCExpr::VariantKind VK;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
return IsConstantImm && (Imm != 0) && isShiftedInt<6, 4>(Imm) &&
VK == RISCVMCExpr::VK_RISCV_None;
}
bool isUImm20LUI() const {
RISCVMCExpr::VariantKind VK;
int64_t Imm;
bool IsValid;
if (!isImm())
return false;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
if (!IsConstantImm) {
IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
return IsValid && (VK == RISCVMCExpr::VK_RISCV_HI ||
VK == RISCVMCExpr::VK_RISCV_TPREL_HI);
} else {
return isUInt<20>(Imm) && (VK == RISCVMCExpr::VK_RISCV_None ||
VK == RISCVMCExpr::VK_RISCV_HI ||
VK == RISCVMCExpr::VK_RISCV_TPREL_HI);
}
}
bool isUImm20AUIPC() const {
RISCVMCExpr::VariantKind VK;
int64_t Imm;
bool IsValid;
if (!isImm())
return false;
bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
if (!IsConstantImm) {
IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
return IsValid && (VK == RISCVMCExpr::VK_RISCV_PCREL_HI ||
VK == RISCVMCExpr::VK_RISCV_GOT_HI ||
VK == RISCVMCExpr::VK_RISCV_TLS_GOT_HI ||
VK == RISCVMCExpr::VK_RISCV_TLS_GD_HI);
} else {
return isUInt<20>(Imm) && (VK == RISCVMCExpr::VK_RISCV_None ||
VK == RISCVMCExpr::VK_RISCV_PCREL_HI ||
VK == RISCVMCExpr::VK_RISCV_GOT_HI ||
VK == RISCVMCExpr::VK_RISCV_TLS_GOT_HI ||
VK == RISCVMCExpr::VK_RISCV_TLS_GD_HI);
}
}
bool isSImm21Lsb0JAL() const { return isBareSimmNLsb0<21>(); }
/// getStartLoc - Gets location of the first token of this operand
SMLoc getStartLoc() const override { return StartLoc; }
/// getEndLoc - Gets location of the last token of this operand
SMLoc getEndLoc() const override { return EndLoc; }
/// True if this operand is for an RV64 instruction
bool isRV64() const { return IsRV64; }
unsigned getReg() const override {
assert(Kind == Register && "Invalid type access!");
return Reg.RegNum;
}
StringRef getSysReg() const {
assert(Kind == SystemRegister && "Invalid access!");
return StringRef(SysReg.Data, SysReg.Length);
}
const MCExpr *getImm() const {
assert(Kind == Immediate && "Invalid type access!");
return Imm.Val;
}
StringRef getToken() const {
assert(Kind == Token && "Invalid type access!");
return Tok;
}
void print(raw_ostream &OS) const override {
switch (Kind) {
case Immediate:
OS << *getImm();
break;
case Register:
OS << "<register x";
OS << getReg() << ">";
break;
case Token:
OS << "'" << getToken() << "'";
break;
case SystemRegister:
OS << "<sysreg: " << getSysReg() << '>';
break;
}
}
static std::unique_ptr<RISCVOperand> createToken(StringRef Str, SMLoc S,
bool IsRV64) {
auto Op = make_unique<RISCVOperand>(Token);
Op->Tok = Str;
Op->StartLoc = S;
Op->EndLoc = S;
Op->IsRV64 = IsRV64;
return Op;
}
static std::unique_ptr<RISCVOperand> createReg(unsigned RegNo, SMLoc S,
SMLoc E, bool IsRV64) {
auto Op = make_unique<RISCVOperand>(Register);
Op->Reg.RegNum = RegNo;
Op->StartLoc = S;
Op->EndLoc = E;
Op->IsRV64 = IsRV64;
return Op;
}
static std::unique_ptr<RISCVOperand> createImm(const MCExpr *Val, SMLoc S,
SMLoc E, bool IsRV64) {
auto Op = make_unique<RISCVOperand>(Immediate);
Op->Imm.Val = Val;
Op->StartLoc = S;
Op->EndLoc = E;
Op->IsRV64 = IsRV64;
return Op;
}
static std::unique_ptr<RISCVOperand>
createSysReg(StringRef Str, SMLoc S, unsigned Encoding, bool IsRV64) {
auto Op = make_unique<RISCVOperand>(SystemRegister);
Op->SysReg.Data = Str.data();
Op->SysReg.Length = Str.size();
Op->SysReg.Encoding = Encoding;
Op->StartLoc = S;
Op->IsRV64 = IsRV64;
return Op;
}
void addExpr(MCInst &Inst, const MCExpr *Expr) const {
assert(Expr && "Expr shouldn't be null!");
int64_t Imm = 0;
RISCVMCExpr::VariantKind VK;
bool IsConstant = evaluateConstantImm(Expr, Imm, VK);
if (IsConstant)
Inst.addOperand(MCOperand::createImm(Imm));
else
Inst.addOperand(MCOperand::createExpr(Expr));
}
// Used by the TableGen Code
void addRegOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createReg(getReg()));
}
void addImmOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
addExpr(Inst, getImm());
}
void addFenceArgOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
// isFenceArg has validated the operand, meaning this cast is safe
auto SE = cast<MCSymbolRefExpr>(getImm());
unsigned Imm = 0;
for (char c : SE->getSymbol().getName()) {
switch (c) {
default:
llvm_unreachable("FenceArg must contain only [iorw]");
case 'i': Imm |= RISCVFenceField::I; break;
case 'o': Imm |= RISCVFenceField::O; break;
case 'r': Imm |= RISCVFenceField::R; break;
case 'w': Imm |= RISCVFenceField::W; break;
}
}
Inst.addOperand(MCOperand::createImm(Imm));
}
void addCSRSystemRegisterOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(SysReg.Encoding));
}
// Returns the rounding mode represented by this RISCVOperand. Should only
// be called after checking isFRMArg.
RISCVFPRndMode::RoundingMode getRoundingMode() const {
// isFRMArg has validated the operand, meaning this cast is safe.
auto SE = cast<MCSymbolRefExpr>(getImm());
RISCVFPRndMode::RoundingMode FRM =
RISCVFPRndMode::stringToRoundingMode(SE->getSymbol().getName());
assert(FRM != RISCVFPRndMode::Invalid && "Invalid rounding mode");
return FRM;
}
void addFRMArgOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(getRoundingMode()));
}
};
} // end anonymous namespace.
#define GET_REGISTER_MATCHER
#define GET_MATCHER_IMPLEMENTATION
#include "RISCVGenAsmMatcher.inc"
// Return the matching FPR64 register for the given FPR32.
// FIXME: Ideally this function could be removed in favour of using
// information from TableGen.
unsigned convertFPR32ToFPR64(unsigned Reg) {
switch (Reg) {
default:
llvm_unreachable("Not a recognised FPR32 register");
case RISCV::F0_32: return RISCV::F0_64;
case RISCV::F1_32: return RISCV::F1_64;
case RISCV::F2_32: return RISCV::F2_64;
case RISCV::F3_32: return RISCV::F3_64;
case RISCV::F4_32: return RISCV::F4_64;
case RISCV::F5_32: return RISCV::F5_64;
case RISCV::F6_32: return RISCV::F6_64;
case RISCV::F7_32: return RISCV::F7_64;
case RISCV::F8_32: return RISCV::F8_64;
case RISCV::F9_32: return RISCV::F9_64;
case RISCV::F10_32: return RISCV::F10_64;
case RISCV::F11_32: return RISCV::F11_64;
case RISCV::F12_32: return RISCV::F12_64;
case RISCV::F13_32: return RISCV::F13_64;
case RISCV::F14_32: return RISCV::F14_64;
case RISCV::F15_32: return RISCV::F15_64;
case RISCV::F16_32: return RISCV::F16_64;
case RISCV::F17_32: return RISCV::F17_64;
case RISCV::F18_32: return RISCV::F18_64;
case RISCV::F19_32: return RISCV::F19_64;
case RISCV::F20_32: return RISCV::F20_64;
case RISCV::F21_32: return RISCV::F21_64;
case RISCV::F22_32: return RISCV::F22_64;
case RISCV::F23_32: return RISCV::F23_64;
case RISCV::F24_32: return RISCV::F24_64;
case RISCV::F25_32: return RISCV::F25_64;
case RISCV::F26_32: return RISCV::F26_64;
case RISCV::F27_32: return RISCV::F27_64;
case RISCV::F28_32: return RISCV::F28_64;
case RISCV::F29_32: return RISCV::F29_64;
case RISCV::F30_32: return RISCV::F30_64;
case RISCV::F31_32: return RISCV::F31_64;
}
}
unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
unsigned Kind) {
RISCVOperand &Op = static_cast<RISCVOperand &>(AsmOp);
if (!Op.isReg())
return Match_InvalidOperand;
unsigned Reg = Op.getReg();
bool IsRegFPR32 =
RISCVMCRegisterClasses[RISCV::FPR32RegClassID].contains(Reg);
bool IsRegFPR32C =
RISCVMCRegisterClasses[RISCV::FPR32CRegClassID].contains(Reg);
// As the parser couldn't differentiate an FPR32 from an FPR64, coerce the
// register from FPR32 to FPR64 or FPR32C to FPR64C if necessary.
if ((IsRegFPR32 && Kind == MCK_FPR64) ||
(IsRegFPR32C && Kind == MCK_FPR64C)) {
Op.Reg.RegNum = convertFPR32ToFPR64(Reg);
return Match_Success;
}
return Match_InvalidOperand;
}
bool RISCVAsmParser::generateImmOutOfRangeError(
OperandVector &Operands, uint64_t ErrorInfo, int64_t Lower, int64_t Upper,
Twine Msg = "immediate must be an integer in the range") {
SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
return Error(ErrorLoc, Msg + " [" + Twine(Lower) + ", " + Twine(Upper) + "]");
}
bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands,
MCStreamer &Out,
uint64_t &ErrorInfo,
bool MatchingInlineAsm) {
MCInst Inst;
auto Result =
MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
switch (Result) {
default:
break;
case Match_Success:
return processInstruction(Inst, IDLoc, Operands, Out);
case Match_MissingFeature:
return Error(IDLoc, "instruction use requires an option to be enabled");
case Match_MnemonicFail:
return Error(IDLoc, "unrecognized instruction mnemonic");
case Match_InvalidOperand: {
SMLoc ErrorLoc = IDLoc;
if (ErrorInfo != ~0U) {
if (ErrorInfo >= Operands.size())
return Error(ErrorLoc, "too few operands for instruction");
ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
if (ErrorLoc == SMLoc())
ErrorLoc = IDLoc;
}
return Error(ErrorLoc, "invalid operand for instruction");
}
}
// Handle the case when the error message is of specific type
// other than the generic Match_InvalidOperand, and the
// corresponding operand is missing.
if (Result > FIRST_TARGET_MATCH_RESULT_TY) {
SMLoc ErrorLoc = IDLoc;
if (ErrorInfo != ~0U && ErrorInfo >= Operands.size())
return Error(ErrorLoc, "too few operands for instruction");
}
switch(Result) {
default:
break;
case Match_InvalidImmXLenLI:
if (isRV64()) {
SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
return Error(ErrorLoc, "operand must be a constant 64-bit integer");
}
return generateImmOutOfRangeError(Operands, ErrorInfo,
std::numeric_limits<int32_t>::min(),
std::numeric_limits<uint32_t>::max());
case Match_InvalidUImmLog2XLen:
if (isRV64())
return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 6) - 1);
return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 5) - 1);
case Match_InvalidUImmLog2XLenNonZero:
if (isRV64())
return generateImmOutOfRangeError(Operands, ErrorInfo, 1, (1 << 6) - 1);
return generateImmOutOfRangeError(Operands, ErrorInfo, 1, (1 << 5) - 1);
case Match_InvalidUImm5:
return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 5) - 1);
case Match_InvalidSImm6:
return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 5),
(1 << 5) - 1);
case Match_InvalidSImm6NonZero:
return generateImmOutOfRangeError(
Operands, ErrorInfo, -(1 << 5), (1 << 5) - 1,
"immediate must be non-zero in the range");
case Match_InvalidCLUIImm:
return generateImmOutOfRangeError(
Operands, ErrorInfo, 1, (1 << 5) - 1,
"immediate must be in [0xfffe0, 0xfffff] or");
case Match_InvalidUImm7Lsb00:
return generateImmOutOfRangeError(
Operands, ErrorInfo, 0, (1 << 7) - 4,
"immediate must be a multiple of 4 bytes in the range");
case Match_InvalidUImm8Lsb00:
return generateImmOutOfRangeError(
Operands, ErrorInfo, 0, (1 << 8) - 4,
"immediate must be a multiple of 4 bytes in the range");
case Match_InvalidUImm8Lsb000:
return generateImmOutOfRangeError(
Operands, ErrorInfo, 0, (1 << 8) - 8,
"immediate must be a multiple of 8 bytes in the range");
case Match_InvalidSImm9Lsb0:
return generateImmOutOfRangeError(
Operands, ErrorInfo, -(1 << 8), (1 << 8) - 2,
"immediate must be a multiple of 2 bytes in the range");
case Match_InvalidUImm9Lsb000:
return generateImmOutOfRangeError(
Operands, ErrorInfo, 0, (1 << 9) - 8,
"immediate must be a multiple of 8 bytes in the range");
case Match_InvalidUImm10Lsb00NonZero:
return generateImmOutOfRangeError(
Operands, ErrorInfo, 4, (1 << 10) - 4,
"immediate must be a multiple of 4 bytes in the range");
case Match_InvalidSImm10Lsb0000NonZero:
return generateImmOutOfRangeError(
Operands, ErrorInfo, -(1 << 9), (1 << 9) - 16,
"immediate must be a multiple of 16 bytes and non-zero in the range");
case Match_InvalidSImm12:
return generateImmOutOfRangeError(
Operands, ErrorInfo, -(1 << 11), (1 << 11) - 1,
"operand must be a symbol with %lo/%pcrel_lo/%tprel_lo modifier or an "
"integer in the range");
case Match_InvalidSImm12Lsb0:
return generateImmOutOfRangeError(
Operands, ErrorInfo, -(1 << 11), (1 << 11) - 2,
"immediate must be a multiple of 2 bytes in the range");
case Match_InvalidSImm13Lsb0:
return generateImmOutOfRangeError(
Operands, ErrorInfo, -(1 << 12), (1 << 12) - 2,
"immediate must be a multiple of 2 bytes in the range");
case Match_InvalidUImm20LUI:
return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 20) - 1,
"operand must be a symbol with "
"%hi/%tprel_hi modifier or an integer in "
"the range");
case Match_InvalidUImm20AUIPC:
return generateImmOutOfRangeError(
Operands, ErrorInfo, 0, (1 << 20) - 1,
"operand must be a symbol with a "
"%pcrel_hi/%got_pcrel_hi/%tls_ie_pcrel_hi/%tls_gd_pcrel_hi modifier or "
"an integer in the range");
case Match_InvalidSImm21Lsb0JAL:
return generateImmOutOfRangeError(
Operands, ErrorInfo, -(1 << 20), (1 << 20) - 2,
"immediate must be a multiple of 2 bytes in the range");
case Match_InvalidCSRSystemRegister: {
return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 12) - 1,
"operand must be a valid system register "
"name or an integer in the range");
}
case Match_InvalidFenceArg: {
SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
return Error(
ErrorLoc,
"operand must be formed of letters selected in-order from 'iorw'");
}
case Match_InvalidFRMArg: {
SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
return Error(
ErrorLoc,
"operand must be a valid floating point rounding mode mnemonic");
}
case Match_InvalidBareSymbol: {
SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
return Error(ErrorLoc, "operand must be a bare symbol name");
}
case Match_InvalidCallSymbol: {
SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
return Error(ErrorLoc, "operand must be a bare symbol name");
}
case Match_InvalidTPRelAddSymbol: {
SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
return Error(ErrorLoc, "operand must be a symbol with %tprel_add modifier");
}
}
llvm_unreachable("Unknown match type detected!");
}
// Attempts to match Name as a register (either using the default name or
// alternative ABI names), setting RegNo to the matching register. Upon
// failure, returns true and sets RegNo to 0. If IsRV32E then registers
// x16-x31 will be rejected.
static bool matchRegisterNameHelper(bool IsRV32E, unsigned &RegNo,
StringRef Name) {
RegNo = MatchRegisterName(Name);
if (RegNo == 0)
RegNo = MatchRegisterAltName(Name);
if (IsRV32E && RegNo >= RISCV::X16 && RegNo <= RISCV::X31)
RegNo = 0;
return RegNo == 0;
}
bool RISCVAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
SMLoc &EndLoc) {
const AsmToken &Tok = getParser().getTok();
StartLoc = Tok.getLoc();
EndLoc = Tok.getEndLoc();
RegNo = 0;
StringRef Name = getLexer().getTok().getIdentifier();
if (matchRegisterNameHelper(isRV32E(), RegNo, Name))
return Error(StartLoc, "invalid register name");
getParser().Lex(); // Eat identifier token.
return false;
}
OperandMatchResultTy RISCVAsmParser::parseRegister(OperandVector &Operands,
bool AllowParens) {
SMLoc FirstS = getLoc();
bool HadParens = false;
AsmToken LParen;
// If this is an LParen and a parenthesised register name is allowed, parse it
// atomically.
if (AllowParens && getLexer().is(AsmToken::LParen)) {
AsmToken Buf[2];
size_t ReadCount = getLexer().peekTokens(Buf);
if (ReadCount == 2 && Buf[1].getKind() == AsmToken::RParen) {
HadParens = true;
LParen = getParser().getTok();
getParser().Lex(); // Eat '('
}
}
switch (getLexer().getKind()) {
default:
if (HadParens)
getLexer().UnLex(LParen);
return MatchOperand_NoMatch;
case AsmToken::Identifier:
StringRef Name = getLexer().getTok().getIdentifier();
unsigned RegNo;
matchRegisterNameHelper(isRV32E(), RegNo, Name);
if (RegNo == 0) {
if (HadParens)
getLexer().UnLex(LParen);
return MatchOperand_NoMatch;
}
if (HadParens)
Operands.push_back(RISCVOperand::createToken("(", FirstS, isRV64()));
SMLoc S = getLoc();
SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
getLexer().Lex();
Operands.push_back(RISCVOperand::createReg(RegNo, S, E, isRV64()));
}
if (HadParens) {
getParser().Lex(); // Eat ')'
Operands.push_back(RISCVOperand::createToken(")", getLoc(), isRV64()));
}
return MatchOperand_Success;
}
OperandMatchResultTy
RISCVAsmParser::parseCSRSystemRegister(OperandVector &Operands) {
SMLoc S = getLoc();
const MCExpr *Res;
switch (getLexer().getKind()) {
default:
return MatchOperand_NoMatch;
case AsmToken::LParen:
case AsmToken::Minus:
case AsmToken::Plus:
case AsmToken::Exclaim:
case AsmToken::Tilde:
case AsmToken::Integer:
case AsmToken::String: {
if (getParser().parseExpression(Res))
return MatchOperand_ParseFail;
auto *CE = dyn_cast<MCConstantExpr>(Res);
if (CE) {
int64_t Imm = CE->getValue();
if (isUInt<12>(Imm)) {
auto SysReg = RISCVSysReg::lookupSysRegByEncoding(Imm);
// Accept an immediate representing a named or un-named Sys Reg
// if the range is valid, regardless of the required features.
Operands.push_back(RISCVOperand::createSysReg(
SysReg ? SysReg->Name : "", S, Imm, isRV64()));
return MatchOperand_Success;
}
}
Twine Msg = "immediate must be an integer in the range";
Error(S, Msg + " [" + Twine(0) + ", " + Twine((1 << 12) - 1) + "]");
return MatchOperand_ParseFail;
}
case AsmToken::Identifier: {
StringRef Identifier;
if (getParser().parseIdentifier(Identifier))
return MatchOperand_ParseFail;
auto SysReg = RISCVSysReg::lookupSysRegByName(Identifier);
// Accept a named Sys Reg if the required features are present.
if (SysReg) {
if (!SysReg->haveRequiredFeatures(getSTI().getFeatureBits())) {
Error(S, "system register use requires an option to be enabled");
return MatchOperand_ParseFail;
}
Operands.push_back(RISCVOperand::createSysReg(
Identifier, S, SysReg->Encoding, isRV64()));
return MatchOperand_Success;
}
Twine Msg = "operand must be a valid system register name "
"or an integer in the range";
Error(S, Msg + " [" + Twine(0) + ", " + Twine((1 << 12) - 1) + "]");
return MatchOperand_ParseFail;
}
case AsmToken::Percent: {
// Discard operand with modifier.
Twine Msg = "immediate must be an integer in the range";
Error(S, Msg + " [" + Twine(0) + ", " + Twine((1 << 12) - 1) + "]");
return MatchOperand_ParseFail;
}
}
return MatchOperand_NoMatch;
}
OperandMatchResultTy RISCVAsmParser::parseImmediate(OperandVector &Operands) {
SMLoc S = getLoc();
SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
const MCExpr *Res;
switch (getLexer().getKind()) {
default:
return MatchOperand_NoMatch;
case AsmToken::LParen:
case AsmToken::Dot:
case AsmToken::Minus:
case AsmToken::Plus:
case AsmToken::Exclaim:
case AsmToken::Tilde:
case AsmToken::Integer:
case AsmToken::String:
case AsmToken::Identifier:
if (getParser().parseExpression(Res))
return MatchOperand_ParseFail;
break;
case AsmToken::Percent:
return parseOperandWithModifier(Operands);
}
Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
return MatchOperand_Success;
}
OperandMatchResultTy
RISCVAsmParser::parseOperandWithModifier(OperandVector &Operands) {
SMLoc S = getLoc();
SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
if (getLexer().getKind() != AsmToken::Percent) {
Error(getLoc(), "expected '%' for operand modifier");
return MatchOperand_ParseFail;
}
getParser().Lex(); // Eat '%'
if (getLexer().getKind() != AsmToken::Identifier) {
Error(getLoc(), "expected valid identifier for operand modifier");
return MatchOperand_ParseFail;
}
StringRef Identifier = getParser().getTok().getIdentifier();
RISCVMCExpr::VariantKind VK = RISCVMCExpr::getVariantKindForName(Identifier);
if (VK == RISCVMCExpr::VK_RISCV_Invalid) {
Error(getLoc(), "unrecognized operand modifier");
return MatchOperand_ParseFail;
}
getParser().Lex(); // Eat the identifier
if (getLexer().getKind() != AsmToken::LParen) {
Error(getLoc(), "expected '('");
return MatchOperand_ParseFail;
}
getParser().Lex(); // Eat '('
const MCExpr *SubExpr;
if (getParser().parseParenExpression(SubExpr, E)) {
return MatchOperand_ParseFail;
}
const MCExpr *ModExpr = RISCVMCExpr::create(SubExpr, VK, getContext());
Operands.push_back(RISCVOperand::createImm(ModExpr, S, E, isRV64()));
return MatchOperand_Success;
}
OperandMatchResultTy RISCVAsmParser::parseBareSymbol(OperandVector &Operands) {
SMLoc S = getLoc();
SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
const MCExpr *Res;
if (getLexer().getKind() != AsmToken::Identifier)
return MatchOperand_NoMatch;
StringRef Identifier;
AsmToken Tok = getLexer().getTok();
if (getParser().parseIdentifier(Identifier))
return MatchOperand_ParseFail;
if (Identifier.consume_back("@plt")) {
Error(getLoc(), "'@plt' operand not valid for instruction");
return MatchOperand_ParseFail;
}
MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
if (Sym->isVariable()) {
const MCExpr *V = Sym->getVariableValue(/*SetUsed=*/false);
if (!isa<MCSymbolRefExpr>(V)) {
getLexer().UnLex(Tok); // Put back if it's not a bare symbol.
return MatchOperand_NoMatch;
}
Res = V;
} else
Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+
+ MCBinaryExpr::Opcode Opcode;
+ switch (getLexer().getKind()) {
+ default:
+ Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
+ return MatchOperand_Success;
+ case AsmToken::Plus:
+ Opcode = MCBinaryExpr::Add;
+ break;
+ case AsmToken::Minus:
+ Opcode = MCBinaryExpr::Sub;
+ break;
+ }
+
+ const MCExpr *Expr;
+ if (getParser().parseExpression(Expr))
+ return MatchOperand_ParseFail;
+ Res = MCBinaryExpr::create(Opcode, Res, Expr, getContext());
Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
return MatchOperand_Success;
}
OperandMatchResultTy RISCVAsmParser::parseCallSymbol(OperandVector &Operands) {
SMLoc S = getLoc();
SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
const MCExpr *Res;
if (getLexer().getKind() != AsmToken::Identifier)
return MatchOperand_NoMatch;
// Avoid parsing the register in `call rd, foo` as a call symbol.
if (getLexer().peekTok().getKind() != AsmToken::EndOfStatement)
return MatchOperand_NoMatch;
StringRef Identifier;
if (getParser().parseIdentifier(Identifier))
return MatchOperand_ParseFail;
RISCVMCExpr::VariantKind Kind = RISCVMCExpr::VK_RISCV_CALL;
if (Identifier.consume_back("@plt"))
Kind = RISCVMCExpr::VK_RISCV_CALL_PLT;
MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
Res = RISCVMCExpr::create(Res, Kind, getContext());
Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
return MatchOperand_Success;
}
OperandMatchResultTy RISCVAsmParser::parseJALOffset(OperandVector &Operands) {
// Parsing jal operands is fiddly due to the `jal foo` and `jal ra, foo`
// both being acceptable forms. When parsing `jal ra, foo` this function
// will be called for the `ra` register operand in an attempt to match the
// single-operand alias. parseJALOffset must fail for this case. It would
// seem logical to try parse the operand using parseImmediate and return
// NoMatch if the next token is a comma (meaning we must be parsing a jal in
// the second form rather than the first). We can't do this as there's no
// way of rewinding the lexer state. Instead, return NoMatch if this operand
// is an identifier and is followed by a comma.
if (getLexer().is(AsmToken::Identifier) &&
getLexer().peekTok().is(AsmToken::Comma))
return MatchOperand_NoMatch;
return parseImmediate(Operands);
}
OperandMatchResultTy
RISCVAsmParser::parseMemOpBaseReg(OperandVector &Operands) {
if (getLexer().isNot(AsmToken::LParen)) {
Error(getLoc(), "expected '('");
return MatchOperand_ParseFail;
}
getParser().Lex(); // Eat '('
Operands.push_back(RISCVOperand::createToken("(", getLoc(), isRV64()));
if (parseRegister(Operands) != MatchOperand_Success) {
Error(getLoc(), "expected register");
return MatchOperand_ParseFail;
}
if (getLexer().isNot(AsmToken::RParen)) {
Error(getLoc(), "expected ')'");
return MatchOperand_ParseFail;
}
getParser().Lex(); // Eat ')'
Operands.push_back(RISCVOperand::createToken(")", getLoc(), isRV64()));
return MatchOperand_Success;
}
/// Looks at a token type and creates the relevant operand from this
/// information, adding to Operands. If operand was parsed, returns false, else
/// true.
bool RISCVAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
// Check if the current operand has a custom associated parser, if so, try to
// custom parse the operand, or fallback to the general approach.
OperandMatchResultTy Result =
MatchOperandParserImpl(Operands, Mnemonic, /*ParseForAllFeatures=*/true);
if (Result == MatchOperand_Success)
return false;
if (Result == MatchOperand_ParseFail)
return true;
// Attempt to parse token as a register.
if (parseRegister(Operands, true) == MatchOperand_Success)
return false;
// Attempt to parse token as an immediate
if (parseImmediate(Operands) == MatchOperand_Success) {
// Parse memory base register if present
if (getLexer().is(AsmToken::LParen))
return parseMemOpBaseReg(Operands) != MatchOperand_Success;
return false;
}
// Finally we have exhausted all options and must declare defeat.
Error(getLoc(), "unknown operand");
return true;
}
bool RISCVAsmParser::ParseInstruction(ParseInstructionInfo &Info,
StringRef Name, SMLoc NameLoc,
OperandVector &Operands) {
// Ensure that if the instruction occurs when relaxation is enabled,
// relocations are forced for the file. Ideally this would be done when there
// is enough information to reliably determine if the instruction itself may
// cause relaxations. Unfortunately instruction processing stage occurs in the
// same pass as relocation emission, so it's too late to set a 'sticky bit'
// for the entire file.
if (getSTI().getFeatureBits()[RISCV::FeatureRelax]) {
auto *Assembler = getTargetStreamer().getStreamer().getAssemblerPtr();
if (Assembler != nullptr) {
RISCVAsmBackend &MAB =
static_cast<RISCVAsmBackend &>(Assembler->getBackend());
MAB.setForceRelocs();
}
}
// First operand is token for instruction
Operands.push_back(RISCVOperand::createToken(Name, NameLoc, isRV64()));
// If there are no more operands, then finish
if (getLexer().is(AsmToken::EndOfStatement))
return false;
// Parse first operand
if (parseOperand(Operands, Name))
return true;
// Parse until end of statement, consuming commas between operands
unsigned OperandIdx = 1;
while (getLexer().is(AsmToken::Comma)) {
// Consume comma token
getLexer().Lex();
// Parse next operand
if (parseOperand(Operands, Name))
return true;
++OperandIdx;
}
if (getLexer().isNot(AsmToken::EndOfStatement)) {
SMLoc Loc = getLexer().getLoc();
getParser().eatToEndOfStatement();
return Error(Loc, "unexpected token");
}
getParser().Lex(); // Consume the EndOfStatement.
return false;
}
bool RISCVAsmParser::classifySymbolRef(const MCExpr *Expr,
RISCVMCExpr::VariantKind &Kind,
int64_t &Addend) {
Kind = RISCVMCExpr::VK_RISCV_None;
Addend = 0;
if (const RISCVMCExpr *RE = dyn_cast<RISCVMCExpr>(Expr)) {
Kind = RE->getKind();
Expr = RE->getSubExpr();
}
// It's a simple symbol reference or constant with no addend.
if (isa<MCConstantExpr>(Expr) || isa<MCSymbolRefExpr>(Expr))
return true;
const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr);
if (!BE)
return false;
if (!isa<MCSymbolRefExpr>(BE->getLHS()))
return false;
if (BE->getOpcode() != MCBinaryExpr::Add &&
BE->getOpcode() != MCBinaryExpr::Sub)
return false;
// We are able to support the subtraction of two symbol references
if (BE->getOpcode() == MCBinaryExpr::Sub &&
isa<MCSymbolRefExpr>(BE->getRHS()))
return true;
// See if the addend is a constant, otherwise there's more going
// on here than we can deal with.
auto AddendExpr = dyn_cast<MCConstantExpr>(BE->getRHS());
if (!AddendExpr)
return false;
Addend = AddendExpr->getValue();
if (BE->getOpcode() == MCBinaryExpr::Sub)
Addend = -Addend;
// It's some symbol reference + a constant addend
return Kind != RISCVMCExpr::VK_RISCV_Invalid;
}
bool RISCVAsmParser::ParseDirective(AsmToken DirectiveID) {
// This returns false if this function recognizes the directive
// regardless of whether it is successfully handles or reports an
// error. Otherwise it returns true to give the generic parser a
// chance at recognizing it.
StringRef IDVal = DirectiveID.getString();
if (IDVal == ".option")
return parseDirectiveOption();
return true;
}
bool RISCVAsmParser::parseDirectiveOption() {
MCAsmParser &Parser = getParser();
// Get the option token.
AsmToken Tok = Parser.getTok();
// At the moment only identifiers are supported.
if (Tok.isNot(AsmToken::Identifier))
return Error(Parser.getTok().getLoc(),
"unexpected token, expected identifier");
StringRef Option = Tok.getIdentifier();
if (Option == "push") {
getTargetStreamer().emitDirectiveOptionPush();
Parser.Lex();
if (Parser.getTok().isNot(AsmToken::EndOfStatement))
return Error(Parser.getTok().getLoc(),
"unexpected token, expected end of statement");
pushFeatureBits();
return false;
}
if (Option == "pop") {
SMLoc StartLoc = Parser.getTok().getLoc();
getTargetStreamer().emitDirectiveOptionPop();
Parser.Lex();
if (Parser.getTok().isNot(AsmToken::EndOfStatement))
return Error(Parser.getTok().getLoc(),
"unexpected token, expected end of statement");
if (popFeatureBits())
return Error(StartLoc, ".option pop with no .option push");
return false;
}
if (Option == "rvc") {
getTargetStreamer().emitDirectiveOptionRVC();
Parser.Lex();
if (Parser.getTok().isNot(AsmToken::EndOfStatement))
return Error(Parser.getTok().getLoc(),
"unexpected token, expected end of statement");
setFeatureBits(RISCV::FeatureStdExtC, "c");
return false;
}
if (Option == "norvc") {
getTargetStreamer().emitDirectiveOptionNoRVC();
Parser.Lex();
if (Parser.getTok().isNot(AsmToken::EndOfStatement))
return Error(Parser.getTok().getLoc(),
"unexpected token, expected end of statement");
clearFeatureBits(RISCV::FeatureStdExtC, "c");
return false;
}
if (Option == "relax") {
getTargetStreamer().emitDirectiveOptionRelax();
Parser.Lex();
if (Parser.getTok().isNot(AsmToken::EndOfStatement))
return Error(Parser.getTok().getLoc(),
"unexpected token, expected end of statement");
setFeatureBits(RISCV::FeatureRelax, "relax");
return false;
}
if (Option == "norelax") {
getTargetStreamer().emitDirectiveOptionNoRelax();
Parser.Lex();
if (Parser.getTok().isNot(AsmToken::EndOfStatement))
return Error(Parser.getTok().getLoc(),
"unexpected token, expected end of statement");
clearFeatureBits(RISCV::FeatureRelax, "relax");
return false;
}
// Unknown option.
Warning(Parser.getTok().getLoc(),
"unknown option, expected 'push', 'pop', 'rvc', 'norvc', 'relax' or "
"'norelax'");
Parser.eatToEndOfStatement();
return false;
}
void RISCVAsmParser::emitToStreamer(MCStreamer &S, const MCInst &Inst) {
MCInst CInst;
bool Res = compressInst(CInst, Inst, getSTI(), S.getContext());
CInst.setLoc(Inst.getLoc());
S.EmitInstruction((Res ? CInst : Inst), getSTI());
}
void RISCVAsmParser::emitLoadImm(unsigned DestReg, int64_t Value,
MCStreamer &Out) {
RISCVMatInt::InstSeq Seq;
RISCVMatInt::generateInstSeq(Value, isRV64(), Seq);
unsigned SrcReg = RISCV::X0;
for (RISCVMatInt::Inst &Inst : Seq) {
if (Inst.Opc == RISCV::LUI) {
emitToStreamer(
Out, MCInstBuilder(RISCV::LUI).addReg(DestReg).addImm(Inst.Imm));
} else {
emitToStreamer(
Out, MCInstBuilder(Inst.Opc).addReg(DestReg).addReg(SrcReg).addImm(
Inst.Imm));
}
// Only the first instruction has X0 as its source.
SrcReg = DestReg;
}
}
void RISCVAsmParser::emitAuipcInstPair(MCOperand DestReg, MCOperand TmpReg,
const MCExpr *Symbol,
RISCVMCExpr::VariantKind VKHi,
unsigned SecondOpcode, SMLoc IDLoc,
MCStreamer &Out) {
// A pair of instructions for PC-relative addressing; expands to
// TmpLabel: AUIPC TmpReg, VKHi(symbol)
// OP DestReg, TmpReg, %pcrel_lo(TmpLabel)
MCContext &Ctx = getContext();
MCSymbol *TmpLabel = Ctx.createTempSymbol(
"pcrel_hi", /* AlwaysAddSuffix */ true, /* CanBeUnnamed */ false);
Out.EmitLabel(TmpLabel);
const RISCVMCExpr *SymbolHi = RISCVMCExpr::create(Symbol, VKHi, Ctx);
emitToStreamer(
Out, MCInstBuilder(RISCV::AUIPC).addOperand(TmpReg).addExpr(SymbolHi));
const MCExpr *RefToLinkTmpLabel =
RISCVMCExpr::create(MCSymbolRefExpr::create(TmpLabel, Ctx),
RISCVMCExpr::VK_RISCV_PCREL_LO, Ctx);
emitToStreamer(Out, MCInstBuilder(SecondOpcode)
.addOperand(DestReg)
.addOperand(TmpReg)
.addExpr(RefToLinkTmpLabel));
}
void RISCVAsmParser::emitLoadLocalAddress(MCInst &Inst, SMLoc IDLoc,
MCStreamer &Out) {
// The load local address pseudo-instruction "lla" is used in PC-relative
// addressing of local symbols:
// lla rdest, symbol
// expands to
// TmpLabel: AUIPC rdest, %pcrel_hi(symbol)
// ADDI rdest, rdest, %pcrel_lo(TmpLabel)
MCOperand DestReg = Inst.getOperand(0);
const MCExpr *Symbol = Inst.getOperand(1).getExpr();
emitAuipcInstPair(DestReg, DestReg, Symbol, RISCVMCExpr::VK_RISCV_PCREL_HI,
RISCV::ADDI, IDLoc, Out);
}
void RISCVAsmParser::emitLoadAddress(MCInst &Inst, SMLoc IDLoc,
MCStreamer &Out) {
// The load address pseudo-instruction "la" is used in PC-relative and
// GOT-indirect addressing of global symbols:
// la rdest, symbol
// expands to either (for non-PIC)
// TmpLabel: AUIPC rdest, %pcrel_hi(symbol)
// ADDI rdest, rdest, %pcrel_lo(TmpLabel)
// or (for PIC)
// TmpLabel: AUIPC rdest, %got_pcrel_hi(symbol)
// Lx rdest, %pcrel_lo(TmpLabel)(rdest)
MCOperand DestReg = Inst.getOperand(0);
const MCExpr *Symbol = Inst.getOperand(1).getExpr();
unsigned SecondOpcode;
RISCVMCExpr::VariantKind VKHi;
// FIXME: Should check .option (no)pic when implemented
if (getContext().getObjectFileInfo()->isPositionIndependent()) {
SecondOpcode = isRV64() ? RISCV::LD : RISCV::LW;
VKHi = RISCVMCExpr::VK_RISCV_GOT_HI;
} else {
SecondOpcode = RISCV::ADDI;
VKHi = RISCVMCExpr::VK_RISCV_PCREL_HI;
}
emitAuipcInstPair(DestReg, DestReg, Symbol, VKHi, SecondOpcode, IDLoc, Out);
}
void RISCVAsmParser::emitLoadTLSIEAddress(MCInst &Inst, SMLoc IDLoc,
MCStreamer &Out) {
// The load TLS IE address pseudo-instruction "la.tls.ie" is used in
// initial-exec TLS model addressing of global symbols:
// la.tls.ie rdest, symbol
// expands to
// TmpLabel: AUIPC rdest, %tls_ie_pcrel_hi(symbol)
// Lx rdest, %pcrel_lo(TmpLabel)(rdest)
MCOperand DestReg = Inst.getOperand(0);
const MCExpr *Symbol = Inst.getOperand(1).getExpr();
unsigned SecondOpcode = isRV64() ? RISCV::LD : RISCV::LW;
emitAuipcInstPair(DestReg, DestReg, Symbol, RISCVMCExpr::VK_RISCV_TLS_GOT_HI,
SecondOpcode, IDLoc, Out);
}
void RISCVAsmParser::emitLoadTLSGDAddress(MCInst &Inst, SMLoc IDLoc,
MCStreamer &Out) {
// The load TLS GD address pseudo-instruction "la.tls.gd" is used in
// global-dynamic TLS model addressing of global symbols:
// la.tls.gd rdest, symbol
// expands to
// TmpLabel: AUIPC rdest, %tls_gd_pcrel_hi(symbol)
// ADDI rdest, rdest, %pcrel_lo(TmpLabel)
MCOperand DestReg = Inst.getOperand(0);
const MCExpr *Symbol = Inst.getOperand(1).getExpr();
emitAuipcInstPair(DestReg, DestReg, Symbol, RISCVMCExpr::VK_RISCV_TLS_GD_HI,
RISCV::ADDI, IDLoc, Out);
}
void RISCVAsmParser::emitLoadStoreSymbol(MCInst &Inst, unsigned Opcode,
SMLoc IDLoc, MCStreamer &Out,
bool HasTmpReg) {
// The load/store pseudo-instruction does a pc-relative load with
// a symbol.
//
// The expansion looks like this
//
// TmpLabel: AUIPC tmp, %pcrel_hi(symbol)
// [S|L]X rd, %pcrel_lo(TmpLabel)(tmp)
MCOperand DestReg = Inst.getOperand(0);
unsigned SymbolOpIdx = HasTmpReg ? 2 : 1;
unsigned TmpRegOpIdx = HasTmpReg ? 1 : 0;
MCOperand TmpReg = Inst.getOperand(TmpRegOpIdx);
const MCExpr *Symbol = Inst.getOperand(SymbolOpIdx).getExpr();
emitAuipcInstPair(DestReg, TmpReg, Symbol, RISCVMCExpr::VK_RISCV_PCREL_HI,
Opcode, IDLoc, Out);
}
bool RISCVAsmParser::checkPseudoAddTPRel(MCInst &Inst,
OperandVector &Operands) {
assert(Inst.getOpcode() == RISCV::PseudoAddTPRel && "Invalid instruction");
assert(Inst.getOperand(2).isReg() && "Unexpected second operand kind");
if (Inst.getOperand(2).getReg() != RISCV::X4) {
SMLoc ErrorLoc = ((RISCVOperand &)*Operands[3]).getStartLoc();
return Error(ErrorLoc, "the second input operand must be tp/x4 when using "
"%tprel_add modifier");
}
return false;
}
bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
OperandVector &Operands,
MCStreamer &Out) {
Inst.setLoc(IDLoc);
switch (Inst.getOpcode()) {
default:
break;
case RISCV::PseudoLI: {
unsigned Reg = Inst.getOperand(0).getReg();
const MCOperand &Op1 = Inst.getOperand(1);
if (Op1.isExpr()) {
// We must have li reg, %lo(sym) or li reg, %pcrel_lo(sym) or similar.
// Just convert to an addi. This allows compatibility with gas.
emitToStreamer(Out, MCInstBuilder(RISCV::ADDI)
.addReg(Reg)
.addReg(RISCV::X0)
.addExpr(Op1.getExpr()));
return false;
}
int64_t Imm = Inst.getOperand(1).getImm();
// On RV32 the immediate here can either be a signed or an unsigned
// 32-bit number. Sign extension has to be performed to ensure that Imm
// represents the expected signed 64-bit number.
if (!isRV64())
Imm = SignExtend64<32>(Imm);
emitLoadImm(Reg, Imm, Out);
return false;
}
case RISCV::PseudoLLA:
emitLoadLocalAddress(Inst, IDLoc, Out);
return false;
case RISCV::PseudoLA:
emitLoadAddress(Inst, IDLoc, Out);
return false;
case RISCV::PseudoLA_TLS_IE:
emitLoadTLSIEAddress(Inst, IDLoc, Out);
return false;
case RISCV::PseudoLA_TLS_GD:
emitLoadTLSGDAddress(Inst, IDLoc, Out);
return false;
case RISCV::PseudoLB:
emitLoadStoreSymbol(Inst, RISCV::LB, IDLoc, Out, /*HasTmpReg=*/false);
return false;
case RISCV::PseudoLBU:
emitLoadStoreSymbol(Inst, RISCV::LBU, IDLoc, Out, /*HasTmpReg=*/false);
return false;
case RISCV::PseudoLH:
emitLoadStoreSymbol(Inst, RISCV::LH, IDLoc, Out, /*HasTmpReg=*/false);
return false;
case RISCV::PseudoLHU:
emitLoadStoreSymbol(Inst, RISCV::LHU, IDLoc, Out, /*HasTmpReg=*/false);
return false;
case RISCV::PseudoLW:
emitLoadStoreSymbol(Inst, RISCV::LW, IDLoc, Out, /*HasTmpReg=*/false);
return false;
case RISCV::PseudoLWU:
emitLoadStoreSymbol(Inst, RISCV::LWU, IDLoc, Out, /*HasTmpReg=*/false);
return false;
case RISCV::PseudoLD:
emitLoadStoreSymbol(Inst, RISCV::LD, IDLoc, Out, /*HasTmpReg=*/false);
return false;
case RISCV::PseudoFLW:
emitLoadStoreSymbol(Inst, RISCV::FLW, IDLoc, Out, /*HasTmpReg=*/true);
return false;
case RISCV::PseudoFLD:
emitLoadStoreSymbol(Inst, RISCV::FLD, IDLoc, Out, /*HasTmpReg=*/true);
return false;
case RISCV::PseudoSB:
emitLoadStoreSymbol(Inst, RISCV::SB, IDLoc, Out, /*HasTmpReg=*/true);
return false;
case RISCV::PseudoSH:
emitLoadStoreSymbol(Inst, RISCV::SH, IDLoc, Out, /*HasTmpReg=*/true);
return false;
case RISCV::PseudoSW:
emitLoadStoreSymbol(Inst, RISCV::SW, IDLoc, Out, /*HasTmpReg=*/true);
return false;
case RISCV::PseudoSD:
emitLoadStoreSymbol(Inst, RISCV::SD, IDLoc, Out, /*HasTmpReg=*/true);
return false;
case RISCV::PseudoFSW:
emitLoadStoreSymbol(Inst, RISCV::FSW, IDLoc, Out, /*HasTmpReg=*/true);
return false;
case RISCV::PseudoFSD:
emitLoadStoreSymbol(Inst, RISCV::FSD, IDLoc, Out, /*HasTmpReg=*/true);
return false;
case RISCV::PseudoAddTPRel:
if (checkPseudoAddTPRel(Inst, Operands))
return true;
break;
}
emitToStreamer(Out, Inst);
return false;
}
extern "C" void LLVMInitializeRISCVAsmParser() {
RegisterMCAsmParser<RISCVAsmParser> X(getTheRISCV32Target());
RegisterMCAsmParser<RISCVAsmParser> Y(getTheRISCV64Target());
}
Index: vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVFrameLowering.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVFrameLowering.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVFrameLowering.cpp (revision 351303)
@@ -1,364 +1,408 @@
//===-- RISCVFrameLowering.cpp - RISCV Frame Information ------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains the RISCV implementation of TargetFrameLowering class.
//
//===----------------------------------------------------------------------===//
#include "RISCVFrameLowering.h"
#include "RISCVMachineFunctionInfo.h"
#include "RISCVSubtarget.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/MC/MCDwarf.h"
using namespace llvm;
bool RISCVFrameLowering::hasFP(const MachineFunction &MF) const {
const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
const MachineFrameInfo &MFI = MF.getFrameInfo();
return MF.getTarget().Options.DisableFramePointerElim(MF) ||
RegInfo->needsStackRealignment(MF) || MFI.hasVarSizedObjects() ||
MFI.isFrameAddressTaken();
}
// Determines the size of the frame and maximum call frame size.
void RISCVFrameLowering::determineFrameLayout(MachineFunction &MF) const {
MachineFrameInfo &MFI = MF.getFrameInfo();
const RISCVRegisterInfo *RI = STI.getRegisterInfo();
// Get the number of bytes to allocate from the FrameInfo.
uint64_t FrameSize = MFI.getStackSize();
// Get the alignment.
- uint64_t StackAlign = RI->needsStackRealignment(MF) ? MFI.getMaxAlignment()
- : getStackAlignment();
+ unsigned StackAlign = getStackAlignment();
+ if (RI->needsStackRealignment(MF)) {
+ unsigned MaxStackAlign = std::max(StackAlign, MFI.getMaxAlignment());
+ FrameSize += (MaxStackAlign - StackAlign);
+ StackAlign = MaxStackAlign;
+ }
+ // Set Max Call Frame Size
+ uint64_t MaxCallSize = alignTo(MFI.getMaxCallFrameSize(), StackAlign);
+ MFI.setMaxCallFrameSize(MaxCallSize);
+
// Make sure the frame is aligned.
FrameSize = alignTo(FrameSize, StackAlign);
// Update frame info.
MFI.setStackSize(FrameSize);
}
void RISCVFrameLowering::adjustReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, unsigned DestReg,
unsigned SrcReg, int64_t Val,
MachineInstr::MIFlag Flag) const {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
const RISCVInstrInfo *TII = STI.getInstrInfo();
if (DestReg == SrcReg && Val == 0)
return;
if (isInt<12>(Val)) {
BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADDI), DestReg)
.addReg(SrcReg)
.addImm(Val)
.setMIFlag(Flag);
} else if (isInt<32>(Val)) {
unsigned Opc = RISCV::ADD;
bool isSub = Val < 0;
if (isSub) {
Val = -Val;
Opc = RISCV::SUB;
}
unsigned ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
TII->movImm32(MBB, MBBI, DL, ScratchReg, Val, Flag);
BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
.addReg(SrcReg)
.addReg(ScratchReg, RegState::Kill)
.setMIFlag(Flag);
} else {
report_fatal_error("adjustReg cannot yet handle adjustments >32 bits");
}
}
// Returns the register used to hold the frame pointer.
static unsigned getFPReg(const RISCVSubtarget &STI) { return RISCV::X8; }
// Returns the register used to hold the stack pointer.
static unsigned getSPReg(const RISCVSubtarget &STI) { return RISCV::X2; }
void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
MachineFrameInfo &MFI = MF.getFrameInfo();
auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
const RISCVRegisterInfo *RI = STI.getRegisterInfo();
const RISCVInstrInfo *TII = STI.getInstrInfo();
MachineBasicBlock::iterator MBBI = MBB.begin();
+ if (RI->needsStackRealignment(MF) && MFI.hasVarSizedObjects()) {
+ report_fatal_error(
+ "RISC-V backend can't currently handle functions that need stack "
+ "realignment and have variable sized objects");
+ }
+
unsigned FPReg = getFPReg(STI);
unsigned SPReg = getSPReg(STI);
// Debug location must be unknown since the first debug location is used
// to determine the end of the prologue.
DebugLoc DL;
// Determine the correct frame layout
determineFrameLayout(MF);
// FIXME (note copied from Lanai): This appears to be overallocating. Needs
// investigation. Get the number of bytes to allocate from the FrameInfo.
uint64_t StackSize = MFI.getStackSize();
// Early exit if there is no need to allocate on the stack
if (StackSize == 0 && !MFI.adjustsStack())
return;
// Allocate space on the stack if necessary.
adjustReg(MBB, MBBI, DL, SPReg, SPReg, -StackSize, MachineInstr::FrameSetup);
// Emit ".cfi_def_cfa_offset StackSize"
unsigned CFIIndex = MF.addFrameInst(
MCCFIInstruction::createDefCfaOffset(nullptr, -StackSize));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
// The frame pointer is callee-saved, and code has been generated for us to
// save it to the stack. We need to skip over the storing of callee-saved
// registers as the frame pointer must be modified after it has been saved
// to the stack, not before.
// FIXME: assumes exactly one instruction is used to save each callee-saved
// register.
const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
std::advance(MBBI, CSI.size());
// Iterate over list of callee-saved registers and emit .cfi_offset
// directives.
for (const auto &Entry : CSI) {
int64_t Offset = MFI.getObjectOffset(Entry.getFrameIdx());
unsigned Reg = Entry.getReg();
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
nullptr, RI->getDwarfRegNum(Reg, true), Offset));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
}
// Generate new FP.
if (hasFP(MF)) {
adjustReg(MBB, MBBI, DL, FPReg, SPReg,
StackSize - RVFI->getVarArgsSaveSize(), MachineInstr::FrameSetup);
// Emit ".cfi_def_cfa $fp, 0"
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
nullptr, RI->getDwarfRegNum(FPReg, true), 0));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
+
+ // Realign Stack
+ const RISCVRegisterInfo *RI = STI.getRegisterInfo();
+ if (RI->needsStackRealignment(MF)) {
+ unsigned MaxAlignment = MFI.getMaxAlignment();
+
+ const RISCVInstrInfo *TII = STI.getInstrInfo();
+ if (isInt<12>(-(int)MaxAlignment)) {
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::ANDI), SPReg)
+ .addReg(SPReg)
+ .addImm(-(int)MaxAlignment);
+ } else {
+ unsigned ShiftAmount = countTrailingZeros(MaxAlignment);
+ unsigned VR =
+ MF.getRegInfo().createVirtualRegister(&RISCV::GPRRegClass);
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::SRLI), VR)
+ .addReg(SPReg)
+ .addImm(ShiftAmount);
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::SLLI), SPReg)
+ .addReg(VR)
+ .addImm(ShiftAmount);
+ }
+ }
}
}
void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
const RISCVRegisterInfo *RI = STI.getRegisterInfo();
MachineFrameInfo &MFI = MF.getFrameInfo();
auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
DebugLoc DL = MBBI->getDebugLoc();
const RISCVInstrInfo *TII = STI.getInstrInfo();
unsigned FPReg = getFPReg(STI);
unsigned SPReg = getSPReg(STI);
// Skip to before the restores of callee-saved registers
// FIXME: assumes exactly one instruction is used to restore each
// callee-saved register.
auto LastFrameDestroy = std::prev(MBBI, MFI.getCalleeSavedInfo().size());
uint64_t StackSize = MFI.getStackSize();
uint64_t FPOffset = StackSize - RVFI->getVarArgsSaveSize();
// Restore the stack pointer using the value of the frame pointer. Only
// necessary if the stack pointer was modified, meaning the stack size is
// unknown.
if (RI->needsStackRealignment(MF) || MFI.hasVarSizedObjects()) {
assert(hasFP(MF) && "frame pointer should not have been eliminated");
adjustReg(MBB, LastFrameDestroy, DL, SPReg, FPReg, -FPOffset,
MachineInstr::FrameDestroy);
}
if (hasFP(MF)) {
// To find the instruction restoring FP from stack.
for (auto &I = LastFrameDestroy; I != MBBI; ++I) {
if (I->mayLoad() && I->getOperand(0).isReg()) {
unsigned DestReg = I->getOperand(0).getReg();
if (DestReg == FPReg) {
// If there is frame pointer, after restoring $fp registers, we
// need adjust CFA to ($sp - FPOffset).
// Emit ".cfi_def_cfa $sp, -FPOffset"
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
nullptr, RI->getDwarfRegNum(SPReg, true), -FPOffset));
BuildMI(MBB, std::next(I), DL,
TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
break;
}
}
}
}
// Add CFI directives for callee-saved registers.
const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
// Iterate over list of callee-saved registers and emit .cfi_restore
// directives.
for (const auto &Entry : CSI) {
unsigned Reg = Entry.getReg();
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
nullptr, RI->getDwarfRegNum(Reg, true)));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
}
// Deallocate stack
adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy);
// After restoring $sp, we need to adjust CFA to $(sp + 0)
// Emit ".cfi_def_cfa_offset 0"
unsigned CFIIndex =
MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
}
int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF,
int FI,
unsigned &FrameReg) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
const auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
// Callee-saved registers should be referenced relative to the stack
// pointer (positive offset), otherwise use the frame pointer (negative
// offset).
const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
int MinCSFI = 0;
int MaxCSFI = -1;
int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea() +
MFI.getOffsetAdjustment();
if (CSI.size()) {
MinCSFI = CSI[0].getFrameIdx();
MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
}
if (FI >= MinCSFI && FI <= MaxCSFI) {
+ FrameReg = RISCV::X2;
+ Offset += MF.getFrameInfo().getStackSize();
+ } else if (RI->needsStackRealignment(MF)) {
+ assert(!MFI.hasVarSizedObjects() &&
+ "Unexpected combination of stack realignment and varsized objects");
+ // If the stack was realigned, the frame pointer is set in order to allow
+ // SP to be restored, but we still access stack objects using SP.
FrameReg = RISCV::X2;
Offset += MF.getFrameInfo().getStackSize();
} else {
FrameReg = RI->getFrameRegister(MF);
if (hasFP(MF))
Offset += RVFI->getVarArgsSaveSize();
else
Offset += MF.getFrameInfo().getStackSize();
}
return Offset;
}
void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
BitVector &SavedRegs,
RegScavenger *RS) const {
TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
// Unconditionally spill RA and FP only if the function uses a frame
// pointer.
if (hasFP(MF)) {
SavedRegs.set(RISCV::X1);
SavedRegs.set(RISCV::X8);
}
// If interrupt is enabled and there are calls in the handler,
// unconditionally save all Caller-saved registers and
// all FP registers, regardless whether they are used.
MachineFrameInfo &MFI = MF.getFrameInfo();
if (MF.getFunction().hasFnAttribute("interrupt") && MFI.hasCalls()) {
static const MCPhysReg CSRegs[] = { RISCV::X1, /* ra */
RISCV::X5, RISCV::X6, RISCV::X7, /* t0-t2 */
RISCV::X10, RISCV::X11, /* a0-a1, a2-a7 */
RISCV::X12, RISCV::X13, RISCV::X14, RISCV::X15, RISCV::X16, RISCV::X17,
RISCV::X28, RISCV::X29, RISCV::X30, RISCV::X31, 0 /* t3-t6 */
};
for (unsigned i = 0; CSRegs[i]; ++i)
SavedRegs.set(CSRegs[i]);
if (MF.getSubtarget<RISCVSubtarget>().hasStdExtD() ||
MF.getSubtarget<RISCVSubtarget>().hasStdExtF()) {
// If interrupt is enabled, this list contains all FP registers.
const MCPhysReg * Regs = MF.getRegInfo().getCalleeSavedRegs();
for (unsigned i = 0; Regs[i]; ++i)
if (RISCV::FPR32RegClass.contains(Regs[i]) ||
RISCV::FPR64RegClass.contains(Regs[i]))
SavedRegs.set(Regs[i]);
}
}
}
void RISCVFrameLowering::processFunctionBeforeFrameFinalized(
MachineFunction &MF, RegScavenger *RS) const {
const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
MachineFrameInfo &MFI = MF.getFrameInfo();
const TargetRegisterClass *RC = &RISCV::GPRRegClass;
// estimateStackSize has been observed to under-estimate the final stack
// size, so give ourselves wiggle-room by checking for stack size
// representable an 11-bit signed field rather than 12-bits.
// FIXME: It may be possible to craft a function with a small stack that
// still needs an emergency spill slot for branch relaxation. This case
// would currently be missed.
if (!isInt<11>(MFI.estimateStackSize(MF))) {
int RegScavFI = MFI.CreateStackObject(
RegInfo->getSpillSize(*RC), RegInfo->getSpillAlignment(*RC), false);
RS->addScavengingFrameIndex(RegScavFI);
}
}
// Not preserve stack space within prologue for outgoing variables when the
// function contains variable size objects and let eliminateCallFramePseudoInstr
// preserve stack space for it.
bool RISCVFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
return !MF.getFrameInfo().hasVarSizedObjects();
}
// Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions.
MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const {
unsigned SPReg = RISCV::X2;
DebugLoc DL = MI->getDebugLoc();
if (!hasReservedCallFrame(MF)) {
// If space has not been reserved for a call frame, ADJCALLSTACKDOWN and
// ADJCALLSTACKUP must be converted to instructions manipulating the stack
// pointer. This is necessary when there is a variable length stack
// allocation (e.g. alloca), which means it's not possible to allocate
// space for outgoing arguments from within the function prologue.
int64_t Amount = MI->getOperand(0).getImm();
if (Amount != 0) {
// Ensure the stack remains aligned after adjustment.
Amount = alignSPAdjust(Amount);
if (MI->getOpcode() == RISCV::ADJCALLSTACKDOWN)
Amount = -Amount;
adjustReg(MBB, MI, DL, SPReg, SPReg, Amount, MachineInstr::NoFlags);
}
}
return MBB.erase(MI);
}
Index: vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVISelLowering.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVISelLowering.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVISelLowering.cpp (revision 351303)
@@ -1,2621 +1,2648 @@
//===-- RISCVISelLowering.cpp - RISCV DAG Lowering Implementation --------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines the interfaces that RISCV uses to lower LLVM code into a
// selection DAG.
//
//===----------------------------------------------------------------------===//
#include "RISCVISelLowering.h"
#include "RISCV.h"
#include "RISCVMachineFunctionInfo.h"
#include "RISCVRegisterInfo.h"
#include "RISCVSubtarget.h"
#include "RISCVTargetMachine.h"
#include "Utils/RISCVMatInt.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/DiagnosticPrinter.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
#define DEBUG_TYPE "riscv-lower"
STATISTIC(NumTailCalls, "Number of tail calls");
RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
const RISCVSubtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
if (Subtarget.isRV32E())
report_fatal_error("Codegen not yet implemented for RV32E");
RISCVABI::ABI ABI = Subtarget.getTargetABI();
assert(ABI != RISCVABI::ABI_Unknown && "Improperly initialised target ABI");
switch (ABI) {
default:
report_fatal_error("Don't know how to lower this ABI");
case RISCVABI::ABI_ILP32:
case RISCVABI::ABI_ILP32F:
case RISCVABI::ABI_ILP32D:
case RISCVABI::ABI_LP64:
case RISCVABI::ABI_LP64F:
case RISCVABI::ABI_LP64D:
break;
}
MVT XLenVT = Subtarget.getXLenVT();
// Set up the register classes.
addRegisterClass(XLenVT, &RISCV::GPRRegClass);
if (Subtarget.hasStdExtF())
addRegisterClass(MVT::f32, &RISCV::FPR32RegClass);
if (Subtarget.hasStdExtD())
addRegisterClass(MVT::f64, &RISCV::FPR64RegClass);
// Compute derived properties from the register classes.
computeRegisterProperties(STI.getRegisterInfo());
setStackPointerRegisterToSaveRestore(RISCV::X2);
for (auto N : {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD})
setLoadExtAction(N, XLenVT, MVT::i1, Promote);
// TODO: add all necessary setOperationAction calls.
setOperationAction(ISD::DYNAMIC_STACKALLOC, XLenVT, Expand);
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
setOperationAction(ISD::BR_CC, XLenVT, Expand);
setOperationAction(ISD::SELECT, XLenVT, Custom);
setOperationAction(ISD::SELECT_CC, XLenVT, Expand);
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
setOperationAction(ISD::VASTART, MVT::Other, Custom);
setOperationAction(ISD::VAARG, MVT::Other, Expand);
setOperationAction(ISD::VACOPY, MVT::Other, Expand);
setOperationAction(ISD::VAEND, MVT::Other, Expand);
for (auto VT : {MVT::i1, MVT::i8, MVT::i16})
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
if (Subtarget.is64Bit()) {
setOperationAction(ISD::SHL, MVT::i32, Custom);
setOperationAction(ISD::SRA, MVT::i32, Custom);
setOperationAction(ISD::SRL, MVT::i32, Custom);
}
if (!Subtarget.hasStdExtM()) {
setOperationAction(ISD::MUL, XLenVT, Expand);
setOperationAction(ISD::MULHS, XLenVT, Expand);
setOperationAction(ISD::MULHU, XLenVT, Expand);
setOperationAction(ISD::SDIV, XLenVT, Expand);
setOperationAction(ISD::UDIV, XLenVT, Expand);
setOperationAction(ISD::SREM, XLenVT, Expand);
setOperationAction(ISD::UREM, XLenVT, Expand);
}
if (Subtarget.is64Bit() && Subtarget.hasStdExtM()) {
setOperationAction(ISD::SDIV, MVT::i32, Custom);
setOperationAction(ISD::UDIV, MVT::i32, Custom);
setOperationAction(ISD::UREM, MVT::i32, Custom);
}
setOperationAction(ISD::SDIVREM, XLenVT, Expand);
setOperationAction(ISD::UDIVREM, XLenVT, Expand);
setOperationAction(ISD::SMUL_LOHI, XLenVT, Expand);
setOperationAction(ISD::UMUL_LOHI, XLenVT, Expand);
setOperationAction(ISD::SHL_PARTS, XLenVT, Custom);
setOperationAction(ISD::SRL_PARTS, XLenVT, Custom);
setOperationAction(ISD::SRA_PARTS, XLenVT, Custom);
setOperationAction(ISD::ROTL, XLenVT, Expand);
setOperationAction(ISD::ROTR, XLenVT, Expand);
setOperationAction(ISD::BSWAP, XLenVT, Expand);
setOperationAction(ISD::CTTZ, XLenVT, Expand);
setOperationAction(ISD::CTLZ, XLenVT, Expand);
setOperationAction(ISD::CTPOP, XLenVT, Expand);
ISD::CondCode FPCCToExtend[] = {
ISD::SETOGT, ISD::SETOGE, ISD::SETONE, ISD::SETUEQ, ISD::SETUGT,
ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE, ISD::SETGT,
ISD::SETGE, ISD::SETNE};
ISD::NodeType FPOpToExtend[] = {
ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOW, ISD::FREM};
if (Subtarget.hasStdExtF()) {
setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
for (auto CC : FPCCToExtend)
setCondCodeAction(CC, MVT::f32, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
setOperationAction(ISD::SELECT, MVT::f32, Custom);
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
for (auto Op : FPOpToExtend)
setOperationAction(Op, MVT::f32, Expand);
}
if (Subtarget.hasStdExtF() && Subtarget.is64Bit())
setOperationAction(ISD::BITCAST, MVT::i32, Custom);
if (Subtarget.hasStdExtD()) {
setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
for (auto CC : FPCCToExtend)
setCondCodeAction(CC, MVT::f64, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
setOperationAction(ISD::SELECT, MVT::f64, Custom);
setOperationAction(ISD::BR_CC, MVT::f64, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
for (auto Op : FPOpToExtend)
setOperationAction(Op, MVT::f64, Expand);
}
setOperationAction(ISD::GlobalAddress, XLenVT, Custom);
setOperationAction(ISD::BlockAddress, XLenVT, Custom);
setOperationAction(ISD::ConstantPool, XLenVT, Custom);
setOperationAction(ISD::GlobalTLSAddress, XLenVT, Custom);
// TODO: On M-mode only targets, the cycle[h] CSR may not be present.
// Unfortunately this can't be determined just from the ISA naming string.
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64,
Subtarget.is64Bit() ? Legal : Custom);
if (Subtarget.hasStdExtA()) {
setMaxAtomicSizeInBitsSupported(Subtarget.getXLen());
setMinCmpXchgSizeInBits(32);
} else {
setMaxAtomicSizeInBitsSupported(0);
}
setBooleanContents(ZeroOrOneBooleanContent);
// Function alignments (log2).
unsigned FunctionAlignment = Subtarget.hasStdExtC() ? 1 : 2;
setMinFunctionAlignment(FunctionAlignment);
setPrefFunctionAlignment(FunctionAlignment);
// Effectively disable jump table generation.
setMinimumJumpTableEntries(INT_MAX);
}
EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
EVT VT) const {
if (!VT.isVector())
return getPointerTy(DL);
return VT.changeVectorElementTypeToInteger();
}
bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,
MachineFunction &MF,
unsigned Intrinsic) const {
switch (Intrinsic) {
default:
return false;
case Intrinsic::riscv_masked_atomicrmw_xchg_i32:
case Intrinsic::riscv_masked_atomicrmw_add_i32:
case Intrinsic::riscv_masked_atomicrmw_sub_i32:
case Intrinsic::riscv_masked_atomicrmw_nand_i32:
case Intrinsic::riscv_masked_atomicrmw_max_i32:
case Intrinsic::riscv_masked_atomicrmw_min_i32:
case Intrinsic::riscv_masked_atomicrmw_umax_i32:
case Intrinsic::riscv_masked_atomicrmw_umin_i32:
case Intrinsic::riscv_masked_cmpxchg_i32:
PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.align = 4;
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
MachineMemOperand::MOVolatile;
return true;
}
}
bool RISCVTargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS,
Instruction *I) const {
// No global is ever allowed as a base.
if (AM.BaseGV)
return false;
// Require a 12-bit signed offset.
if (!isInt<12>(AM.BaseOffs))
return false;
switch (AM.Scale) {
case 0: // "r+i" or just "i", depending on HasBaseReg.
break;
case 1:
if (!AM.HasBaseReg) // allow "r+i".
break;
return false; // disallow "r+r" or "r+r+i".
default:
return false;
}
return true;
}
bool RISCVTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
return isInt<12>(Imm);
}
bool RISCVTargetLowering::isLegalAddImmediate(int64_t Imm) const {
return isInt<12>(Imm);
}
// On RV32, 64-bit integers are split into their high and low parts and held
// in two different registers, so the trunc is free since the low register can
// just be used.
bool RISCVTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
if (Subtarget.is64Bit() || !SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
return false;
unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
unsigned DestBits = DstTy->getPrimitiveSizeInBits();
return (SrcBits == 64 && DestBits == 32);
}
bool RISCVTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
if (Subtarget.is64Bit() || SrcVT.isVector() || DstVT.isVector() ||
!SrcVT.isInteger() || !DstVT.isInteger())
return false;
unsigned SrcBits = SrcVT.getSizeInBits();
unsigned DestBits = DstVT.getSizeInBits();
return (SrcBits == 64 && DestBits == 32);
}
bool RISCVTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
// Zexts are free if they can be combined with a load.
if (auto *LD = dyn_cast<LoadSDNode>(Val)) {
EVT MemVT = LD->getMemoryVT();
if ((MemVT == MVT::i8 || MemVT == MVT::i16 ||
(Subtarget.is64Bit() && MemVT == MVT::i32)) &&
(LD->getExtensionType() == ISD::NON_EXTLOAD ||
LD->getExtensionType() == ISD::ZEXTLOAD))
return true;
}
return TargetLowering::isZExtFree(Val, VT2);
}
bool RISCVTargetLowering::isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const {
return Subtarget.is64Bit() && SrcVT == MVT::i32 && DstVT == MVT::i64;
}
bool RISCVTargetLowering::hasBitPreservingFPLogic(EVT VT) const {
return (VT == MVT::f32 && Subtarget.hasStdExtF()) ||
(VT == MVT::f64 && Subtarget.hasStdExtD());
}
// Changes the condition code and swaps operands if necessary, so the SetCC
// operation matches one of the comparisons supported directly in the RISC-V
// ISA.
static void normaliseSetCC(SDValue &LHS, SDValue &RHS, ISD::CondCode &CC) {
switch (CC) {
default:
break;
case ISD::SETGT:
case ISD::SETLE:
case ISD::SETUGT:
case ISD::SETULE:
CC = ISD::getSetCCSwappedOperands(CC);
std::swap(LHS, RHS);
break;
}
}
// Return the RISC-V branch opcode that matches the given DAG integer
// condition code. The CondCode must be one of those supported by the RISC-V
// ISA (see normaliseSetCC).
static unsigned getBranchOpcodeForIntCondCode(ISD::CondCode CC) {
switch (CC) {
default:
llvm_unreachable("Unsupported CondCode");
case ISD::SETEQ:
return RISCV::BEQ;
case ISD::SETNE:
return RISCV::BNE;
case ISD::SETLT:
return RISCV::BLT;
case ISD::SETGE:
return RISCV::BGE;
case ISD::SETULT:
return RISCV::BLTU;
case ISD::SETUGE:
return RISCV::BGEU;
}
}
SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default:
report_fatal_error("unimplemented operand");
case ISD::GlobalAddress:
return lowerGlobalAddress(Op, DAG);
case ISD::BlockAddress:
return lowerBlockAddress(Op, DAG);
case ISD::ConstantPool:
return lowerConstantPool(Op, DAG);
case ISD::GlobalTLSAddress:
return lowerGlobalTLSAddress(Op, DAG);
case ISD::SELECT:
return lowerSELECT(Op, DAG);
case ISD::VASTART:
return lowerVASTART(Op, DAG);
case ISD::FRAMEADDR:
return lowerFRAMEADDR(Op, DAG);
case ISD::RETURNADDR:
return lowerRETURNADDR(Op, DAG);
case ISD::SHL_PARTS:
return lowerShiftLeftParts(Op, DAG);
case ISD::SRA_PARTS:
return lowerShiftRightParts(Op, DAG, true);
case ISD::SRL_PARTS:
return lowerShiftRightParts(Op, DAG, false);
case ISD::BITCAST: {
assert(Subtarget.is64Bit() && Subtarget.hasStdExtF() &&
"Unexpected custom legalisation");
SDLoc DL(Op);
SDValue Op0 = Op.getOperand(0);
if (Op.getValueType() != MVT::f32 || Op0.getValueType() != MVT::i32)
return SDValue();
SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
SDValue FPConv = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, NewOp0);
return FPConv;
}
}
}
static SDValue getTargetNode(GlobalAddressSDNode *N, SDLoc DL, EVT Ty,
SelectionDAG &DAG, unsigned Flags) {
return DAG.getTargetGlobalAddress(N->getGlobal(), DL, Ty, 0, Flags);
}
static SDValue getTargetNode(BlockAddressSDNode *N, SDLoc DL, EVT Ty,
SelectionDAG &DAG, unsigned Flags) {
return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, N->getOffset(),
Flags);
}
static SDValue getTargetNode(ConstantPoolSDNode *N, SDLoc DL, EVT Ty,
SelectionDAG &DAG, unsigned Flags) {
return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
N->getOffset(), Flags);
}
template <class NodeTy>
SDValue RISCVTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
bool IsLocal) const {
SDLoc DL(N);
EVT Ty = getPointerTy(DAG.getDataLayout());
if (isPositionIndependent()) {
SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0);
if (IsLocal)
// Use PC-relative addressing to access the symbol. This generates the
// pattern (PseudoLLA sym), which expands to (addi (auipc %pcrel_hi(sym))
// %pcrel_lo(auipc)).
return SDValue(DAG.getMachineNode(RISCV::PseudoLLA, DL, Ty, Addr), 0);
// Use PC-relative addressing to access the GOT for this symbol, then load
// the address from the GOT. This generates the pattern (PseudoLA sym),
// which expands to (ld (addi (auipc %got_pcrel_hi(sym)) %pcrel_lo(auipc))).
return SDValue(DAG.getMachineNode(RISCV::PseudoLA, DL, Ty, Addr), 0);
}
switch (getTargetMachine().getCodeModel()) {
default:
report_fatal_error("Unsupported code model for lowering");
case CodeModel::Small: {
// Generate a sequence for accessing addresses within the first 2 GiB of
// address space. This generates the pattern (addi (lui %hi(sym)) %lo(sym)).
SDValue AddrHi = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_HI);
SDValue AddrLo = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_LO);
SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, AddrHi), 0);
return SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNHi, AddrLo), 0);
}
case CodeModel::Medium: {
// Generate a sequence for accessing addresses within any 2GiB range within
// the address space. This generates the pattern (PseudoLLA sym), which
// expands to (addi (auipc %pcrel_hi(sym)) %pcrel_lo(auipc)).
SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0);
return SDValue(DAG.getMachineNode(RISCV::PseudoLLA, DL, Ty, Addr), 0);
}
}
}
SDValue RISCVTargetLowering::lowerGlobalAddress(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT Ty = Op.getValueType();
GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
int64_t Offset = N->getOffset();
MVT XLenVT = Subtarget.getXLenVT();
const GlobalValue *GV = N->getGlobal();
bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
SDValue Addr = getAddr(N, DAG, IsLocal);
// In order to maximise the opportunity for common subexpression elimination,
// emit a separate ADD node for the global address offset instead of folding
// it in the global address node. Later peephole optimisations may choose to
// fold it back in when profitable.
if (Offset != 0)
return DAG.getNode(ISD::ADD, DL, Ty, Addr,
DAG.getConstant(Offset, DL, XLenVT));
return Addr;
}
SDValue RISCVTargetLowering::lowerBlockAddress(SDValue Op,
SelectionDAG &DAG) const {
BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);
return getAddr(N, DAG);
}
SDValue RISCVTargetLowering::lowerConstantPool(SDValue Op,
SelectionDAG &DAG) const {
ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
return getAddr(N, DAG);
}
SDValue RISCVTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N,
SelectionDAG &DAG,
bool UseGOT) const {
SDLoc DL(N);
EVT Ty = getPointerTy(DAG.getDataLayout());
const GlobalValue *GV = N->getGlobal();
MVT XLenVT = Subtarget.getXLenVT();
if (UseGOT) {
// Use PC-relative addressing to access the GOT for this TLS symbol, then
// load the address from the GOT and add the thread pointer. This generates
// the pattern (PseudoLA_TLS_IE sym), which expands to
// (ld (auipc %tls_ie_pcrel_hi(sym)) %pcrel_lo(auipc)).
SDValue Addr = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, 0);
SDValue Load =
SDValue(DAG.getMachineNode(RISCV::PseudoLA_TLS_IE, DL, Ty, Addr), 0);
// Add the thread pointer.
SDValue TPReg = DAG.getRegister(RISCV::X4, XLenVT);
return DAG.getNode(ISD::ADD, DL, Ty, Load, TPReg);
}
// Generate a sequence for accessing the address relative to the thread
// pointer, with the appropriate adjustment for the thread pointer offset.
// This generates the pattern
// (add (add_tprel (lui %tprel_hi(sym)) tp %tprel_add(sym)) %tprel_lo(sym))
SDValue AddrHi =
DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_HI);
SDValue AddrAdd =
DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_ADD);
SDValue AddrLo =
DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_LO);
SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, AddrHi), 0);
SDValue TPReg = DAG.getRegister(RISCV::X4, XLenVT);
SDValue MNAdd = SDValue(
DAG.getMachineNode(RISCV::PseudoAddTPRel, DL, Ty, MNHi, TPReg, AddrAdd),
0);
return SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNAdd, AddrLo), 0);
}
SDValue RISCVTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N,
SelectionDAG &DAG) const {
SDLoc DL(N);
EVT Ty = getPointerTy(DAG.getDataLayout());
IntegerType *CallTy = Type::getIntNTy(*DAG.getContext(), Ty.getSizeInBits());
const GlobalValue *GV = N->getGlobal();
// Use a PC-relative addressing mode to access the global dynamic GOT address.
// This generates the pattern (PseudoLA_TLS_GD sym), which expands to
// (addi (auipc %tls_gd_pcrel_hi(sym)) %pcrel_lo(auipc)).
SDValue Addr = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, 0);
SDValue Load =
SDValue(DAG.getMachineNode(RISCV::PseudoLA_TLS_GD, DL, Ty, Addr), 0);
// Prepare argument list to generate call.
ArgListTy Args;
ArgListEntry Entry;
Entry.Node = Load;
Entry.Ty = CallTy;
Args.push_back(Entry);
// Setup call to __tls_get_addr.
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(DL)
.setChain(DAG.getEntryNode())
.setLibCallee(CallingConv::C, CallTy,
DAG.getExternalSymbol("__tls_get_addr", Ty),
std::move(Args));
return LowerCallTo(CLI).first;
}
SDValue RISCVTargetLowering::lowerGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT Ty = Op.getValueType();
GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
int64_t Offset = N->getOffset();
MVT XLenVT = Subtarget.getXLenVT();
// Non-PIC TLS lowering should always use the LocalExec model.
TLSModel::Model Model = isPositionIndependent()
? getTargetMachine().getTLSModel(N->getGlobal())
: TLSModel::LocalExec;
SDValue Addr;
switch (Model) {
case TLSModel::LocalExec:
Addr = getStaticTLSAddr(N, DAG, /*UseGOT=*/false);
break;
case TLSModel::InitialExec:
Addr = getStaticTLSAddr(N, DAG, /*UseGOT=*/true);
break;
case TLSModel::LocalDynamic:
case TLSModel::GeneralDynamic:
Addr = getDynamicTLSAddr(N, DAG);
break;
}
// In order to maximise the opportunity for common subexpression elimination,
// emit a separate ADD node for the global address offset instead of folding
// it in the global address node. Later peephole optimisations may choose to
// fold it back in when profitable.
if (Offset != 0)
return DAG.getNode(ISD::ADD, DL, Ty, Addr,
DAG.getConstant(Offset, DL, XLenVT));
return Addr;
}
SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue CondV = Op.getOperand(0);
SDValue TrueV = Op.getOperand(1);
SDValue FalseV = Op.getOperand(2);
SDLoc DL(Op);
MVT XLenVT = Subtarget.getXLenVT();
// If the result type is XLenVT and CondV is the output of a SETCC node
// which also operated on XLenVT inputs, then merge the SETCC node into the
// lowered RISCVISD::SELECT_CC to take advantage of the integer
// compare+branch instructions. i.e.:
// (select (setcc lhs, rhs, cc), truev, falsev)
// -> (riscvisd::select_cc lhs, rhs, cc, truev, falsev)
if (Op.getSimpleValueType() == XLenVT && CondV.getOpcode() == ISD::SETCC &&
CondV.getOperand(0).getSimpleValueType() == XLenVT) {
SDValue LHS = CondV.getOperand(0);
SDValue RHS = CondV.getOperand(1);
auto CC = cast<CondCodeSDNode>(CondV.getOperand(2));
ISD::CondCode CCVal = CC->get();
normaliseSetCC(LHS, RHS, CCVal);
SDValue TargetCC = DAG.getConstant(CCVal, DL, XLenVT);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
SDValue Ops[] = {LHS, RHS, TargetCC, TrueV, FalseV};
return DAG.getNode(RISCVISD::SELECT_CC, DL, VTs, Ops);
}
// Otherwise:
// (select condv, truev, falsev)
// -> (riscvisd::select_cc condv, zero, setne, truev, falsev)
SDValue Zero = DAG.getConstant(0, DL, XLenVT);
SDValue SetNE = DAG.getConstant(ISD::SETNE, DL, XLenVT);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
SDValue Ops[] = {CondV, Zero, SetNE, TrueV, FalseV};
return DAG.getNode(RISCVISD::SELECT_CC, DL, VTs, Ops);
}
SDValue RISCVTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
RISCVMachineFunctionInfo *FuncInfo = MF.getInfo<RISCVMachineFunctionInfo>();
SDLoc DL(Op);
SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
getPointerTy(MF.getDataLayout()));
// vastart just stores the address of the VarArgsFrameIndex slot into the
// memory location argument.
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
return DAG.getStore(Op.getOperand(0), DL, FI, Op.getOperand(1),
MachinePointerInfo(SV));
}
SDValue RISCVTargetLowering::lowerFRAMEADDR(SDValue Op,
SelectionDAG &DAG) const {
const RISCVRegisterInfo &RI = *Subtarget.getRegisterInfo();
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
MFI.setFrameAddressIsTaken(true);
unsigned FrameReg = RI.getFrameRegister(MF);
int XLenInBytes = Subtarget.getXLen() / 8;
EVT VT = Op.getValueType();
SDLoc DL(Op);
SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, VT);
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
while (Depth--) {
int Offset = -(XLenInBytes * 2);
SDValue Ptr = DAG.getNode(ISD::ADD, DL, VT, FrameAddr,
DAG.getIntPtrConstant(Offset, DL));
FrameAddr =
DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
}
return FrameAddr;
}
SDValue RISCVTargetLowering::lowerRETURNADDR(SDValue Op,
SelectionDAG &DAG) const {
const RISCVRegisterInfo &RI = *Subtarget.getRegisterInfo();
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
MFI.setReturnAddressIsTaken(true);
MVT XLenVT = Subtarget.getXLenVT();
int XLenInBytes = Subtarget.getXLen() / 8;
if (verifyReturnAddressArgumentIsConstant(Op, DAG))
return SDValue();
EVT VT = Op.getValueType();
SDLoc DL(Op);
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
if (Depth) {
int Off = -XLenInBytes;
SDValue FrameAddr = lowerFRAMEADDR(Op, DAG);
SDValue Offset = DAG.getConstant(Off, DL, VT);
return DAG.getLoad(VT, DL, DAG.getEntryNode(),
DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
MachinePointerInfo());
}
// Return the value of the return address register, marking it an implicit
// live-in.
unsigned Reg = MF.addLiveIn(RI.getRARegister(), getRegClassFor(XLenVT));
return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, XLenVT);
}
SDValue RISCVTargetLowering::lowerShiftLeftParts(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
SDValue Lo = Op.getOperand(0);
SDValue Hi = Op.getOperand(1);
SDValue Shamt = Op.getOperand(2);
EVT VT = Lo.getValueType();
// if Shamt-XLEN < 0: // Shamt < XLEN
// Lo = Lo << Shamt
// Hi = (Hi << Shamt) | ((Lo >>u 1) >>u (XLEN-1 - Shamt))
// else:
// Lo = 0
// Hi = Lo << (Shamt-XLEN)
SDValue Zero = DAG.getConstant(0, DL, VT);
SDValue One = DAG.getConstant(1, DL, VT);
SDValue MinusXLen = DAG.getConstant(-(int)Subtarget.getXLen(), DL, VT);
SDValue XLenMinus1 = DAG.getConstant(Subtarget.getXLen() - 1, DL, VT);
SDValue ShamtMinusXLen = DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusXLen);
SDValue XLenMinus1Shamt = DAG.getNode(ISD::SUB, DL, VT, XLenMinus1, Shamt);
SDValue LoTrue = DAG.getNode(ISD::SHL, DL, VT, Lo, Shamt);
SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, VT, Lo, One);
SDValue ShiftRightLo =
DAG.getNode(ISD::SRL, DL, VT, ShiftRight1Lo, XLenMinus1Shamt);
SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, VT, Hi, Shamt);
SDValue HiTrue = DAG.getNode(ISD::OR, DL, VT, ShiftLeftHi, ShiftRightLo);
SDValue HiFalse = DAG.getNode(ISD::SHL, DL, VT, Lo, ShamtMinusXLen);
SDValue CC = DAG.getSetCC(DL, VT, ShamtMinusXLen, Zero, ISD::SETLT);
Lo = DAG.getNode(ISD::SELECT, DL, VT, CC, LoTrue, Zero);
Hi = DAG.getNode(ISD::SELECT, DL, VT, CC, HiTrue, HiFalse);
SDValue Parts[2] = {Lo, Hi};
return DAG.getMergeValues(Parts, DL);
}
SDValue RISCVTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
bool IsSRA) const {
SDLoc DL(Op);
SDValue Lo = Op.getOperand(0);
SDValue Hi = Op.getOperand(1);
SDValue Shamt = Op.getOperand(2);
EVT VT = Lo.getValueType();
// SRA expansion:
// if Shamt-XLEN < 0: // Shamt < XLEN
// Lo = (Lo >>u Shamt) | ((Hi << 1) << (XLEN-1 - Shamt))
// Hi = Hi >>s Shamt
// else:
// Lo = Hi >>s (Shamt-XLEN);
// Hi = Hi >>s (XLEN-1)
//
// SRL expansion:
// if Shamt-XLEN < 0: // Shamt < XLEN
// Lo = (Lo >>u Shamt) | ((Hi << 1) << (XLEN-1 - Shamt))
// Hi = Hi >>u Shamt
// else:
// Lo = Hi >>u (Shamt-XLEN);
// Hi = 0;
unsigned ShiftRightOp = IsSRA ? ISD::SRA : ISD::SRL;
SDValue Zero = DAG.getConstant(0, DL, VT);
SDValue One = DAG.getConstant(1, DL, VT);
SDValue MinusXLen = DAG.getConstant(-(int)Subtarget.getXLen(), DL, VT);
SDValue XLenMinus1 = DAG.getConstant(Subtarget.getXLen() - 1, DL, VT);
SDValue ShamtMinusXLen = DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusXLen);
SDValue XLenMinus1Shamt = DAG.getNode(ISD::SUB, DL, VT, XLenMinus1, Shamt);
SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, VT, Lo, Shamt);
SDValue ShiftLeftHi1 = DAG.getNode(ISD::SHL, DL, VT, Hi, One);
SDValue ShiftLeftHi =
DAG.getNode(ISD::SHL, DL, VT, ShiftLeftHi1, XLenMinus1Shamt);
SDValue LoTrue = DAG.getNode(ISD::OR, DL, VT, ShiftRightLo, ShiftLeftHi);
SDValue HiTrue = DAG.getNode(ShiftRightOp, DL, VT, Hi, Shamt);
SDValue LoFalse = DAG.getNode(ShiftRightOp, DL, VT, Hi, ShamtMinusXLen);
SDValue HiFalse =
IsSRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, XLenMinus1) : Zero;
SDValue CC = DAG.getSetCC(DL, VT, ShamtMinusXLen, Zero, ISD::SETLT);
Lo = DAG.getNode(ISD::SELECT, DL, VT, CC, LoTrue, LoFalse);
Hi = DAG.getNode(ISD::SELECT, DL, VT, CC, HiTrue, HiFalse);
SDValue Parts[2] = {Lo, Hi};
return DAG.getMergeValues(Parts, DL);
}
// Returns the opcode of the target-specific SDNode that implements the 32-bit
// form of the given Opcode.
static RISCVISD::NodeType getRISCVWOpcode(unsigned Opcode) {
switch (Opcode) {
default:
llvm_unreachable("Unexpected opcode");
case ISD::SHL:
return RISCVISD::SLLW;
case ISD::SRA:
return RISCVISD::SRAW;
case ISD::SRL:
return RISCVISD::SRLW;
case ISD::SDIV:
return RISCVISD::DIVW;
case ISD::UDIV:
return RISCVISD::DIVUW;
case ISD::UREM:
return RISCVISD::REMUW;
}
}
// Converts the given 32-bit operation to a target-specific SelectionDAG node.
// Because i32 isn't a legal type for RV64, these operations would otherwise
// be promoted to i64, making it difficult to select the SLLW/DIVUW/.../*W
// later one because the fact the operation was originally of type i32 is
// lost.
static SDValue customLegalizeToWOp(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
RISCVISD::NodeType WOpcode = getRISCVWOpcode(N->getOpcode());
SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
SDValue NewOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
SDValue NewRes = DAG.getNode(WOpcode, DL, MVT::i64, NewOp0, NewOp1);
// ReplaceNodeResults requires we maintain the same type for the return value.
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes);
}
void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
SDLoc DL(N);
switch (N->getOpcode()) {
default:
llvm_unreachable("Don't know how to custom type legalize this operation!");
case ISD::READCYCLECOUNTER: {
assert(!Subtarget.is64Bit() &&
"READCYCLECOUNTER only has custom type legalization on riscv32");
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
SDValue RCW =
DAG.getNode(RISCVISD::READ_CYCLE_WIDE, DL, VTs, N->getOperand(0));
Results.push_back(RCW);
Results.push_back(RCW.getValue(1));
Results.push_back(RCW.getValue(2));
break;
}
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
"Unexpected custom legalisation");
if (N->getOperand(1).getOpcode() == ISD::Constant)
return;
Results.push_back(customLegalizeToWOp(N, DAG));
break;
case ISD::SDIV:
case ISD::UDIV:
case ISD::UREM:
assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
Subtarget.hasStdExtM() && "Unexpected custom legalisation");
if (N->getOperand(0).getOpcode() == ISD::Constant ||
N->getOperand(1).getOpcode() == ISD::Constant)
return;
Results.push_back(customLegalizeToWOp(N, DAG));
break;
case ISD::BITCAST: {
assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
Subtarget.hasStdExtF() && "Unexpected custom legalisation");
SDLoc DL(N);
SDValue Op0 = N->getOperand(0);
if (Op0.getValueType() != MVT::f32)
return;
SDValue FPConv =
DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Op0);
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPConv));
break;
}
}
}
SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
switch (N->getOpcode()) {
default:
break;
case RISCVISD::SplitF64: {
SDValue Op0 = N->getOperand(0);
// If the input to SplitF64 is just BuildPairF64 then the operation is
// redundant. Instead, use BuildPairF64's operands directly.
if (Op0->getOpcode() == RISCVISD::BuildPairF64)
return DCI.CombineTo(N, Op0.getOperand(0), Op0.getOperand(1));
SDLoc DL(N);
// It's cheaper to materialise two 32-bit integers than to load a double
// from the constant pool and transfer it to integer registers through the
// stack.
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op0)) {
APInt V = C->getValueAPF().bitcastToAPInt();
SDValue Lo = DAG.getConstant(V.trunc(32), DL, MVT::i32);
SDValue Hi = DAG.getConstant(V.lshr(32).trunc(32), DL, MVT::i32);
return DCI.CombineTo(N, Lo, Hi);
}
// This is a target-specific version of a DAGCombine performed in
// DAGCombiner::visitBITCAST. It performs the equivalent of:
// fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
// fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
if (!(Op0.getOpcode() == ISD::FNEG || Op0.getOpcode() == ISD::FABS) ||
!Op0.getNode()->hasOneUse())
break;
SDValue NewSplitF64 =
DAG.getNode(RISCVISD::SplitF64, DL, DAG.getVTList(MVT::i32, MVT::i32),
Op0.getOperand(0));
SDValue Lo = NewSplitF64.getValue(0);
SDValue Hi = NewSplitF64.getValue(1);
APInt SignBit = APInt::getSignMask(32);
if (Op0.getOpcode() == ISD::FNEG) {
SDValue NewHi = DAG.getNode(ISD::XOR, DL, MVT::i32, Hi,
DAG.getConstant(SignBit, DL, MVT::i32));
return DCI.CombineTo(N, Lo, NewHi);
}
assert(Op0.getOpcode() == ISD::FABS);
SDValue NewHi = DAG.getNode(ISD::AND, DL, MVT::i32, Hi,
DAG.getConstant(~SignBit, DL, MVT::i32));
return DCI.CombineTo(N, Lo, NewHi);
}
case RISCVISD::SLLW:
case RISCVISD::SRAW:
case RISCVISD::SRLW: {
// Only the lower 32 bits of LHS and lower 5 bits of RHS are read.
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
APInt LHSMask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32);
APInt RHSMask = APInt::getLowBitsSet(RHS.getValueSizeInBits(), 5);
if ((SimplifyDemandedBits(N->getOperand(0), LHSMask, DCI)) ||
(SimplifyDemandedBits(N->getOperand(1), RHSMask, DCI)))
return SDValue();
break;
}
case RISCVISD::FMV_X_ANYEXTW_RV64: {
SDLoc DL(N);
SDValue Op0 = N->getOperand(0);
// If the input to FMV_X_ANYEXTW_RV64 is just FMV_W_X_RV64 then the
// conversion is unnecessary and can be replaced with an ANY_EXTEND
// of the FMV_W_X_RV64 operand.
if (Op0->getOpcode() == RISCVISD::FMV_W_X_RV64) {
SDValue AExtOp =
DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0.getOperand(0));
return DCI.CombineTo(N, AExtOp);
}
// This is a target-specific version of a DAGCombine performed in
// DAGCombiner::visitBITCAST. It performs the equivalent of:
// fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
// fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
if (!(Op0.getOpcode() == ISD::FNEG || Op0.getOpcode() == ISD::FABS) ||
!Op0.getNode()->hasOneUse())
break;
SDValue NewFMV = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64,
Op0.getOperand(0));
APInt SignBit = APInt::getSignMask(32).sext(64);
if (Op0.getOpcode() == ISD::FNEG) {
return DCI.CombineTo(N,
DAG.getNode(ISD::XOR, DL, MVT::i64, NewFMV,
DAG.getConstant(SignBit, DL, MVT::i64)));
}
assert(Op0.getOpcode() == ISD::FABS);
return DCI.CombineTo(N,
DAG.getNode(ISD::AND, DL, MVT::i64, NewFMV,
DAG.getConstant(~SignBit, DL, MVT::i64)));
}
}
return SDValue();
}
bool RISCVTargetLowering::isDesirableToCommuteWithShift(
const SDNode *N, CombineLevel Level) const {
// The following folds are only desirable if `(OP _, c1 << c2)` can be
// materialised in fewer instructions than `(OP _, c1)`:
//
// (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
// (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
SDValue N0 = N->getOperand(0);
EVT Ty = N0.getValueType();
if (Ty.isScalarInteger() &&
(N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR)) {
auto *C1 = dyn_cast<ConstantSDNode>(N0->getOperand(1));
auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (C1 && C2) {
APInt C1Int = C1->getAPIntValue();
APInt ShiftedC1Int = C1Int << C2->getAPIntValue();
// We can materialise `c1 << c2` into an add immediate, so it's "free",
// and the combine should happen, to potentially allow further combines
// later.
- if (isLegalAddImmediate(ShiftedC1Int.getSExtValue()))
+ if (ShiftedC1Int.getMinSignedBits() <= 64 &&
+ isLegalAddImmediate(ShiftedC1Int.getSExtValue()))
return true;
// We can materialise `c1` in an add immediate, so it's "free", and the
// combine should be prevented.
- if (isLegalAddImmediate(C1Int.getSExtValue()))
+ if (C1Int.getMinSignedBits() <= 64 &&
+ isLegalAddImmediate(C1Int.getSExtValue()))
return false;
// Neither constant will fit into an immediate, so find materialisation
// costs.
int C1Cost = RISCVMatInt::getIntMatCost(C1Int, Ty.getSizeInBits(),
Subtarget.is64Bit());
int ShiftedC1Cost = RISCVMatInt::getIntMatCost(
ShiftedC1Int, Ty.getSizeInBits(), Subtarget.is64Bit());
// Materialising `c1` is cheaper than materialising `c1 << c2`, so the
// combine should be prevented.
if (C1Cost < ShiftedC1Cost)
return false;
}
}
return true;
}
unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
unsigned Depth) const {
switch (Op.getOpcode()) {
default:
break;
case RISCVISD::SLLW:
case RISCVISD::SRAW:
case RISCVISD::SRLW:
case RISCVISD::DIVW:
case RISCVISD::DIVUW:
case RISCVISD::REMUW:
// TODO: As the result is sign-extended, this is conservatively correct. A
// more precise answer could be calculated for SRAW depending on known
// bits in the shift amount.
return 33;
}
return 1;
}
MachineBasicBlock *emitReadCycleWidePseudo(MachineInstr &MI,
MachineBasicBlock *BB) {
assert(MI.getOpcode() == RISCV::ReadCycleWide && "Unexpected instruction");
// To read the 64-bit cycle CSR on a 32-bit target, we read the two halves.
// Should the count have wrapped while it was being read, we need to try
// again.
// ...
// read:
// rdcycleh x3 # load high word of cycle
// rdcycle x2 # load low word of cycle
// rdcycleh x4 # load high word of cycle
// bne x3, x4, read # check if high word reads match, otherwise try again
// ...
MachineFunction &MF = *BB->getParent();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
MachineFunction::iterator It = ++BB->getIterator();
MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);
MF.insert(It, LoopMBB);
MachineBasicBlock *DoneMBB = MF.CreateMachineBasicBlock(LLVM_BB);
MF.insert(It, DoneMBB);
// Transfer the remainder of BB and its successor edges to DoneMBB.
DoneMBB->splice(DoneMBB->begin(), BB,
std::next(MachineBasicBlock::iterator(MI)), BB->end());
DoneMBB->transferSuccessorsAndUpdatePHIs(BB);
BB->addSuccessor(LoopMBB);
MachineRegisterInfo &RegInfo = MF.getRegInfo();
unsigned ReadAgainReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
unsigned LoReg = MI.getOperand(0).getReg();
unsigned HiReg = MI.getOperand(1).getReg();
DebugLoc DL = MI.getDebugLoc();
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), HiReg)
.addImm(RISCVSysReg::lookupSysRegByName("CYCLEH")->Encoding)
.addReg(RISCV::X0);
BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), LoReg)
.addImm(RISCVSysReg::lookupSysRegByName("CYCLE")->Encoding)
.addReg(RISCV::X0);
BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), ReadAgainReg)
.addImm(RISCVSysReg::lookupSysRegByName("CYCLEH")->Encoding)
.addReg(RISCV::X0);
BuildMI(LoopMBB, DL, TII->get(RISCV::BNE))
.addReg(HiReg)
.addReg(ReadAgainReg)
.addMBB(LoopMBB);
LoopMBB->addSuccessor(LoopMBB);
LoopMBB->addSuccessor(DoneMBB);
MI.eraseFromParent();
return DoneMBB;
}
static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI,
MachineBasicBlock *BB) {
assert(MI.getOpcode() == RISCV::SplitF64Pseudo && "Unexpected instruction");
MachineFunction &MF = *BB->getParent();
DebugLoc DL = MI.getDebugLoc();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
unsigned LoReg = MI.getOperand(0).getReg();
unsigned HiReg = MI.getOperand(1).getReg();
unsigned SrcReg = MI.getOperand(2).getReg();
const TargetRegisterClass *SrcRC = &RISCV::FPR64RegClass;
int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex();
TII.storeRegToStackSlot(*BB, MI, SrcReg, MI.getOperand(2).isKill(), FI, SrcRC,
RI);
MachineMemOperand *MMO =
MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
MachineMemOperand::MOLoad, 8, 8);
BuildMI(*BB, MI, DL, TII.get(RISCV::LW), LoReg)
.addFrameIndex(FI)
.addImm(0)
.addMemOperand(MMO);
BuildMI(*BB, MI, DL, TII.get(RISCV::LW), HiReg)
.addFrameIndex(FI)
.addImm(4)
.addMemOperand(MMO);
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
}
static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI,
MachineBasicBlock *BB) {
assert(MI.getOpcode() == RISCV::BuildPairF64Pseudo &&
"Unexpected instruction");
MachineFunction &MF = *BB->getParent();
DebugLoc DL = MI.getDebugLoc();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
unsigned DstReg = MI.getOperand(0).getReg();
unsigned LoReg = MI.getOperand(1).getReg();
unsigned HiReg = MI.getOperand(2).getReg();
const TargetRegisterClass *DstRC = &RISCV::FPR64RegClass;
int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex();
MachineMemOperand *MMO =
MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
MachineMemOperand::MOStore, 8, 8);
BuildMI(*BB, MI, DL, TII.get(RISCV::SW))
.addReg(LoReg, getKillRegState(MI.getOperand(1).isKill()))
.addFrameIndex(FI)
.addImm(0)
.addMemOperand(MMO);
BuildMI(*BB, MI, DL, TII.get(RISCV::SW))
.addReg(HiReg, getKillRegState(MI.getOperand(2).isKill()))
.addFrameIndex(FI)
.addImm(4)
.addMemOperand(MMO);
TII.loadRegFromStackSlot(*BB, MI, DstReg, FI, DstRC, RI);
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
}
static bool isSelectPseudo(MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
return false;
case RISCV::Select_GPR_Using_CC_GPR:
case RISCV::Select_FPR32_Using_CC_GPR:
case RISCV::Select_FPR64_Using_CC_GPR:
return true;
}
}
static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
MachineBasicBlock *BB) {
// To "insert" Select_* instructions, we actually have to insert the triangle
// control-flow pattern. The incoming instructions know the destination vreg
// to set, the condition code register to branch on, the true/false values to
// select between, and the condcode to use to select the appropriate branch.
//
// We produce the following control flow:
// HeadMBB
// | \
// | IfFalseMBB
// | /
// TailMBB
//
// When we find a sequence of selects we attempt to optimize their emission
// by sharing the control flow. Currently we only handle cases where we have
// multiple selects with the exact same condition (same LHS, RHS and CC).
// The selects may be interleaved with other instructions if the other
// instructions meet some requirements we deem safe:
// - They are debug instructions. Otherwise,
// - They do not have side-effects, do not access memory and their inputs do
// not depend on the results of the select pseudo-instructions.
// The TrueV/FalseV operands of the selects cannot depend on the result of
// previous selects in the sequence.
// These conditions could be further relaxed. See the X86 target for a
// related approach and more information.
unsigned LHS = MI.getOperand(1).getReg();
unsigned RHS = MI.getOperand(2).getReg();
auto CC = static_cast<ISD::CondCode>(MI.getOperand(3).getImm());
SmallVector<MachineInstr *, 4> SelectDebugValues;
SmallSet<unsigned, 4> SelectDests;
SelectDests.insert(MI.getOperand(0).getReg());
MachineInstr *LastSelectPseudo = &MI;
for (auto E = BB->end(), SequenceMBBI = MachineBasicBlock::iterator(MI);
SequenceMBBI != E; ++SequenceMBBI) {
if (SequenceMBBI->isDebugInstr())
continue;
else if (isSelectPseudo(*SequenceMBBI)) {
if (SequenceMBBI->getOperand(1).getReg() != LHS ||
SequenceMBBI->getOperand(2).getReg() != RHS ||
SequenceMBBI->getOperand(3).getImm() != CC ||
SelectDests.count(SequenceMBBI->getOperand(4).getReg()) ||
SelectDests.count(SequenceMBBI->getOperand(5).getReg()))
break;
LastSelectPseudo = &*SequenceMBBI;
SequenceMBBI->collectDebugValues(SelectDebugValues);
SelectDests.insert(SequenceMBBI->getOperand(0).getReg());
} else {
if (SequenceMBBI->hasUnmodeledSideEffects() ||
SequenceMBBI->mayLoadOrStore())
break;
if (llvm::any_of(SequenceMBBI->operands(), [&](MachineOperand &MO) {
return MO.isReg() && MO.isUse() && SelectDests.count(MO.getReg());
}))
break;
}
}
const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
DebugLoc DL = MI.getDebugLoc();
MachineFunction::iterator I = ++BB->getIterator();
MachineBasicBlock *HeadMBB = BB;
MachineFunction *F = BB->getParent();
MachineBasicBlock *TailMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *IfFalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
F->insert(I, IfFalseMBB);
F->insert(I, TailMBB);
// Transfer debug instructions associated with the selects to TailMBB.
for (MachineInstr *DebugInstr : SelectDebugValues) {
TailMBB->push_back(DebugInstr->removeFromParent());
}
// Move all instructions after the sequence to TailMBB.
TailMBB->splice(TailMBB->end(), HeadMBB,
std::next(LastSelectPseudo->getIterator()), HeadMBB->end());
// Update machine-CFG edges by transferring all successors of the current
// block to the new block which will contain the Phi nodes for the selects.
TailMBB->transferSuccessorsAndUpdatePHIs(HeadMBB);
// Set the successors for HeadMBB.
HeadMBB->addSuccessor(IfFalseMBB);
HeadMBB->addSuccessor(TailMBB);
// Insert appropriate branch.
unsigned Opcode = getBranchOpcodeForIntCondCode(CC);
BuildMI(HeadMBB, DL, TII.get(Opcode))
.addReg(LHS)
.addReg(RHS)
.addMBB(TailMBB);
// IfFalseMBB just falls through to TailMBB.
IfFalseMBB->addSuccessor(TailMBB);
// Create PHIs for all of the select pseudo-instructions.
auto SelectMBBI = MI.getIterator();
auto SelectEnd = std::next(LastSelectPseudo->getIterator());
auto InsertionPoint = TailMBB->begin();
while (SelectMBBI != SelectEnd) {
auto Next = std::next(SelectMBBI);
if (isSelectPseudo(*SelectMBBI)) {
// %Result = phi [ %TrueValue, HeadMBB ], [ %FalseValue, IfFalseMBB ]
BuildMI(*TailMBB, InsertionPoint, SelectMBBI->getDebugLoc(),
TII.get(RISCV::PHI), SelectMBBI->getOperand(0).getReg())
.addReg(SelectMBBI->getOperand(4).getReg())
.addMBB(HeadMBB)
.addReg(SelectMBBI->getOperand(5).getReg())
.addMBB(IfFalseMBB);
SelectMBBI->eraseFromParent();
}
SelectMBBI = Next;
}
F->getProperties().reset(MachineFunctionProperties::Property::NoPHIs);
return TailMBB;
}
MachineBasicBlock *
RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
switch (MI.getOpcode()) {
default:
llvm_unreachable("Unexpected instr type to insert");
case RISCV::ReadCycleWide:
assert(!Subtarget.is64Bit() &&
"ReadCycleWrite is only to be used on riscv32");
return emitReadCycleWidePseudo(MI, BB);
case RISCV::Select_GPR_Using_CC_GPR:
case RISCV::Select_FPR32_Using_CC_GPR:
case RISCV::Select_FPR64_Using_CC_GPR:
return emitSelectPseudo(MI, BB);
case RISCV::BuildPairF64Pseudo:
return emitBuildPairF64Pseudo(MI, BB);
case RISCV::SplitF64Pseudo:
return emitSplitF64Pseudo(MI, BB);
}
}
// Calling Convention Implementation.
// The expectations for frontend ABI lowering vary from target to target.
// Ideally, an LLVM frontend would be able to avoid worrying about many ABI
// details, but this is a longer term goal. For now, we simply try to keep the
// role of the frontend as simple and well-defined as possible. The rules can
// be summarised as:
// * Never split up large scalar arguments. We handle them here.
// * If a hardfloat calling convention is being used, and the struct may be
// passed in a pair of registers (fp+fp, int+fp), and both registers are
// available, then pass as two separate arguments. If either the GPRs or FPRs
// are exhausted, then pass according to the rule below.
// * If a struct could never be passed in registers or directly in a stack
// slot (as it is larger than 2*XLEN and the floating point rules don't
// apply), then pass it using a pointer with the byval attribute.
// * If a struct is less than 2*XLEN, then coerce to either a two-element
// word-sized array or a 2*XLEN scalar (depending on alignment).
// * The frontend can determine whether a struct is returned by reference or
// not based on its size and fields. If it will be returned by reference, the
// frontend must modify the prototype so a pointer with the sret annotation is
// passed as the first argument. This is not necessary for large scalar
// returns.
// * Struct return values and varargs should be coerced to structs containing
// register-size fields in the same situations they would be for fixed
// arguments.
static const MCPhysReg ArgGPRs[] = {
RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13,
RISCV::X14, RISCV::X15, RISCV::X16, RISCV::X17
};
static const MCPhysReg ArgFPR32s[] = {
RISCV::F10_32, RISCV::F11_32, RISCV::F12_32, RISCV::F13_32,
RISCV::F14_32, RISCV::F15_32, RISCV::F16_32, RISCV::F17_32
};
static const MCPhysReg ArgFPR64s[] = {
RISCV::F10_64, RISCV::F11_64, RISCV::F12_64, RISCV::F13_64,
RISCV::F14_64, RISCV::F15_64, RISCV::F16_64, RISCV::F17_64
};
// Pass a 2*XLEN argument that has been split into two XLEN values through
// registers or the stack as necessary.
static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
ISD::ArgFlagsTy ArgFlags1, unsigned ValNo2,
MVT ValVT2, MVT LocVT2,
ISD::ArgFlagsTy ArgFlags2) {
unsigned XLenInBytes = XLen / 8;
if (unsigned Reg = State.AllocateReg(ArgGPRs)) {
// At least one half can be passed via register.
State.addLoc(CCValAssign::getReg(VA1.getValNo(), VA1.getValVT(), Reg,
VA1.getLocVT(), CCValAssign::Full));
} else {
// Both halves must be passed on the stack, with proper alignment.
unsigned StackAlign = std::max(XLenInBytes, ArgFlags1.getOrigAlign());
State.addLoc(
CCValAssign::getMem(VA1.getValNo(), VA1.getValVT(),
State.AllocateStack(XLenInBytes, StackAlign),
VA1.getLocVT(), CCValAssign::Full));
State.addLoc(CCValAssign::getMem(
ValNo2, ValVT2, State.AllocateStack(XLenInBytes, XLenInBytes), LocVT2,
CCValAssign::Full));
return false;
}
if (unsigned Reg = State.AllocateReg(ArgGPRs)) {
// The second half can also be passed via register.
State.addLoc(
CCValAssign::getReg(ValNo2, ValVT2, Reg, LocVT2, CCValAssign::Full));
} else {
// The second half is passed via the stack, without additional alignment.
State.addLoc(CCValAssign::getMem(
ValNo2, ValVT2, State.AllocateStack(XLenInBytes, XLenInBytes), LocVT2,
CCValAssign::Full));
}
return false;
}
// Implements the RISC-V calling convention. Returns true upon failure.
static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
bool IsRet, Type *OrigTy) {
unsigned XLen = DL.getLargestLegalIntTypeSizeInBits();
assert(XLen == 32 || XLen == 64);
MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64;
// Any return value split in to more than two values can't be returned
// directly.
if (IsRet && ValNo > 1)
return true;
// UseGPRForF32 if targeting one of the soft-float ABIs, if passing a
// variadic argument, or if no F32 argument registers are available.
bool UseGPRForF32 = true;
// UseGPRForF64 if targeting soft-float ABIs or an FLEN=32 ABI, if passing a
// variadic argument, or if no F64 argument registers are available.
bool UseGPRForF64 = true;
switch (ABI) {
default:
llvm_unreachable("Unexpected ABI");
case RISCVABI::ABI_ILP32:
case RISCVABI::ABI_LP64:
break;
case RISCVABI::ABI_ILP32F:
case RISCVABI::ABI_LP64F:
UseGPRForF32 = !IsFixed;
break;
case RISCVABI::ABI_ILP32D:
case RISCVABI::ABI_LP64D:
UseGPRForF32 = !IsFixed;
UseGPRForF64 = !IsFixed;
break;
}
if (State.getFirstUnallocated(ArgFPR32s) == array_lengthof(ArgFPR32s))
UseGPRForF32 = true;
if (State.getFirstUnallocated(ArgFPR64s) == array_lengthof(ArgFPR64s))
UseGPRForF64 = true;
// From this point on, rely on UseGPRForF32, UseGPRForF64 and similar local
// variables rather than directly checking against the target ABI.
if (UseGPRForF32 && ValVT == MVT::f32) {
LocVT = XLenVT;
LocInfo = CCValAssign::BCvt;
} else if (UseGPRForF64 && XLen == 64 && ValVT == MVT::f64) {
LocVT = MVT::i64;
LocInfo = CCValAssign::BCvt;
}
// If this is a variadic argument, the RISC-V calling convention requires
// that it is assigned an 'even' or 'aligned' register if it has 8-byte
// alignment (RV32) or 16-byte alignment (RV64). An aligned register should
// be used regardless of whether the original argument was split during
// legalisation or not. The argument will not be passed by registers if the
// original type is larger than 2*XLEN, so the register alignment rule does
// not apply.
unsigned TwoXLenInBytes = (2 * XLen) / 8;
if (!IsFixed && ArgFlags.getOrigAlign() == TwoXLenInBytes &&
DL.getTypeAllocSize(OrigTy) == TwoXLenInBytes) {
unsigned RegIdx = State.getFirstUnallocated(ArgGPRs);
// Skip 'odd' register if necessary.
if (RegIdx != array_lengthof(ArgGPRs) && RegIdx % 2 == 1)
State.AllocateReg(ArgGPRs);
}
SmallVectorImpl<CCValAssign> &PendingLocs = State.getPendingLocs();
SmallVectorImpl<ISD::ArgFlagsTy> &PendingArgFlags =
State.getPendingArgFlags();
assert(PendingLocs.size() == PendingArgFlags.size() &&
"PendingLocs and PendingArgFlags out of sync");
// Handle passing f64 on RV32D with a soft float ABI or when floating point
// registers are exhausted.
if (UseGPRForF64 && XLen == 32 && ValVT == MVT::f64) {
assert(!ArgFlags.isSplit() && PendingLocs.empty() &&
"Can't lower f64 if it is split");
// Depending on available argument GPRS, f64 may be passed in a pair of
// GPRs, split between a GPR and the stack, or passed completely on the
// stack. LowerCall/LowerFormalArguments/LowerReturn must recognise these
// cases.
unsigned Reg = State.AllocateReg(ArgGPRs);
LocVT = MVT::i32;
if (!Reg) {
unsigned StackOffset = State.AllocateStack(8, 8);
State.addLoc(
CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
return false;
}
if (!State.AllocateReg(ArgGPRs))
State.AllocateStack(4, 4);
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return false;
}
// Split arguments might be passed indirectly, so keep track of the pending
// values.
if (ArgFlags.isSplit() || !PendingLocs.empty()) {
LocVT = XLenVT;
LocInfo = CCValAssign::Indirect;
PendingLocs.push_back(
CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
PendingArgFlags.push_back(ArgFlags);
if (!ArgFlags.isSplitEnd()) {
return false;
}
}
// If the split argument only had two elements, it should be passed directly
// in registers or on the stack.
if (ArgFlags.isSplitEnd() && PendingLocs.size() <= 2) {
assert(PendingLocs.size() == 2 && "Unexpected PendingLocs.size()");
// Apply the normal calling convention rules to the first half of the
// split argument.
CCValAssign VA = PendingLocs[0];
ISD::ArgFlagsTy AF = PendingArgFlags[0];
PendingLocs.clear();
PendingArgFlags.clear();
return CC_RISCVAssign2XLen(XLen, State, VA, AF, ValNo, ValVT, LocVT,
ArgFlags);
}
// Allocate to a register if possible, or else a stack slot.
unsigned Reg;
if (ValVT == MVT::f32 && !UseGPRForF32)
Reg = State.AllocateReg(ArgFPR32s, ArgFPR64s);
else if (ValVT == MVT::f64 && !UseGPRForF64)
Reg = State.AllocateReg(ArgFPR64s, ArgFPR32s);
else
Reg = State.AllocateReg(ArgGPRs);
unsigned StackOffset = Reg ? 0 : State.AllocateStack(XLen / 8, XLen / 8);
// If we reach this point and PendingLocs is non-empty, we must be at the
// end of a split argument that must be passed indirectly.
if (!PendingLocs.empty()) {
assert(ArgFlags.isSplitEnd() && "Expected ArgFlags.isSplitEnd()");
assert(PendingLocs.size() > 2 && "Unexpected PendingLocs.size()");
for (auto &It : PendingLocs) {
if (Reg)
It.convertToReg(Reg);
else
It.convertToMem(StackOffset);
State.addLoc(It);
}
PendingLocs.clear();
PendingArgFlags.clear();
return false;
}
assert((!UseGPRForF32 || !UseGPRForF64 || LocVT == XLenVT) &&
"Expected an XLenVT at this stage");
if (Reg) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return false;
}
// When an f32 or f64 is passed on the stack, no bit-conversion is needed.
if (ValVT == MVT::f32 || ValVT == MVT::f64) {
LocVT = ValVT;
LocInfo = CCValAssign::Full;
}
State.addLoc(CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
return false;
}
void RISCVTargetLowering::analyzeInputArgs(
MachineFunction &MF, CCState &CCInfo,
const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet) const {
unsigned NumArgs = Ins.size();
FunctionType *FType = MF.getFunction().getFunctionType();
for (unsigned i = 0; i != NumArgs; ++i) {
MVT ArgVT = Ins[i].VT;
ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
Type *ArgTy = nullptr;
if (IsRet)
ArgTy = FType->getReturnType();
else if (Ins[i].isOrigArg())
ArgTy = FType->getParamType(Ins[i].getOrigArgIndex());
RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
ArgFlags, CCInfo, /*IsRet=*/true, IsRet, ArgTy)) {
LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type "
<< EVT(ArgVT).getEVTString() << '\n');
llvm_unreachable(nullptr);
}
}
}
void RISCVTargetLowering::analyzeOutputArgs(
MachineFunction &MF, CCState &CCInfo,
const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsRet,
CallLoweringInfo *CLI) const {
unsigned NumArgs = Outs.size();
for (unsigned i = 0; i != NumArgs; i++) {
MVT ArgVT = Outs[i].VT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
Type *OrigTy = CLI ? CLI->getArgs()[Outs[i].OrigArgIndex].Ty : nullptr;
RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy)) {
LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type "
<< EVT(ArgVT).getEVTString() << "\n");
llvm_unreachable(nullptr);
}
}
}
// Convert Val to a ValVT. Should not be called for CCValAssign::Indirect
// values.
static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val,
const CCValAssign &VA, const SDLoc &DL) {
switch (VA.getLocInfo()) {
default:
llvm_unreachable("Unexpected CCValAssign::LocInfo");
case CCValAssign::Full:
break;
case CCValAssign::BCvt:
if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) {
Val = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, Val);
break;
}
Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
break;
}
return Val;
}
// The caller is responsible for loading the full value if the argument is
// passed with CCValAssign::Indirect.
static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain,
const CCValAssign &VA, const SDLoc &DL) {
MachineFunction &MF = DAG.getMachineFunction();
MachineRegisterInfo &RegInfo = MF.getRegInfo();
EVT LocVT = VA.getLocVT();
SDValue Val;
const TargetRegisterClass *RC;
switch (LocVT.getSimpleVT().SimpleTy) {
default:
llvm_unreachable("Unexpected register type");
case MVT::i32:
case MVT::i64:
RC = &RISCV::GPRRegClass;
break;
case MVT::f32:
RC = &RISCV::FPR32RegClass;
break;
case MVT::f64:
RC = &RISCV::FPR64RegClass;
break;
}
unsigned VReg = RegInfo.createVirtualRegister(RC);
RegInfo.addLiveIn(VA.getLocReg(), VReg);
Val = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
if (VA.getLocInfo() == CCValAssign::Indirect)
return Val;
return convertLocVTToValVT(DAG, Val, VA, DL);
}
static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val,
const CCValAssign &VA, const SDLoc &DL) {
EVT LocVT = VA.getLocVT();
switch (VA.getLocInfo()) {
default:
llvm_unreachable("Unexpected CCValAssign::LocInfo");
case CCValAssign::Full:
break;
case CCValAssign::BCvt:
if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) {
Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Val);
break;
}
Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val);
break;
}
return Val;
}
// The caller is responsible for loading the full value if the argument is
// passed with CCValAssign::Indirect.
static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain,
const CCValAssign &VA, const SDLoc &DL) {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
EVT LocVT = VA.getLocVT();
EVT ValVT = VA.getValVT();
EVT PtrVT = MVT::getIntegerVT(DAG.getDataLayout().getPointerSizeInBits(0));
int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
VA.getLocMemOffset(), /*Immutable=*/true);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
SDValue Val;
ISD::LoadExtType ExtType;
switch (VA.getLocInfo()) {
default:
llvm_unreachable("Unexpected CCValAssign::LocInfo");
case CCValAssign::Full:
case CCValAssign::Indirect:
case CCValAssign::BCvt:
ExtType = ISD::NON_EXTLOAD;
break;
}
Val = DAG.getExtLoad(
ExtType, DL, LocVT, Chain, FIN,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), ValVT);
return Val;
}
static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain,
const CCValAssign &VA, const SDLoc &DL) {
assert(VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64 &&
"Unexpected VA");
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
MachineRegisterInfo &RegInfo = MF.getRegInfo();
if (VA.isMemLoc()) {
// f64 is passed on the stack.
int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), /*Immutable=*/true);
SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
return DAG.getLoad(MVT::f64, DL, Chain, FIN,
MachinePointerInfo::getFixedStack(MF, FI));
}
assert(VA.isRegLoc() && "Expected register VA assignment");
unsigned LoVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
RegInfo.addLiveIn(VA.getLocReg(), LoVReg);
SDValue Lo = DAG.getCopyFromReg(Chain, DL, LoVReg, MVT::i32);
SDValue Hi;
if (VA.getLocReg() == RISCV::X17) {
// Second half of f64 is passed on the stack.
int FI = MFI.CreateFixedObject(4, 0, /*Immutable=*/true);
SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
Hi = DAG.getLoad(MVT::i32, DL, Chain, FIN,
MachinePointerInfo::getFixedStack(MF, FI));
} else {
// Second half of f64 is passed in another GPR.
unsigned HiVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
RegInfo.addLiveIn(VA.getLocReg() + 1, HiVReg);
Hi = DAG.getCopyFromReg(Chain, DL, HiVReg, MVT::i32);
}
return DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
}
// Transform physical registers into virtual registers.
SDValue RISCVTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
switch (CallConv) {
default:
report_fatal_error("Unsupported calling convention");
case CallingConv::C:
case CallingConv::Fast:
break;
}
MachineFunction &MF = DAG.getMachineFunction();
const Function &Func = MF.getFunction();
if (Func.hasFnAttribute("interrupt")) {
if (!Func.arg_empty())
report_fatal_error(
"Functions with the interrupt attribute cannot have arguments!");
StringRef Kind =
MF.getFunction().getFnAttribute("interrupt").getValueAsString();
if (!(Kind == "user" || Kind == "supervisor" || Kind == "machine"))
report_fatal_error(
"Function interrupt attribute argument not supported!");
}
EVT PtrVT = getPointerTy(DAG.getDataLayout());
MVT XLenVT = Subtarget.getXLenVT();
unsigned XLenInBytes = Subtarget.getXLen() / 8;
// Used with vargs to acumulate store chains.
std::vector<SDValue> OutChains;
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false);
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
SDValue ArgValue;
// Passing f64 on RV32D with a soft float ABI must be handled as a special
// case.
if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64)
ArgValue = unpackF64OnRV32DSoftABI(DAG, Chain, VA, DL);
else if (VA.isRegLoc())
ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL);
else
ArgValue = unpackFromMemLoc(DAG, Chain, VA, DL);
if (VA.getLocInfo() == CCValAssign::Indirect) {
// If the original argument was split and passed by reference (e.g. i128
// on RV32), we need to load all parts of it here (using the same
// address).
InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue,
MachinePointerInfo()));
unsigned ArgIndex = Ins[i].OrigArgIndex;
assert(Ins[i].PartOffset == 0);
while (i + 1 != e && Ins[i + 1].OrigArgIndex == ArgIndex) {
CCValAssign &PartVA = ArgLocs[i + 1];
unsigned PartOffset = Ins[i + 1].PartOffset;
SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, ArgValue,
DAG.getIntPtrConstant(PartOffset, DL));
InVals.push_back(DAG.getLoad(PartVA.getValVT(), DL, Chain, Address,
MachinePointerInfo()));
++i;
}
continue;
}
InVals.push_back(ArgValue);
}
if (IsVarArg) {
ArrayRef<MCPhysReg> ArgRegs = makeArrayRef(ArgGPRs);
unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs);
const TargetRegisterClass *RC = &RISCV::GPRRegClass;
MachineFrameInfo &MFI = MF.getFrameInfo();
MachineRegisterInfo &RegInfo = MF.getRegInfo();
RISCVMachineFunctionInfo *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
// Offset of the first variable argument from stack pointer, and size of
// the vararg save area. For now, the varargs save area is either zero or
// large enough to hold a0-a7.
int VaArgOffset, VarArgsSaveSize;
// If all registers are allocated, then all varargs must be passed on the
// stack and we don't need to save any argregs.
if (ArgRegs.size() == Idx) {
VaArgOffset = CCInfo.getNextStackOffset();
VarArgsSaveSize = 0;
} else {
VarArgsSaveSize = XLenInBytes * (ArgRegs.size() - Idx);
VaArgOffset = -VarArgsSaveSize;
}
// Record the frame index of the first variable argument
// which is a value necessary to VASTART.
int FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true);
RVFI->setVarArgsFrameIndex(FI);
// If saving an odd number of registers then create an extra stack slot to
// ensure that the frame pointer is 2*XLEN-aligned, which in turn ensures
// offsets to even-numbered registered remain 2*XLEN-aligned.
if (Idx % 2) {
FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset - (int)XLenInBytes,
true);
VarArgsSaveSize += XLenInBytes;
}
// Copy the integer registers that may have been used for passing varargs
// to the vararg save area.
for (unsigned I = Idx; I < ArgRegs.size();
++I, VaArgOffset += XLenInBytes) {
const unsigned Reg = RegInfo.createVirtualRegister(RC);
RegInfo.addLiveIn(ArgRegs[I], Reg);
SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, XLenVT);
FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true);
SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
SDValue Store = DAG.getStore(Chain, DL, ArgValue, PtrOff,
MachinePointerInfo::getFixedStack(MF, FI));
cast<StoreSDNode>(Store.getNode())
->getMemOperand()
->setValue((Value *)nullptr);
OutChains.push_back(Store);
}
RVFI->setVarArgsSaveSize(VarArgsSaveSize);
}
// All stores are grouped in one node to allow the matching between
// the size of Ins and InVals. This only happens for vararg functions.
if (!OutChains.empty()) {
OutChains.push_back(Chain);
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
}
return Chain;
}
/// isEligibleForTailCallOptimization - Check whether the call is eligible
/// for tail call optimization.
/// Note: This is modelled after ARM's IsEligibleForTailCallOptimization.
bool RISCVTargetLowering::isEligibleForTailCallOptimization(
CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
const SmallVector<CCValAssign, 16> &ArgLocs) const {
auto &Callee = CLI.Callee;
auto CalleeCC = CLI.CallConv;
auto IsVarArg = CLI.IsVarArg;
auto &Outs = CLI.Outs;
auto &Caller = MF.getFunction();
auto CallerCC = Caller.getCallingConv();
// Do not tail call opt functions with "disable-tail-calls" attribute.
if (Caller.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
return false;
// Exception-handling functions need a special set of instructions to
// indicate a return to the hardware. Tail-calling another function would
// probably break this.
// TODO: The "interrupt" attribute isn't currently defined by RISC-V. This
// should be expanded as new function attributes are introduced.
if (Caller.hasFnAttribute("interrupt"))
return false;
// Do not tail call opt functions with varargs.
if (IsVarArg)
return false;
// Do not tail call opt if the stack is used to pass parameters.
if (CCInfo.getNextStackOffset() != 0)
return false;
// Do not tail call opt if any parameters need to be passed indirectly.
// Since long doubles (fp128) and i128 are larger than 2*XLEN, they are
// passed indirectly. So the address of the value will be passed in a
// register, or if not available, then the address is put on the stack. In
// order to pass indirectly, space on the stack often needs to be allocated
// in order to store the value. In this case the CCInfo.getNextStackOffset()
// != 0 check is not enough and we need to check if any CCValAssign ArgsLocs
// are passed CCValAssign::Indirect.
for (auto &VA : ArgLocs)
if (VA.getLocInfo() == CCValAssign::Indirect)
return false;
// Do not tail call opt if either caller or callee uses struct return
// semantics.
auto IsCallerStructRet = Caller.hasStructRetAttr();
auto IsCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
if (IsCallerStructRet || IsCalleeStructRet)
return false;
// Externally-defined functions with weak linkage should not be
// tail-called. The behaviour of branch instructions in this situation (as
// used for tail calls) is implementation-defined, so we cannot rely on the
// linker replacing the tail call with a return.
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
const GlobalValue *GV = G->getGlobal();
if (GV->hasExternalWeakLinkage())
return false;
}
// The callee has to preserve all registers the caller needs to preserve.
const RISCVRegisterInfo *TRI = Subtarget.getRegisterInfo();
const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
if (CalleeCC != CallerCC) {
const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
return false;
}
// Byval parameters hand the function a pointer directly into the stack area
// we want to reuse during a tail call. Working around this *is* possible
// but less efficient and uglier in LowerCall.
for (auto &Arg : Outs)
if (Arg.Flags.isByVal())
return false;
return true;
}
// Lower a call to a callseq_start + CALL + callseq_end chain, and add input
// and output parameter nodes.
SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
SelectionDAG &DAG = CLI.DAG;
SDLoc &DL = CLI.DL;
SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
bool &IsTailCall = CLI.IsTailCall;
CallingConv::ID CallConv = CLI.CallConv;
bool IsVarArg = CLI.IsVarArg;
EVT PtrVT = getPointerTy(DAG.getDataLayout());
MVT XLenVT = Subtarget.getXLenVT();
MachineFunction &MF = DAG.getMachineFunction();
// Analyze the operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI);
// Check if it's really possible to do a tail call.
if (IsTailCall)
IsTailCall = isEligibleForTailCallOptimization(ArgCCInfo, CLI, MF, ArgLocs);
if (IsTailCall)
++NumTailCalls;
else if (CLI.CS && CLI.CS.isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = ArgCCInfo.getNextStackOffset();
// Create local copies for byval args
SmallVector<SDValue, 8> ByValArgs;
for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
ISD::ArgFlagsTy Flags = Outs[i].Flags;
if (!Flags.isByVal())
continue;
SDValue Arg = OutVals[i];
unsigned Size = Flags.getByValSize();
unsigned Align = Flags.getByValAlign();
int FI = MF.getFrameInfo().CreateStackObject(Size, Align, /*isSS=*/false);
SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
SDValue SizeNode = DAG.getConstant(Size, DL, XLenVT);
Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Align,
/*IsVolatile=*/false,
/*AlwaysInline=*/false,
IsTailCall, MachinePointerInfo(),
MachinePointerInfo());
ByValArgs.push_back(FIPtr);
}
if (!IsTailCall)
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL);
// Copy argument values to their designated locations.
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
SmallVector<SDValue, 8> MemOpChains;
SDValue StackPtr;
for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
SDValue ArgValue = OutVals[i];
ISD::ArgFlagsTy Flags = Outs[i].Flags;
// Handle passing f64 on RV32D with a soft float ABI as a special case.
bool IsF64OnRV32DSoftABI =
VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64;
if (IsF64OnRV32DSoftABI && VA.isRegLoc()) {
SDValue SplitF64 = DAG.getNode(
RISCVISD::SplitF64, DL, DAG.getVTList(MVT::i32, MVT::i32), ArgValue);
SDValue Lo = SplitF64.getValue(0);
SDValue Hi = SplitF64.getValue(1);
unsigned RegLo = VA.getLocReg();
RegsToPass.push_back(std::make_pair(RegLo, Lo));
if (RegLo == RISCV::X17) {
// Second half of f64 is passed on the stack.
// Work out the address of the stack slot.
if (!StackPtr.getNode())
StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X2, PtrVT);
// Emit the store.
MemOpChains.push_back(
DAG.getStore(Chain, DL, Hi, StackPtr, MachinePointerInfo()));
} else {
// Second half of f64 is passed in another GPR.
unsigned RegHigh = RegLo + 1;
RegsToPass.push_back(std::make_pair(RegHigh, Hi));
}
continue;
}
// IsF64OnRV32DSoftABI && VA.isMemLoc() is handled below in the same way
// as any other MemLoc.
// Promote the value if needed.
// For now, only handle fully promoted and indirect arguments.
if (VA.getLocInfo() == CCValAssign::Indirect) {
// Store the argument in a stack slot and pass its address.
SDValue SpillSlot = DAG.CreateStackTemporary(Outs[i].ArgVT);
int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
MemOpChains.push_back(
DAG.getStore(Chain, DL, ArgValue, SpillSlot,
MachinePointerInfo::getFixedStack(MF, FI)));
// If the original argument was split (e.g. i128), we need
// to store all parts of it here (and pass just one address).
unsigned ArgIndex = Outs[i].OrigArgIndex;
assert(Outs[i].PartOffset == 0);
while (i + 1 != e && Outs[i + 1].OrigArgIndex == ArgIndex) {
SDValue PartValue = OutVals[i + 1];
unsigned PartOffset = Outs[i + 1].PartOffset;
SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot,
DAG.getIntPtrConstant(PartOffset, DL));
MemOpChains.push_back(
DAG.getStore(Chain, DL, PartValue, Address,
MachinePointerInfo::getFixedStack(MF, FI)));
++i;
}
ArgValue = SpillSlot;
} else {
ArgValue = convertValVTToLocVT(DAG, ArgValue, VA, DL);
}
// Use local copy if it is a byval arg.
if (Flags.isByVal())
ArgValue = ByValArgs[j++];
if (VA.isRegLoc()) {
// Queue up the argument copies and emit them at the end.
RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
} else {
assert(VA.isMemLoc() && "Argument not register or memory");
assert(!IsTailCall && "Tail call not allowed if stack is used "
"for passing parameters");
// Work out the address of the stack slot.
if (!StackPtr.getNode())
StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X2, PtrVT);
SDValue Address =
DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
DAG.getIntPtrConstant(VA.getLocMemOffset(), DL));
// Emit the store.
MemOpChains.push_back(
DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo()));
}
}
// Join the stores, which are independent of one another.
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
SDValue Glue;
// Build a sequence of copy-to-reg nodes, chained and glued together.
for (auto &Reg : RegsToPass) {
Chain = DAG.getCopyToReg(Chain, DL, Reg.first, Reg.second, Glue);
Glue = Chain.getValue(1);
}
// If the callee is a GlobalAddress/ExternalSymbol node, turn it into a
// TargetGlobalAddress/TargetExternalSymbol node so that legalize won't
// split it and then direct call can be matched by PseudoCALL.
if (GlobalAddressSDNode *S = dyn_cast<GlobalAddressSDNode>(Callee)) {
const GlobalValue *GV = S->getGlobal();
unsigned OpFlags = RISCVII::MO_CALL;
if (!getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV))
OpFlags = RISCVII::MO_PLT;
Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
unsigned OpFlags = RISCVII::MO_CALL;
if (!getTargetMachine().shouldAssumeDSOLocal(*MF.getFunction().getParent(),
nullptr))
OpFlags = RISCVII::MO_PLT;
Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, OpFlags);
}
// The first call operand is the chain and the second is the target address.
SmallVector<SDValue, 8> Ops;
Ops.push_back(Chain);
Ops.push_back(Callee);
// Add argument registers to the end of the list so that they are
// known live into the call.
for (auto &Reg : RegsToPass)
Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
if (!IsTailCall) {
// Add a register mask operand representing the call-preserved registers.
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
}
// Glue the call to the argument copies, if any.
if (Glue.getNode())
Ops.push_back(Glue);
// Emit the call.
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
if (IsTailCall) {
MF.getFrameInfo().setHasTailCall();
return DAG.getNode(RISCVISD::TAIL, DL, NodeTys, Ops);
}
Chain = DAG.getNode(RISCVISD::CALL, DL, NodeTys, Ops);
Glue = Chain.getValue(1);
// Mark the end of the call, which is glued to the call itself.
Chain = DAG.getCALLSEQ_END(Chain,
DAG.getConstant(NumBytes, DL, PtrVT, true),
DAG.getConstant(0, DL, PtrVT, true),
Glue, DL);
Glue = Chain.getValue(1);
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
CCState RetCCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
analyzeInputArgs(MF, RetCCInfo, Ins, /*IsRet=*/true);
// Copy all of the result registers out of their specified physreg.
for (auto &VA : RVLocs) {
// Copy the value out
SDValue RetValue =
DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), Glue);
// Glue the RetValue to the end of the call sequence
Chain = RetValue.getValue(1);
Glue = RetValue.getValue(2);
if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) {
assert(VA.getLocReg() == ArgGPRs[0] && "Unexpected reg assignment");
SDValue RetValue2 =
DAG.getCopyFromReg(Chain, DL, ArgGPRs[1], MVT::i32, Glue);
Chain = RetValue2.getValue(1);
Glue = RetValue2.getValue(2);
RetValue = DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, RetValue,
RetValue2);
}
RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL);
InVals.push_back(RetValue);
}
return Chain;
}
bool RISCVTargetLowering::CanLowerReturn(
CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
MVT VT = Outs[i].VT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
if (CC_RISCV(MF.getDataLayout(), ABI, i, VT, VT, CCValAssign::Full,
ArgFlags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true, nullptr))
return false;
}
return true;
}
SDValue
RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &DL, SelectionDAG &DAG) const {
// Stores the assignment of the return value to a location.
SmallVector<CCValAssign, 16> RVLocs;
// Info about the registers and stack slot.
CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
analyzeOutputArgs(DAG.getMachineFunction(), CCInfo, Outs, /*IsRet=*/true,
nullptr);
SDValue Glue;
SmallVector<SDValue, 4> RetOps(1, Chain);
// Copy the result values into the output registers.
for (unsigned i = 0, e = RVLocs.size(); i < e; ++i) {
SDValue Val = OutVals[i];
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) {
// Handle returning f64 on RV32D with a soft float ABI.
assert(VA.isRegLoc() && "Expected return via registers");
SDValue SplitF64 = DAG.getNode(RISCVISD::SplitF64, DL,
DAG.getVTList(MVT::i32, MVT::i32), Val);
SDValue Lo = SplitF64.getValue(0);
SDValue Hi = SplitF64.getValue(1);
unsigned RegLo = VA.getLocReg();
unsigned RegHi = RegLo + 1;
Chain = DAG.getCopyToReg(Chain, DL, RegLo, Lo, Glue);
Glue = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(RegLo, MVT::i32));
Chain = DAG.getCopyToReg(Chain, DL, RegHi, Hi, Glue);
Glue = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(RegHi, MVT::i32));
} else {
// Handle a 'normal' return.
Val = convertValVTToLocVT(DAG, Val, VA, DL);
Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Glue);
// Guarantee that all emitted copies are stuck together.
Glue = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
}
}
RetOps[0] = Chain; // Update chain.
// Add the glue node if we have it.
if (Glue.getNode()) {
RetOps.push_back(Glue);
}
// Interrupt service routines use different return instructions.
const Function &Func = DAG.getMachineFunction().getFunction();
if (Func.hasFnAttribute("interrupt")) {
if (!Func.getReturnType()->isVoidTy())
report_fatal_error(
"Functions with the interrupt attribute must have void return type!");
MachineFunction &MF = DAG.getMachineFunction();
StringRef Kind =
MF.getFunction().getFnAttribute("interrupt").getValueAsString();
unsigned RetOpc;
if (Kind == "user")
RetOpc = RISCVISD::URET_FLAG;
else if (Kind == "supervisor")
RetOpc = RISCVISD::SRET_FLAG;
else
RetOpc = RISCVISD::MRET_FLAG;
return DAG.getNode(RetOpc, DL, MVT::Other, RetOps);
}
return DAG.getNode(RISCVISD::RET_FLAG, DL, MVT::Other, RetOps);
}
const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((RISCVISD::NodeType)Opcode) {
case RISCVISD::FIRST_NUMBER:
break;
case RISCVISD::RET_FLAG:
return "RISCVISD::RET_FLAG";
case RISCVISD::URET_FLAG:
return "RISCVISD::URET_FLAG";
case RISCVISD::SRET_FLAG:
return "RISCVISD::SRET_FLAG";
case RISCVISD::MRET_FLAG:
return "RISCVISD::MRET_FLAG";
case RISCVISD::CALL:
return "RISCVISD::CALL";
case RISCVISD::SELECT_CC:
return "RISCVISD::SELECT_CC";
case RISCVISD::BuildPairF64:
return "RISCVISD::BuildPairF64";
case RISCVISD::SplitF64:
return "RISCVISD::SplitF64";
case RISCVISD::TAIL:
return "RISCVISD::TAIL";
case RISCVISD::SLLW:
return "RISCVISD::SLLW";
case RISCVISD::SRAW:
return "RISCVISD::SRAW";
case RISCVISD::SRLW:
return "RISCVISD::SRLW";
case RISCVISD::DIVW:
return "RISCVISD::DIVW";
case RISCVISD::DIVUW:
return "RISCVISD::DIVUW";
case RISCVISD::REMUW:
return "RISCVISD::REMUW";
case RISCVISD::FMV_W_X_RV64:
return "RISCVISD::FMV_W_X_RV64";
case RISCVISD::FMV_X_ANYEXTW_RV64:
return "RISCVISD::FMV_X_ANYEXTW_RV64";
case RISCVISD::READ_CYCLE_WIDE:
return "RISCVISD::READ_CYCLE_WIDE";
}
return nullptr;
}
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+RISCVTargetLowering::ConstraintType
+RISCVTargetLowering::getConstraintType(StringRef Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ default:
+ break;
+ case 'f':
+ return C_RegisterClass;
+ case 'I':
+ case 'J':
+ case 'K':
+ return C_Immediate;
+ }
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}
+
std::pair<unsigned, const TargetRegisterClass *>
RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
MVT VT) const {
// First, see if this is a constraint that directly corresponds to a
// RISCV register class.
if (Constraint.size() == 1) {
switch (Constraint[0]) {
case 'r':
return std::make_pair(0U, &RISCV::GPRRegClass);
+ case 'f':
+ if (Subtarget.hasStdExtF() && VT == MVT::f32)
+ return std::make_pair(0U, &RISCV::FPR32RegClass);
+ if (Subtarget.hasStdExtD() && VT == MVT::f64)
+ return std::make_pair(0U, &RISCV::FPR64RegClass);
+ break;
default:
break;
}
}
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
}
void RISCVTargetLowering::LowerAsmOperandForConstraint(
SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
SelectionDAG &DAG) const {
// Currently only support length 1 constraints.
if (Constraint.length() == 1) {
switch (Constraint[0]) {
case 'I':
// Validate & create a 12-bit signed immediate operand.
if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
uint64_t CVal = C->getSExtValue();
if (isInt<12>(CVal))
Ops.push_back(
DAG.getTargetConstant(CVal, SDLoc(Op), Subtarget.getXLenVT()));
}
return;
case 'J':
// Validate & create an integer zero operand.
if (auto *C = dyn_cast<ConstantSDNode>(Op))
if (C->getZExtValue() == 0)
Ops.push_back(
DAG.getTargetConstant(0, SDLoc(Op), Subtarget.getXLenVT()));
return;
case 'K':
// Validate & create a 5-bit unsigned immediate operand.
if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
uint64_t CVal = C->getZExtValue();
if (isUInt<5>(CVal))
Ops.push_back(
DAG.getTargetConstant(CVal, SDLoc(Op), Subtarget.getXLenVT()));
}
return;
default:
break;
}
}
TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
}
Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
if (isa<LoadInst>(Inst) && Ord == AtomicOrdering::SequentiallyConsistent)
return Builder.CreateFence(Ord);
if (isa<StoreInst>(Inst) && isReleaseOrStronger(Ord))
return Builder.CreateFence(AtomicOrdering::Release);
return nullptr;
}
Instruction *RISCVTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
if (isa<LoadInst>(Inst) && isAcquireOrStronger(Ord))
return Builder.CreateFence(AtomicOrdering::Acquire);
return nullptr;
}
TargetLowering::AtomicExpansionKind
RISCVTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
// atomicrmw {fadd,fsub} must be expanded to use compare-exchange, as floating
// point operations can't be used in an lr/sc sequence without breaking the
// forward-progress guarantee.
if (AI->isFloatingPointOperation())
return AtomicExpansionKind::CmpXChg;
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
if (Size == 8 || Size == 16)
return AtomicExpansionKind::MaskedIntrinsic;
return AtomicExpansionKind::None;
}
static Intrinsic::ID
getIntrinsicForMaskedAtomicRMWBinOp(unsigned XLen, AtomicRMWInst::BinOp BinOp) {
if (XLen == 32) {
switch (BinOp) {
default:
llvm_unreachable("Unexpected AtomicRMW BinOp");
case AtomicRMWInst::Xchg:
return Intrinsic::riscv_masked_atomicrmw_xchg_i32;
case AtomicRMWInst::Add:
return Intrinsic::riscv_masked_atomicrmw_add_i32;
case AtomicRMWInst::Sub:
return Intrinsic::riscv_masked_atomicrmw_sub_i32;
case AtomicRMWInst::Nand:
return Intrinsic::riscv_masked_atomicrmw_nand_i32;
case AtomicRMWInst::Max:
return Intrinsic::riscv_masked_atomicrmw_max_i32;
case AtomicRMWInst::Min:
return Intrinsic::riscv_masked_atomicrmw_min_i32;
case AtomicRMWInst::UMax:
return Intrinsic::riscv_masked_atomicrmw_umax_i32;
case AtomicRMWInst::UMin:
return Intrinsic::riscv_masked_atomicrmw_umin_i32;
}
}
if (XLen == 64) {
switch (BinOp) {
default:
llvm_unreachable("Unexpected AtomicRMW BinOp");
case AtomicRMWInst::Xchg:
return Intrinsic::riscv_masked_atomicrmw_xchg_i64;
case AtomicRMWInst::Add:
return Intrinsic::riscv_masked_atomicrmw_add_i64;
case AtomicRMWInst::Sub:
return Intrinsic::riscv_masked_atomicrmw_sub_i64;
case AtomicRMWInst::Nand:
return Intrinsic::riscv_masked_atomicrmw_nand_i64;
case AtomicRMWInst::Max:
return Intrinsic::riscv_masked_atomicrmw_max_i64;
case AtomicRMWInst::Min:
return Intrinsic::riscv_masked_atomicrmw_min_i64;
case AtomicRMWInst::UMax:
return Intrinsic::riscv_masked_atomicrmw_umax_i64;
case AtomicRMWInst::UMin:
return Intrinsic::riscv_masked_atomicrmw_umin_i64;
}
}
llvm_unreachable("Unexpected XLen\n");
}
Value *RISCVTargetLowering::emitMaskedAtomicRMWIntrinsic(
IRBuilder<> &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr,
Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const {
unsigned XLen = Subtarget.getXLen();
Value *Ordering =
Builder.getIntN(XLen, static_cast<uint64_t>(AI->getOrdering()));
Type *Tys[] = {AlignedAddr->getType()};
Function *LrwOpScwLoop = Intrinsic::getDeclaration(
AI->getModule(),
getIntrinsicForMaskedAtomicRMWBinOp(XLen, AI->getOperation()), Tys);
if (XLen == 64) {
Incr = Builder.CreateSExt(Incr, Builder.getInt64Ty());
Mask = Builder.CreateSExt(Mask, Builder.getInt64Ty());
ShiftAmt = Builder.CreateSExt(ShiftAmt, Builder.getInt64Ty());
}
Value *Result;
// Must pass the shift amount needed to sign extend the loaded value prior
// to performing a signed comparison for min/max. ShiftAmt is the number of
// bits to shift the value into position. Pass XLen-ShiftAmt-ValWidth, which
// is the number of bits to left+right shift the value in order to
// sign-extend.
if (AI->getOperation() == AtomicRMWInst::Min ||
AI->getOperation() == AtomicRMWInst::Max) {
const DataLayout &DL = AI->getModule()->getDataLayout();
unsigned ValWidth =
DL.getTypeStoreSizeInBits(AI->getValOperand()->getType());
Value *SextShamt =
Builder.CreateSub(Builder.getIntN(XLen, XLen - ValWidth), ShiftAmt);
Result = Builder.CreateCall(LrwOpScwLoop,
{AlignedAddr, Incr, Mask, SextShamt, Ordering});
} else {
Result =
Builder.CreateCall(LrwOpScwLoop, {AlignedAddr, Incr, Mask, Ordering});
}
if (XLen == 64)
Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
return Result;
}
TargetLowering::AtomicExpansionKind
RISCVTargetLowering::shouldExpandAtomicCmpXchgInIR(
AtomicCmpXchgInst *CI) const {
unsigned Size = CI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
if (Size == 8 || Size == 16)
return AtomicExpansionKind::MaskedIntrinsic;
return AtomicExpansionKind::None;
}
Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
IRBuilder<> &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr,
Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const {
unsigned XLen = Subtarget.getXLen();
Value *Ordering = Builder.getIntN(XLen, static_cast<uint64_t>(Ord));
Intrinsic::ID CmpXchgIntrID = Intrinsic::riscv_masked_cmpxchg_i32;
if (XLen == 64) {
CmpVal = Builder.CreateSExt(CmpVal, Builder.getInt64Ty());
NewVal = Builder.CreateSExt(NewVal, Builder.getInt64Ty());
Mask = Builder.CreateSExt(Mask, Builder.getInt64Ty());
CmpXchgIntrID = Intrinsic::riscv_masked_cmpxchg_i64;
}
Type *Tys[] = {AlignedAddr->getType()};
Function *MaskedCmpXchg =
Intrinsic::getDeclaration(CI->getModule(), CmpXchgIntrID, Tys);
Value *Result = Builder.CreateCall(
MaskedCmpXchg, {AlignedAddr, CmpVal, NewVal, Mask, Ordering});
if (XLen == 64)
Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
return Result;
}
unsigned RISCVTargetLowering::getExceptionPointerRegister(
const Constant *PersonalityFn) const {
return RISCV::X10;
}
unsigned RISCVTargetLowering::getExceptionSelectorRegister(
const Constant *PersonalityFn) const {
return RISCV::X11;
}
Index: vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVISelLowering.h
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVISelLowering.h (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVISelLowering.h (revision 351303)
@@ -1,210 +1,211 @@
//===-- RISCVISelLowering.h - RISCV DAG Lowering Interface ------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines the interfaces that RISCV uses to lower LLVM code into a
// selection DAG.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_RISCV_RISCVISELLOWERING_H
#define LLVM_LIB_TARGET_RISCV_RISCVISELLOWERING_H
#include "RISCV.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLowering.h"
namespace llvm {
class RISCVSubtarget;
namespace RISCVISD {
enum NodeType : unsigned {
FIRST_NUMBER = ISD::BUILTIN_OP_END,
RET_FLAG,
URET_FLAG,
SRET_FLAG,
MRET_FLAG,
CALL,
SELECT_CC,
BuildPairF64,
SplitF64,
TAIL,
// RV64I shifts, directly matching the semantics of the named RISC-V
// instructions.
SLLW,
SRAW,
SRLW,
// 32-bit operations from RV64M that can't be simply matched with a pattern
// at instruction selection time.
DIVW,
DIVUW,
REMUW,
// FPR32<->GPR transfer operations for RV64. Needed as an i32<->f32 bitcast
// is not legal on RV64. FMV_W_X_RV64 matches the semantics of the FMV.W.X.
// FMV_X_ANYEXTW_RV64 is similar to FMV.X.W but has an any-extended result.
// This is a more convenient semantic for producing dagcombines that remove
// unnecessary GPR->FPR->GPR moves.
FMV_W_X_RV64,
FMV_X_ANYEXTW_RV64,
// READ_CYCLE_WIDE - A read of the 64-bit cycle CSR on a 32-bit target
// (returns (Lo, Hi)). It takes a chain operand.
READ_CYCLE_WIDE
};
}
class RISCVTargetLowering : public TargetLowering {
const RISCVSubtarget &Subtarget;
public:
explicit RISCVTargetLowering(const TargetMachine &TM,
const RISCVSubtarget &STI);
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
MachineFunction &MF,
unsigned Intrinsic) const override;
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
unsigned AS,
Instruction *I = nullptr) const override;
bool isLegalICmpImmediate(int64_t Imm) const override;
bool isLegalAddImmediate(int64_t Imm) const override;
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override;
bool isTruncateFree(EVT SrcVT, EVT DstVT) const override;
bool isZExtFree(SDValue Val, EVT VT2) const override;
bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override;
bool hasBitPreservingFPLogic(EVT VT) const override;
// Provide custom lowering hooks for some operations.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const override;
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth) const override;
// This method returns the name of a target specific DAG node.
const char *getTargetNodeName(unsigned Opcode) const override;
+ ConstraintType getConstraintType(StringRef Constraint) const override;
std::pair<unsigned, const TargetRegisterClass *>
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint, MVT VT) const override;
void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
std::vector<SDValue> &Ops,
SelectionDAG &DAG) const override;
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const override;
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const override;
bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
return VT.isScalarInteger();
}
bool shouldInsertFencesForAtomic(const Instruction *I) const override {
return isa<LoadInst>(I) || isa<StoreInst>(I);
}
Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
ISD::NodeType getExtendForAtomicOps() const override {
return ISD::SIGN_EXTEND;
}
bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
if (DAG.getMachineFunction().getFunction().hasMinSize())
return false;
return true;
}
bool isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const override;
/// If a physical register, this returns the register that receives the
/// exception address on entry to an EH pad.
unsigned
getExceptionPointerRegister(const Constant *PersonalityFn) const override;
/// If a physical register, this returns the register that receives the
/// exception typeid on entry to a landing pad.
unsigned
getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
private:
void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
const SmallVectorImpl<ISD::InputArg> &Ins,
bool IsRet) const;
void analyzeOutputArgs(MachineFunction &MF, CCState &CCInfo,
const SmallVectorImpl<ISD::OutputArg> &Outs,
bool IsRet, CallLoweringInfo *CLI) const;
// Lower incoming arguments, copy physregs into vregs
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
const SDLoc &DL, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals) const override;
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
LLVMContext &Context) const override;
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const override;
SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const override;
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override {
return true;
}
template <class NodeTy>
SDValue getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal = true) const;
SDValue getStaticTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG,
bool UseGOT) const;
SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG) const;
bool shouldConsiderGEPOffsetSplit() const override { return true; }
SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerShiftRightParts(SDValue Op, SelectionDAG &DAG, bool IsSRA) const;
bool isEligibleForTailCallOptimization(
CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
const SmallVector<CCValAssign, 16> &ArgLocs) const;
TargetLowering::AtomicExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
virtual Value *emitMaskedAtomicRMWIntrinsic(
IRBuilder<> &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr,
Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const override;
TargetLowering::AtomicExpansionKind
shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CI) const override;
virtual Value *
emitMaskedAtomicCmpXchgIntrinsic(IRBuilder<> &Builder, AtomicCmpXchgInst *CI,
Value *AlignedAddr, Value *CmpVal,
Value *NewVal, Value *Mask,
AtomicOrdering Ord) const override;
};
}
#endif
Index: vendor/llvm/dist-release_90/lib/Target/Sparc/SparcISelLowering.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/Sparc/SparcISelLowering.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/Sparc/SparcISelLowering.cpp (revision 351303)
@@ -1,3417 +1,3417 @@
//===-- SparcISelLowering.cpp - Sparc DAG Lowering Implementation ---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the interfaces that Sparc uses to lower LLVM code into a
// selection DAG.
//
//===----------------------------------------------------------------------===//
#include "SparcISelLowering.h"
#include "MCTargetDesc/SparcMCExpr.h"
#include "SparcMachineFunctionInfo.h"
#include "SparcRegisterInfo.h"
#include "SparcTargetMachine.h"
#include "SparcTargetObjectFile.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
using namespace llvm;
//===----------------------------------------------------------------------===//
// Calling Convention Implementation
//===----------------------------------------------------------------------===//
static bool CC_Sparc_Assign_SRet(unsigned &ValNo, MVT &ValVT,
MVT &LocVT, CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags, CCState &State)
{
assert (ArgFlags.isSRet());
// Assign SRet argument.
State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
0,
LocVT, LocInfo));
return true;
}
static bool CC_Sparc_Assign_Split_64(unsigned &ValNo, MVT &ValVT,
MVT &LocVT, CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags, CCState &State)
{
static const MCPhysReg RegList[] = {
SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
};
// Try to get first reg.
if (unsigned Reg = State.AllocateReg(RegList)) {
State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
} else {
// Assign whole thing in stack.
State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
State.AllocateStack(8,4),
LocVT, LocInfo));
return true;
}
// Try to get second reg.
if (unsigned Reg = State.AllocateReg(RegList))
State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
else
State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
State.AllocateStack(4,4),
LocVT, LocInfo));
return true;
}
static bool CC_Sparc_Assign_Ret_Split_64(unsigned &ValNo, MVT &ValVT,
MVT &LocVT, CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags, CCState &State)
{
static const MCPhysReg RegList[] = {
SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
};
// Try to get first reg.
if (unsigned Reg = State.AllocateReg(RegList))
State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
else
return false;
// Try to get second reg.
if (unsigned Reg = State.AllocateReg(RegList))
State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
else
return false;
return true;
}
// Allocate a full-sized argument for the 64-bit ABI.
static bool CC_Sparc64_Full(unsigned &ValNo, MVT &ValVT,
MVT &LocVT, CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags, CCState &State) {
assert((LocVT == MVT::f32 || LocVT == MVT::f128
|| LocVT.getSizeInBits() == 64) &&
"Can't handle non-64 bits locations");
// Stack space is allocated for all arguments starting from [%fp+BIAS+128].
unsigned size = (LocVT == MVT::f128) ? 16 : 8;
unsigned alignment = (LocVT == MVT::f128) ? 16 : 8;
unsigned Offset = State.AllocateStack(size, alignment);
unsigned Reg = 0;
if (LocVT == MVT::i64 && Offset < 6*8)
// Promote integers to %i0-%i5.
Reg = SP::I0 + Offset/8;
else if (LocVT == MVT::f64 && Offset < 16*8)
// Promote doubles to %d0-%d30. (Which LLVM calls D0-D15).
Reg = SP::D0 + Offset/8;
else if (LocVT == MVT::f32 && Offset < 16*8)
// Promote floats to %f1, %f3, ...
Reg = SP::F1 + Offset/4;
else if (LocVT == MVT::f128 && Offset < 16*8)
// Promote long doubles to %q0-%q28. (Which LLVM calls Q0-Q7).
Reg = SP::Q0 + Offset/16;
// Promote to register when possible, otherwise use the stack slot.
if (Reg) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return true;
}
// This argument goes on the stack in an 8-byte slot.
// When passing floats, LocVT is smaller than 8 bytes. Adjust the offset to
// the right-aligned float. The first 4 bytes of the stack slot are undefined.
if (LocVT == MVT::f32)
Offset += 4;
State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
return true;
}
// Allocate a half-sized argument for the 64-bit ABI.
//
// This is used when passing { float, int } structs by value in registers.
static bool CC_Sparc64_Half(unsigned &ValNo, MVT &ValVT,
MVT &LocVT, CCValAssign::LocInfo &LocInfo,
ISD::ArgFlagsTy &ArgFlags, CCState &State) {
assert(LocVT.getSizeInBits() == 32 && "Can't handle non-32 bits locations");
unsigned Offset = State.AllocateStack(4, 4);
if (LocVT == MVT::f32 && Offset < 16*8) {
// Promote floats to %f0-%f31.
State.addLoc(CCValAssign::getReg(ValNo, ValVT, SP::F0 + Offset/4,
LocVT, LocInfo));
return true;
}
if (LocVT == MVT::i32 && Offset < 6*8) {
// Promote integers to %i0-%i5, using half the register.
unsigned Reg = SP::I0 + Offset/8;
LocVT = MVT::i64;
LocInfo = CCValAssign::AExt;
// Set the Custom bit if this i32 goes in the high bits of a register.
if (Offset % 8 == 0)
State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg,
LocVT, LocInfo));
else
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return true;
}
State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
return true;
}
#include "SparcGenCallingConv.inc"
// The calling conventions in SparcCallingConv.td are described in terms of the
// callee's register window. This function translates registers to the
// corresponding caller window %o register.
static unsigned toCallerWindow(unsigned Reg) {
static_assert(SP::I0 + 7 == SP::I7 && SP::O0 + 7 == SP::O7,
"Unexpected enum");
if (Reg >= SP::I0 && Reg <= SP::I7)
return Reg - SP::I0 + SP::O0;
return Reg;
}
SDValue
SparcTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &DL, SelectionDAG &DAG) const {
if (Subtarget->is64Bit())
return LowerReturn_64(Chain, CallConv, IsVarArg, Outs, OutVals, DL, DAG);
return LowerReturn_32(Chain, CallConv, IsVarArg, Outs, OutVals, DL, DAG);
}
SDValue
SparcTargetLowering::LowerReturn_32(SDValue Chain, CallingConv::ID CallConv,
bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &DL, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
// CCValAssign - represent the assignment of the return value to locations.
SmallVector<CCValAssign, 16> RVLocs;
// CCState - Info about the registers and stack slot.
CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
// Analyze return values.
CCInfo.AnalyzeReturn(Outs, RetCC_Sparc32);
SDValue Flag;
SmallVector<SDValue, 4> RetOps(1, Chain);
// Make room for the return address offset.
RetOps.push_back(SDValue());
// Copy the result values into the output registers.
for (unsigned i = 0, realRVLocIdx = 0;
i != RVLocs.size();
++i, ++realRVLocIdx) {
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
SDValue Arg = OutVals[realRVLocIdx];
if (VA.needsCustom()) {
assert(VA.getLocVT() == MVT::v2i32);
// Legalize ret v2i32 -> ret 2 x i32 (Basically: do what would
// happen by default if this wasn't a legal type)
SDValue Part0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
Arg,
DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout())));
SDValue Part1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
Arg,
DAG.getConstant(1, DL, getVectorIdxTy(DAG.getDataLayout())));
Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Part0, Flag);
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
VA = RVLocs[++i]; // skip ahead to next loc
Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Part1,
Flag);
} else
Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
// Guarantee that all emitted copies are stuck together with flags.
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
}
unsigned RetAddrOffset = 8; // Call Inst + Delay Slot
// If the function returns a struct, copy the SRetReturnReg to I0
if (MF.getFunction().hasStructRetAttr()) {
SparcMachineFunctionInfo *SFI = MF.getInfo<SparcMachineFunctionInfo>();
unsigned Reg = SFI->getSRetReturnReg();
if (!Reg)
llvm_unreachable("sret virtual register not created in the entry block");
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, PtrVT);
Chain = DAG.getCopyToReg(Chain, DL, SP::I0, Val, Flag);
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(SP::I0, PtrVT));
RetAddrOffset = 12; // CallInst + Delay Slot + Unimp
}
RetOps[0] = Chain; // Update chain.
RetOps[1] = DAG.getConstant(RetAddrOffset, DL, MVT::i32);
// Add the flag if we have it.
if (Flag.getNode())
RetOps.push_back(Flag);
return DAG.getNode(SPISD::RET_FLAG, DL, MVT::Other, RetOps);
}
// Lower return values for the 64-bit ABI.
// Return values are passed the exactly the same way as function arguments.
SDValue
SparcTargetLowering::LowerReturn_64(SDValue Chain, CallingConv::ID CallConv,
bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &DL, SelectionDAG &DAG) const {
// CCValAssign - represent the assignment of the return value to locations.
SmallVector<CCValAssign, 16> RVLocs;
// CCState - Info about the registers and stack slot.
CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
// Analyze return values.
CCInfo.AnalyzeReturn(Outs, RetCC_Sparc64);
SDValue Flag;
SmallVector<SDValue, 4> RetOps(1, Chain);
// The second operand on the return instruction is the return address offset.
// The return address is always %i7+8 with the 64-bit ABI.
RetOps.push_back(DAG.getConstant(8, DL, MVT::i32));
// Copy the result values into the output registers.
for (unsigned i = 0; i != RVLocs.size(); ++i) {
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
SDValue OutVal = OutVals[i];
// Integer return values must be sign or zero extended by the callee.
switch (VA.getLocInfo()) {
case CCValAssign::Full: break;
case CCValAssign::SExt:
OutVal = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), OutVal);
break;
case CCValAssign::ZExt:
OutVal = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), OutVal);
break;
case CCValAssign::AExt:
OutVal = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), OutVal);
break;
default:
llvm_unreachable("Unknown loc info!");
}
// The custom bit on an i32 return value indicates that it should be passed
// in the high bits of the register.
if (VA.getValVT() == MVT::i32 && VA.needsCustom()) {
OutVal = DAG.getNode(ISD::SHL, DL, MVT::i64, OutVal,
DAG.getConstant(32, DL, MVT::i32));
// The next value may go in the low bits of the same register.
// Handle both at once.
if (i+1 < RVLocs.size() && RVLocs[i+1].getLocReg() == VA.getLocReg()) {
SDValue NV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, OutVals[i+1]);
OutVal = DAG.getNode(ISD::OR, DL, MVT::i64, OutVal, NV);
// Skip the next value, it's already done.
++i;
}
}
Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVal, Flag);
// Guarantee that all emitted copies are stuck together with flags.
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
}
RetOps[0] = Chain; // Update chain.
// Add the flag if we have it.
if (Flag.getNode())
RetOps.push_back(Flag);
return DAG.getNode(SPISD::RET_FLAG, DL, MVT::Other, RetOps);
}
SDValue SparcTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
if (Subtarget->is64Bit())
return LowerFormalArguments_64(Chain, CallConv, IsVarArg, Ins,
DL, DAG, InVals);
return LowerFormalArguments_32(Chain, CallConv, IsVarArg, Ins,
DL, DAG, InVals);
}
/// LowerFormalArguments32 - V8 uses a very simple ABI, where all values are
/// passed in either one or two GPRs, including FP values. TODO: we should
/// pass FP values in FP registers for fastcc functions.
SDValue SparcTargetLowering::LowerFormalArguments_32(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineRegisterInfo &RegInfo = MF.getRegInfo();
SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc32);
const unsigned StackOffset = 92;
bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
unsigned InIdx = 0;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i, ++InIdx) {
CCValAssign &VA = ArgLocs[i];
if (Ins[InIdx].Flags.isSRet()) {
if (InIdx != 0)
report_fatal_error("sparc only supports sret on the first parameter");
// Get SRet from [%fp+64].
int FrameIdx = MF.getFrameInfo().CreateFixedObject(4, 64, true);
SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
SDValue Arg =
DAG.getLoad(MVT::i32, dl, Chain, FIPtr, MachinePointerInfo());
InVals.push_back(Arg);
continue;
}
if (VA.isRegLoc()) {
if (VA.needsCustom()) {
assert(VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2i32);
unsigned VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
MF.getRegInfo().addLiveIn(VA.getLocReg(), VRegHi);
SDValue HiVal = DAG.getCopyFromReg(Chain, dl, VRegHi, MVT::i32);
assert(i+1 < e);
CCValAssign &NextVA = ArgLocs[++i];
SDValue LoVal;
if (NextVA.isMemLoc()) {
int FrameIdx = MF.getFrameInfo().
CreateFixedObject(4, StackOffset+NextVA.getLocMemOffset(),true);
SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
LoVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr, MachinePointerInfo());
} else {
unsigned loReg = MF.addLiveIn(NextVA.getLocReg(),
&SP::IntRegsRegClass);
LoVal = DAG.getCopyFromReg(Chain, dl, loReg, MVT::i32);
}
if (IsLittleEndian)
std::swap(LoVal, HiVal);
SDValue WholeValue =
DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal);
WholeValue = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), WholeValue);
InVals.push_back(WholeValue);
continue;
}
unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
MF.getRegInfo().addLiveIn(VA.getLocReg(), VReg);
SDValue Arg = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
if (VA.getLocVT() == MVT::f32)
Arg = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Arg);
else if (VA.getLocVT() != MVT::i32) {
Arg = DAG.getNode(ISD::AssertSext, dl, MVT::i32, Arg,
DAG.getValueType(VA.getLocVT()));
Arg = DAG.getNode(ISD::TRUNCATE, dl, VA.getLocVT(), Arg);
}
InVals.push_back(Arg);
continue;
}
assert(VA.isMemLoc());
unsigned Offset = VA.getLocMemOffset()+StackOffset;
auto PtrVT = getPointerTy(DAG.getDataLayout());
if (VA.needsCustom()) {
assert(VA.getValVT() == MVT::f64 || VA.getValVT() == MVT::v2i32);
// If it is double-word aligned, just load.
if (Offset % 8 == 0) {
int FI = MF.getFrameInfo().CreateFixedObject(8,
Offset,
true);
SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT);
SDValue Load =
DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr, MachinePointerInfo());
InVals.push_back(Load);
continue;
}
int FI = MF.getFrameInfo().CreateFixedObject(4,
Offset,
true);
SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT);
SDValue HiVal =
DAG.getLoad(MVT::i32, dl, Chain, FIPtr, MachinePointerInfo());
int FI2 = MF.getFrameInfo().CreateFixedObject(4,
Offset+4,
true);
SDValue FIPtr2 = DAG.getFrameIndex(FI2, PtrVT);
SDValue LoVal =
DAG.getLoad(MVT::i32, dl, Chain, FIPtr2, MachinePointerInfo());
if (IsLittleEndian)
std::swap(LoVal, HiVal);
SDValue WholeValue =
DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal);
WholeValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), WholeValue);
InVals.push_back(WholeValue);
continue;
}
int FI = MF.getFrameInfo().CreateFixedObject(4,
Offset,
true);
SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT);
SDValue Load ;
if (VA.getValVT() == MVT::i32 || VA.getValVT() == MVT::f32) {
Load = DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr, MachinePointerInfo());
} else if (VA.getValVT() == MVT::f128) {
report_fatal_error("SPARCv8 does not handle f128 in calls; "
"pass indirectly");
} else {
// We shouldn't see any other value types here.
llvm_unreachable("Unexpected ValVT encountered in frame lowering.");
}
InVals.push_back(Load);
}
if (MF.getFunction().hasStructRetAttr()) {
// Copy the SRet Argument to SRetReturnReg.
SparcMachineFunctionInfo *SFI = MF.getInfo<SparcMachineFunctionInfo>();
unsigned Reg = SFI->getSRetReturnReg();
if (!Reg) {
Reg = MF.getRegInfo().createVirtualRegister(&SP::IntRegsRegClass);
SFI->setSRetReturnReg(Reg);
}
SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
}
// Store remaining ArgRegs to the stack if this is a varargs function.
if (isVarArg) {
static const MCPhysReg ArgRegs[] = {
SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
};
unsigned NumAllocated = CCInfo.getFirstUnallocated(ArgRegs);
const MCPhysReg *CurArgReg = ArgRegs+NumAllocated, *ArgRegEnd = ArgRegs+6;
unsigned ArgOffset = CCInfo.getNextStackOffset();
if (NumAllocated == 6)
ArgOffset += StackOffset;
else {
assert(!ArgOffset);
ArgOffset = 68+4*NumAllocated;
}
// Remember the vararg offset for the va_start implementation.
FuncInfo->setVarArgsFrameOffset(ArgOffset);
std::vector<SDValue> OutChains;
for (; CurArgReg != ArgRegEnd; ++CurArgReg) {
unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
MF.getRegInfo().addLiveIn(*CurArgReg, VReg);
SDValue Arg = DAG.getCopyFromReg(DAG.getRoot(), dl, VReg, MVT::i32);
int FrameIdx = MF.getFrameInfo().CreateFixedObject(4, ArgOffset,
true);
SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
OutChains.push_back(
DAG.getStore(DAG.getRoot(), dl, Arg, FIPtr, MachinePointerInfo()));
ArgOffset += 4;
}
if (!OutChains.empty()) {
OutChains.push_back(Chain);
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
}
}
return Chain;
}
// Lower formal arguments for the 64 bit ABI.
SDValue SparcTargetLowering::LowerFormalArguments_64(
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
// Analyze arguments according to CC_Sparc64.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc64);
// The argument array begins at %fp+BIAS+128, after the register save area.
const unsigned ArgArea = 128;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
if (VA.isRegLoc()) {
// This argument is passed in a register.
// All integer register arguments are promoted by the caller to i64.
// Create a virtual register for the promoted live-in value.
unsigned VReg = MF.addLiveIn(VA.getLocReg(),
getRegClassFor(VA.getLocVT()));
SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT());
// Get the high bits for i32 struct elements.
if (VA.getValVT() == MVT::i32 && VA.needsCustom())
Arg = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Arg,
DAG.getConstant(32, DL, MVT::i32));
// The caller promoted the argument, so insert an Assert?ext SDNode so we
// won't promote the value again in this function.
switch (VA.getLocInfo()) {
case CCValAssign::SExt:
Arg = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Arg,
DAG.getValueType(VA.getValVT()));
break;
case CCValAssign::ZExt:
Arg = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Arg,
DAG.getValueType(VA.getValVT()));
break;
default:
break;
}
// Truncate the register down to the argument type.
if (VA.isExtInLoc())
Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
InVals.push_back(Arg);
continue;
}
// The registers are exhausted. This argument was passed on the stack.
assert(VA.isMemLoc());
// The CC_Sparc64_Full/Half functions compute stack offsets relative to the
// beginning of the arguments area at %fp+BIAS+128.
unsigned Offset = VA.getLocMemOffset() + ArgArea;
unsigned ValSize = VA.getValVT().getSizeInBits() / 8;
// Adjust offset for extended arguments, SPARC is big-endian.
// The caller will have written the full slot with extended bytes, but we
// prefer our own extending loads.
if (VA.isExtInLoc())
Offset += 8 - ValSize;
int FI = MF.getFrameInfo().CreateFixedObject(ValSize, Offset, true);
InVals.push_back(
DAG.getLoad(VA.getValVT(), DL, Chain,
DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())),
MachinePointerInfo::getFixedStack(MF, FI)));
}
if (!IsVarArg)
return Chain;
// This function takes variable arguments, some of which may have been passed
// in registers %i0-%i5. Variable floating point arguments are never passed
// in floating point registers. They go on %i0-%i5 or on the stack like
// integer arguments.
//
// The va_start intrinsic needs to know the offset to the first variable
// argument.
unsigned ArgOffset = CCInfo.getNextStackOffset();
SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
// Skip the 128 bytes of register save area.
FuncInfo->setVarArgsFrameOffset(ArgOffset + ArgArea +
Subtarget->getStackPointerBias());
// Save the variable arguments that were passed in registers.
// The caller is required to reserve stack space for 6 arguments regardless
// of how many arguments were actually passed.
SmallVector<SDValue, 8> OutChains;
for (; ArgOffset < 6*8; ArgOffset += 8) {
unsigned VReg = MF.addLiveIn(SP::I0 + ArgOffset/8, &SP::I64RegsRegClass);
SDValue VArg = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
int FI = MF.getFrameInfo().CreateFixedObject(8, ArgOffset + ArgArea, true);
auto PtrVT = getPointerTy(MF.getDataLayout());
OutChains.push_back(
DAG.getStore(Chain, DL, VArg, DAG.getFrameIndex(FI, PtrVT),
MachinePointerInfo::getFixedStack(MF, FI)));
}
if (!OutChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
return Chain;
}
SDValue
SparcTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
if (Subtarget->is64Bit())
return LowerCall_64(CLI, InVals);
return LowerCall_32(CLI, InVals);
}
static bool hasReturnsTwiceAttr(SelectionDAG &DAG, SDValue Callee,
ImmutableCallSite CS) {
if (CS)
return CS.hasFnAttr(Attribute::ReturnsTwice);
const Function *CalleeFn = nullptr;
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
CalleeFn = dyn_cast<Function>(G->getGlobal());
} else if (ExternalSymbolSDNode *E =
dyn_cast<ExternalSymbolSDNode>(Callee)) {
const Function &Fn = DAG.getMachineFunction().getFunction();
const Module *M = Fn.getParent();
const char *CalleeName = E->getSymbol();
CalleeFn = M->getFunction(CalleeName);
}
if (!CalleeFn)
return false;
return CalleeFn->hasFnAttribute(Attribute::ReturnsTwice);
}
// Lower a call for the 32-bit ABI.
SDValue
SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
SelectionDAG &DAG = CLI.DAG;
SDLoc &dl = CLI.DL;
SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
bool &isTailCall = CLI.IsTailCall;
CallingConv::ID CallConv = CLI.CallConv;
bool isVarArg = CLI.IsVarArg;
// Sparc target does not yet support tail call optimization.
isTailCall = false;
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
CCInfo.AnalyzeCallOperands(Outs, CC_Sparc32);
// Get the size of the outgoing arguments stack space requirement.
unsigned ArgsSize = CCInfo.getNextStackOffset();
// Keep stack frames 8-byte aligned.
ArgsSize = (ArgsSize+7) & ~7;
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
// Create local copies for byval args.
SmallVector<SDValue, 8> ByValArgs;
for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
ISD::ArgFlagsTy Flags = Outs[i].Flags;
if (!Flags.isByVal())
continue;
SDValue Arg = OutVals[i];
unsigned Size = Flags.getByValSize();
unsigned Align = Flags.getByValAlign();
if (Size > 0U) {
int FI = MFI.CreateStackObject(Size, Align, false);
SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
SDValue SizeNode = DAG.getConstant(Size, dl, MVT::i32);
Chain = DAG.getMemcpy(Chain, dl, FIPtr, Arg, SizeNode, Align,
false, // isVolatile,
(Size <= 32), // AlwaysInline if size <= 32,
false, // isTailCall
MachinePointerInfo(), MachinePointerInfo());
ByValArgs.push_back(FIPtr);
}
else {
SDValue nullVal;
ByValArgs.push_back(nullVal);
}
}
Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, dl);
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
SmallVector<SDValue, 8> MemOpChains;
const unsigned StackOffset = 92;
bool hasStructRetAttr = false;
unsigned SRetArgSize = 0;
// Walk the register/memloc assignments, inserting copies/loads.
for (unsigned i = 0, realArgIdx = 0, byvalArgIdx = 0, e = ArgLocs.size();
i != e;
++i, ++realArgIdx) {
CCValAssign &VA = ArgLocs[i];
SDValue Arg = OutVals[realArgIdx];
ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
// Use local copy if it is a byval arg.
if (Flags.isByVal()) {
Arg = ByValArgs[byvalArgIdx++];
if (!Arg) {
continue;
}
}
// Promote the value if needed.
switch (VA.getLocInfo()) {
default: llvm_unreachable("Unknown loc info!");
case CCValAssign::Full: break;
case CCValAssign::SExt:
Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
break;
case CCValAssign::ZExt:
Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
break;
case CCValAssign::AExt:
Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
break;
case CCValAssign::BCvt:
Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
break;
}
if (Flags.isSRet()) {
assert(VA.needsCustom());
// store SRet argument in %sp+64
SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
SDValue PtrOff = DAG.getIntPtrConstant(64, dl);
PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
MemOpChains.push_back(
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
hasStructRetAttr = true;
// sret only allowed on first argument
assert(Outs[realArgIdx].OrigArgIndex == 0);
PointerType *Ty = cast<PointerType>(CLI.getArgs()[0].Ty);
Type *ElementTy = Ty->getElementType();
SRetArgSize = DAG.getDataLayout().getTypeAllocSize(ElementTy);
continue;
}
if (VA.needsCustom()) {
assert(VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2i32);
if (VA.isMemLoc()) {
unsigned Offset = VA.getLocMemOffset() + StackOffset;
// if it is double-word aligned, just store.
if (Offset % 8 == 0) {
SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
MemOpChains.push_back(
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
continue;
}
}
if (VA.getLocVT() == MVT::f64) {
// Move from the float value from float registers into the
// integer registers.
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Arg))
Arg = bitcastConstantFPToInt(C, dl, DAG);
else
Arg = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, Arg);
}
SDValue Part0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
Arg,
DAG.getConstant(0, dl, getVectorIdxTy(DAG.getDataLayout())));
SDValue Part1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
Arg,
DAG.getConstant(1, dl, getVectorIdxTy(DAG.getDataLayout())));
if (VA.isRegLoc()) {
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Part0));
assert(i+1 != e);
CCValAssign &NextVA = ArgLocs[++i];
if (NextVA.isRegLoc()) {
RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Part1));
} else {
// Store the second part in stack.
unsigned Offset = NextVA.getLocMemOffset() + StackOffset;
SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
MemOpChains.push_back(
DAG.getStore(Chain, dl, Part1, PtrOff, MachinePointerInfo()));
}
} else {
unsigned Offset = VA.getLocMemOffset() + StackOffset;
// Store the first part.
SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
MemOpChains.push_back(
DAG.getStore(Chain, dl, Part0, PtrOff, MachinePointerInfo()));
// Store the second part.
PtrOff = DAG.getIntPtrConstant(Offset + 4, dl);
PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
MemOpChains.push_back(
DAG.getStore(Chain, dl, Part1, PtrOff, MachinePointerInfo()));
}
continue;
}
// Arguments that can be passed on register must be kept at
// RegsToPass vector
if (VA.isRegLoc()) {
if (VA.getLocVT() != MVT::f32) {
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
continue;
}
Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
continue;
}
assert(VA.isMemLoc());
// Create a store off the stack pointer for this argument.
SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset() + StackOffset,
dl);
PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
MemOpChains.push_back(
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
}
// Emit all stores, make sure the occur before any copies into physregs.
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
// Build a sequence of copy-to-reg nodes chained together with token
// chain and flag operands which copy the outgoing args into registers.
// The InFlag in necessary since all emitted instructions must be
// stuck together.
SDValue InFlag;
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
unsigned Reg = toCallerWindow(RegsToPass[i].first);
Chain = DAG.getCopyToReg(Chain, dl, Reg, RegsToPass[i].second, InFlag);
InFlag = Chain.getValue(1);
}
bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CS);
// If the callee is a GlobalAddress node (quite common, every direct call is)
// turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
// Likewise ExternalSymbol -> TargetExternalSymbol.
unsigned TF = isPositionIndependent() ? SparcMCExpr::VK_Sparc_WPLT30 : 0;
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i32, 0, TF);
else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32, TF);
// Returns a chain & a flag for retval copy to use
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
SmallVector<SDValue, 8> Ops;
Ops.push_back(Chain);
Ops.push_back(Callee);
if (hasStructRetAttr)
Ops.push_back(DAG.getTargetConstant(SRetArgSize, dl, MVT::i32));
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
Ops.push_back(DAG.getRegister(toCallerWindow(RegsToPass[i].first),
RegsToPass[i].second.getValueType()));
// Add a register mask operand representing the call-preserved registers.
const SparcRegisterInfo *TRI = Subtarget->getRegisterInfo();
const uint32_t *Mask =
((hasReturnsTwice)
? TRI->getRTCallPreservedMask(CallConv)
: TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv));
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
if (InFlag.getNode())
Ops.push_back(InFlag);
Chain = DAG.getNode(SPISD::CALL, dl, NodeTys, Ops);
InFlag = Chain.getValue(1);
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, dl, true),
DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
InFlag = Chain.getValue(1);
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
CCState RVInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
RVInfo.AnalyzeCallResult(Ins, RetCC_Sparc32);
// Copy all of the result registers out of their specified physreg.
for (unsigned i = 0; i != RVLocs.size(); ++i) {
if (RVLocs[i].getLocVT() == MVT::v2i32) {
SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2i32);
SDValue Lo = DAG.getCopyFromReg(
Chain, dl, toCallerWindow(RVLocs[i++].getLocReg()), MVT::i32, InFlag);
Chain = Lo.getValue(1);
InFlag = Lo.getValue(2);
Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2i32, Vec, Lo,
DAG.getConstant(0, dl, MVT::i32));
SDValue Hi = DAG.getCopyFromReg(
Chain, dl, toCallerWindow(RVLocs[i].getLocReg()), MVT::i32, InFlag);
Chain = Hi.getValue(1);
InFlag = Hi.getValue(2);
Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2i32, Vec, Hi,
DAG.getConstant(1, dl, MVT::i32));
InVals.push_back(Vec);
} else {
Chain =
DAG.getCopyFromReg(Chain, dl, toCallerWindow(RVLocs[i].getLocReg()),
RVLocs[i].getValVT(), InFlag)
.getValue(1);
InFlag = Chain.getValue(2);
InVals.push_back(Chain.getValue(0));
}
}
return Chain;
}
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
unsigned SparcTargetLowering::getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const {
unsigned Reg = StringSwitch<unsigned>(RegName)
.Case("i0", SP::I0).Case("i1", SP::I1).Case("i2", SP::I2).Case("i3", SP::I3)
.Case("i4", SP::I4).Case("i5", SP::I5).Case("i6", SP::I6).Case("i7", SP::I7)
.Case("o0", SP::O0).Case("o1", SP::O1).Case("o2", SP::O2).Case("o3", SP::O3)
.Case("o4", SP::O4).Case("o5", SP::O5).Case("o6", SP::O6).Case("o7", SP::O7)
.Case("l0", SP::L0).Case("l1", SP::L1).Case("l2", SP::L2).Case("l3", SP::L3)
.Case("l4", SP::L4).Case("l5", SP::L5).Case("l6", SP::L6).Case("l7", SP::L7)
.Case("g0", SP::G0).Case("g1", SP::G1).Case("g2", SP::G2).Case("g3", SP::G3)
.Case("g4", SP::G4).Case("g5", SP::G5).Case("g6", SP::G6).Case("g7", SP::G7)
.Default(0);
if (Reg)
return Reg;
report_fatal_error("Invalid register name global variable");
}
// Fixup floating point arguments in the ... part of a varargs call.
//
// The SPARC v9 ABI requires that floating point arguments are treated the same
// as integers when calling a varargs function. This does not apply to the
// fixed arguments that are part of the function's prototype.
//
// This function post-processes a CCValAssign array created by
// AnalyzeCallOperands().
static void fixupVariableFloatArgs(SmallVectorImpl<CCValAssign> &ArgLocs,
ArrayRef<ISD::OutputArg> Outs) {
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
const CCValAssign &VA = ArgLocs[i];
MVT ValTy = VA.getLocVT();
// FIXME: What about f32 arguments? C promotes them to f64 when calling
// varargs functions.
if (!VA.isRegLoc() || (ValTy != MVT::f64 && ValTy != MVT::f128))
continue;
// The fixed arguments to a varargs function still go in FP registers.
if (Outs[VA.getValNo()].IsFixed)
continue;
// This floating point argument should be reassigned.
CCValAssign NewVA;
// Determine the offset into the argument array.
unsigned firstReg = (ValTy == MVT::f64) ? SP::D0 : SP::Q0;
unsigned argSize = (ValTy == MVT::f64) ? 8 : 16;
unsigned Offset = argSize * (VA.getLocReg() - firstReg);
assert(Offset < 16*8 && "Offset out of range, bad register enum?");
if (Offset < 6*8) {
// This argument should go in %i0-%i5.
unsigned IReg = SP::I0 + Offset/8;
if (ValTy == MVT::f64)
// Full register, just bitconvert into i64.
NewVA = CCValAssign::getReg(VA.getValNo(), VA.getValVT(),
IReg, MVT::i64, CCValAssign::BCvt);
else {
assert(ValTy == MVT::f128 && "Unexpected type!");
// Full register, just bitconvert into i128 -- We will lower this into
// two i64s in LowerCall_64.
NewVA = CCValAssign::getCustomReg(VA.getValNo(), VA.getValVT(),
IReg, MVT::i128, CCValAssign::BCvt);
}
} else {
// This needs to go to memory, we're out of integer registers.
NewVA = CCValAssign::getMem(VA.getValNo(), VA.getValVT(),
Offset, VA.getLocVT(), VA.getLocInfo());
}
ArgLocs[i] = NewVA;
}
}
// Lower a call for the 64-bit ABI.
SDValue
SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
SelectionDAG &DAG = CLI.DAG;
SDLoc DL = CLI.DL;
SDValue Chain = CLI.Chain;
auto PtrVT = getPointerTy(DAG.getDataLayout());
// Sparc target does not yet support tail call optimization.
CLI.IsTailCall = false;
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
CCInfo.AnalyzeCallOperands(CLI.Outs, CC_Sparc64);
// Get the size of the outgoing arguments stack space requirement.
// The stack offset computed by CC_Sparc64 includes all arguments.
// Called functions expect 6 argument words to exist in the stack frame, used
// or not.
unsigned ArgsSize = std::max(6*8u, CCInfo.getNextStackOffset());
// Keep stack frames 16-byte aligned.
ArgsSize = alignTo(ArgsSize, 16);
// Varargs calls require special treatment.
if (CLI.IsVarArg)
fixupVariableFloatArgs(ArgLocs, CLI.Outs);
// Adjust the stack pointer to make room for the arguments.
// FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
// with more than 6 arguments.
Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, DL);
// Collect the set of registers to pass to the function and their values.
// This will be emitted as a sequence of CopyToReg nodes glued to the call
// instruction.
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
// Collect chains from all the memory opeations that copy arguments to the
// stack. They must follow the stack pointer adjustment above and precede the
// call instruction itself.
SmallVector<SDValue, 8> MemOpChains;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
const CCValAssign &VA = ArgLocs[i];
SDValue Arg = CLI.OutVals[i];
// Promote the value if needed.
switch (VA.getLocInfo()) {
default:
llvm_unreachable("Unknown location info!");
case CCValAssign::Full:
break;
case CCValAssign::SExt:
Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::ZExt:
Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::AExt:
Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::BCvt:
// fixupVariableFloatArgs() may create bitcasts from f128 to i128. But
// SPARC does not support i128 natively. Lower it into two i64, see below.
if (!VA.needsCustom() || VA.getValVT() != MVT::f128
|| VA.getLocVT() != MVT::i128)
Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
break;
}
if (VA.isRegLoc()) {
if (VA.needsCustom() && VA.getValVT() == MVT::f128
&& VA.getLocVT() == MVT::i128) {
// Store and reload into the integer register reg and reg+1.
unsigned Offset = 8 * (VA.getLocReg() - SP::I0);
unsigned StackOffset = Offset + Subtarget->getStackPointerBias() + 128;
SDValue StackPtr = DAG.getRegister(SP::O6, PtrVT);
SDValue HiPtrOff = DAG.getIntPtrConstant(StackOffset, DL);
HiPtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, HiPtrOff);
SDValue LoPtrOff = DAG.getIntPtrConstant(StackOffset + 8, DL);
LoPtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, LoPtrOff);
// Store to %sp+BIAS+128+Offset
SDValue Store =
DAG.getStore(Chain, DL, Arg, HiPtrOff, MachinePointerInfo());
// Load into Reg and Reg+1
SDValue Hi64 =
DAG.getLoad(MVT::i64, DL, Store, HiPtrOff, MachinePointerInfo());
SDValue Lo64 =
DAG.getLoad(MVT::i64, DL, Store, LoPtrOff, MachinePointerInfo());
RegsToPass.push_back(std::make_pair(toCallerWindow(VA.getLocReg()),
Hi64));
RegsToPass.push_back(std::make_pair(toCallerWindow(VA.getLocReg()+1),
Lo64));
continue;
}
// The custom bit on an i32 return value indicates that it should be
// passed in the high bits of the register.
if (VA.getValVT() == MVT::i32 && VA.needsCustom()) {
Arg = DAG.getNode(ISD::SHL, DL, MVT::i64, Arg,
DAG.getConstant(32, DL, MVT::i32));
// The next value may go in the low bits of the same register.
// Handle both at once.
if (i+1 < ArgLocs.size() && ArgLocs[i+1].isRegLoc() &&
ArgLocs[i+1].getLocReg() == VA.getLocReg()) {
SDValue NV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64,
CLI.OutVals[i+1]);
Arg = DAG.getNode(ISD::OR, DL, MVT::i64, Arg, NV);
// Skip the next value, it's already done.
++i;
}
}
RegsToPass.push_back(std::make_pair(toCallerWindow(VA.getLocReg()), Arg));
continue;
}
assert(VA.isMemLoc());
// Create a store off the stack pointer for this argument.
SDValue StackPtr = DAG.getRegister(SP::O6, PtrVT);
// The argument area starts at %fp+BIAS+128 in the callee frame,
// %sp+BIAS+128 in ours.
SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset() +
Subtarget->getStackPointerBias() +
128, DL);
PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
MemOpChains.push_back(
DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo()));
}
// Emit all stores, make sure they occur before the call.
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
// Build a sequence of CopyToReg nodes glued together with token chain and
// glue operands which copy the outgoing args into registers. The InGlue is
// necessary since all emitted instructions must be stuck together in order
// to pass the live physical registers.
SDValue InGlue;
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
Chain = DAG.getCopyToReg(Chain, DL,
RegsToPass[i].first, RegsToPass[i].second, InGlue);
InGlue = Chain.getValue(1);
}
// If the callee is a GlobalAddress node (quite common, every direct call is)
// turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
// Likewise ExternalSymbol -> TargetExternalSymbol.
SDValue Callee = CLI.Callee;
bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CS);
unsigned TF = isPositionIndependent() ? SparcMCExpr::VK_Sparc_WPLT30 : 0;
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT, 0, TF);
else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, TF);
// Build the operands for the call instruction itself.
SmallVector<SDValue, 8> Ops;
Ops.push_back(Chain);
Ops.push_back(Callee);
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
Ops.push_back(DAG.getRegister(RegsToPass[i].first,
RegsToPass[i].second.getValueType()));
// Add a register mask operand representing the call-preserved registers.
const SparcRegisterInfo *TRI = Subtarget->getRegisterInfo();
const uint32_t *Mask =
((hasReturnsTwice) ? TRI->getRTCallPreservedMask(CLI.CallConv)
: TRI->getCallPreservedMask(DAG.getMachineFunction(),
CLI.CallConv));
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
// Make sure the CopyToReg nodes are glued to the call instruction which
// consumes the registers.
if (InGlue.getNode())
Ops.push_back(InGlue);
// Now the call itself.
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
Chain = DAG.getNode(SPISD::CALL, DL, NodeTys, Ops);
InGlue = Chain.getValue(1);
// Revert the stack pointer immediately after the call.
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true),
DAG.getIntPtrConstant(0, DL, true), InGlue, DL);
InGlue = Chain.getValue(1);
// Now extract the return values. This is more or less the same as
// LowerFormalArguments_64.
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
CCState RVInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
// Set inreg flag manually for codegen generated library calls that
// return float.
if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && !CLI.CS)
CLI.Ins[0].Flags.setInReg();
RVInfo.AnalyzeCallResult(CLI.Ins, RetCC_Sparc64);
// Copy all of the result registers out of their specified physreg.
for (unsigned i = 0; i != RVLocs.size(); ++i) {
CCValAssign &VA = RVLocs[i];
unsigned Reg = toCallerWindow(VA.getLocReg());
// When returning 'inreg {i32, i32 }', two consecutive i32 arguments can
// reside in the same register in the high and low bits. Reuse the
// CopyFromReg previous node to avoid duplicate copies.
SDValue RV;
if (RegisterSDNode *SrcReg = dyn_cast<RegisterSDNode>(Chain.getOperand(1)))
if (SrcReg->getReg() == Reg && Chain->getOpcode() == ISD::CopyFromReg)
RV = Chain.getValue(0);
// But usually we'll create a new CopyFromReg for a different register.
if (!RV.getNode()) {
RV = DAG.getCopyFromReg(Chain, DL, Reg, RVLocs[i].getLocVT(), InGlue);
Chain = RV.getValue(1);
InGlue = Chain.getValue(2);
}
// Get the high bits for i32 struct elements.
if (VA.getValVT() == MVT::i32 && VA.needsCustom())
RV = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), RV,
DAG.getConstant(32, DL, MVT::i32));
// The callee promoted the return value, so insert an Assert?ext SDNode so
// we won't promote the value again in this function.
switch (VA.getLocInfo()) {
case CCValAssign::SExt:
RV = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), RV,
DAG.getValueType(VA.getValVT()));
break;
case CCValAssign::ZExt:
RV = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), RV,
DAG.getValueType(VA.getValVT()));
break;
default:
break;
}
// Truncate the register down to the return value type.
if (VA.isExtInLoc())
RV = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), RV);
InVals.push_back(RV);
}
return Chain;
}
//===----------------------------------------------------------------------===//
// TargetLowering Implementation
//===----------------------------------------------------------------------===//
TargetLowering::AtomicExpansionKind SparcTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
if (AI->getOperation() == AtomicRMWInst::Xchg &&
AI->getType()->getPrimitiveSizeInBits() == 32)
return AtomicExpansionKind::None; // Uses xchg instruction
return AtomicExpansionKind::CmpXChg;
}
/// IntCondCCodeToICC - Convert a DAG integer condition code to a SPARC ICC
/// condition.
static SPCC::CondCodes IntCondCCodeToICC(ISD::CondCode CC) {
switch (CC) {
default: llvm_unreachable("Unknown integer condition code!");
case ISD::SETEQ: return SPCC::ICC_E;
case ISD::SETNE: return SPCC::ICC_NE;
case ISD::SETLT: return SPCC::ICC_L;
case ISD::SETGT: return SPCC::ICC_G;
case ISD::SETLE: return SPCC::ICC_LE;
case ISD::SETGE: return SPCC::ICC_GE;
case ISD::SETULT: return SPCC::ICC_CS;
case ISD::SETULE: return SPCC::ICC_LEU;
case ISD::SETUGT: return SPCC::ICC_GU;
case ISD::SETUGE: return SPCC::ICC_CC;
}
}
/// FPCondCCodeToFCC - Convert a DAG floatingp oint condition code to a SPARC
/// FCC condition.
static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) {
switch (CC) {
default: llvm_unreachable("Unknown fp condition code!");
case ISD::SETEQ:
case ISD::SETOEQ: return SPCC::FCC_E;
case ISD::SETNE:
case ISD::SETUNE: return SPCC::FCC_NE;
case ISD::SETLT:
case ISD::SETOLT: return SPCC::FCC_L;
case ISD::SETGT:
case ISD::SETOGT: return SPCC::FCC_G;
case ISD::SETLE:
case ISD::SETOLE: return SPCC::FCC_LE;
case ISD::SETGE:
case ISD::SETOGE: return SPCC::FCC_GE;
case ISD::SETULT: return SPCC::FCC_UL;
case ISD::SETULE: return SPCC::FCC_ULE;
case ISD::SETUGT: return SPCC::FCC_UG;
case ISD::SETUGE: return SPCC::FCC_UGE;
case ISD::SETUO: return SPCC::FCC_U;
case ISD::SETO: return SPCC::FCC_O;
case ISD::SETONE: return SPCC::FCC_LG;
case ISD::SETUEQ: return SPCC::FCC_UE;
}
}
SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
const SparcSubtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize(0));
// Instructions which use registers as conditionals examine all the
// bits (as does the pseudo SELECT_CC expansion). I don't think it
// matters much whether it's ZeroOrOneBooleanContent, or
// ZeroOrNegativeOneBooleanContent, so, arbitrarily choose the
// former.
setBooleanContents(ZeroOrOneBooleanContent);
setBooleanVectorContents(ZeroOrOneBooleanContent);
// Set up the register classes.
addRegisterClass(MVT::i32, &SP::IntRegsRegClass);
if (!Subtarget->useSoftFloat()) {
addRegisterClass(MVT::f32, &SP::FPRegsRegClass);
addRegisterClass(MVT::f64, &SP::DFPRegsRegClass);
addRegisterClass(MVT::f128, &SP::QFPRegsRegClass);
}
if (Subtarget->is64Bit()) {
addRegisterClass(MVT::i64, &SP::I64RegsRegClass);
} else {
// On 32bit sparc, we define a double-register 32bit register
// class, as well. This is modeled in LLVM as a 2-vector of i32.
addRegisterClass(MVT::v2i32, &SP::IntPairRegClass);
// ...but almost all operations must be expanded, so set that as
// the default.
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
setOperationAction(Op, MVT::v2i32, Expand);
}
// Truncating/extending stores/loads are also not supported.
for (MVT VT : MVT::integer_vector_valuetypes()) {
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i32, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Expand);
setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, VT, Expand);
setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, VT, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, VT, Expand);
setTruncStoreAction(VT, MVT::v2i32, Expand);
setTruncStoreAction(MVT::v2i32, VT, Expand);
}
// However, load and store *are* legal.
setOperationAction(ISD::LOAD, MVT::v2i32, Legal);
setOperationAction(ISD::STORE, MVT::v2i32, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Legal);
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Legal);
// And we need to promote i64 loads/stores into vector load/store
setOperationAction(ISD::LOAD, MVT::i64, Custom);
setOperationAction(ISD::STORE, MVT::i64, Custom);
// Sadly, this doesn't work:
// AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
// AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
}
// Turn FP extload into load/fpextend
for (MVT VT : MVT::fp_valuetypes()) {
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
}
// Sparc doesn't have i1 sign extending load
for (MVT VT : MVT::integer_valuetypes())
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
// Turn FP truncstore into trunc + store.
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::f128, MVT::f32, Expand);
setTruncStoreAction(MVT::f128, MVT::f64, Expand);
// Custom legalize GlobalAddress nodes into LO/HI parts.
setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
setOperationAction(ISD::ConstantPool, PtrVT, Custom);
setOperationAction(ISD::BlockAddress, PtrVT, Custom);
// Sparc doesn't have sext_inreg, replace them with shl/sra
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
// Sparc has no REM or DIVREM operations.
setOperationAction(ISD::UREM, MVT::i32, Expand);
setOperationAction(ISD::SREM, MVT::i32, Expand);
setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
// ... nor does SparcV9.
if (Subtarget->is64Bit()) {
setOperationAction(ISD::UREM, MVT::i64, Expand);
setOperationAction(ISD::SREM, MVT::i64, Expand);
setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
}
// Custom expand fp<->sint
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
// Custom Expand fp<->uint
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::BITCAST, MVT::f32, Expand);
setOperationAction(ISD::BITCAST, MVT::i32, Expand);
// Sparc has no select or setcc: expand to SELECT_CC.
setOperationAction(ISD::SELECT, MVT::i32, Expand);
setOperationAction(ISD::SELECT, MVT::f32, Expand);
setOperationAction(ISD::SELECT, MVT::f64, Expand);
setOperationAction(ISD::SELECT, MVT::f128, Expand);
setOperationAction(ISD::SETCC, MVT::i32, Expand);
setOperationAction(ISD::SETCC, MVT::f32, Expand);
setOperationAction(ISD::SETCC, MVT::f64, Expand);
setOperationAction(ISD::SETCC, MVT::f128, Expand);
// Sparc doesn't have BRCOND either, it has BR_CC.
setOperationAction(ISD::BRCOND, MVT::Other, Expand);
setOperationAction(ISD::BRIND, MVT::Other, Expand);
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
setOperationAction(ISD::BR_CC, MVT::i32, Custom);
setOperationAction(ISD::BR_CC, MVT::f32, Custom);
setOperationAction(ISD::BR_CC, MVT::f64, Custom);
setOperationAction(ISD::BR_CC, MVT::f128, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
setOperationAction(ISD::ADDC, MVT::i32, Custom);
setOperationAction(ISD::ADDE, MVT::i32, Custom);
setOperationAction(ISD::SUBC, MVT::i32, Custom);
setOperationAction(ISD::SUBE, MVT::i32, Custom);
if (Subtarget->is64Bit()) {
setOperationAction(ISD::ADDC, MVT::i64, Custom);
setOperationAction(ISD::ADDE, MVT::i64, Custom);
setOperationAction(ISD::SUBC, MVT::i64, Custom);
setOperationAction(ISD::SUBE, MVT::i64, Custom);
setOperationAction(ISD::BITCAST, MVT::f64, Expand);
setOperationAction(ISD::BITCAST, MVT::i64, Expand);
setOperationAction(ISD::SELECT, MVT::i64, Expand);
setOperationAction(ISD::SETCC, MVT::i64, Expand);
setOperationAction(ISD::BR_CC, MVT::i64, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
setOperationAction(ISD::CTPOP, MVT::i64,
Subtarget->usePopc() ? Legal : Expand);
setOperationAction(ISD::CTTZ , MVT::i64, Expand);
setOperationAction(ISD::CTLZ , MVT::i64, Expand);
setOperationAction(ISD::BSWAP, MVT::i64, Expand);
setOperationAction(ISD::ROTL , MVT::i64, Expand);
setOperationAction(ISD::ROTR , MVT::i64, Expand);
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
}
// ATOMICs.
// Atomics are supported on SparcV9. 32-bit atomics are also
// supported by some Leon SparcV8 variants. Otherwise, atomics
// are unsupported.
if (Subtarget->isV9())
setMaxAtomicSizeInBitsSupported(64);
else if (Subtarget->hasLeonCasa())
setMaxAtomicSizeInBitsSupported(32);
else
setMaxAtomicSizeInBitsSupported(0);
setMinCmpXchgSizeInBits(32);
setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Legal);
setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Legal);
// Custom Lower Atomic LOAD/STORE
setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
if (Subtarget->is64Bit()) {
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Legal);
setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Legal);
setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Custom);
}
if (!Subtarget->is64Bit()) {
// These libcalls are not available in 32-bit.
setLibcallName(RTLIB::SHL_I128, nullptr);
setLibcallName(RTLIB::SRL_I128, nullptr);
setLibcallName(RTLIB::SRA_I128, nullptr);
}
if (!Subtarget->isV9()) {
// SparcV8 does not have FNEGD and FABSD.
setOperationAction(ISD::FNEG, MVT::f64, Custom);
setOperationAction(ISD::FABS, MVT::f64, Custom);
}
setOperationAction(ISD::FSIN , MVT::f128, Expand);
setOperationAction(ISD::FCOS , MVT::f128, Expand);
setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
setOperationAction(ISD::FREM , MVT::f128, Expand);
setOperationAction(ISD::FMA , MVT::f128, Expand);
setOperationAction(ISD::FSIN , MVT::f64, Expand);
setOperationAction(ISD::FCOS , MVT::f64, Expand);
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
setOperationAction(ISD::FREM , MVT::f64, Expand);
setOperationAction(ISD::FMA , MVT::f64, Expand);
setOperationAction(ISD::FSIN , MVT::f32, Expand);
setOperationAction(ISD::FCOS , MVT::f32, Expand);
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
setOperationAction(ISD::FREM , MVT::f32, Expand);
setOperationAction(ISD::FMA , MVT::f32, Expand);
setOperationAction(ISD::CTTZ , MVT::i32, Expand);
setOperationAction(ISD::CTLZ , MVT::i32, Expand);
setOperationAction(ISD::ROTL , MVT::i32, Expand);
setOperationAction(ISD::ROTR , MVT::i32, Expand);
setOperationAction(ISD::BSWAP, MVT::i32, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
setOperationAction(ISD::FPOW , MVT::f128, Expand);
setOperationAction(ISD::FPOW , MVT::f64, Expand);
setOperationAction(ISD::FPOW , MVT::f32, Expand);
setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
// Expands to [SU]MUL_LOHI.
setOperationAction(ISD::MULHU, MVT::i32, Expand);
setOperationAction(ISD::MULHS, MVT::i32, Expand);
setOperationAction(ISD::MUL, MVT::i32, Expand);
if (Subtarget->useSoftMulDiv()) {
// .umul works for both signed and unsigned
setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
setLibcallName(RTLIB::MUL_I32, ".umul");
setOperationAction(ISD::SDIV, MVT::i32, Expand);
setLibcallName(RTLIB::SDIV_I32, ".div");
setOperationAction(ISD::UDIV, MVT::i32, Expand);
setLibcallName(RTLIB::UDIV_I32, ".udiv");
setLibcallName(RTLIB::SREM_I32, ".rem");
setLibcallName(RTLIB::UREM_I32, ".urem");
}
if (Subtarget->is64Bit()) {
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
setOperationAction(ISD::MULHU, MVT::i64, Expand);
setOperationAction(ISD::MULHS, MVT::i64, Expand);
setOperationAction(ISD::UMULO, MVT::i64, Custom);
setOperationAction(ISD::SMULO, MVT::i64, Custom);
setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
}
// VASTART needs to be custom lowered to use the VarArgsFrameIndex.
setOperationAction(ISD::VASTART , MVT::Other, Custom);
// VAARG needs to be lowered to not do unaligned accesses for doubles.
setOperationAction(ISD::VAARG , MVT::Other, Custom);
setOperationAction(ISD::TRAP , MVT::Other, Legal);
setOperationAction(ISD::DEBUGTRAP , MVT::Other, Legal);
// Use the default implementation.
setOperationAction(ISD::VACOPY , MVT::Other, Expand);
setOperationAction(ISD::VAEND , MVT::Other, Expand);
setOperationAction(ISD::STACKSAVE , MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE , MVT::Other, Expand);
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom);
setStackPointerRegisterToSaveRestore(SP::O6);
setOperationAction(ISD::CTPOP, MVT::i32,
Subtarget->usePopc() ? Legal : Expand);
if (Subtarget->isV9() && Subtarget->hasHardQuad()) {
setOperationAction(ISD::LOAD, MVT::f128, Legal);
setOperationAction(ISD::STORE, MVT::f128, Legal);
} else {
setOperationAction(ISD::LOAD, MVT::f128, Custom);
setOperationAction(ISD::STORE, MVT::f128, Custom);
}
if (Subtarget->hasHardQuad()) {
setOperationAction(ISD::FADD, MVT::f128, Legal);
setOperationAction(ISD::FSUB, MVT::f128, Legal);
setOperationAction(ISD::FMUL, MVT::f128, Legal);
setOperationAction(ISD::FDIV, MVT::f128, Legal);
setOperationAction(ISD::FSQRT, MVT::f128, Legal);
setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
setOperationAction(ISD::FP_ROUND, MVT::f64, Legal);
if (Subtarget->isV9()) {
setOperationAction(ISD::FNEG, MVT::f128, Legal);
setOperationAction(ISD::FABS, MVT::f128, Legal);
} else {
setOperationAction(ISD::FNEG, MVT::f128, Custom);
setOperationAction(ISD::FABS, MVT::f128, Custom);
}
if (!Subtarget->is64Bit()) {
setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Q_qtoll");
setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Q_qtoull");
setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Q_lltoq");
setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Q_ulltoq");
}
} else {
// Custom legalize f128 operations.
setOperationAction(ISD::FADD, MVT::f128, Custom);
setOperationAction(ISD::FSUB, MVT::f128, Custom);
setOperationAction(ISD::FMUL, MVT::f128, Custom);
setOperationAction(ISD::FDIV, MVT::f128, Custom);
setOperationAction(ISD::FSQRT, MVT::f128, Custom);
setOperationAction(ISD::FNEG, MVT::f128, Custom);
setOperationAction(ISD::FABS, MVT::f128, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
// Setup Runtime library names.
if (Subtarget->is64Bit() && !Subtarget->useSoftFloat()) {
setLibcallName(RTLIB::ADD_F128, "_Qp_add");
setLibcallName(RTLIB::SUB_F128, "_Qp_sub");
setLibcallName(RTLIB::MUL_F128, "_Qp_mul");
setLibcallName(RTLIB::DIV_F128, "_Qp_div");
setLibcallName(RTLIB::SQRT_F128, "_Qp_sqrt");
setLibcallName(RTLIB::FPTOSINT_F128_I32, "_Qp_qtoi");
setLibcallName(RTLIB::FPTOUINT_F128_I32, "_Qp_qtoui");
setLibcallName(RTLIB::SINTTOFP_I32_F128, "_Qp_itoq");
setLibcallName(RTLIB::UINTTOFP_I32_F128, "_Qp_uitoq");
setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Qp_qtox");
setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Qp_qtoux");
setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Qp_xtoq");
setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Qp_uxtoq");
setLibcallName(RTLIB::FPEXT_F32_F128, "_Qp_stoq");
setLibcallName(RTLIB::FPEXT_F64_F128, "_Qp_dtoq");
setLibcallName(RTLIB::FPROUND_F128_F32, "_Qp_qtos");
setLibcallName(RTLIB::FPROUND_F128_F64, "_Qp_qtod");
} else if (!Subtarget->useSoftFloat()) {
setLibcallName(RTLIB::ADD_F128, "_Q_add");
setLibcallName(RTLIB::SUB_F128, "_Q_sub");
setLibcallName(RTLIB::MUL_F128, "_Q_mul");
setLibcallName(RTLIB::DIV_F128, "_Q_div");
setLibcallName(RTLIB::SQRT_F128, "_Q_sqrt");
setLibcallName(RTLIB::FPTOSINT_F128_I32, "_Q_qtoi");
setLibcallName(RTLIB::FPTOUINT_F128_I32, "_Q_qtou");
setLibcallName(RTLIB::SINTTOFP_I32_F128, "_Q_itoq");
setLibcallName(RTLIB::UINTTOFP_I32_F128, "_Q_utoq");
setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Q_qtoll");
setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Q_qtoull");
setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Q_lltoq");
setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Q_ulltoq");
setLibcallName(RTLIB::FPEXT_F32_F128, "_Q_stoq");
setLibcallName(RTLIB::FPEXT_F64_F128, "_Q_dtoq");
setLibcallName(RTLIB::FPROUND_F128_F32, "_Q_qtos");
setLibcallName(RTLIB::FPROUND_F128_F64, "_Q_qtod");
}
}
if (Subtarget->fixAllFDIVSQRT()) {
// Promote FDIVS and FSQRTS to FDIVD and FSQRTD instructions instead as
// the former instructions generate errata on LEON processors.
setOperationAction(ISD::FDIV, MVT::f32, Promote);
setOperationAction(ISD::FSQRT, MVT::f32, Promote);
}
if (Subtarget->hasNoFMULS()) {
setOperationAction(ISD::FMUL, MVT::f32, Promote);
}
// Custom combine bitcast between f64 and v2i32
if (!Subtarget->is64Bit())
setTargetDAGCombine(ISD::BITCAST);
if (Subtarget->hasLeonCycleCounter())
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setMinFunctionAlignment(2);
computeRegisterProperties(Subtarget->getRegisterInfo());
}
bool SparcTargetLowering::useSoftFloat() const {
return Subtarget->useSoftFloat();
}
const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((SPISD::NodeType)Opcode) {
case SPISD::FIRST_NUMBER: break;
case SPISD::CMPICC: return "SPISD::CMPICC";
case SPISD::CMPFCC: return "SPISD::CMPFCC";
case SPISD::BRICC: return "SPISD::BRICC";
case SPISD::BRXCC: return "SPISD::BRXCC";
case SPISD::BRFCC: return "SPISD::BRFCC";
case SPISD::SELECT_ICC: return "SPISD::SELECT_ICC";
case SPISD::SELECT_XCC: return "SPISD::SELECT_XCC";
case SPISD::SELECT_FCC: return "SPISD::SELECT_FCC";
case SPISD::Hi: return "SPISD::Hi";
case SPISD::Lo: return "SPISD::Lo";
case SPISD::FTOI: return "SPISD::FTOI";
case SPISD::ITOF: return "SPISD::ITOF";
case SPISD::FTOX: return "SPISD::FTOX";
case SPISD::XTOF: return "SPISD::XTOF";
case SPISD::CALL: return "SPISD::CALL";
case SPISD::RET_FLAG: return "SPISD::RET_FLAG";
case SPISD::GLOBAL_BASE_REG: return "SPISD::GLOBAL_BASE_REG";
case SPISD::FLUSHW: return "SPISD::FLUSHW";
case SPISD::TLS_ADD: return "SPISD::TLS_ADD";
case SPISD::TLS_LD: return "SPISD::TLS_LD";
case SPISD::TLS_CALL: return "SPISD::TLS_CALL";
}
return nullptr;
}
EVT SparcTargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
EVT VT) const {
if (!VT.isVector())
return MVT::i32;
return VT.changeVectorElementTypeToInteger();
}
/// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
/// be zero. Op is expected to be a target specific node. Used by DAG
/// combiner.
void SparcTargetLowering::computeKnownBitsForTargetNode
(const SDValue Op,
KnownBits &Known,
const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth) const {
KnownBits Known2;
Known.resetAll();
switch (Op.getOpcode()) {
default: break;
case SPISD::SELECT_ICC:
case SPISD::SELECT_XCC:
case SPISD::SELECT_FCC:
Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
// Only known if known in both the LHS and RHS.
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
break;
}
}
// Look at LHS/RHS/CC and see if they are a lowered setcc instruction. If so
// set LHS/RHS and SPCC to the LHS/RHS of the setcc and SPCC to the condition.
static void LookThroughSetCC(SDValue &LHS, SDValue &RHS,
ISD::CondCode CC, unsigned &SPCC) {
if (isNullConstant(RHS) &&
CC == ISD::SETNE &&
(((LHS.getOpcode() == SPISD::SELECT_ICC ||
LHS.getOpcode() == SPISD::SELECT_XCC) &&
LHS.getOperand(3).getOpcode() == SPISD::CMPICC) ||
(LHS.getOpcode() == SPISD::SELECT_FCC &&
LHS.getOperand(3).getOpcode() == SPISD::CMPFCC)) &&
isOneConstant(LHS.getOperand(0)) &&
isNullConstant(LHS.getOperand(1))) {
SDValue CMPCC = LHS.getOperand(3);
SPCC = cast<ConstantSDNode>(LHS.getOperand(2))->getZExtValue();
LHS = CMPCC.getOperand(0);
RHS = CMPCC.getOperand(1);
}
}
// Convert to a target node and set target flags.
SDValue SparcTargetLowering::withTargetFlags(SDValue Op, unsigned TF,
SelectionDAG &DAG) const {
if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op))
return DAG.getTargetGlobalAddress(GA->getGlobal(),
SDLoc(GA),
GA->getValueType(0),
GA->getOffset(), TF);
if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op))
return DAG.getTargetConstantPool(CP->getConstVal(),
CP->getValueType(0),
CP->getAlignment(),
CP->getOffset(), TF);
if (const BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op))
return DAG.getTargetBlockAddress(BA->getBlockAddress(),
Op.getValueType(),
0,
TF);
if (const ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op))
return DAG.getTargetExternalSymbol(ES->getSymbol(),
ES->getValueType(0), TF);
llvm_unreachable("Unhandled address SDNode");
}
// Split Op into high and low parts according to HiTF and LoTF.
// Return an ADD node combining the parts.
SDValue SparcTargetLowering::makeHiLoPair(SDValue Op,
unsigned HiTF, unsigned LoTF,
SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT VT = Op.getValueType();
SDValue Hi = DAG.getNode(SPISD::Hi, DL, VT, withTargetFlags(Op, HiTF, DAG));
SDValue Lo = DAG.getNode(SPISD::Lo, DL, VT, withTargetFlags(Op, LoTF, DAG));
return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo);
}
// Build SDNodes for producing an address from a GlobalAddress, ConstantPool,
// or ExternalSymbol SDNode.
SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT VT = getPointerTy(DAG.getDataLayout());
// Handle PIC mode first. SPARC needs a got load for every variable!
if (isPositionIndependent()) {
const Module *M = DAG.getMachineFunction().getFunction().getParent();
PICLevel::Level picLevel = M->getPICLevel();
SDValue Idx;
if (picLevel == PICLevel::SmallPIC) {
// This is the pic13 code model, the GOT is known to be smaller than 8KiB.
Idx = DAG.getNode(SPISD::Lo, DL, Op.getValueType(),
withTargetFlags(Op, SparcMCExpr::VK_Sparc_GOT13, DAG));
} else {
// This is the pic32 code model, the GOT is known to be smaller than 4GB.
Idx = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_GOT22,
SparcMCExpr::VK_Sparc_GOT10, DAG);
}
SDValue GlobalBase = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, VT);
SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, VT, GlobalBase, Idx);
// GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
// function has calls.
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
MFI.setHasCalls(true);
return DAG.getLoad(VT, DL, DAG.getEntryNode(), AbsAddr,
MachinePointerInfo::getGOT(DAG.getMachineFunction()));
}
// This is one of the absolute code models.
switch(getTargetMachine().getCodeModel()) {
default:
llvm_unreachable("Unsupported absolute code model");
case CodeModel::Small:
// abs32.
return makeHiLoPair(Op, SparcMCExpr::VK_Sparc_HI,
SparcMCExpr::VK_Sparc_LO, DAG);
case CodeModel::Medium: {
// abs44.
SDValue H44 = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_H44,
SparcMCExpr::VK_Sparc_M44, DAG);
H44 = DAG.getNode(ISD::SHL, DL, VT, H44, DAG.getConstant(12, DL, MVT::i32));
SDValue L44 = withTargetFlags(Op, SparcMCExpr::VK_Sparc_L44, DAG);
L44 = DAG.getNode(SPISD::Lo, DL, VT, L44);
return DAG.getNode(ISD::ADD, DL, VT, H44, L44);
}
case CodeModel::Large: {
// abs64.
SDValue Hi = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_HH,
SparcMCExpr::VK_Sparc_HM, DAG);
Hi = DAG.getNode(ISD::SHL, DL, VT, Hi, DAG.getConstant(32, DL, MVT::i32));
SDValue Lo = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_HI,
SparcMCExpr::VK_Sparc_LO, DAG);
return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo);
}
}
}
SDValue SparcTargetLowering::LowerGlobalAddress(SDValue Op,
SelectionDAG &DAG) const {
return makeAddress(Op, DAG);
}
SDValue SparcTargetLowering::LowerConstantPool(SDValue Op,
SelectionDAG &DAG) const {
return makeAddress(Op, DAG);
}
SDValue SparcTargetLowering::LowerBlockAddress(SDValue Op,
SelectionDAG &DAG) const {
return makeAddress(Op, DAG);
}
SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
if (DAG.getTarget().useEmulatedTLS())
return LowerToTLSEmulatedModel(GA, DAG);
SDLoc DL(GA);
const GlobalValue *GV = GA->getGlobal();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
TLSModel::Model model = getTargetMachine().getTLSModel(GV);
if (model == TLSModel::GeneralDynamic || model == TLSModel::LocalDynamic) {
unsigned HiTF = ((model == TLSModel::GeneralDynamic)
? SparcMCExpr::VK_Sparc_TLS_GD_HI22
: SparcMCExpr::VK_Sparc_TLS_LDM_HI22);
unsigned LoTF = ((model == TLSModel::GeneralDynamic)
? SparcMCExpr::VK_Sparc_TLS_GD_LO10
: SparcMCExpr::VK_Sparc_TLS_LDM_LO10);
unsigned addTF = ((model == TLSModel::GeneralDynamic)
? SparcMCExpr::VK_Sparc_TLS_GD_ADD
: SparcMCExpr::VK_Sparc_TLS_LDM_ADD);
unsigned callTF = ((model == TLSModel::GeneralDynamic)
? SparcMCExpr::VK_Sparc_TLS_GD_CALL
: SparcMCExpr::VK_Sparc_TLS_LDM_CALL);
SDValue HiLo = makeHiLoPair(Op, HiTF, LoTF, DAG);
SDValue Base = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, PtrVT);
SDValue Argument = DAG.getNode(SPISD::TLS_ADD, DL, PtrVT, Base, HiLo,
withTargetFlags(Op, addTF, DAG));
SDValue Chain = DAG.getEntryNode();
SDValue InFlag;
Chain = DAG.getCALLSEQ_START(Chain, 1, 0, DL);
Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InFlag);
InFlag = Chain.getValue(1);
SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT);
SDValue Symbol = withTargetFlags(Op, callTF, DAG);
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
const uint32_t *Mask = Subtarget->getRegisterInfo()->getCallPreservedMask(
DAG.getMachineFunction(), CallingConv::C);
assert(Mask && "Missing call preserved mask for calling convention");
SDValue Ops[] = {Chain,
Callee,
Symbol,
DAG.getRegister(SP::O0, PtrVT),
DAG.getRegisterMask(Mask),
InFlag};
Chain = DAG.getNode(SPISD::TLS_CALL, DL, NodeTys, Ops);
InFlag = Chain.getValue(1);
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(1, DL, true),
DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
InFlag = Chain.getValue(1);
SDValue Ret = DAG.getCopyFromReg(Chain, DL, SP::O0, PtrVT, InFlag);
if (model != TLSModel::LocalDynamic)
return Ret;
SDValue Hi = DAG.getNode(SPISD::Hi, DL, PtrVT,
withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LDO_HIX22, DAG));
SDValue Lo = DAG.getNode(SPISD::Lo, DL, PtrVT,
withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LDO_LOX10, DAG));
HiLo = DAG.getNode(ISD::XOR, DL, PtrVT, Hi, Lo);
return DAG.getNode(SPISD::TLS_ADD, DL, PtrVT, Ret, HiLo,
withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LDO_ADD, DAG));
}
if (model == TLSModel::InitialExec) {
unsigned ldTF = ((PtrVT == MVT::i64)? SparcMCExpr::VK_Sparc_TLS_IE_LDX
: SparcMCExpr::VK_Sparc_TLS_IE_LD);
SDValue Base = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, PtrVT);
// GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
// function has calls.
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
MFI.setHasCalls(true);
SDValue TGA = makeHiLoPair(Op,
SparcMCExpr::VK_Sparc_TLS_IE_HI22,
SparcMCExpr::VK_Sparc_TLS_IE_LO10, DAG);
SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base, TGA);
SDValue Offset = DAG.getNode(SPISD::TLS_LD,
DL, PtrVT, Ptr,
withTargetFlags(Op, ldTF, DAG));
return DAG.getNode(SPISD::TLS_ADD, DL, PtrVT,
DAG.getRegister(SP::G7, PtrVT), Offset,
withTargetFlags(Op,
SparcMCExpr::VK_Sparc_TLS_IE_ADD, DAG));
}
assert(model == TLSModel::LocalExec);
SDValue Hi = DAG.getNode(SPISD::Hi, DL, PtrVT,
withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LE_HIX22, DAG));
SDValue Lo = DAG.getNode(SPISD::Lo, DL, PtrVT,
withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LE_LOX10, DAG));
SDValue Offset = DAG.getNode(ISD::XOR, DL, PtrVT, Hi, Lo);
return DAG.getNode(ISD::ADD, DL, PtrVT,
DAG.getRegister(SP::G7, PtrVT), Offset);
}
SDValue SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain,
ArgListTy &Args, SDValue Arg,
const SDLoc &DL,
SelectionDAG &DAG) const {
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
EVT ArgVT = Arg.getValueType();
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
ArgListEntry Entry;
Entry.Node = Arg;
Entry.Ty = ArgTy;
if (ArgTy->isFP128Ty()) {
// Create a stack object and pass the pointer to the library function.
int FI = MFI.CreateStackObject(16, 8, false);
SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
Chain = DAG.getStore(Chain, DL, Entry.Node, FIPtr, MachinePointerInfo(),
/* Alignment = */ 8);
Entry.Node = FIPtr;
Entry.Ty = PointerType::getUnqual(ArgTy);
}
Args.push_back(Entry);
return Chain;
}
SDValue
SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
const char *LibFuncName,
unsigned numArgs) const {
ArgListTy Args;
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Callee = DAG.getExternalSymbol(LibFuncName, PtrVT);
Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext());
Type *RetTyABI = RetTy;
SDValue Chain = DAG.getEntryNode();
SDValue RetPtr;
if (RetTy->isFP128Ty()) {
// Create a Stack Object to receive the return value of type f128.
ArgListEntry Entry;
int RetFI = MFI.CreateStackObject(16, 8, false);
RetPtr = DAG.getFrameIndex(RetFI, PtrVT);
Entry.Node = RetPtr;
Entry.Ty = PointerType::getUnqual(RetTy);
if (!Subtarget->is64Bit())
Entry.IsSRet = true;
Entry.IsReturned = false;
Args.push_back(Entry);
RetTyABI = Type::getVoidTy(*DAG.getContext());
}
assert(Op->getNumOperands() >= numArgs && "Not enough operands!");
for (unsigned i = 0, e = numArgs; i != e; ++i) {
Chain = LowerF128_LibCallArg(Chain, Args, Op.getOperand(i), SDLoc(Op), DAG);
}
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(SDLoc(Op)).setChain(Chain)
.setCallee(CallingConv::C, RetTyABI, Callee, std::move(Args));
std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
// chain is in second result.
if (RetTyABI == RetTy)
return CallInfo.first;
assert (RetTy->isFP128Ty() && "Unexpected return type!");
Chain = CallInfo.second;
// Load RetPtr to get the return value.
return DAG.getLoad(Op.getValueType(), SDLoc(Op), Chain, RetPtr,
MachinePointerInfo(), /* Alignment = */ 8);
}
SDValue SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
unsigned &SPCC, const SDLoc &DL,
SelectionDAG &DAG) const {
const char *LibCall = nullptr;
bool is64Bit = Subtarget->is64Bit();
switch(SPCC) {
default: llvm_unreachable("Unhandled conditional code!");
case SPCC::FCC_E : LibCall = is64Bit? "_Qp_feq" : "_Q_feq"; break;
case SPCC::FCC_NE : LibCall = is64Bit? "_Qp_fne" : "_Q_fne"; break;
case SPCC::FCC_L : LibCall = is64Bit? "_Qp_flt" : "_Q_flt"; break;
case SPCC::FCC_G : LibCall = is64Bit? "_Qp_fgt" : "_Q_fgt"; break;
case SPCC::FCC_LE : LibCall = is64Bit? "_Qp_fle" : "_Q_fle"; break;
case SPCC::FCC_GE : LibCall = is64Bit? "_Qp_fge" : "_Q_fge"; break;
case SPCC::FCC_UL :
case SPCC::FCC_ULE:
case SPCC::FCC_UG :
case SPCC::FCC_UGE:
case SPCC::FCC_U :
case SPCC::FCC_O :
case SPCC::FCC_LG :
case SPCC::FCC_UE : LibCall = is64Bit? "_Qp_cmp" : "_Q_cmp"; break;
}
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Callee = DAG.getExternalSymbol(LibCall, PtrVT);
Type *RetTy = Type::getInt32Ty(*DAG.getContext());
ArgListTy Args;
SDValue Chain = DAG.getEntryNode();
Chain = LowerF128_LibCallArg(Chain, Args, LHS, DL, DAG);
Chain = LowerF128_LibCallArg(Chain, Args, RHS, DL, DAG);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(DL).setChain(Chain)
.setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
// result is in first, and chain is in second result.
SDValue Result = CallInfo.first;
switch(SPCC) {
default: {
SDValue RHS = DAG.getTargetConstant(0, DL, Result.getValueType());
SPCC = SPCC::ICC_NE;
return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
}
case SPCC::FCC_UL : {
SDValue Mask = DAG.getTargetConstant(1, DL, Result.getValueType());
Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
SDValue RHS = DAG.getTargetConstant(0, DL, Result.getValueType());
SPCC = SPCC::ICC_NE;
return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
}
case SPCC::FCC_ULE: {
SDValue RHS = DAG.getTargetConstant(2, DL, Result.getValueType());
SPCC = SPCC::ICC_NE;
return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
}
case SPCC::FCC_UG : {
SDValue RHS = DAG.getTargetConstant(1, DL, Result.getValueType());
SPCC = SPCC::ICC_G;
return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
}
case SPCC::FCC_UGE: {
SDValue RHS = DAG.getTargetConstant(1, DL, Result.getValueType());
SPCC = SPCC::ICC_NE;
return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
}
case SPCC::FCC_U : {
SDValue RHS = DAG.getTargetConstant(3, DL, Result.getValueType());
SPCC = SPCC::ICC_E;
return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
}
case SPCC::FCC_O : {
SDValue RHS = DAG.getTargetConstant(3, DL, Result.getValueType());
SPCC = SPCC::ICC_NE;
return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
}
case SPCC::FCC_LG : {
SDValue Mask = DAG.getTargetConstant(3, DL, Result.getValueType());
Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
SDValue RHS = DAG.getTargetConstant(0, DL, Result.getValueType());
SPCC = SPCC::ICC_NE;
return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
}
case SPCC::FCC_UE : {
SDValue Mask = DAG.getTargetConstant(3, DL, Result.getValueType());
Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
SDValue RHS = DAG.getTargetConstant(0, DL, Result.getValueType());
SPCC = SPCC::ICC_E;
return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
}
}
}
static SDValue
LowerF128_FPEXTEND(SDValue Op, SelectionDAG &DAG,
const SparcTargetLowering &TLI) {
if (Op.getOperand(0).getValueType() == MVT::f64)
return TLI.LowerF128Op(Op, DAG,
TLI.getLibcallName(RTLIB::FPEXT_F64_F128), 1);
if (Op.getOperand(0).getValueType() == MVT::f32)
return TLI.LowerF128Op(Op, DAG,
TLI.getLibcallName(RTLIB::FPEXT_F32_F128), 1);
llvm_unreachable("fpextend with non-float operand!");
return SDValue();
}
static SDValue
LowerF128_FPROUND(SDValue Op, SelectionDAG &DAG,
const SparcTargetLowering &TLI) {
// FP_ROUND on f64 and f32 are legal.
if (Op.getOperand(0).getValueType() != MVT::f128)
return Op;
if (Op.getValueType() == MVT::f64)
return TLI.LowerF128Op(Op, DAG,
TLI.getLibcallName(RTLIB::FPROUND_F128_F64), 1);
if (Op.getValueType() == MVT::f32)
return TLI.LowerF128Op(Op, DAG,
TLI.getLibcallName(RTLIB::FPROUND_F128_F32), 1);
llvm_unreachable("fpround to non-float!");
return SDValue();
}
static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG,
const SparcTargetLowering &TLI,
bool hasHardQuad) {
SDLoc dl(Op);
EVT VT = Op.getValueType();
assert(VT == MVT::i32 || VT == MVT::i64);
// Expand f128 operations to fp128 abi calls.
if (Op.getOperand(0).getValueType() == MVT::f128
&& (!hasHardQuad || !TLI.isTypeLegal(VT))) {
const char *libName = TLI.getLibcallName(VT == MVT::i32
? RTLIB::FPTOSINT_F128_I32
: RTLIB::FPTOSINT_F128_I64);
return TLI.LowerF128Op(Op, DAG, libName, 1);
}
// Expand if the resulting type is illegal.
if (!TLI.isTypeLegal(VT))
return SDValue();
// Otherwise, Convert the fp value to integer in an FP register.
if (VT == MVT::i32)
Op = DAG.getNode(SPISD::FTOI, dl, MVT::f32, Op.getOperand(0));
else
Op = DAG.getNode(SPISD::FTOX, dl, MVT::f64, Op.getOperand(0));
return DAG.getNode(ISD::BITCAST, dl, VT, Op);
}
static SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG,
const SparcTargetLowering &TLI,
bool hasHardQuad) {
SDLoc dl(Op);
EVT OpVT = Op.getOperand(0).getValueType();
assert(OpVT == MVT::i32 || (OpVT == MVT::i64));
EVT floatVT = (OpVT == MVT::i32) ? MVT::f32 : MVT::f64;
// Expand f128 operations to fp128 ABI calls.
if (Op.getValueType() == MVT::f128
&& (!hasHardQuad || !TLI.isTypeLegal(OpVT))) {
const char *libName = TLI.getLibcallName(OpVT == MVT::i32
? RTLIB::SINTTOFP_I32_F128
: RTLIB::SINTTOFP_I64_F128);
return TLI.LowerF128Op(Op, DAG, libName, 1);
}
// Expand if the operand type is illegal.
if (!TLI.isTypeLegal(OpVT))
return SDValue();
// Otherwise, Convert the int value to FP in an FP register.
SDValue Tmp = DAG.getNode(ISD::BITCAST, dl, floatVT, Op.getOperand(0));
unsigned opcode = (OpVT == MVT::i32)? SPISD::ITOF : SPISD::XTOF;
return DAG.getNode(opcode, dl, Op.getValueType(), Tmp);
}
static SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG,
const SparcTargetLowering &TLI,
bool hasHardQuad) {
SDLoc dl(Op);
EVT VT = Op.getValueType();
// Expand if it does not involve f128 or the target has support for
// quad floating point instructions and the resulting type is legal.
if (Op.getOperand(0).getValueType() != MVT::f128 ||
(hasHardQuad && TLI.isTypeLegal(VT)))
return SDValue();
assert(VT == MVT::i32 || VT == MVT::i64);
return TLI.LowerF128Op(Op, DAG,
TLI.getLibcallName(VT == MVT::i32
? RTLIB::FPTOUINT_F128_I32
: RTLIB::FPTOUINT_F128_I64),
1);
}
static SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG,
const SparcTargetLowering &TLI,
bool hasHardQuad) {
SDLoc dl(Op);
EVT OpVT = Op.getOperand(0).getValueType();
assert(OpVT == MVT::i32 || OpVT == MVT::i64);
// Expand if it does not involve f128 or the target has support for
// quad floating point instructions and the operand type is legal.
if (Op.getValueType() != MVT::f128 || (hasHardQuad && TLI.isTypeLegal(OpVT)))
return SDValue();
return TLI.LowerF128Op(Op, DAG,
TLI.getLibcallName(OpVT == MVT::i32
? RTLIB::UINTTOFP_I32_F128
: RTLIB::UINTTOFP_I64_F128),
1);
}
static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG,
const SparcTargetLowering &TLI,
bool hasHardQuad) {
SDValue Chain = Op.getOperand(0);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
SDValue LHS = Op.getOperand(2);
SDValue RHS = Op.getOperand(3);
SDValue Dest = Op.getOperand(4);
SDLoc dl(Op);
unsigned Opc, SPCC = ~0U;
// If this is a br_cc of a "setcc", and if the setcc got lowered into
// an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
LookThroughSetCC(LHS, RHS, CC, SPCC);
// Get the condition flag.
SDValue CompareFlag;
if (LHS.getValueType().isInteger()) {
CompareFlag = DAG.getNode(SPISD::CMPICC, dl, MVT::Glue, LHS, RHS);
if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
// 32-bit compares use the icc flags, 64-bit uses the xcc flags.
Opc = LHS.getValueType() == MVT::i32 ? SPISD::BRICC : SPISD::BRXCC;
} else {
if (!hasHardQuad && LHS.getValueType() == MVT::f128) {
if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
CompareFlag = TLI.LowerF128Compare(LHS, RHS, SPCC, dl, DAG);
Opc = SPISD::BRICC;
} else {
CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
Opc = SPISD::BRFCC;
}
}
return DAG.getNode(Opc, dl, MVT::Other, Chain, Dest,
DAG.getConstant(SPCC, dl, MVT::i32), CompareFlag);
}
static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
const SparcTargetLowering &TLI,
bool hasHardQuad) {
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
SDValue TrueVal = Op.getOperand(2);
SDValue FalseVal = Op.getOperand(3);
SDLoc dl(Op);
unsigned Opc, SPCC = ~0U;
// If this is a select_cc of a "setcc", and if the setcc got lowered into
// an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
LookThroughSetCC(LHS, RHS, CC, SPCC);
SDValue CompareFlag;
if (LHS.getValueType().isInteger()) {
CompareFlag = DAG.getNode(SPISD::CMPICC, dl, MVT::Glue, LHS, RHS);
Opc = LHS.getValueType() == MVT::i32 ?
SPISD::SELECT_ICC : SPISD::SELECT_XCC;
if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
} else {
if (!hasHardQuad && LHS.getValueType() == MVT::f128) {
if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
CompareFlag = TLI.LowerF128Compare(LHS, RHS, SPCC, dl, DAG);
Opc = SPISD::SELECT_ICC;
} else {
CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
Opc = SPISD::SELECT_FCC;
if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
}
}
return DAG.getNode(Opc, dl, TrueVal.getValueType(), TrueVal, FalseVal,
DAG.getConstant(SPCC, dl, MVT::i32), CompareFlag);
}
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
const SparcTargetLowering &TLI) {
MachineFunction &MF = DAG.getMachineFunction();
SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
// Need frame address to find the address of VarArgsFrameIndex.
MF.getFrameInfo().setFrameAddressIsTaken(true);
// vastart just stores the address of the VarArgsFrameIndex slot into the
// memory location argument.
SDLoc DL(Op);
SDValue Offset =
DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getRegister(SP::I6, PtrVT),
DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL));
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
return DAG.getStore(Op.getOperand(0), DL, Offset, Op.getOperand(1),
MachinePointerInfo(SV));
}
static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
SDNode *Node = Op.getNode();
EVT VT = Node->getValueType(0);
SDValue InChain = Node->getOperand(0);
SDValue VAListPtr = Node->getOperand(1);
EVT PtrVT = VAListPtr.getValueType();
const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
SDLoc DL(Node);
SDValue VAList =
DAG.getLoad(PtrVT, DL, InChain, VAListPtr, MachinePointerInfo(SV));
// Increment the pointer, VAList, to the next vaarg.
SDValue NextPtr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
DAG.getIntPtrConstant(VT.getSizeInBits()/8,
DL));
// Store the incremented VAList to the legalized pointer.
InChain = DAG.getStore(VAList.getValue(1), DL, NextPtr, VAListPtr,
MachinePointerInfo(SV));
// Load the actual argument out of the pointer VAList.
// We can't count on greater alignment than the word size.
return DAG.getLoad(VT, DL, InChain, VAList, MachinePointerInfo(),
std::min(PtrVT.getSizeInBits(), VT.getSizeInBits()) / 8);
}
static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
const SparcSubtarget *Subtarget) {
SDValue Chain = Op.getOperand(0); // Legalize the chain.
SDValue Size = Op.getOperand(1); // Legalize the size.
unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
unsigned StackAlign = Subtarget->getFrameLowering()->getStackAlignment();
EVT VT = Size->getValueType(0);
SDLoc dl(Op);
// TODO: implement over-aligned alloca. (Note: also implies
// supporting support for overaligned function frames + dynamic
// allocations, at all, which currently isn't supported)
if (Align > StackAlign) {
const MachineFunction &MF = DAG.getMachineFunction();
report_fatal_error("Function \"" + Twine(MF.getName()) + "\": "
"over-aligned dynamic alloca not supported.");
}
// The resultant pointer needs to be above the register spill area
// at the bottom of the stack.
unsigned regSpillArea;
if (Subtarget->is64Bit()) {
regSpillArea = 128;
} else {
// On Sparc32, the size of the spill area is 92. Unfortunately,
// that's only 4-byte aligned, not 8-byte aligned (the stack
// pointer is 8-byte aligned). So, if the user asked for an 8-byte
// aligned dynamic allocation, we actually need to add 96 to the
// bottom of the stack, instead of 92, to ensure 8-byte alignment.
// That also means adding 4 to the size of the allocation --
// before applying the 8-byte rounding. Unfortunately, we the
// value we get here has already had rounding applied. So, we need
// to add 8, instead, wasting a bit more memory.
// Further, this only actually needs to be done if the required
// alignment is > 4, but, we've lost that info by this point, too,
// so we always apply it.
// (An alternative approach would be to always reserve 96 bytes
// instead of the required 92, but then we'd waste 4 extra bytes
// in every frame, not just those with dynamic stack allocations)
// TODO: modify code in SelectionDAGBuilder to make this less sad.
Size = DAG.getNode(ISD::ADD, dl, VT, Size,
DAG.getConstant(8, dl, VT));
regSpillArea = 96;
}
unsigned SPReg = SP::O6;
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
SDValue NewSP = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
Chain = DAG.getCopyToReg(SP.getValue(1), dl, SPReg, NewSP); // Output chain
regSpillArea += Subtarget->getStackPointerBias();
SDValue NewVal = DAG.getNode(ISD::ADD, dl, VT, NewSP,
DAG.getConstant(regSpillArea, dl, VT));
SDValue Ops[2] = { NewVal, Chain };
return DAG.getMergeValues(Ops, dl);
}
static SDValue getFLUSHW(SDValue Op, SelectionDAG &DAG) {
SDLoc dl(Op);
SDValue Chain = DAG.getNode(SPISD::FLUSHW,
dl, MVT::Other, DAG.getEntryNode());
return Chain;
}
static SDValue getFRAMEADDR(uint64_t depth, SDValue Op, SelectionDAG &DAG,
const SparcSubtarget *Subtarget,
bool AlwaysFlush = false) {
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
MFI.setFrameAddressIsTaken(true);
EVT VT = Op.getValueType();
SDLoc dl(Op);
unsigned FrameReg = SP::I6;
unsigned stackBias = Subtarget->getStackPointerBias();
SDValue FrameAddr;
SDValue Chain;
// flush first to make sure the windowed registers' values are in stack
Chain = (depth || AlwaysFlush) ? getFLUSHW(Op, DAG) : DAG.getEntryNode();
FrameAddr = DAG.getCopyFromReg(Chain, dl, FrameReg, VT);
unsigned Offset = (Subtarget->is64Bit()) ? (stackBias + 112) : 56;
while (depth--) {
SDValue Ptr = DAG.getNode(ISD::ADD, dl, VT, FrameAddr,
DAG.getIntPtrConstant(Offset, dl));
FrameAddr = DAG.getLoad(VT, dl, Chain, Ptr, MachinePointerInfo());
}
if (Subtarget->is64Bit())
FrameAddr = DAG.getNode(ISD::ADD, dl, VT, FrameAddr,
DAG.getIntPtrConstant(stackBias, dl));
return FrameAddr;
}
static SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG,
const SparcSubtarget *Subtarget) {
uint64_t depth = Op.getConstantOperandVal(0);
return getFRAMEADDR(depth, Op, DAG, Subtarget);
}
static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
const SparcTargetLowering &TLI,
const SparcSubtarget *Subtarget) {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
MFI.setReturnAddressIsTaken(true);
if (TLI.verifyReturnAddressArgumentIsConstant(Op, DAG))
return SDValue();
EVT VT = Op.getValueType();
SDLoc dl(Op);
uint64_t depth = Op.getConstantOperandVal(0);
SDValue RetAddr;
if (depth == 0) {
auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
unsigned RetReg = MF.addLiveIn(SP::I7, TLI.getRegClassFor(PtrVT));
RetAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, RetReg, VT);
return RetAddr;
}
// Need frame address to find return address of the caller.
SDValue FrameAddr = getFRAMEADDR(depth - 1, Op, DAG, Subtarget, true);
unsigned Offset = (Subtarget->is64Bit()) ? 120 : 60;
SDValue Ptr = DAG.getNode(ISD::ADD,
dl, VT,
FrameAddr,
DAG.getIntPtrConstant(Offset, dl));
RetAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), Ptr, MachinePointerInfo());
return RetAddr;
}
static SDValue LowerF64Op(SDValue SrcReg64, const SDLoc &dl, SelectionDAG &DAG,
unsigned opcode) {
assert(SrcReg64.getValueType() == MVT::f64 && "LowerF64Op called on non-double!");
assert(opcode == ISD::FNEG || opcode == ISD::FABS);
// Lower fneg/fabs on f64 to fneg/fabs on f32.
// fneg f64 => fneg f32:sub_even, fmov f32:sub_odd.
// fabs f64 => fabs f32:sub_even, fmov f32:sub_odd.
// Note: in little-endian, the floating-point value is stored in the
// registers are in the opposite order, so the subreg with the sign
// bit is the highest-numbered (odd), rather than the
// lowest-numbered (even).
SDValue Hi32 = DAG.getTargetExtractSubreg(SP::sub_even, dl, MVT::f32,
SrcReg64);
SDValue Lo32 = DAG.getTargetExtractSubreg(SP::sub_odd, dl, MVT::f32,
SrcReg64);
if (DAG.getDataLayout().isLittleEndian())
Lo32 = DAG.getNode(opcode, dl, MVT::f32, Lo32);
else
Hi32 = DAG.getNode(opcode, dl, MVT::f32, Hi32);
SDValue DstReg64 = SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
dl, MVT::f64), 0);
DstReg64 = DAG.getTargetInsertSubreg(SP::sub_even, dl, MVT::f64,
DstReg64, Hi32);
DstReg64 = DAG.getTargetInsertSubreg(SP::sub_odd, dl, MVT::f64,
DstReg64, Lo32);
return DstReg64;
}
// Lower a f128 load into two f64 loads.
static SDValue LowerF128Load(SDValue Op, SelectionDAG &DAG)
{
SDLoc dl(Op);
LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
assert(LdNode && LdNode->getOffset().isUndef()
&& "Unexpected node type");
unsigned alignment = LdNode->getAlignment();
if (alignment > 8)
alignment = 8;
SDValue Hi64 =
DAG.getLoad(MVT::f64, dl, LdNode->getChain(), LdNode->getBasePtr(),
LdNode->getPointerInfo(), alignment);
EVT addrVT = LdNode->getBasePtr().getValueType();
SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
LdNode->getBasePtr(),
DAG.getConstant(8, dl, addrVT));
SDValue Lo64 = DAG.getLoad(MVT::f64, dl, LdNode->getChain(), LoPtr,
LdNode->getPointerInfo(), alignment);
SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, dl, MVT::i32);
SDValue SubRegOdd = DAG.getTargetConstant(SP::sub_odd64, dl, MVT::i32);
SDNode *InFP128 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
dl, MVT::f128);
InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
MVT::f128,
SDValue(InFP128, 0),
Hi64,
SubRegEven);
InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
MVT::f128,
SDValue(InFP128, 0),
Lo64,
SubRegOdd);
SDValue OutChains[2] = { SDValue(Hi64.getNode(), 1),
SDValue(Lo64.getNode(), 1) };
SDValue OutChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
SDValue Ops[2] = {SDValue(InFP128,0), OutChain};
return DAG.getMergeValues(Ops, dl);
}
static SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG)
{
LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode());
EVT MemVT = LdNode->getMemoryVT();
if (MemVT == MVT::f128)
return LowerF128Load(Op, DAG);
return Op;
}
// Lower a f128 store into two f64 stores.
static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
SDLoc dl(Op);
StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
assert(StNode && StNode->getOffset().isUndef()
&& "Unexpected node type");
SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, dl, MVT::i32);
SDValue SubRegOdd = DAG.getTargetConstant(SP::sub_odd64, dl, MVT::i32);
SDNode *Hi64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG,
dl,
MVT::f64,
StNode->getValue(),
SubRegEven);
SDNode *Lo64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG,
dl,
MVT::f64,
StNode->getValue(),
SubRegOdd);
unsigned alignment = StNode->getAlignment();
if (alignment > 8)
alignment = 8;
SDValue OutChains[2];
OutChains[0] =
DAG.getStore(StNode->getChain(), dl, SDValue(Hi64, 0),
StNode->getBasePtr(), MachinePointerInfo(), alignment);
EVT addrVT = StNode->getBasePtr().getValueType();
SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
StNode->getBasePtr(),
DAG.getConstant(8, dl, addrVT));
OutChains[1] = DAG.getStore(StNode->getChain(), dl, SDValue(Lo64, 0), LoPtr,
MachinePointerInfo(), alignment);
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
}
static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG)
{
SDLoc dl(Op);
StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
EVT MemVT = St->getMemoryVT();
if (MemVT == MVT::f128)
return LowerF128Store(Op, DAG);
if (MemVT == MVT::i64) {
// Custom handling for i64 stores: turn it into a bitcast and a
// v2i32 store.
SDValue Val = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, St->getValue());
SDValue Chain = DAG.getStore(
St->getChain(), dl, Val, St->getBasePtr(), St->getPointerInfo(),
St->getAlignment(), St->getMemOperand()->getFlags(), St->getAAInfo());
return Chain;
}
return SDValue();
}
static SDValue LowerFNEGorFABS(SDValue Op, SelectionDAG &DAG, bool isV9) {
assert((Op.getOpcode() == ISD::FNEG || Op.getOpcode() == ISD::FABS)
&& "invalid opcode");
SDLoc dl(Op);
if (Op.getValueType() == MVT::f64)
return LowerF64Op(Op.getOperand(0), dl, DAG, Op.getOpcode());
if (Op.getValueType() != MVT::f128)
return Op;
// Lower fabs/fneg on f128 to fabs/fneg on f64
// fabs/fneg f128 => fabs/fneg f64:sub_even64, fmov f64:sub_odd64
// (As with LowerF64Op, on little-endian, we need to negate the odd
// subreg)
SDValue SrcReg128 = Op.getOperand(0);
SDValue Hi64 = DAG.getTargetExtractSubreg(SP::sub_even64, dl, MVT::f64,
SrcReg128);
SDValue Lo64 = DAG.getTargetExtractSubreg(SP::sub_odd64, dl, MVT::f64,
SrcReg128);
if (DAG.getDataLayout().isLittleEndian()) {
if (isV9)
Lo64 = DAG.getNode(Op.getOpcode(), dl, MVT::f64, Lo64);
else
Lo64 = LowerF64Op(Lo64, dl, DAG, Op.getOpcode());
} else {
if (isV9)
Hi64 = DAG.getNode(Op.getOpcode(), dl, MVT::f64, Hi64);
else
Hi64 = LowerF64Op(Hi64, dl, DAG, Op.getOpcode());
}
SDValue DstReg128 = SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
dl, MVT::f128), 0);
DstReg128 = DAG.getTargetInsertSubreg(SP::sub_even64, dl, MVT::f128,
DstReg128, Hi64);
DstReg128 = DAG.getTargetInsertSubreg(SP::sub_odd64, dl, MVT::f128,
DstReg128, Lo64);
return DstReg128;
}
static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
if (Op.getValueType() != MVT::i64)
return Op;
SDLoc dl(Op);
SDValue Src1 = Op.getOperand(0);
SDValue Src1Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1);
SDValue Src1Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src1,
DAG.getConstant(32, dl, MVT::i64));
Src1Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1Hi);
SDValue Src2 = Op.getOperand(1);
SDValue Src2Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2);
SDValue Src2Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src2,
DAG.getConstant(32, dl, MVT::i64));
Src2Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2Hi);
bool hasChain = false;
unsigned hiOpc = Op.getOpcode();
switch (Op.getOpcode()) {
default: llvm_unreachable("Invalid opcode");
case ISD::ADDC: hiOpc = ISD::ADDE; break;
case ISD::ADDE: hasChain = true; break;
case ISD::SUBC: hiOpc = ISD::SUBE; break;
case ISD::SUBE: hasChain = true; break;
}
SDValue Lo;
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Glue);
if (hasChain) {
Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo,
Op.getOperand(2));
} else {
Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo);
}
SDValue Hi = DAG.getNode(hiOpc, dl, VTs, Src1Hi, Src2Hi, Lo.getValue(1));
SDValue Carry = Hi.getValue(1);
Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Lo);
Hi = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Hi);
Hi = DAG.getNode(ISD::SHL, dl, MVT::i64, Hi,
DAG.getConstant(32, dl, MVT::i64));
SDValue Dst = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, Lo);
SDValue Ops[2] = { Dst, Carry };
return DAG.getMergeValues(Ops, dl);
}
// Custom lower UMULO/SMULO for SPARC. This code is similar to ExpandNode()
// in LegalizeDAG.cpp except the order of arguments to the library function.
static SDValue LowerUMULO_SMULO(SDValue Op, SelectionDAG &DAG,
const SparcTargetLowering &TLI)
{
unsigned opcode = Op.getOpcode();
assert((opcode == ISD::UMULO || opcode == ISD::SMULO) && "Invalid Opcode.");
bool isSigned = (opcode == ISD::SMULO);
EVT VT = MVT::i64;
EVT WideVT = MVT::i128;
SDLoc dl(Op);
SDValue LHS = Op.getOperand(0);
if (LHS.getValueType() != VT)
return Op;
SDValue ShiftAmt = DAG.getConstant(63, dl, VT);
SDValue RHS = Op.getOperand(1);
SDValue HiLHS = DAG.getNode(ISD::SRA, dl, VT, LHS, ShiftAmt);
SDValue HiRHS = DAG.getNode(ISD::SRA, dl, MVT::i64, RHS, ShiftAmt);
SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
SDValue MulResult = TLI.makeLibCall(DAG,
RTLIB::MUL_I128, WideVT,
Args, isSigned, dl).first;
SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT,
MulResult, DAG.getIntPtrConstant(0, dl));
SDValue TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT,
MulResult, DAG.getIntPtrConstant(1, dl));
if (isSigned) {
SDValue Tmp1 = DAG.getNode(ISD::SRA, dl, VT, BottomHalf, ShiftAmt);
TopHalf = DAG.getSetCC(dl, MVT::i32, TopHalf, Tmp1, ISD::SETNE);
} else {
TopHalf = DAG.getSetCC(dl, MVT::i32, TopHalf, DAG.getConstant(0, dl, VT),
ISD::SETNE);
}
// MulResult is a node with an illegal type. Because such things are not
// generally permitted during this phase of legalization, ensure that
// nothing is left using the node. The above EXTRACT_ELEMENT nodes should have
// been folded.
assert(MulResult->use_empty() && "Illegally typed node still in use!");
SDValue Ops[2] = { BottomHalf, TopHalf } ;
return DAG.getMergeValues(Ops, dl);
}
static SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) {
if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
// Expand with a fence.
return SDValue();
// Monotonic load/stores are legal.
return Op;
}
SDValue SparcTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
SDLoc dl(Op);
switch (IntNo) {
default: return SDValue(); // Don't custom lower most intrinsics.
case Intrinsic::thread_pointer: {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
return DAG.getRegister(SP::G7, PtrVT);
}
}
}
SDValue SparcTargetLowering::
LowerOperation(SDValue Op, SelectionDAG &DAG) const {
bool hasHardQuad = Subtarget->hasHardQuad();
bool isV9 = Subtarget->isV9();
switch (Op.getOpcode()) {
default: llvm_unreachable("Should not custom lower this!");
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG, *this,
Subtarget);
case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG,
Subtarget);
case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG, *this,
hasHardQuad);
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG, *this,
hasHardQuad);
case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG, *this,
hasHardQuad);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG, *this,
hasHardQuad);
case ISD::BR_CC: return LowerBR_CC(Op, DAG, *this,
hasHardQuad);
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG, *this,
hasHardQuad);
case ISD::VASTART: return LowerVASTART(Op, DAG, *this);
case ISD::VAARG: return LowerVAARG(Op, DAG);
case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG,
Subtarget);
case ISD::LOAD: return LowerLOAD(Op, DAG);
case ISD::STORE: return LowerSTORE(Op, DAG);
case ISD::FADD: return LowerF128Op(Op, DAG,
getLibcallName(RTLIB::ADD_F128), 2);
case ISD::FSUB: return LowerF128Op(Op, DAG,
getLibcallName(RTLIB::SUB_F128), 2);
case ISD::FMUL: return LowerF128Op(Op, DAG,
getLibcallName(RTLIB::MUL_F128), 2);
case ISD::FDIV: return LowerF128Op(Op, DAG,
getLibcallName(RTLIB::DIV_F128), 2);
case ISD::FSQRT: return LowerF128Op(Op, DAG,
getLibcallName(RTLIB::SQRT_F128),1);
case ISD::FABS:
case ISD::FNEG: return LowerFNEGorFABS(Op, DAG, isV9);
case ISD::FP_EXTEND: return LowerF128_FPEXTEND(Op, DAG, *this);
case ISD::FP_ROUND: return LowerF128_FPROUND(Op, DAG, *this);
case ISD::ADDC:
case ISD::ADDE:
case ISD::SUBC:
case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
case ISD::UMULO:
case ISD::SMULO: return LowerUMULO_SMULO(Op, DAG, *this);
case ISD::ATOMIC_LOAD:
case ISD::ATOMIC_STORE: return LowerATOMIC_LOAD_STORE(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
}
}
SDValue SparcTargetLowering::bitcastConstantFPToInt(ConstantFPSDNode *C,
const SDLoc &DL,
SelectionDAG &DAG) const {
APInt V = C->getValueAPF().bitcastToAPInt();
SDValue Lo = DAG.getConstant(V.zextOrTrunc(32), DL, MVT::i32);
SDValue Hi = DAG.getConstant(V.lshr(32).zextOrTrunc(32), DL, MVT::i32);
if (DAG.getDataLayout().isLittleEndian())
std::swap(Lo, Hi);
return DAG.getBuildVector(MVT::v2i32, DL, {Hi, Lo});
}
SDValue SparcTargetLowering::PerformBITCASTCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SDLoc dl(N);
SDValue Src = N->getOperand(0);
if (isa<ConstantFPSDNode>(Src) && N->getSimpleValueType(0) == MVT::v2i32 &&
Src.getSimpleValueType() == MVT::f64)
return bitcastConstantFPToInt(cast<ConstantFPSDNode>(Src), dl, DCI.DAG);
return SDValue();
}
SDValue SparcTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
switch (N->getOpcode()) {
default:
break;
case ISD::BITCAST:
return PerformBITCASTCombine(N, DCI);
}
return SDValue();
}
MachineBasicBlock *
SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
switch (MI.getOpcode()) {
default: llvm_unreachable("Unknown SELECT_CC!");
case SP::SELECT_CC_Int_ICC:
case SP::SELECT_CC_FP_ICC:
case SP::SELECT_CC_DFP_ICC:
case SP::SELECT_CC_QFP_ICC:
return expandSelectCC(MI, BB, SP::BCOND);
case SP::SELECT_CC_Int_FCC:
case SP::SELECT_CC_FP_FCC:
case SP::SELECT_CC_DFP_FCC:
case SP::SELECT_CC_QFP_FCC:
return expandSelectCC(MI, BB, SP::FBCOND);
}
}
MachineBasicBlock *
SparcTargetLowering::expandSelectCC(MachineInstr &MI, MachineBasicBlock *BB,
unsigned BROpcode) const {
const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
DebugLoc dl = MI.getDebugLoc();
unsigned CC = (SPCC::CondCodes)MI.getOperand(3).getImm();
// To "insert" a SELECT_CC instruction, we actually have to insert the
// triangle control-flow pattern. The incoming instruction knows the
// destination vreg to set, the condition code register to branch on, the
// true/false values to select between, and the condition code for the branch.
//
// We produce the following control flow:
// ThisMBB
// | \
// | IfFalseMBB
// | /
// SinkMBB
const BasicBlock *LLVM_BB = BB->getBasicBlock();
MachineFunction::iterator It = ++BB->getIterator();
MachineBasicBlock *ThisMBB = BB;
MachineFunction *F = BB->getParent();
MachineBasicBlock *IfFalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
F->insert(It, IfFalseMBB);
F->insert(It, SinkMBB);
// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
SinkMBB->splice(SinkMBB->begin(), ThisMBB,
std::next(MachineBasicBlock::iterator(MI)), ThisMBB->end());
SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
// Set the new successors for ThisMBB.
ThisMBB->addSuccessor(IfFalseMBB);
ThisMBB->addSuccessor(SinkMBB);
BuildMI(ThisMBB, dl, TII.get(BROpcode))
.addMBB(SinkMBB)
.addImm(CC);
// IfFalseMBB just falls through to SinkMBB.
IfFalseMBB->addSuccessor(SinkMBB);
// %Result = phi [ %TrueValue, ThisMBB ], [ %FalseValue, IfFalseMBB ]
BuildMI(*SinkMBB, SinkMBB->begin(), dl, TII.get(SP::PHI),
MI.getOperand(0).getReg())
.addReg(MI.getOperand(1).getReg())
.addMBB(ThisMBB)
.addReg(MI.getOperand(2).getReg())
.addMBB(IfFalseMBB);
MI.eraseFromParent(); // The pseudo instruction is gone now.
return SinkMBB;
}
//===----------------------------------------------------------------------===//
// Sparc Inline Assembly Support
//===----------------------------------------------------------------------===//
/// getConstraintType - Given a constraint letter, return the type of
/// constraint it is for this target.
SparcTargetLowering::ConstraintType
SparcTargetLowering::getConstraintType(StringRef Constraint) const {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
default: break;
case 'r':
case 'f':
case 'e':
return C_RegisterClass;
case 'I': // SIMM13
- return C_Other;
+ return C_Immediate;
}
}
return TargetLowering::getConstraintType(Constraint);
}
TargetLowering::ConstraintWeight SparcTargetLowering::
getSingleConstraintMatchWeight(AsmOperandInfo &info,
const char *constraint) const {
ConstraintWeight weight = CW_Invalid;
Value *CallOperandVal = info.CallOperandVal;
// If we don't have a value, we can't do a match,
// but allow it at the lowest weight.
if (!CallOperandVal)
return CW_Default;
// Look at the constraint type.
switch (*constraint) {
default:
weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
break;
case 'I': // SIMM13
if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
if (isInt<13>(C->getSExtValue()))
weight = CW_Constant;
}
break;
}
return weight;
}
/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
/// vector. If it is invalid, don't add anything to Ops.
void SparcTargetLowering::
LowerAsmOperandForConstraint(SDValue Op,
std::string &Constraint,
std::vector<SDValue> &Ops,
SelectionDAG &DAG) const {
SDValue Result(nullptr, 0);
// Only support length 1 constraints for now.
if (Constraint.length() > 1)
return;
char ConstraintLetter = Constraint[0];
switch (ConstraintLetter) {
default: break;
case 'I':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (isInt<13>(C->getSExtValue())) {
Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
Op.getValueType());
break;
}
return;
}
}
if (Result.getNode()) {
Ops.push_back(Result);
return;
}
TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
}
std::pair<unsigned, const TargetRegisterClass *>
SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
MVT VT) const {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
case 'r':
if (VT == MVT::v2i32)
return std::make_pair(0U, &SP::IntPairRegClass);
else if (Subtarget->is64Bit())
return std::make_pair(0U, &SP::I64RegsRegClass);
else
return std::make_pair(0U, &SP::IntRegsRegClass);
case 'f':
if (VT == MVT::f32 || VT == MVT::i32)
return std::make_pair(0U, &SP::FPRegsRegClass);
else if (VT == MVT::f64 || VT == MVT::i64)
return std::make_pair(0U, &SP::LowDFPRegsRegClass);
else if (VT == MVT::f128)
return std::make_pair(0U, &SP::LowQFPRegsRegClass);
// This will generate an error message
return std::make_pair(0U, nullptr);
case 'e':
if (VT == MVT::f32 || VT == MVT::i32)
return std::make_pair(0U, &SP::FPRegsRegClass);
else if (VT == MVT::f64 || VT == MVT::i64 )
return std::make_pair(0U, &SP::DFPRegsRegClass);
else if (VT == MVT::f128)
return std::make_pair(0U, &SP::QFPRegsRegClass);
// This will generate an error message
return std::make_pair(0U, nullptr);
}
} else if (!Constraint.empty() && Constraint.size() <= 5
&& Constraint[0] == '{' && *(Constraint.end()-1) == '}') {
// constraint = '{r<d>}'
// Remove the braces from around the name.
StringRef name(Constraint.data()+1, Constraint.size()-2);
// Handle register aliases:
// r0-r7 -> g0-g7
// r8-r15 -> o0-o7
// r16-r23 -> l0-l7
// r24-r31 -> i0-i7
uint64_t intVal = 0;
if (name.substr(0, 1).equals("r")
&& !name.substr(1).getAsInteger(10, intVal) && intVal <= 31) {
const char regTypes[] = { 'g', 'o', 'l', 'i' };
char regType = regTypes[intVal/8];
char regIdx = '0' + (intVal % 8);
char tmp[] = { '{', regType, regIdx, '}', 0 };
std::string newConstraint = std::string(tmp);
return TargetLowering::getRegForInlineAsmConstraint(TRI, newConstraint,
VT);
}
if (name.substr(0, 1).equals("f") &&
!name.substr(1).getAsInteger(10, intVal) && intVal <= 63) {
std::string newConstraint;
if (VT == MVT::f32 || VT == MVT::Other) {
newConstraint = "{f" + utostr(intVal) + "}";
} else if (VT == MVT::f64 && (intVal % 2 == 0)) {
newConstraint = "{d" + utostr(intVal / 2) + "}";
} else if (VT == MVT::f128 && (intVal % 4 == 0)) {
newConstraint = "{q" + utostr(intVal / 4) + "}";
} else {
return std::make_pair(0U, nullptr);
}
return TargetLowering::getRegForInlineAsmConstraint(TRI, newConstraint,
VT);
}
}
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
}
bool
SparcTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
// The Sparc target isn't yet aware of offsets.
return false;
}
void SparcTargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue>& Results,
SelectionDAG &DAG) const {
SDLoc dl(N);
RTLIB::Libcall libCall = RTLIB::UNKNOWN_LIBCALL;
switch (N->getOpcode()) {
default:
llvm_unreachable("Do not know how to custom type legalize this operation!");
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
// Custom lower only if it involves f128 or i64.
if (N->getOperand(0).getValueType() != MVT::f128
|| N->getValueType(0) != MVT::i64)
return;
libCall = ((N->getOpcode() == ISD::FP_TO_SINT)
? RTLIB::FPTOSINT_F128_I64
: RTLIB::FPTOUINT_F128_I64);
Results.push_back(LowerF128Op(SDValue(N, 0),
DAG,
getLibcallName(libCall),
1));
return;
case ISD::READCYCLECOUNTER: {
assert(Subtarget->hasLeonCycleCounter());
SDValue Lo = DAG.getCopyFromReg(N->getOperand(0), dl, SP::ASR23, MVT::i32);
SDValue Hi = DAG.getCopyFromReg(Lo, dl, SP::G0, MVT::i32);
SDValue Ops[] = { Lo, Hi };
SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops);
Results.push_back(Pair);
Results.push_back(N->getOperand(0));
return;
}
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
// Custom lower only if it involves f128 or i64.
if (N->getValueType(0) != MVT::f128
|| N->getOperand(0).getValueType() != MVT::i64)
return;
libCall = ((N->getOpcode() == ISD::SINT_TO_FP)
? RTLIB::SINTTOFP_I64_F128
: RTLIB::UINTTOFP_I64_F128);
Results.push_back(LowerF128Op(SDValue(N, 0),
DAG,
getLibcallName(libCall),
1));
return;
case ISD::LOAD: {
LoadSDNode *Ld = cast<LoadSDNode>(N);
// Custom handling only for i64: turn i64 load into a v2i32 load,
// and a bitcast.
if (Ld->getValueType(0) != MVT::i64 || Ld->getMemoryVT() != MVT::i64)
return;
SDLoc dl(N);
SDValue LoadRes = DAG.getExtLoad(
Ld->getExtensionType(), dl, MVT::v2i32, Ld->getChain(),
Ld->getBasePtr(), Ld->getPointerInfo(), MVT::v2i32, Ld->getAlignment(),
Ld->getMemOperand()->getFlags(), Ld->getAAInfo());
SDValue Res = DAG.getNode(ISD::BITCAST, dl, MVT::i64, LoadRes);
Results.push_back(Res);
Results.push_back(LoadRes.getValue(1));
return;
}
}
}
// Override to enable LOAD_STACK_GUARD lowering on Linux.
bool SparcTargetLowering::useLoadStackGuardNode() const {
if (!Subtarget->isTargetLinux())
return TargetLowering::useLoadStackGuardNode();
return true;
}
// Override to disable global variable loading on Linux.
void SparcTargetLowering::insertSSPDeclarations(Module &M) const {
if (!Subtarget->isTargetLinux())
return TargetLowering::insertSSPDeclarations(M);
}
Index: vendor/llvm/dist-release_90/lib/Target/SystemZ/SystemZISelLowering.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/SystemZ/SystemZISelLowering.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/SystemZ/SystemZISelLowering.cpp (revision 351303)
@@ -1,7768 +1,7768 @@
//===-- SystemZISelLowering.cpp - SystemZ DAG lowering implementation -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the SystemZTargetLowering class.
//
//===----------------------------------------------------------------------===//
#include "SystemZISelLowering.h"
#include "SystemZCallingConv.h"
#include "SystemZConstantPoolValue.h"
#include "SystemZMachineFunctionInfo.h"
#include "SystemZTargetMachine.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"
#include <cctype>
using namespace llvm;
#define DEBUG_TYPE "systemz-lower"
namespace {
// Represents information about a comparison.
struct Comparison {
Comparison(SDValue Op0In, SDValue Op1In)
: Op0(Op0In), Op1(Op1In), Opcode(0), ICmpType(0), CCValid(0), CCMask(0) {}
// The operands to the comparison.
SDValue Op0, Op1;
// The opcode that should be used to compare Op0 and Op1.
unsigned Opcode;
// A SystemZICMP value. Only used for integer comparisons.
unsigned ICmpType;
// The mask of CC values that Opcode can produce.
unsigned CCValid;
// The mask of CC values for which the original condition is true.
unsigned CCMask;
};
} // end anonymous namespace
// Classify VT as either 32 or 64 bit.
static bool is32Bit(EVT VT) {
switch (VT.getSimpleVT().SimpleTy) {
case MVT::i32:
return true;
case MVT::i64:
return false;
default:
llvm_unreachable("Unsupported type");
}
}
// Return a version of MachineOperand that can be safely used before the
// final use.
static MachineOperand earlyUseOperand(MachineOperand Op) {
if (Op.isReg())
Op.setIsKill(false);
return Op;
}
SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
const SystemZSubtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize(0));
// Set up the register classes.
if (Subtarget.hasHighWord())
addRegisterClass(MVT::i32, &SystemZ::GRX32BitRegClass);
else
addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass);
addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass);
if (Subtarget.hasVector()) {
addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass);
addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass);
} else {
addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
}
if (Subtarget.hasVectorEnhancements1())
addRegisterClass(MVT::f128, &SystemZ::VR128BitRegClass);
else
addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
if (Subtarget.hasVector()) {
addRegisterClass(MVT::v16i8, &SystemZ::VR128BitRegClass);
addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass);
addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass);
addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass);
addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass);
addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass);
}
// Compute derived properties from the register classes
computeRegisterProperties(Subtarget.getRegisterInfo());
// Set up special registers.
setStackPointerRegisterToSaveRestore(SystemZ::R15D);
// TODO: It may be better to default to latency-oriented scheduling, however
// LLVM's current latency-oriented scheduler can't handle physreg definitions
// such as SystemZ has with CC, so set this to the register-pressure
// scheduler, because it can.
setSchedulingPreference(Sched::RegPressure);
setBooleanContents(ZeroOrOneBooleanContent);
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
// Instructions are strings of 2-byte aligned 2-byte values.
setMinFunctionAlignment(2);
// For performance reasons we prefer 16-byte alignment.
setPrefFunctionAlignment(4);
// Handle operations that are handled in a similar way for all types.
for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
I <= MVT::LAST_FP_VALUETYPE;
++I) {
MVT VT = MVT::SimpleValueType(I);
if (isTypeLegal(VT)) {
// Lower SET_CC into an IPM-based sequence.
setOperationAction(ISD::SETCC, VT, Custom);
// Expand SELECT(C, A, B) into SELECT_CC(X, 0, A, B, NE).
setOperationAction(ISD::SELECT, VT, Expand);
// Lower SELECT_CC and BR_CC into separate comparisons and branches.
setOperationAction(ISD::SELECT_CC, VT, Custom);
setOperationAction(ISD::BR_CC, VT, Custom);
}
}
// Expand jump table branches as address arithmetic followed by an
// indirect jump.
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
// Expand BRCOND into a BR_CC (see above).
setOperationAction(ISD::BRCOND, MVT::Other, Expand);
// Handle integer types.
for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
I <= MVT::LAST_INTEGER_VALUETYPE;
++I) {
MVT VT = MVT::SimpleValueType(I);
if (isTypeLegal(VT)) {
// Expand individual DIV and REMs into DIVREMs.
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UDIV, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
setOperationAction(ISD::SDIVREM, VT, Custom);
setOperationAction(ISD::UDIVREM, VT, Custom);
// Support addition/subtraction with overflow.
setOperationAction(ISD::SADDO, VT, Custom);
setOperationAction(ISD::SSUBO, VT, Custom);
// Support addition/subtraction with carry.
setOperationAction(ISD::UADDO, VT, Custom);
setOperationAction(ISD::USUBO, VT, Custom);
// Support carry in as value rather than glue.
setOperationAction(ISD::ADDCARRY, VT, Custom);
setOperationAction(ISD::SUBCARRY, VT, Custom);
// Lower ATOMIC_LOAD and ATOMIC_STORE into normal volatile loads and
// stores, putting a serialization instruction after the stores.
setOperationAction(ISD::ATOMIC_LOAD, VT, Custom);
setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
// Lower ATOMIC_LOAD_SUB into ATOMIC_LOAD_ADD if LAA and LAAG are
// available, or if the operand is constant.
setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
// Use POPCNT on z196 and above.
if (Subtarget.hasPopulationCount())
setOperationAction(ISD::CTPOP, VT, Custom);
else
setOperationAction(ISD::CTPOP, VT, Expand);
// No special instructions for these.
setOperationAction(ISD::CTTZ, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
// Use *MUL_LOHI where possible instead of MULH*.
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::SMUL_LOHI, VT, Custom);
setOperationAction(ISD::UMUL_LOHI, VT, Custom);
// Only z196 and above have native support for conversions to unsigned.
// On z10, promoting to i64 doesn't generate an inexact condition for
// values that are outside the i32 range but in the i64 range, so use
// the default expansion.
if (!Subtarget.hasFPExtension())
setOperationAction(ISD::FP_TO_UINT, VT, Expand);
}
}
// Type legalization will convert 8- and 16-bit atomic operations into
// forms that operate on i32s (but still keeping the original memory VT).
// Lower them into full i32 operations.
setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Custom);
// Even though i128 is not a legal type, we still need to custom lower
// the atomic operations in order to exploit SystemZ instructions.
setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
// We can use the CC result of compare-and-swap to implement
// the "success" result of ATOMIC_CMP_SWAP_WITH_SUCCESS.
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Custom);
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
// Traps are legal, as we will convert them to "j .+2".
setOperationAction(ISD::TRAP, MVT::Other, Legal);
// z10 has instructions for signed but not unsigned FP conversion.
// Handle unsigned 32-bit types as signed 64-bit types.
if (!Subtarget.hasFPExtension()) {
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote);
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
}
// We have native support for a 64-bit CTLZ, via FLOGR.
setOperationAction(ISD::CTLZ, MVT::i32, Promote);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Promote);
setOperationAction(ISD::CTLZ, MVT::i64, Legal);
// On arch13 we have native support for a 64-bit CTPOP.
if (Subtarget.hasMiscellaneousExtensions3()) {
setOperationAction(ISD::CTPOP, MVT::i32, Promote);
setOperationAction(ISD::CTPOP, MVT::i64, Legal);
}
// Give LowerOperation the chance to replace 64-bit ORs with subregs.
setOperationAction(ISD::OR, MVT::i64, Custom);
// FIXME: Can we support these natively?
setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
// We have native instructions for i8, i16 and i32 extensions, but not i1.
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
for (MVT VT : MVT::integer_valuetypes()) {
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
}
// Handle the various types of symbolic address.
setOperationAction(ISD::ConstantPool, PtrVT, Custom);
setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
setOperationAction(ISD::BlockAddress, PtrVT, Custom);
setOperationAction(ISD::JumpTable, PtrVT, Custom);
// We need to handle dynamic allocations specially because of the
// 160-byte area at the bottom of the stack.
setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, PtrVT, Custom);
// Use custom expanders so that we can force the function to use
// a frame pointer.
setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Custom);
// Handle prefetches with PFD or PFDRL.
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
for (MVT VT : MVT::vector_valuetypes()) {
// Assume by default that all vector operations need to be expanded.
for (unsigned Opcode = 0; Opcode < ISD::BUILTIN_OP_END; ++Opcode)
if (getOperationAction(Opcode, VT) == Legal)
setOperationAction(Opcode, VT, Expand);
// Likewise all truncating stores and extending loads.
for (MVT InnerVT : MVT::vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
}
if (isTypeLegal(VT)) {
// These operations are legal for anything that can be stored in a
// vector register, even if there is no native support for the format
// as such. In particular, we can do these for v4f32 even though there
// are no specific instructions for that format.
setOperationAction(ISD::LOAD, VT, Legal);
setOperationAction(ISD::STORE, VT, Legal);
setOperationAction(ISD::VSELECT, VT, Legal);
setOperationAction(ISD::BITCAST, VT, Legal);
setOperationAction(ISD::UNDEF, VT, Legal);
// Likewise, except that we need to replace the nodes with something
// more specific.
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
}
}
// Handle integer vector types.
for (MVT VT : MVT::integer_vector_valuetypes()) {
if (isTypeLegal(VT)) {
// These operations have direct equivalents.
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Legal);
setOperationAction(ISD::ADD, VT, Legal);
setOperationAction(ISD::SUB, VT, Legal);
if (VT != MVT::v2i64)
setOperationAction(ISD::MUL, VT, Legal);
setOperationAction(ISD::AND, VT, Legal);
setOperationAction(ISD::OR, VT, Legal);
setOperationAction(ISD::XOR, VT, Legal);
if (Subtarget.hasVectorEnhancements1())
setOperationAction(ISD::CTPOP, VT, Legal);
else
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTTZ, VT, Legal);
setOperationAction(ISD::CTLZ, VT, Legal);
// Convert a GPR scalar to a vector by inserting it into element 0.
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
// Use a series of unpacks for extensions.
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
// Detect shifts by a scalar amount and convert them into
// V*_BY_SCALAR.
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::SRL, VT, Custom);
// At present ROTL isn't matched by DAGCombiner. ROTR should be
// converted into ROTL.
setOperationAction(ISD::ROTL, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
// Map SETCCs onto one of VCE, VCH or VCHL, swapping the operands
// and inverting the result as necessary.
setOperationAction(ISD::SETCC, VT, Custom);
}
}
if (Subtarget.hasVector()) {
// There should be no need to check for float types other than v2f64
// since <2 x f32> isn't a legal type.
setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v2f64, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v2f64, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v2f64, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v2f64, Legal);
}
if (Subtarget.hasVectorEnhancements2()) {
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v4f32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v4f32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v4f32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v4f32, Legal);
}
// Handle floating-point types.
for (unsigned I = MVT::FIRST_FP_VALUETYPE;
I <= MVT::LAST_FP_VALUETYPE;
++I) {
MVT VT = MVT::SimpleValueType(I);
if (isTypeLegal(VT)) {
// We can use FI for FRINT.
setOperationAction(ISD::FRINT, VT, Legal);
// We can use the extended form of FI for other rounding operations.
if (Subtarget.hasFPExtension()) {
setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::FFLOOR, VT, Legal);
setOperationAction(ISD::FCEIL, VT, Legal);
setOperationAction(ISD::FTRUNC, VT, Legal);
setOperationAction(ISD::FROUND, VT, Legal);
}
// No special instructions for these.
setOperationAction(ISD::FSIN, VT, Expand);
setOperationAction(ISD::FCOS, VT, Expand);
setOperationAction(ISD::FSINCOS, VT, Expand);
setOperationAction(ISD::FREM, VT, Expand);
setOperationAction(ISD::FPOW, VT, Expand);
// Handle constrained floating-point operations.
setOperationAction(ISD::STRICT_FADD, VT, Legal);
setOperationAction(ISD::STRICT_FSUB, VT, Legal);
setOperationAction(ISD::STRICT_FMUL, VT, Legal);
setOperationAction(ISD::STRICT_FDIV, VT, Legal);
setOperationAction(ISD::STRICT_FMA, VT, Legal);
setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
setOperationAction(ISD::STRICT_FRINT, VT, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
if (Subtarget.hasFPExtension()) {
setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
setOperationAction(ISD::STRICT_FROUND, VT, Legal);
setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
}
}
}
// Handle floating-point vector types.
if (Subtarget.hasVector()) {
// Scalar-to-vector conversion is just a subreg.
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
// Some insertions and extractions can be done directly but others
// need to go via integers.
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
// These operations have direct equivalents.
setOperationAction(ISD::FADD, MVT::v2f64, Legal);
setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
setOperationAction(ISD::FSUB, MVT::v2f64, Legal);
setOperationAction(ISD::FMUL, MVT::v2f64, Legal);
setOperationAction(ISD::FMA, MVT::v2f64, Legal);
setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
setOperationAction(ISD::FABS, MVT::v2f64, Legal);
setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
// Handle constrained floating-point operations.
setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FMA, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FRINT, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FFLOOR, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FCEIL, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FTRUNC, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FROUND, MVT::v2f64, Legal);
}
// The vector enhancements facility 1 has instructions for these.
if (Subtarget.hasVectorEnhancements1()) {
setOperationAction(ISD::FADD, MVT::v4f32, Legal);
setOperationAction(ISD::FNEG, MVT::v4f32, Legal);
setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
setOperationAction(ISD::FMA, MVT::v4f32, Legal);
setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
setOperationAction(ISD::FABS, MVT::v4f32, Legal);
setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
setOperationAction(ISD::FMAXIMUM, MVT::f64, Legal);
setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
setOperationAction(ISD::FMINIMUM, MVT::f64, Legal);
setOperationAction(ISD::FMAXNUM, MVT::v2f64, Legal);
setOperationAction(ISD::FMAXIMUM, MVT::v2f64, Legal);
setOperationAction(ISD::FMINNUM, MVT::v2f64, Legal);
setOperationAction(ISD::FMINIMUM, MVT::v2f64, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f128, Legal);
setOperationAction(ISD::FMAXIMUM, MVT::f128, Legal);
setOperationAction(ISD::FMINNUM, MVT::f128, Legal);
setOperationAction(ISD::FMINIMUM, MVT::f128, Legal);
// Handle constrained floating-point operations.
setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FMA, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FRINT, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FFLOOR, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FCEIL, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FROUND, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FTRUNC, MVT::v4f32, Legal);
for (auto VT : { MVT::f32, MVT::f64, MVT::f128,
MVT::v4f32, MVT::v2f64 }) {
setOperationAction(ISD::STRICT_FMAXNUM, VT, Legal);
setOperationAction(ISD::STRICT_FMINNUM, VT, Legal);
}
}
// We have fused multiply-addition for f32 and f64 but not f128.
setOperationAction(ISD::FMA, MVT::f32, Legal);
setOperationAction(ISD::FMA, MVT::f64, Legal);
if (Subtarget.hasVectorEnhancements1())
setOperationAction(ISD::FMA, MVT::f128, Legal);
else
setOperationAction(ISD::FMA, MVT::f128, Expand);
// We don't have a copysign instruction on vector registers.
if (Subtarget.hasVectorEnhancements1())
setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
// Needed so that we don't try to implement f128 constant loads using
// a load-and-extend of a f80 constant (in cases where the constant
// would fit in an f80).
for (MVT VT : MVT::fp_valuetypes())
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
// We don't have extending load instruction on vector registers.
if (Subtarget.hasVectorEnhancements1()) {
setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
}
// Floating-point truncation and stores need to be done separately.
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::f128, MVT::f32, Expand);
setTruncStoreAction(MVT::f128, MVT::f64, Expand);
// We have 64-bit FPR<->GPR moves, but need special handling for
// 32-bit forms.
if (!Subtarget.hasVector()) {
setOperationAction(ISD::BITCAST, MVT::i32, Custom);
setOperationAction(ISD::BITCAST, MVT::f32, Custom);
}
// VASTART and VACOPY need to deal with the SystemZ-specific varargs
// structure, but VAEND is a no-op.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
setOperationAction(ISD::VACOPY, MVT::Other, Custom);
setOperationAction(ISD::VAEND, MVT::Other, Expand);
// Codes for which we want to perform some z-specific combinations.
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
setTargetDAGCombine(ISD::FP_ROUND);
setTargetDAGCombine(ISD::FP_EXTEND);
setTargetDAGCombine(ISD::BSWAP);
setTargetDAGCombine(ISD::SDIV);
setTargetDAGCombine(ISD::UDIV);
setTargetDAGCombine(ISD::SREM);
setTargetDAGCombine(ISD::UREM);
// Handle intrinsics.
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
// We want to use MVC in preference to even a single load/store pair.
MaxStoresPerMemcpy = 0;
MaxStoresPerMemcpyOptSize = 0;
// The main memset sequence is a byte store followed by an MVC.
// Two STC or MV..I stores win over that, but the kind of fused stores
// generated by target-independent code don't when the byte value is
// variable. E.g. "STC <reg>;MHI <reg>,257;STH <reg>" is not better
// than "STC;MVC". Handle the choice in target-specific code instead.
MaxStoresPerMemset = 0;
MaxStoresPerMemsetOptSize = 0;
}
EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL,
LLVMContext &, EVT VT) const {
if (!VT.isVector())
return MVT::i32;
return VT.changeVectorElementTypeToInteger();
}
bool SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
VT = VT.getScalarType();
if (!VT.isSimple())
return false;
switch (VT.getSimpleVT().SimpleTy) {
case MVT::f32:
case MVT::f64:
return true;
case MVT::f128:
return Subtarget.hasVectorEnhancements1();
default:
break;
}
return false;
}
// Return true if the constant can be generated with a vector instruction,
// such as VGM, VGMB or VREPI.
bool SystemZVectorConstantInfo::isVectorConstantLegal(
const SystemZSubtarget &Subtarget) {
const SystemZInstrInfo *TII =
static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
if (!Subtarget.hasVector() ||
(isFP128 && !Subtarget.hasVectorEnhancements1()))
return false;
// Try using VECTOR GENERATE BYTE MASK. This is the architecturally-
// preferred way of creating all-zero and all-one vectors so give it
// priority over other methods below.
unsigned Mask = 0;
unsigned I = 0;
for (; I < SystemZ::VectorBytes; ++I) {
uint64_t Byte = IntBits.lshr(I * 8).trunc(8).getZExtValue();
if (Byte == 0xff)
Mask |= 1ULL << I;
else if (Byte != 0)
break;
}
if (I == SystemZ::VectorBytes) {
Opcode = SystemZISD::BYTE_MASK;
OpVals.push_back(Mask);
VecVT = MVT::getVectorVT(MVT::getIntegerVT(8), 16);
return true;
}
if (SplatBitSize > 64)
return false;
auto tryValue = [&](uint64_t Value) -> bool {
// Try VECTOR REPLICATE IMMEDIATE
int64_t SignedValue = SignExtend64(Value, SplatBitSize);
if (isInt<16>(SignedValue)) {
OpVals.push_back(((unsigned) SignedValue));
Opcode = SystemZISD::REPLICATE;
VecVT = MVT::getVectorVT(MVT::getIntegerVT(SplatBitSize),
SystemZ::VectorBits / SplatBitSize);
return true;
}
// Try VECTOR GENERATE MASK
unsigned Start, End;
if (TII->isRxSBGMask(Value, SplatBitSize, Start, End)) {
// isRxSBGMask returns the bit numbers for a full 64-bit value, with 0
// denoting 1 << 63 and 63 denoting 1. Convert them to bit numbers for
// an SplatBitSize value, so that 0 denotes 1 << (SplatBitSize-1).
OpVals.push_back(Start - (64 - SplatBitSize));
OpVals.push_back(End - (64 - SplatBitSize));
Opcode = SystemZISD::ROTATE_MASK;
VecVT = MVT::getVectorVT(MVT::getIntegerVT(SplatBitSize),
SystemZ::VectorBits / SplatBitSize);
return true;
}
return false;
};
// First try assuming that any undefined bits above the highest set bit
// and below the lowest set bit are 1s. This increases the likelihood of
// being able to use a sign-extended element value in VECTOR REPLICATE
// IMMEDIATE or a wraparound mask in VECTOR GENERATE MASK.
uint64_t SplatBitsZ = SplatBits.getZExtValue();
uint64_t SplatUndefZ = SplatUndef.getZExtValue();
uint64_t Lower =
(SplatUndefZ & ((uint64_t(1) << findFirstSet(SplatBitsZ)) - 1));
uint64_t Upper =
(SplatUndefZ & ~((uint64_t(1) << findLastSet(SplatBitsZ)) - 1));
if (tryValue(SplatBitsZ | Upper | Lower))
return true;
// Now try assuming that any undefined bits between the first and
// last defined set bits are set. This increases the chances of
// using a non-wraparound mask.
uint64_t Middle = SplatUndefZ & ~Upper & ~Lower;
return tryValue(SplatBitsZ | Middle);
}
SystemZVectorConstantInfo::SystemZVectorConstantInfo(APFloat FPImm) {
IntBits = FPImm.bitcastToAPInt().zextOrSelf(128);
isFP128 = (&FPImm.getSemantics() == &APFloat::IEEEquad());
// Find the smallest splat.
SplatBits = FPImm.bitcastToAPInt();
unsigned Width = SplatBits.getBitWidth();
while (Width > 8) {
unsigned HalfSize = Width / 2;
APInt HighValue = SplatBits.lshr(HalfSize).trunc(HalfSize);
APInt LowValue = SplatBits.trunc(HalfSize);
// If the two halves do not match, stop here.
if (HighValue != LowValue || 8 > HalfSize)
break;
SplatBits = HighValue;
Width = HalfSize;
}
SplatUndef = 0;
SplatBitSize = Width;
}
SystemZVectorConstantInfo::SystemZVectorConstantInfo(BuildVectorSDNode *BVN) {
assert(BVN->isConstant() && "Expected a constant BUILD_VECTOR");
bool HasAnyUndefs;
// Get IntBits by finding the 128 bit splat.
BVN->isConstantSplat(IntBits, SplatUndef, SplatBitSize, HasAnyUndefs, 128,
true);
// Get SplatBits by finding the 8 bit or greater splat.
BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, 8,
true);
}
bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const {
// We can load zero using LZ?R and negative zero using LZ?R;LC?BR.
if (Imm.isZero() || Imm.isNegZero())
return true;
return SystemZVectorConstantInfo(Imm).isVectorConstantLegal(Subtarget);
}
bool SystemZTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
// We can use CGFI or CLGFI.
return isInt<32>(Imm) || isUInt<32>(Imm);
}
bool SystemZTargetLowering::isLegalAddImmediate(int64_t Imm) const {
// We can use ALGFI or SLGFI.
return isUInt<32>(Imm) || isUInt<32>(-Imm);
}
bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(
EVT VT, unsigned, unsigned, MachineMemOperand::Flags, bool *Fast) const {
// Unaligned accesses should never be slower than the expanded version.
// We check specifically for aligned accesses in the few cases where
// they are required.
if (Fast)
*Fast = true;
return true;
}
// Information about the addressing mode for a memory access.
struct AddressingMode {
// True if a long displacement is supported.
bool LongDisplacement;
// True if use of index register is supported.
bool IndexReg;
AddressingMode(bool LongDispl, bool IdxReg) :
LongDisplacement(LongDispl), IndexReg(IdxReg) {}
};
// Return the desired addressing mode for a Load which has only one use (in
// the same block) which is a Store.
static AddressingMode getLoadStoreAddrMode(bool HasVector,
Type *Ty) {
// With vector support a Load->Store combination may be combined to either
// an MVC or vector operations and it seems to work best to allow the
// vector addressing mode.
if (HasVector)
return AddressingMode(false/*LongDispl*/, true/*IdxReg*/);
// Otherwise only the MVC case is special.
bool MVC = Ty->isIntegerTy(8);
return AddressingMode(!MVC/*LongDispl*/, !MVC/*IdxReg*/);
}
// Return the addressing mode which seems most desirable given an LLVM
// Instruction pointer.
static AddressingMode
supportedAddressingMode(Instruction *I, bool HasVector) {
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
switch (II->getIntrinsicID()) {
default: break;
case Intrinsic::memset:
case Intrinsic::memmove:
case Intrinsic::memcpy:
return AddressingMode(false/*LongDispl*/, false/*IdxReg*/);
}
}
if (isa<LoadInst>(I) && I->hasOneUse()) {
auto *SingleUser = dyn_cast<Instruction>(*I->user_begin());
if (SingleUser->getParent() == I->getParent()) {
if (isa<ICmpInst>(SingleUser)) {
if (auto *C = dyn_cast<ConstantInt>(SingleUser->getOperand(1)))
if (C->getBitWidth() <= 64 &&
(isInt<16>(C->getSExtValue()) || isUInt<16>(C->getZExtValue())))
// Comparison of memory with 16 bit signed / unsigned immediate
return AddressingMode(false/*LongDispl*/, false/*IdxReg*/);
} else if (isa<StoreInst>(SingleUser))
// Load->Store
return getLoadStoreAddrMode(HasVector, I->getType());
}
} else if (auto *StoreI = dyn_cast<StoreInst>(I)) {
if (auto *LoadI = dyn_cast<LoadInst>(StoreI->getValueOperand()))
if (LoadI->hasOneUse() && LoadI->getParent() == I->getParent())
// Load->Store
return getLoadStoreAddrMode(HasVector, LoadI->getType());
}
if (HasVector && (isa<LoadInst>(I) || isa<StoreInst>(I))) {
// * Use LDE instead of LE/LEY for z13 to avoid partial register
// dependencies (LDE only supports small offsets).
// * Utilize the vector registers to hold floating point
// values (vector load / store instructions only support small
// offsets).
Type *MemAccessTy = (isa<LoadInst>(I) ? I->getType() :
I->getOperand(0)->getType());
bool IsFPAccess = MemAccessTy->isFloatingPointTy();
bool IsVectorAccess = MemAccessTy->isVectorTy();
// A store of an extracted vector element will be combined into a VSTE type
// instruction.
if (!IsVectorAccess && isa<StoreInst>(I)) {
Value *DataOp = I->getOperand(0);
if (isa<ExtractElementInst>(DataOp))
IsVectorAccess = true;
}
// A load which gets inserted into a vector element will be combined into a
// VLE type instruction.
if (!IsVectorAccess && isa<LoadInst>(I) && I->hasOneUse()) {
User *LoadUser = *I->user_begin();
if (isa<InsertElementInst>(LoadUser))
IsVectorAccess = true;
}
if (IsFPAccess || IsVectorAccess)
return AddressingMode(false/*LongDispl*/, true/*IdxReg*/);
}
return AddressingMode(true/*LongDispl*/, true/*IdxReg*/);
}
bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I) const {
// Punt on globals for now, although they can be used in limited
// RELATIVE LONG cases.
if (AM.BaseGV)
return false;
// Require a 20-bit signed offset.
if (!isInt<20>(AM.BaseOffs))
return false;
AddressingMode SupportedAM(true, true);
if (I != nullptr)
SupportedAM = supportedAddressingMode(I, Subtarget.hasVector());
if (!SupportedAM.LongDisplacement && !isUInt<12>(AM.BaseOffs))
return false;
if (!SupportedAM.IndexReg)
// No indexing allowed.
return AM.Scale == 0;
else
// Indexing is OK but no scale factor can be applied.
return AM.Scale == 0 || AM.Scale == 1;
}
bool SystemZTargetLowering::isTruncateFree(Type *FromType, Type *ToType) const {
if (!FromType->isIntegerTy() || !ToType->isIntegerTy())
return false;
unsigned FromBits = FromType->getPrimitiveSizeInBits();
unsigned ToBits = ToType->getPrimitiveSizeInBits();
return FromBits > ToBits;
}
bool SystemZTargetLowering::isTruncateFree(EVT FromVT, EVT ToVT) const {
if (!FromVT.isInteger() || !ToVT.isInteger())
return false;
unsigned FromBits = FromVT.getSizeInBits();
unsigned ToBits = ToVT.getSizeInBits();
return FromBits > ToBits;
}
//===----------------------------------------------------------------------===//
// Inline asm support
//===----------------------------------------------------------------------===//
TargetLowering::ConstraintType
SystemZTargetLowering::getConstraintType(StringRef Constraint) const {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
case 'a': // Address register
case 'd': // Data register (equivalent to 'r')
case 'f': // Floating-point register
case 'h': // High-part register
case 'r': // General-purpose register
case 'v': // Vector register
return C_RegisterClass;
case 'Q': // Memory with base and unsigned 12-bit displacement
case 'R': // Likewise, plus an index
case 'S': // Memory with base and signed 20-bit displacement
case 'T': // Likewise, plus an index
case 'm': // Equivalent to 'T'.
return C_Memory;
case 'I': // Unsigned 8-bit constant
case 'J': // Unsigned 12-bit constant
case 'K': // Signed 16-bit constant
case 'L': // Signed 20-bit displacement (on all targets we support)
case 'M': // 0x7fffffff
- return C_Other;
+ return C_Immediate;
default:
break;
}
}
return TargetLowering::getConstraintType(Constraint);
}
TargetLowering::ConstraintWeight SystemZTargetLowering::
getSingleConstraintMatchWeight(AsmOperandInfo &info,
const char *constraint) const {
ConstraintWeight weight = CW_Invalid;
Value *CallOperandVal = info.CallOperandVal;
// If we don't have a value, we can't do a match,
// but allow it at the lowest weight.
if (!CallOperandVal)
return CW_Default;
Type *type = CallOperandVal->getType();
// Look at the constraint type.
switch (*constraint) {
default:
weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
break;
case 'a': // Address register
case 'd': // Data register (equivalent to 'r')
case 'h': // High-part register
case 'r': // General-purpose register
if (CallOperandVal->getType()->isIntegerTy())
weight = CW_Register;
break;
case 'f': // Floating-point register
if (type->isFloatingPointTy())
weight = CW_Register;
break;
case 'v': // Vector register
if ((type->isVectorTy() || type->isFloatingPointTy()) &&
Subtarget.hasVector())
weight = CW_Register;
break;
case 'I': // Unsigned 8-bit constant
if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
if (isUInt<8>(C->getZExtValue()))
weight = CW_Constant;
break;
case 'J': // Unsigned 12-bit constant
if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
if (isUInt<12>(C->getZExtValue()))
weight = CW_Constant;
break;
case 'K': // Signed 16-bit constant
if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
if (isInt<16>(C->getSExtValue()))
weight = CW_Constant;
break;
case 'L': // Signed 20-bit displacement (on all targets we support)
if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
if (isInt<20>(C->getSExtValue()))
weight = CW_Constant;
break;
case 'M': // 0x7fffffff
if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
if (C->getZExtValue() == 0x7fffffff)
weight = CW_Constant;
break;
}
return weight;
}
// Parse a "{tNNN}" register constraint for which the register type "t"
// has already been verified. MC is the class associated with "t" and
// Map maps 0-based register numbers to LLVM register numbers.
static std::pair<unsigned, const TargetRegisterClass *>
parseRegisterNumber(StringRef Constraint, const TargetRegisterClass *RC,
const unsigned *Map, unsigned Size) {
assert(*(Constraint.end()-1) == '}' && "Missing '}'");
if (isdigit(Constraint[2])) {
unsigned Index;
bool Failed =
Constraint.slice(2, Constraint.size() - 1).getAsInteger(10, Index);
if (!Failed && Index < Size && Map[Index])
return std::make_pair(Map[Index], RC);
}
return std::make_pair(0U, nullptr);
}
std::pair<unsigned, const TargetRegisterClass *>
SystemZTargetLowering::getRegForInlineAsmConstraint(
const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
if (Constraint.size() == 1) {
// GCC Constraint Letters
switch (Constraint[0]) {
default: break;
case 'd': // Data register (equivalent to 'r')
case 'r': // General-purpose register
if (VT == MVT::i64)
return std::make_pair(0U, &SystemZ::GR64BitRegClass);
else if (VT == MVT::i128)
return std::make_pair(0U, &SystemZ::GR128BitRegClass);
return std::make_pair(0U, &SystemZ::GR32BitRegClass);
case 'a': // Address register
if (VT == MVT::i64)
return std::make_pair(0U, &SystemZ::ADDR64BitRegClass);
else if (VT == MVT::i128)
return std::make_pair(0U, &SystemZ::ADDR128BitRegClass);
return std::make_pair(0U, &SystemZ::ADDR32BitRegClass);
case 'h': // High-part register (an LLVM extension)
return std::make_pair(0U, &SystemZ::GRH32BitRegClass);
case 'f': // Floating-point register
if (VT == MVT::f64)
return std::make_pair(0U, &SystemZ::FP64BitRegClass);
else if (VT == MVT::f128)
return std::make_pair(0U, &SystemZ::FP128BitRegClass);
return std::make_pair(0U, &SystemZ::FP32BitRegClass);
case 'v': // Vector register
if (Subtarget.hasVector()) {
if (VT == MVT::f32)
return std::make_pair(0U, &SystemZ::VR32BitRegClass);
if (VT == MVT::f64)
return std::make_pair(0U, &SystemZ::VR64BitRegClass);
return std::make_pair(0U, &SystemZ::VR128BitRegClass);
}
break;
}
}
if (Constraint.size() > 0 && Constraint[0] == '{') {
// We need to override the default register parsing for GPRs and FPRs
// because the interpretation depends on VT. The internal names of
// the registers are also different from the external names
// (F0D and F0S instead of F0, etc.).
if (Constraint[1] == 'r') {
if (VT == MVT::i32)
return parseRegisterNumber(Constraint, &SystemZ::GR32BitRegClass,
SystemZMC::GR32Regs, 16);
if (VT == MVT::i128)
return parseRegisterNumber(Constraint, &SystemZ::GR128BitRegClass,
SystemZMC::GR128Regs, 16);
return parseRegisterNumber(Constraint, &SystemZ::GR64BitRegClass,
SystemZMC::GR64Regs, 16);
}
if (Constraint[1] == 'f') {
if (VT == MVT::f32)
return parseRegisterNumber(Constraint, &SystemZ::FP32BitRegClass,
SystemZMC::FP32Regs, 16);
if (VT == MVT::f128)
return parseRegisterNumber(Constraint, &SystemZ::FP128BitRegClass,
SystemZMC::FP128Regs, 16);
return parseRegisterNumber(Constraint, &SystemZ::FP64BitRegClass,
SystemZMC::FP64Regs, 16);
}
if (Constraint[1] == 'v') {
if (VT == MVT::f32)
return parseRegisterNumber(Constraint, &SystemZ::VR32BitRegClass,
SystemZMC::VR32Regs, 32);
if (VT == MVT::f64)
return parseRegisterNumber(Constraint, &SystemZ::VR64BitRegClass,
SystemZMC::VR64Regs, 32);
return parseRegisterNumber(Constraint, &SystemZ::VR128BitRegClass,
SystemZMC::VR128Regs, 32);
}
}
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
}
void SystemZTargetLowering::
LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
std::vector<SDValue> &Ops,
SelectionDAG &DAG) const {
// Only support length 1 constraints for now.
if (Constraint.length() == 1) {
switch (Constraint[0]) {
case 'I': // Unsigned 8-bit constant
if (auto *C = dyn_cast<ConstantSDNode>(Op))
if (isUInt<8>(C->getZExtValue()))
Ops.push_back(DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
Op.getValueType()));
return;
case 'J': // Unsigned 12-bit constant
if (auto *C = dyn_cast<ConstantSDNode>(Op))
if (isUInt<12>(C->getZExtValue()))
Ops.push_back(DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
Op.getValueType()));
return;
case 'K': // Signed 16-bit constant
if (auto *C = dyn_cast<ConstantSDNode>(Op))
if (isInt<16>(C->getSExtValue()))
Ops.push_back(DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
Op.getValueType()));
return;
case 'L': // Signed 20-bit displacement (on all targets we support)
if (auto *C = dyn_cast<ConstantSDNode>(Op))
if (isInt<20>(C->getSExtValue()))
Ops.push_back(DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
Op.getValueType()));
return;
case 'M': // 0x7fffffff
if (auto *C = dyn_cast<ConstantSDNode>(Op))
if (C->getZExtValue() == 0x7fffffff)
Ops.push_back(DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
Op.getValueType()));
return;
}
}
TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
}
//===----------------------------------------------------------------------===//
// Calling conventions
//===----------------------------------------------------------------------===//
#include "SystemZGenCallingConv.inc"
const MCPhysReg *SystemZTargetLowering::getScratchRegisters(
CallingConv::ID) const {
static const MCPhysReg ScratchRegs[] = { SystemZ::R0D, SystemZ::R1D,
SystemZ::R14D, 0 };
return ScratchRegs;
}
bool SystemZTargetLowering::allowTruncateForTailCall(Type *FromType,
Type *ToType) const {
return isTruncateFree(FromType, ToType);
}
bool SystemZTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
return CI->isTailCall();
}
// We do not yet support 128-bit single-element vector types. If the user
// attempts to use such types as function argument or return type, prefer
// to error out instead of emitting code violating the ABI.
static void VerifyVectorType(MVT VT, EVT ArgVT) {
if (ArgVT.isVector() && !VT.isVector())
report_fatal_error("Unsupported vector argument or return type");
}
static void VerifyVectorTypes(const SmallVectorImpl<ISD::InputArg> &Ins) {
for (unsigned i = 0; i < Ins.size(); ++i)
VerifyVectorType(Ins[i].VT, Ins[i].ArgVT);
}
static void VerifyVectorTypes(const SmallVectorImpl<ISD::OutputArg> &Outs) {
for (unsigned i = 0; i < Outs.size(); ++i)
VerifyVectorType(Outs[i].VT, Outs[i].ArgVT);
}
// Value is a value that has been passed to us in the location described by VA
// (and so has type VA.getLocVT()). Convert Value to VA.getValVT(), chaining
// any loads onto Chain.
static SDValue convertLocVTToValVT(SelectionDAG &DAG, const SDLoc &DL,
CCValAssign &VA, SDValue Chain,
SDValue Value) {
// If the argument has been promoted from a smaller type, insert an
// assertion to capture this.
if (VA.getLocInfo() == CCValAssign::SExt)
Value = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Value,
DAG.getValueType(VA.getValVT()));
else if (VA.getLocInfo() == CCValAssign::ZExt)
Value = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Value,
DAG.getValueType(VA.getValVT()));
if (VA.isExtInLoc())
Value = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Value);
else if (VA.getLocInfo() == CCValAssign::BCvt) {
// If this is a short vector argument loaded from the stack,
// extend from i64 to full vector size and then bitcast.
assert(VA.getLocVT() == MVT::i64);
assert(VA.getValVT().isVector());
Value = DAG.getBuildVector(MVT::v2i64, DL, {Value, DAG.getUNDEF(MVT::i64)});
Value = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Value);
} else
assert(VA.getLocInfo() == CCValAssign::Full && "Unsupported getLocInfo");
return Value;
}
// Value is a value of type VA.getValVT() that we need to copy into
// the location described by VA. Return a copy of Value converted to
// VA.getValVT(). The caller is responsible for handling indirect values.
static SDValue convertValVTToLocVT(SelectionDAG &DAG, const SDLoc &DL,
CCValAssign &VA, SDValue Value) {
switch (VA.getLocInfo()) {
case CCValAssign::SExt:
return DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Value);
case CCValAssign::ZExt:
return DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Value);
case CCValAssign::AExt:
return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value);
case CCValAssign::BCvt:
// If this is a short vector argument to be stored to the stack,
// bitcast to v2i64 and then extract first element.
assert(VA.getLocVT() == MVT::i64);
assert(VA.getValVT().isVector());
Value = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Value);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VA.getLocVT(), Value,
DAG.getConstant(0, DL, MVT::i32));
case CCValAssign::Full:
return Value;
default:
llvm_unreachable("Unhandled getLocInfo()");
}
}
SDValue SystemZTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
SystemZMachineFunctionInfo *FuncInfo =
MF.getInfo<SystemZMachineFunctionInfo>();
auto *TFL =
static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering());
EVT PtrVT = getPointerTy(DAG.getDataLayout());
// Detect unsupported vector argument types.
if (Subtarget.hasVector())
VerifyVectorTypes(Ins);
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
SystemZCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ);
unsigned NumFixedGPRs = 0;
unsigned NumFixedFPRs = 0;
for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
SDValue ArgValue;
CCValAssign &VA = ArgLocs[I];
EVT LocVT = VA.getLocVT();
if (VA.isRegLoc()) {
// Arguments passed in registers
const TargetRegisterClass *RC;
switch (LocVT.getSimpleVT().SimpleTy) {
default:
// Integers smaller than i64 should be promoted to i64.
llvm_unreachable("Unexpected argument type");
case MVT::i32:
NumFixedGPRs += 1;
RC = &SystemZ::GR32BitRegClass;
break;
case MVT::i64:
NumFixedGPRs += 1;
RC = &SystemZ::GR64BitRegClass;
break;
case MVT::f32:
NumFixedFPRs += 1;
RC = &SystemZ::FP32BitRegClass;
break;
case MVT::f64:
NumFixedFPRs += 1;
RC = &SystemZ::FP64BitRegClass;
break;
case MVT::v16i8:
case MVT::v8i16:
case MVT::v4i32:
case MVT::v2i64:
case MVT::v4f32:
case MVT::v2f64:
RC = &SystemZ::VR128BitRegClass;
break;
}
unsigned VReg = MRI.createVirtualRegister(RC);
MRI.addLiveIn(VA.getLocReg(), VReg);
ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
} else {
assert(VA.isMemLoc() && "Argument not register or memory");
// Create the frame index object for this incoming parameter.
int FI = MFI.CreateFixedObject(LocVT.getSizeInBits() / 8,
VA.getLocMemOffset(), true);
// Create the SelectionDAG nodes corresponding to a load
// from this parameter. Unpromoted ints and floats are
// passed as right-justified 8-byte values.
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
DAG.getIntPtrConstant(4, DL));
ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
MachinePointerInfo::getFixedStack(MF, FI));
}
// Convert the value of the argument register into the value that's
// being passed.
if (VA.getLocInfo() == CCValAssign::Indirect) {
InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue,
MachinePointerInfo()));
// If the original argument was split (e.g. i128), we need
// to load all parts of it here (using the same address).
unsigned ArgIndex = Ins[I].OrigArgIndex;
assert (Ins[I].PartOffset == 0);
while (I + 1 != E && Ins[I + 1].OrigArgIndex == ArgIndex) {
CCValAssign &PartVA = ArgLocs[I + 1];
unsigned PartOffset = Ins[I + 1].PartOffset;
SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, ArgValue,
DAG.getIntPtrConstant(PartOffset, DL));
InVals.push_back(DAG.getLoad(PartVA.getValVT(), DL, Chain, Address,
MachinePointerInfo()));
++I;
}
} else
InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, ArgValue));
}
if (IsVarArg) {
// Save the number of non-varargs registers for later use by va_start, etc.
FuncInfo->setVarArgsFirstGPR(NumFixedGPRs);
FuncInfo->setVarArgsFirstFPR(NumFixedFPRs);
// Likewise the address (in the form of a frame index) of where the
// first stack vararg would be. The 1-byte size here is arbitrary.
int64_t StackSize = CCInfo.getNextStackOffset();
FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
// ...and a similar frame index for the caller-allocated save area
// that will be used to store the incoming registers.
int64_t RegSaveOffset = TFL->getOffsetOfLocalArea();
unsigned RegSaveIndex = MFI.CreateFixedObject(1, RegSaveOffset, true);
FuncInfo->setRegSaveFrameIndex(RegSaveIndex);
// Store the FPR varargs in the reserved frame slots. (We store the
// GPRs as part of the prologue.)
if (NumFixedFPRs < SystemZ::NumArgFPRs) {
SDValue MemOps[SystemZ::NumArgFPRs];
for (unsigned I = NumFixedFPRs; I < SystemZ::NumArgFPRs; ++I) {
unsigned Offset = TFL->getRegSpillOffset(SystemZ::ArgFPRs[I]);
int FI = MFI.CreateFixedObject(8, RegSaveOffset + Offset, true);
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
unsigned VReg = MF.addLiveIn(SystemZ::ArgFPRs[I],
&SystemZ::FP64BitRegClass);
SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f64);
MemOps[I] = DAG.getStore(ArgValue.getValue(1), DL, ArgValue, FIN,
MachinePointerInfo::getFixedStack(MF, FI));
}
// Join the stores, which are independent of one another.
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
makeArrayRef(&MemOps[NumFixedFPRs],
SystemZ::NumArgFPRs-NumFixedFPRs));
}
}
return Chain;
}
static bool canUseSiblingCall(const CCState &ArgCCInfo,
SmallVectorImpl<CCValAssign> &ArgLocs,
SmallVectorImpl<ISD::OutputArg> &Outs) {
// Punt if there are any indirect or stack arguments, or if the call
// needs the callee-saved argument register R6, or if the call uses
// the callee-saved register arguments SwiftSelf and SwiftError.
for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
CCValAssign &VA = ArgLocs[I];
if (VA.getLocInfo() == CCValAssign::Indirect)
return false;
if (!VA.isRegLoc())
return false;
unsigned Reg = VA.getLocReg();
if (Reg == SystemZ::R6H || Reg == SystemZ::R6L || Reg == SystemZ::R6D)
return false;
if (Outs[I].Flags.isSwiftSelf() || Outs[I].Flags.isSwiftError())
return false;
}
return true;
}
SDValue
SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
SelectionDAG &DAG = CLI.DAG;
SDLoc &DL = CLI.DL;
SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
bool &IsTailCall = CLI.IsTailCall;
CallingConv::ID CallConv = CLI.CallConv;
bool IsVarArg = CLI.IsVarArg;
MachineFunction &MF = DAG.getMachineFunction();
EVT PtrVT = getPointerTy(MF.getDataLayout());
// Detect unsupported vector argument and return types.
if (Subtarget.hasVector()) {
VerifyVectorTypes(Outs);
VerifyVectorTypes(Ins);
}
// Analyze the operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
SystemZCCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
// We don't support GuaranteedTailCallOpt, only automatically-detected
// sibling calls.
if (IsTailCall && !canUseSiblingCall(ArgCCInfo, ArgLocs, Outs))
IsTailCall = false;
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = ArgCCInfo.getNextStackOffset();
// Mark the start of the call.
if (!IsTailCall)
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
// Copy argument values to their designated locations.
SmallVector<std::pair<unsigned, SDValue>, 9> RegsToPass;
SmallVector<SDValue, 8> MemOpChains;
SDValue StackPtr;
for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
CCValAssign &VA = ArgLocs[I];
SDValue ArgValue = OutVals[I];
if (VA.getLocInfo() == CCValAssign::Indirect) {
// Store the argument in a stack slot and pass its address.
SDValue SpillSlot = DAG.CreateStackTemporary(Outs[I].ArgVT);
int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
MemOpChains.push_back(
DAG.getStore(Chain, DL, ArgValue, SpillSlot,
MachinePointerInfo::getFixedStack(MF, FI)));
// If the original argument was split (e.g. i128), we need
// to store all parts of it here (and pass just one address).
unsigned ArgIndex = Outs[I].OrigArgIndex;
assert (Outs[I].PartOffset == 0);
while (I + 1 != E && Outs[I + 1].OrigArgIndex == ArgIndex) {
SDValue PartValue = OutVals[I + 1];
unsigned PartOffset = Outs[I + 1].PartOffset;
SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot,
DAG.getIntPtrConstant(PartOffset, DL));
MemOpChains.push_back(
DAG.getStore(Chain, DL, PartValue, Address,
MachinePointerInfo::getFixedStack(MF, FI)));
++I;
}
ArgValue = SpillSlot;
} else
ArgValue = convertValVTToLocVT(DAG, DL, VA, ArgValue);
if (VA.isRegLoc())
// Queue up the argument copies and emit them at the end.
RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
else {
assert(VA.isMemLoc() && "Argument not register or memory");
// Work out the address of the stack slot. Unpromoted ints and
// floats are passed as right-justified 8-byte values.
if (!StackPtr.getNode())
StackPtr = DAG.getCopyFromReg(Chain, DL, SystemZ::R15D, PtrVT);
unsigned Offset = SystemZMC::CallFrameSize + VA.getLocMemOffset();
if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
Offset += 4;
SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
DAG.getIntPtrConstant(Offset, DL));
// Emit the store.
MemOpChains.push_back(
DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo()));
}
}
// Join the stores, which are independent of one another.
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
// Accept direct calls by converting symbolic call addresses to the
// associated Target* opcodes. Force %r1 to be used for indirect
// tail calls.
SDValue Glue;
if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT);
Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
} else if (auto *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT);
Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
} else if (IsTailCall) {
Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R1D, Callee, Glue);
Glue = Chain.getValue(1);
Callee = DAG.getRegister(SystemZ::R1D, Callee.getValueType());
}
// Build a sequence of copy-to-reg nodes, chained and glued together.
for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) {
Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first,
RegsToPass[I].second, Glue);
Glue = Chain.getValue(1);
}
// The first call operand is the chain and the second is the target address.
SmallVector<SDValue, 8> Ops;
Ops.push_back(Chain);
Ops.push_back(Callee);
// Add argument registers to the end of the list so that they are
// known live into the call.
for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I)
Ops.push_back(DAG.getRegister(RegsToPass[I].first,
RegsToPass[I].second.getValueType()));
// Add a register mask operand representing the call-preserved registers.
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
// Glue the call to the argument copies, if any.
if (Glue.getNode())
Ops.push_back(Glue);
// Emit the call.
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
if (IsTailCall)
return DAG.getNode(SystemZISD::SIBCALL, DL, NodeTys, Ops);
Chain = DAG.getNode(SystemZISD::CALL, DL, NodeTys, Ops);
Glue = Chain.getValue(1);
// Mark the end of the call, which is glued to the call itself.
Chain = DAG.getCALLSEQ_END(Chain,
DAG.getConstant(NumBytes, DL, PtrVT, true),
DAG.getConstant(0, DL, PtrVT, true),
Glue, DL);
Glue = Chain.getValue(1);
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RetLocs;
CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext());
RetCCInfo.AnalyzeCallResult(Ins, RetCC_SystemZ);
// Copy all of the result registers out of their specified physreg.
for (unsigned I = 0, E = RetLocs.size(); I != E; ++I) {
CCValAssign &VA = RetLocs[I];
// Copy the value out, gluing the copy to the end of the call sequence.
SDValue RetValue = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(),
VA.getLocVT(), Glue);
Chain = RetValue.getValue(1);
Glue = RetValue.getValue(2);
// Convert the value of the return register into the value that's
// being returned.
InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, RetValue));
}
return Chain;
}
bool SystemZTargetLowering::
CanLowerReturn(CallingConv::ID CallConv,
MachineFunction &MF, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
LLVMContext &Context) const {
// Detect unsupported vector return types.
if (Subtarget.hasVector())
VerifyVectorTypes(Outs);
// Special case that we cannot easily detect in RetCC_SystemZ since
// i128 is not a legal type.
for (auto &Out : Outs)
if (Out.ArgVT == MVT::i128)
return false;
SmallVector<CCValAssign, 16> RetLocs;
CCState RetCCInfo(CallConv, isVarArg, MF, RetLocs, Context);
return RetCCInfo.CheckReturn(Outs, RetCC_SystemZ);
}
SDValue
SystemZTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &DL, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
// Detect unsupported vector return types.
if (Subtarget.hasVector())
VerifyVectorTypes(Outs);
// Assign locations to each returned value.
SmallVector<CCValAssign, 16> RetLocs;
CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext());
RetCCInfo.AnalyzeReturn(Outs, RetCC_SystemZ);
// Quick exit for void returns
if (RetLocs.empty())
return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, Chain);
// Copy the result values into the output registers.
SDValue Glue;
SmallVector<SDValue, 4> RetOps;
RetOps.push_back(Chain);
for (unsigned I = 0, E = RetLocs.size(); I != E; ++I) {
CCValAssign &VA = RetLocs[I];
SDValue RetValue = OutVals[I];
// Make the return register live on exit.
assert(VA.isRegLoc() && "Can only return in registers!");
// Promote the value as required.
RetValue = convertValVTToLocVT(DAG, DL, VA, RetValue);
// Chain and glue the copies together.
unsigned Reg = VA.getLocReg();
Chain = DAG.getCopyToReg(Chain, DL, Reg, RetValue, Glue);
Glue = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(Reg, VA.getLocVT()));
}
// Update chain and glue.
RetOps[0] = Chain;
if (Glue.getNode())
RetOps.push_back(Glue);
return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, RetOps);
}
// Return true if Op is an intrinsic node with chain that returns the CC value
// as its only (other) argument. Provide the associated SystemZISD opcode and
// the mask of valid CC values if so.
static bool isIntrinsicWithCCAndChain(SDValue Op, unsigned &Opcode,
unsigned &CCValid) {
unsigned Id = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
switch (Id) {
case Intrinsic::s390_tbegin:
Opcode = SystemZISD::TBEGIN;
CCValid = SystemZ::CCMASK_TBEGIN;
return true;
case Intrinsic::s390_tbegin_nofloat:
Opcode = SystemZISD::TBEGIN_NOFLOAT;
CCValid = SystemZ::CCMASK_TBEGIN;
return true;
case Intrinsic::s390_tend:
Opcode = SystemZISD::TEND;
CCValid = SystemZ::CCMASK_TEND;
return true;
default:
return false;
}
}
// Return true if Op is an intrinsic node without chain that returns the
// CC value as its final argument. Provide the associated SystemZISD
// opcode and the mask of valid CC values if so.
static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
switch (Id) {
case Intrinsic::s390_vpkshs:
case Intrinsic::s390_vpksfs:
case Intrinsic::s390_vpksgs:
Opcode = SystemZISD::PACKS_CC;
CCValid = SystemZ::CCMASK_VCMP;
return true;
case Intrinsic::s390_vpklshs:
case Intrinsic::s390_vpklsfs:
case Intrinsic::s390_vpklsgs:
Opcode = SystemZISD::PACKLS_CC;
CCValid = SystemZ::CCMASK_VCMP;
return true;
case Intrinsic::s390_vceqbs:
case Intrinsic::s390_vceqhs:
case Intrinsic::s390_vceqfs:
case Intrinsic::s390_vceqgs:
Opcode = SystemZISD::VICMPES;
CCValid = SystemZ::CCMASK_VCMP;
return true;
case Intrinsic::s390_vchbs:
case Intrinsic::s390_vchhs:
case Intrinsic::s390_vchfs:
case Intrinsic::s390_vchgs:
Opcode = SystemZISD::VICMPHS;
CCValid = SystemZ::CCMASK_VCMP;
return true;
case Intrinsic::s390_vchlbs:
case Intrinsic::s390_vchlhs:
case Intrinsic::s390_vchlfs:
case Intrinsic::s390_vchlgs:
Opcode = SystemZISD::VICMPHLS;
CCValid = SystemZ::CCMASK_VCMP;
return true;
case Intrinsic::s390_vtm:
Opcode = SystemZISD::VTM;
CCValid = SystemZ::CCMASK_VCMP;
return true;
case Intrinsic::s390_vfaebs:
case Intrinsic::s390_vfaehs:
case Intrinsic::s390_vfaefs:
Opcode = SystemZISD::VFAE_CC;
CCValid = SystemZ::CCMASK_ANY;
return true;
case Intrinsic::s390_vfaezbs:
case Intrinsic::s390_vfaezhs:
case Intrinsic::s390_vfaezfs:
Opcode = SystemZISD::VFAEZ_CC;
CCValid = SystemZ::CCMASK_ANY;
return true;
case Intrinsic::s390_vfeebs:
case Intrinsic::s390_vfeehs:
case Intrinsic::s390_vfeefs:
Opcode = SystemZISD::VFEE_CC;
CCValid = SystemZ::CCMASK_ANY;
return true;
case Intrinsic::s390_vfeezbs:
case Intrinsic::s390_vfeezhs:
case Intrinsic::s390_vfeezfs:
Opcode = SystemZISD::VFEEZ_CC;
CCValid = SystemZ::CCMASK_ANY;
return true;
case Intrinsic::s390_vfenebs:
case Intrinsic::s390_vfenehs:
case Intrinsic::s390_vfenefs:
Opcode = SystemZISD::VFENE_CC;
CCValid = SystemZ::CCMASK_ANY;
return true;
case Intrinsic::s390_vfenezbs:
case Intrinsic::s390_vfenezhs:
case Intrinsic::s390_vfenezfs:
Opcode = SystemZISD::VFENEZ_CC;
CCValid = SystemZ::CCMASK_ANY;
return true;
case Intrinsic::s390_vistrbs:
case Intrinsic::s390_vistrhs:
case Intrinsic::s390_vistrfs:
Opcode = SystemZISD::VISTR_CC;
CCValid = SystemZ::CCMASK_0 | SystemZ::CCMASK_3;
return true;
case Intrinsic::s390_vstrcbs:
case Intrinsic::s390_vstrchs:
case Intrinsic::s390_vstrcfs:
Opcode = SystemZISD::VSTRC_CC;
CCValid = SystemZ::CCMASK_ANY;
return true;
case Intrinsic::s390_vstrczbs:
case Intrinsic::s390_vstrczhs:
case Intrinsic::s390_vstrczfs:
Opcode = SystemZISD::VSTRCZ_CC;
CCValid = SystemZ::CCMASK_ANY;
return true;
case Intrinsic::s390_vstrsb:
case Intrinsic::s390_vstrsh:
case Intrinsic::s390_vstrsf:
Opcode = SystemZISD::VSTRS_CC;
CCValid = SystemZ::CCMASK_ANY;
return true;
case Intrinsic::s390_vstrszb:
case Intrinsic::s390_vstrszh:
case Intrinsic::s390_vstrszf:
Opcode = SystemZISD::VSTRSZ_CC;
CCValid = SystemZ::CCMASK_ANY;
return true;
case Intrinsic::s390_vfcedbs:
case Intrinsic::s390_vfcesbs:
Opcode = SystemZISD::VFCMPES;
CCValid = SystemZ::CCMASK_VCMP;
return true;
case Intrinsic::s390_vfchdbs:
case Intrinsic::s390_vfchsbs:
Opcode = SystemZISD::VFCMPHS;
CCValid = SystemZ::CCMASK_VCMP;
return true;
case Intrinsic::s390_vfchedbs:
case Intrinsic::s390_vfchesbs:
Opcode = SystemZISD::VFCMPHES;
CCValid = SystemZ::CCMASK_VCMP;
return true;
case Intrinsic::s390_vftcidb:
case Intrinsic::s390_vftcisb:
Opcode = SystemZISD::VFTCI;
CCValid = SystemZ::CCMASK_VCMP;
return true;
case Intrinsic::s390_tdc:
Opcode = SystemZISD::TDC;
CCValid = SystemZ::CCMASK_TDC;
return true;
default:
return false;
}
}
// Emit an intrinsic with chain and an explicit CC register result.
static SDNode *emitIntrinsicWithCCAndChain(SelectionDAG &DAG, SDValue Op,
unsigned Opcode) {
// Copy all operands except the intrinsic ID.
unsigned NumOps = Op.getNumOperands();
SmallVector<SDValue, 6> Ops;
Ops.reserve(NumOps - 1);
Ops.push_back(Op.getOperand(0));
for (unsigned I = 2; I < NumOps; ++I)
Ops.push_back(Op.getOperand(I));
assert(Op->getNumValues() == 2 && "Expected only CC result and chain");
SDVTList RawVTs = DAG.getVTList(MVT::i32, MVT::Other);
SDValue Intr = DAG.getNode(Opcode, SDLoc(Op), RawVTs, Ops);
SDValue OldChain = SDValue(Op.getNode(), 1);
SDValue NewChain = SDValue(Intr.getNode(), 1);
DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
return Intr.getNode();
}
// Emit an intrinsic with an explicit CC register result.
static SDNode *emitIntrinsicWithCC(SelectionDAG &DAG, SDValue Op,
unsigned Opcode) {
// Copy all operands except the intrinsic ID.
unsigned NumOps = Op.getNumOperands();
SmallVector<SDValue, 6> Ops;
Ops.reserve(NumOps - 1);
for (unsigned I = 1; I < NumOps; ++I)
Ops.push_back(Op.getOperand(I));
SDValue Intr = DAG.getNode(Opcode, SDLoc(Op), Op->getVTList(), Ops);
return Intr.getNode();
}
// CC is a comparison that will be implemented using an integer or
// floating-point comparison. Return the condition code mask for
// a branch on true. In the integer case, CCMASK_CMP_UO is set for
// unsigned comparisons and clear for signed ones. In the floating-point
// case, CCMASK_CMP_UO has its normal mask meaning (unordered).
static unsigned CCMaskForCondCode(ISD::CondCode CC) {
#define CONV(X) \
case ISD::SET##X: return SystemZ::CCMASK_CMP_##X; \
case ISD::SETO##X: return SystemZ::CCMASK_CMP_##X; \
case ISD::SETU##X: return SystemZ::CCMASK_CMP_UO | SystemZ::CCMASK_CMP_##X
switch (CC) {
default:
llvm_unreachable("Invalid integer condition!");
CONV(EQ);
CONV(NE);
CONV(GT);
CONV(GE);
CONV(LT);
CONV(LE);
case ISD::SETO: return SystemZ::CCMASK_CMP_O;
case ISD::SETUO: return SystemZ::CCMASK_CMP_UO;
}
#undef CONV
}
// If C can be converted to a comparison against zero, adjust the operands
// as necessary.
static void adjustZeroCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
if (C.ICmpType == SystemZICMP::UnsignedOnly)
return;
auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1.getNode());
if (!ConstOp1)
return;
int64_t Value = ConstOp1->getSExtValue();
if ((Value == -1 && C.CCMask == SystemZ::CCMASK_CMP_GT) ||
(Value == -1 && C.CCMask == SystemZ::CCMASK_CMP_LE) ||
(Value == 1 && C.CCMask == SystemZ::CCMASK_CMP_LT) ||
(Value == 1 && C.CCMask == SystemZ::CCMASK_CMP_GE)) {
C.CCMask ^= SystemZ::CCMASK_CMP_EQ;
C.Op1 = DAG.getConstant(0, DL, C.Op1.getValueType());
}
}
// If a comparison described by C is suitable for CLI(Y), CHHSI or CLHHSI,
// adjust the operands as necessary.
static void adjustSubwordCmp(SelectionDAG &DAG, const SDLoc &DL,
Comparison &C) {
// For us to make any changes, it must a comparison between a single-use
// load and a constant.
if (!C.Op0.hasOneUse() ||
C.Op0.getOpcode() != ISD::LOAD ||
C.Op1.getOpcode() != ISD::Constant)
return;
// We must have an 8- or 16-bit load.
auto *Load = cast<LoadSDNode>(C.Op0);
unsigned NumBits = Load->getMemoryVT().getStoreSizeInBits();
if (NumBits != 8 && NumBits != 16)
return;
// The load must be an extending one and the constant must be within the
// range of the unextended value.
auto *ConstOp1 = cast<ConstantSDNode>(C.Op1);
uint64_t Value = ConstOp1->getZExtValue();
uint64_t Mask = (1 << NumBits) - 1;
if (Load->getExtensionType() == ISD::SEXTLOAD) {
// Make sure that ConstOp1 is in range of C.Op0.
int64_t SignedValue = ConstOp1->getSExtValue();
if (uint64_t(SignedValue) + (uint64_t(1) << (NumBits - 1)) > Mask)
return;
if (C.ICmpType != SystemZICMP::SignedOnly) {
// Unsigned comparison between two sign-extended values is equivalent
// to unsigned comparison between two zero-extended values.
Value &= Mask;
} else if (NumBits == 8) {
// Try to treat the comparison as unsigned, so that we can use CLI.
// Adjust CCMask and Value as necessary.
if (Value == 0 && C.CCMask == SystemZ::CCMASK_CMP_LT)
// Test whether the high bit of the byte is set.
Value = 127, C.CCMask = SystemZ::CCMASK_CMP_GT;
else if (Value == 0 && C.CCMask == SystemZ::CCMASK_CMP_GE)
// Test whether the high bit of the byte is clear.
Value = 128, C.CCMask = SystemZ::CCMASK_CMP_LT;
else
// No instruction exists for this combination.
return;
C.ICmpType = SystemZICMP::UnsignedOnly;
}
} else if (Load->getExtensionType() == ISD::ZEXTLOAD) {
if (Value > Mask)
return;
// If the constant is in range, we can use any comparison.
C.ICmpType = SystemZICMP::Any;
} else
return;
// Make sure that the first operand is an i32 of the right extension type.
ISD::LoadExtType ExtType = (C.ICmpType == SystemZICMP::SignedOnly ?
ISD::SEXTLOAD :
ISD::ZEXTLOAD);
if (C.Op0.getValueType() != MVT::i32 ||
Load->getExtensionType() != ExtType) {
C.Op0 = DAG.getExtLoad(ExtType, SDLoc(Load), MVT::i32, Load->getChain(),
Load->getBasePtr(), Load->getPointerInfo(),
Load->getMemoryVT(), Load->getAlignment(),
Load->getMemOperand()->getFlags());
// Update the chain uses.
DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), C.Op0.getValue(1));
}
// Make sure that the second operand is an i32 with the right value.
if (C.Op1.getValueType() != MVT::i32 ||
Value != ConstOp1->getZExtValue())
C.Op1 = DAG.getConstant(Value, DL, MVT::i32);
}
// Return true if Op is either an unextended load, or a load suitable
// for integer register-memory comparisons of type ICmpType.
static bool isNaturalMemoryOperand(SDValue Op, unsigned ICmpType) {
auto *Load = dyn_cast<LoadSDNode>(Op.getNode());
if (Load) {
// There are no instructions to compare a register with a memory byte.
if (Load->getMemoryVT() == MVT::i8)
return false;
// Otherwise decide on extension type.
switch (Load->getExtensionType()) {
case ISD::NON_EXTLOAD:
return true;
case ISD::SEXTLOAD:
return ICmpType != SystemZICMP::UnsignedOnly;
case ISD::ZEXTLOAD:
return ICmpType != SystemZICMP::SignedOnly;
default:
break;
}
}
return false;
}
// Return true if it is better to swap the operands of C.
static bool shouldSwapCmpOperands(const Comparison &C) {
// Leave f128 comparisons alone, since they have no memory forms.
if (C.Op0.getValueType() == MVT::f128)
return false;
// Always keep a floating-point constant second, since comparisons with
// zero can use LOAD TEST and comparisons with other constants make a
// natural memory operand.
if (isa<ConstantFPSDNode>(C.Op1))
return false;
// Never swap comparisons with zero since there are many ways to optimize
// those later.
auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1);
if (ConstOp1 && ConstOp1->getZExtValue() == 0)
return false;
// Also keep natural memory operands second if the loaded value is
// only used here. Several comparisons have memory forms.
if (isNaturalMemoryOperand(C.Op1, C.ICmpType) && C.Op1.hasOneUse())
return false;
// Look for cases where Cmp0 is a single-use load and Cmp1 isn't.
// In that case we generally prefer the memory to be second.
if (isNaturalMemoryOperand(C.Op0, C.ICmpType) && C.Op0.hasOneUse()) {
// The only exceptions are when the second operand is a constant and
// we can use things like CHHSI.
if (!ConstOp1)
return true;
// The unsigned memory-immediate instructions can handle 16-bit
// unsigned integers.
if (C.ICmpType != SystemZICMP::SignedOnly &&
isUInt<16>(ConstOp1->getZExtValue()))
return false;
// The signed memory-immediate instructions can handle 16-bit
// signed integers.
if (C.ICmpType != SystemZICMP::UnsignedOnly &&
isInt<16>(ConstOp1->getSExtValue()))
return false;
return true;
}
// Try to promote the use of CGFR and CLGFR.
unsigned Opcode0 = C.Op0.getOpcode();
if (C.ICmpType != SystemZICMP::UnsignedOnly && Opcode0 == ISD::SIGN_EXTEND)
return true;
if (C.ICmpType != SystemZICMP::SignedOnly && Opcode0 == ISD::ZERO_EXTEND)
return true;
if (C.ICmpType != SystemZICMP::SignedOnly &&
Opcode0 == ISD::AND &&
C.Op0.getOperand(1).getOpcode() == ISD::Constant &&
cast<ConstantSDNode>(C.Op0.getOperand(1))->getZExtValue() == 0xffffffff)
return true;
return false;
}
// Return a version of comparison CC mask CCMask in which the LT and GT
// actions are swapped.
static unsigned reverseCCMask(unsigned CCMask) {
return ((CCMask & SystemZ::CCMASK_CMP_EQ) |
(CCMask & SystemZ::CCMASK_CMP_GT ? SystemZ::CCMASK_CMP_LT : 0) |
(CCMask & SystemZ::CCMASK_CMP_LT ? SystemZ::CCMASK_CMP_GT : 0) |
(CCMask & SystemZ::CCMASK_CMP_UO));
}
// Check whether C tests for equality between X and Y and whether X - Y
// or Y - X is also computed. In that case it's better to compare the
// result of the subtraction against zero.
static void adjustForSubtraction(SelectionDAG &DAG, const SDLoc &DL,
Comparison &C) {
if (C.CCMask == SystemZ::CCMASK_CMP_EQ ||
C.CCMask == SystemZ::CCMASK_CMP_NE) {
for (auto I = C.Op0->use_begin(), E = C.Op0->use_end(); I != E; ++I) {
SDNode *N = *I;
if (N->getOpcode() == ISD::SUB &&
((N->getOperand(0) == C.Op0 && N->getOperand(1) == C.Op1) ||
(N->getOperand(0) == C.Op1 && N->getOperand(1) == C.Op0))) {
C.Op0 = SDValue(N, 0);
C.Op1 = DAG.getConstant(0, DL, N->getValueType(0));
return;
}
}
}
}
// Check whether C compares a floating-point value with zero and if that
// floating-point value is also negated. In this case we can use the
// negation to set CC, so avoiding separate LOAD AND TEST and
// LOAD (NEGATIVE/COMPLEMENT) instructions.
static void adjustForFNeg(Comparison &C) {
auto *C1 = dyn_cast<ConstantFPSDNode>(C.Op1);
if (C1 && C1->isZero()) {
for (auto I = C.Op0->use_begin(), E = C.Op0->use_end(); I != E; ++I) {
SDNode *N = *I;
if (N->getOpcode() == ISD::FNEG) {
C.Op0 = SDValue(N, 0);
C.CCMask = reverseCCMask(C.CCMask);
return;
}
}
}
}
// Check whether C compares (shl X, 32) with 0 and whether X is
// also sign-extended. In that case it is better to test the result
// of the sign extension using LTGFR.
//
// This case is important because InstCombine transforms a comparison
// with (sext (trunc X)) into a comparison with (shl X, 32).
static void adjustForLTGFR(Comparison &C) {
// Check for a comparison between (shl X, 32) and 0.
if (C.Op0.getOpcode() == ISD::SHL &&
C.Op0.getValueType() == MVT::i64 &&
C.Op1.getOpcode() == ISD::Constant &&
cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
auto *C1 = dyn_cast<ConstantSDNode>(C.Op0.getOperand(1));
if (C1 && C1->getZExtValue() == 32) {
SDValue ShlOp0 = C.Op0.getOperand(0);
// See whether X has any SIGN_EXTEND_INREG uses.
for (auto I = ShlOp0->use_begin(), E = ShlOp0->use_end(); I != E; ++I) {
SDNode *N = *I;
if (N->getOpcode() == ISD::SIGN_EXTEND_INREG &&
cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i32) {
C.Op0 = SDValue(N, 0);
return;
}
}
}
}
}
// If C compares the truncation of an extending load, try to compare
// the untruncated value instead. This exposes more opportunities to
// reuse CC.
static void adjustICmpTruncate(SelectionDAG &DAG, const SDLoc &DL,
Comparison &C) {
if (C.Op0.getOpcode() == ISD::TRUNCATE &&
C.Op0.getOperand(0).getOpcode() == ISD::LOAD &&
C.Op1.getOpcode() == ISD::Constant &&
cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
auto *L = cast<LoadSDNode>(C.Op0.getOperand(0));
if (L->getMemoryVT().getStoreSizeInBits() <= C.Op0.getValueSizeInBits()) {
unsigned Type = L->getExtensionType();
if ((Type == ISD::ZEXTLOAD && C.ICmpType != SystemZICMP::SignedOnly) ||
(Type == ISD::SEXTLOAD && C.ICmpType != SystemZICMP::UnsignedOnly)) {
C.Op0 = C.Op0.getOperand(0);
C.Op1 = DAG.getConstant(0, DL, C.Op0.getValueType());
}
}
}
}
// Return true if shift operation N has an in-range constant shift value.
// Store it in ShiftVal if so.
static bool isSimpleShift(SDValue N, unsigned &ShiftVal) {
auto *Shift = dyn_cast<ConstantSDNode>(N.getOperand(1));
if (!Shift)
return false;
uint64_t Amount = Shift->getZExtValue();
if (Amount >= N.getValueSizeInBits())
return false;
ShiftVal = Amount;
return true;
}
// Check whether an AND with Mask is suitable for a TEST UNDER MASK
// instruction and whether the CC value is descriptive enough to handle
// a comparison of type Opcode between the AND result and CmpVal.
// CCMask says which comparison result is being tested and BitSize is
// the number of bits in the operands. If TEST UNDER MASK can be used,
// return the corresponding CC mask, otherwise return 0.
static unsigned getTestUnderMaskCond(unsigned BitSize, unsigned CCMask,
uint64_t Mask, uint64_t CmpVal,
unsigned ICmpType) {
assert(Mask != 0 && "ANDs with zero should have been removed by now");
// Check whether the mask is suitable for TMHH, TMHL, TMLH or TMLL.
if (!SystemZ::isImmLL(Mask) && !SystemZ::isImmLH(Mask) &&
!SystemZ::isImmHL(Mask) && !SystemZ::isImmHH(Mask))
return 0;
// Work out the masks for the lowest and highest bits.
unsigned HighShift = 63 - countLeadingZeros(Mask);
uint64_t High = uint64_t(1) << HighShift;
uint64_t Low = uint64_t(1) << countTrailingZeros(Mask);
// Signed ordered comparisons are effectively unsigned if the sign
// bit is dropped.
bool EffectivelyUnsigned = (ICmpType != SystemZICMP::SignedOnly);
// Check for equality comparisons with 0, or the equivalent.
if (CmpVal == 0) {
if (CCMask == SystemZ::CCMASK_CMP_EQ)
return SystemZ::CCMASK_TM_ALL_0;
if (CCMask == SystemZ::CCMASK_CMP_NE)
return SystemZ::CCMASK_TM_SOME_1;
}
if (EffectivelyUnsigned && CmpVal > 0 && CmpVal <= Low) {
if (CCMask == SystemZ::CCMASK_CMP_LT)
return SystemZ::CCMASK_TM_ALL_0;
if (CCMask == SystemZ::CCMASK_CMP_GE)
return SystemZ::CCMASK_TM_SOME_1;
}
if (EffectivelyUnsigned && CmpVal < Low) {
if (CCMask == SystemZ::CCMASK_CMP_LE)
return SystemZ::CCMASK_TM_ALL_0;
if (CCMask == SystemZ::CCMASK_CMP_GT)
return SystemZ::CCMASK_TM_SOME_1;
}
// Check for equality comparisons with the mask, or the equivalent.
if (CmpVal == Mask) {
if (CCMask == SystemZ::CCMASK_CMP_EQ)
return SystemZ::CCMASK_TM_ALL_1;
if (CCMask == SystemZ::CCMASK_CMP_NE)
return SystemZ::CCMASK_TM_SOME_0;
}
if (EffectivelyUnsigned && CmpVal >= Mask - Low && CmpVal < Mask) {
if (CCMask == SystemZ::CCMASK_CMP_GT)
return SystemZ::CCMASK_TM_ALL_1;
if (CCMask == SystemZ::CCMASK_CMP_LE)
return SystemZ::CCMASK_TM_SOME_0;
}
if (EffectivelyUnsigned && CmpVal > Mask - Low && CmpVal <= Mask) {
if (CCMask == SystemZ::CCMASK_CMP_GE)
return SystemZ::CCMASK_TM_ALL_1;
if (CCMask == SystemZ::CCMASK_CMP_LT)
return SystemZ::CCMASK_TM_SOME_0;
}
// Check for ordered comparisons with the top bit.
if (EffectivelyUnsigned && CmpVal >= Mask - High && CmpVal < High) {
if (CCMask == SystemZ::CCMASK_CMP_LE)
return SystemZ::CCMASK_TM_MSB_0;
if (CCMask == SystemZ::CCMASK_CMP_GT)
return SystemZ::CCMASK_TM_MSB_1;
}
if (EffectivelyUnsigned && CmpVal > Mask - High && CmpVal <= High) {
if (CCMask == SystemZ::CCMASK_CMP_LT)
return SystemZ::CCMASK_TM_MSB_0;
if (CCMask == SystemZ::CCMASK_CMP_GE)
return SystemZ::CCMASK_TM_MSB_1;
}
// If there are just two bits, we can do equality checks for Low and High
// as well.
if (Mask == Low + High) {
if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == Low)
return SystemZ::CCMASK_TM_MIXED_MSB_0;
if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == Low)
return SystemZ::CCMASK_TM_MIXED_MSB_0 ^ SystemZ::CCMASK_ANY;
if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == High)
return SystemZ::CCMASK_TM_MIXED_MSB_1;
if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == High)
return SystemZ::CCMASK_TM_MIXED_MSB_1 ^ SystemZ::CCMASK_ANY;
}
// Looks like we've exhausted our options.
return 0;
}
// See whether C can be implemented as a TEST UNDER MASK instruction.
// Update the arguments with the TM version if so.
static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL,
Comparison &C) {
// Check that we have a comparison with a constant.
auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1);
if (!ConstOp1)
return;
uint64_t CmpVal = ConstOp1->getZExtValue();
// Check whether the nonconstant input is an AND with a constant mask.
Comparison NewC(C);
uint64_t MaskVal;
ConstantSDNode *Mask = nullptr;
if (C.Op0.getOpcode() == ISD::AND) {
NewC.Op0 = C.Op0.getOperand(0);
NewC.Op1 = C.Op0.getOperand(1);
Mask = dyn_cast<ConstantSDNode>(NewC.Op1);
if (!Mask)
return;
MaskVal = Mask->getZExtValue();
} else {
// There is no instruction to compare with a 64-bit immediate
// so use TMHH instead if possible. We need an unsigned ordered
// comparison with an i64 immediate.
if (NewC.Op0.getValueType() != MVT::i64 ||
NewC.CCMask == SystemZ::CCMASK_CMP_EQ ||
NewC.CCMask == SystemZ::CCMASK_CMP_NE ||
NewC.ICmpType == SystemZICMP::SignedOnly)
return;
// Convert LE and GT comparisons into LT and GE.
if (NewC.CCMask == SystemZ::CCMASK_CMP_LE ||
NewC.CCMask == SystemZ::CCMASK_CMP_GT) {
if (CmpVal == uint64_t(-1))
return;
CmpVal += 1;
NewC.CCMask ^= SystemZ::CCMASK_CMP_EQ;
}
// If the low N bits of Op1 are zero than the low N bits of Op0 can
// be masked off without changing the result.
MaskVal = -(CmpVal & -CmpVal);
NewC.ICmpType = SystemZICMP::UnsignedOnly;
}
if (!MaskVal)
return;
// Check whether the combination of mask, comparison value and comparison
// type are suitable.
unsigned BitSize = NewC.Op0.getValueSizeInBits();
unsigned NewCCMask, ShiftVal;
if (NewC.ICmpType != SystemZICMP::SignedOnly &&
NewC.Op0.getOpcode() == ISD::SHL &&
isSimpleShift(NewC.Op0, ShiftVal) &&
(MaskVal >> ShiftVal != 0) &&
((CmpVal >> ShiftVal) << ShiftVal) == CmpVal &&
(NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
MaskVal >> ShiftVal,
CmpVal >> ShiftVal,
SystemZICMP::Any))) {
NewC.Op0 = NewC.Op0.getOperand(0);
MaskVal >>= ShiftVal;
} else if (NewC.ICmpType != SystemZICMP::SignedOnly &&
NewC.Op0.getOpcode() == ISD::SRL &&
isSimpleShift(NewC.Op0, ShiftVal) &&
(MaskVal << ShiftVal != 0) &&
((CmpVal << ShiftVal) >> ShiftVal) == CmpVal &&
(NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
MaskVal << ShiftVal,
CmpVal << ShiftVal,
SystemZICMP::UnsignedOnly))) {
NewC.Op0 = NewC.Op0.getOperand(0);
MaskVal <<= ShiftVal;
} else {
NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask, MaskVal, CmpVal,
NewC.ICmpType);
if (!NewCCMask)
return;
}
// Go ahead and make the change.
C.Opcode = SystemZISD::TM;
C.Op0 = NewC.Op0;
if (Mask && Mask->getZExtValue() == MaskVal)
C.Op1 = SDValue(Mask, 0);
else
C.Op1 = DAG.getConstant(MaskVal, DL, C.Op0.getValueType());
C.CCValid = SystemZ::CCMASK_TM;
C.CCMask = NewCCMask;
}
// See whether the comparison argument contains a redundant AND
// and remove it if so. This sometimes happens due to the generic
// BRCOND expansion.
static void adjustForRedundantAnd(SelectionDAG &DAG, const SDLoc &DL,
Comparison &C) {
if (C.Op0.getOpcode() != ISD::AND)
return;
auto *Mask = dyn_cast<ConstantSDNode>(C.Op0.getOperand(1));
if (!Mask)
return;
KnownBits Known = DAG.computeKnownBits(C.Op0.getOperand(0));
if ((~Known.Zero).getZExtValue() & ~Mask->getZExtValue())
return;
C.Op0 = C.Op0.getOperand(0);
}
// Return a Comparison that tests the condition-code result of intrinsic
// node Call against constant integer CC using comparison code Cond.
// Opcode is the opcode of the SystemZISD operation for the intrinsic
// and CCValid is the set of possible condition-code results.
static Comparison getIntrinsicCmp(SelectionDAG &DAG, unsigned Opcode,
SDValue Call, unsigned CCValid, uint64_t CC,
ISD::CondCode Cond) {
Comparison C(Call, SDValue());
C.Opcode = Opcode;
C.CCValid = CCValid;
if (Cond == ISD::SETEQ)
// bit 3 for CC==0, bit 0 for CC==3, always false for CC>3.
C.CCMask = CC < 4 ? 1 << (3 - CC) : 0;
else if (Cond == ISD::SETNE)
// ...and the inverse of that.
C.CCMask = CC < 4 ? ~(1 << (3 - CC)) : -1;
else if (Cond == ISD::SETLT || Cond == ISD::SETULT)
// bits above bit 3 for CC==0 (always false), bits above bit 0 for CC==3,
// always true for CC>3.
C.CCMask = CC < 4 ? ~0U << (4 - CC) : -1;
else if (Cond == ISD::SETGE || Cond == ISD::SETUGE)
// ...and the inverse of that.
C.CCMask = CC < 4 ? ~(~0U << (4 - CC)) : 0;
else if (Cond == ISD::SETLE || Cond == ISD::SETULE)
// bit 3 and above for CC==0, bit 0 and above for CC==3 (always true),
// always true for CC>3.
C.CCMask = CC < 4 ? ~0U << (3 - CC) : -1;
else if (Cond == ISD::SETGT || Cond == ISD::SETUGT)
// ...and the inverse of that.
C.CCMask = CC < 4 ? ~(~0U << (3 - CC)) : 0;
else
llvm_unreachable("Unexpected integer comparison type");
C.CCMask &= CCValid;
return C;
}
// Decide how to implement a comparison of type Cond between CmpOp0 with CmpOp1.
static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
ISD::CondCode Cond, const SDLoc &DL) {
if (CmpOp1.getOpcode() == ISD::Constant) {
uint64_t Constant = cast<ConstantSDNode>(CmpOp1)->getZExtValue();
unsigned Opcode, CCValid;
if (CmpOp0.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
CmpOp0.getResNo() == 0 && CmpOp0->hasNUsesOfValue(1, 0) &&
isIntrinsicWithCCAndChain(CmpOp0, Opcode, CCValid))
return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, Constant, Cond);
if (CmpOp0.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
CmpOp0.getResNo() == CmpOp0->getNumValues() - 1 &&
isIntrinsicWithCC(CmpOp0, Opcode, CCValid))
return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, Constant, Cond);
}
Comparison C(CmpOp0, CmpOp1);
C.CCMask = CCMaskForCondCode(Cond);
if (C.Op0.getValueType().isFloatingPoint()) {
C.CCValid = SystemZ::CCMASK_FCMP;
C.Opcode = SystemZISD::FCMP;
adjustForFNeg(C);
} else {
C.CCValid = SystemZ::CCMASK_ICMP;
C.Opcode = SystemZISD::ICMP;
// Choose the type of comparison. Equality and inequality tests can
// use either signed or unsigned comparisons. The choice also doesn't
// matter if both sign bits are known to be clear. In those cases we
// want to give the main isel code the freedom to choose whichever
// form fits best.
if (C.CCMask == SystemZ::CCMASK_CMP_EQ ||
C.CCMask == SystemZ::CCMASK_CMP_NE ||
(DAG.SignBitIsZero(C.Op0) && DAG.SignBitIsZero(C.Op1)))
C.ICmpType = SystemZICMP::Any;
else if (C.CCMask & SystemZ::CCMASK_CMP_UO)
C.ICmpType = SystemZICMP::UnsignedOnly;
else
C.ICmpType = SystemZICMP::SignedOnly;
C.CCMask &= ~SystemZ::CCMASK_CMP_UO;
adjustForRedundantAnd(DAG, DL, C);
adjustZeroCmp(DAG, DL, C);
adjustSubwordCmp(DAG, DL, C);
adjustForSubtraction(DAG, DL, C);
adjustForLTGFR(C);
adjustICmpTruncate(DAG, DL, C);
}
if (shouldSwapCmpOperands(C)) {
std::swap(C.Op0, C.Op1);
C.CCMask = reverseCCMask(C.CCMask);
}
adjustForTestUnderMask(DAG, DL, C);
return C;
}
// Emit the comparison instruction described by C.
static SDValue emitCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
if (!C.Op1.getNode()) {
SDNode *Node;
switch (C.Op0.getOpcode()) {
case ISD::INTRINSIC_W_CHAIN:
Node = emitIntrinsicWithCCAndChain(DAG, C.Op0, C.Opcode);
return SDValue(Node, 0);
case ISD::INTRINSIC_WO_CHAIN:
Node = emitIntrinsicWithCC(DAG, C.Op0, C.Opcode);
return SDValue(Node, Node->getNumValues() - 1);
default:
llvm_unreachable("Invalid comparison operands");
}
}
if (C.Opcode == SystemZISD::ICMP)
return DAG.getNode(SystemZISD::ICMP, DL, MVT::i32, C.Op0, C.Op1,
DAG.getConstant(C.ICmpType, DL, MVT::i32));
if (C.Opcode == SystemZISD::TM) {
bool RegisterOnly = (bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_0) !=
bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_1));
return DAG.getNode(SystemZISD::TM, DL, MVT::i32, C.Op0, C.Op1,
DAG.getConstant(RegisterOnly, DL, MVT::i32));
}
return DAG.getNode(C.Opcode, DL, MVT::i32, C.Op0, C.Op1);
}
// Implement a 32-bit *MUL_LOHI operation by extending both operands to
// 64 bits. Extend is the extension type to use. Store the high part
// in Hi and the low part in Lo.
static void lowerMUL_LOHI32(SelectionDAG &DAG, const SDLoc &DL, unsigned Extend,
SDValue Op0, SDValue Op1, SDValue &Hi,
SDValue &Lo) {
Op0 = DAG.getNode(Extend, DL, MVT::i64, Op0);
Op1 = DAG.getNode(Extend, DL, MVT::i64, Op1);
SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, Op0, Op1);
Hi = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
DAG.getConstant(32, DL, MVT::i64));
Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Hi);
Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
}
// Lower a binary operation that produces two VT results, one in each
// half of a GR128 pair. Op0 and Op1 are the VT operands to the operation,
// and Opcode performs the GR128 operation. Store the even register result
// in Even and the odd register result in Odd.
static void lowerGR128Binary(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
unsigned Opcode, SDValue Op0, SDValue Op1,
SDValue &Even, SDValue &Odd) {
SDValue Result = DAG.getNode(Opcode, DL, MVT::Untyped, Op0, Op1);
bool Is32Bit = is32Bit(VT);
Even = DAG.getTargetExtractSubreg(SystemZ::even128(Is32Bit), DL, VT, Result);
Odd = DAG.getTargetExtractSubreg(SystemZ::odd128(Is32Bit), DL, VT, Result);
}
// Return an i32 value that is 1 if the CC value produced by CCReg is
// in the mask CCMask and 0 otherwise. CC is known to have a value
// in CCValid, so other values can be ignored.
static SDValue emitSETCC(SelectionDAG &DAG, const SDLoc &DL, SDValue CCReg,
unsigned CCValid, unsigned CCMask) {
SDValue Ops[] = { DAG.getConstant(1, DL, MVT::i32),
DAG.getConstant(0, DL, MVT::i32),
DAG.getConstant(CCValid, DL, MVT::i32),
DAG.getConstant(CCMask, DL, MVT::i32), CCReg };
return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, MVT::i32, Ops);
}
// Return the SystemISD vector comparison operation for CC, or 0 if it cannot
// be done directly. IsFP is true if CC is for a floating-point rather than
// integer comparison.
static unsigned getVectorComparison(ISD::CondCode CC, bool IsFP) {
switch (CC) {
case ISD::SETOEQ:
case ISD::SETEQ:
return IsFP ? SystemZISD::VFCMPE : SystemZISD::VICMPE;
case ISD::SETOGE:
case ISD::SETGE:
return IsFP ? SystemZISD::VFCMPHE : static_cast<SystemZISD::NodeType>(0);
case ISD::SETOGT:
case ISD::SETGT:
return IsFP ? SystemZISD::VFCMPH : SystemZISD::VICMPH;
case ISD::SETUGT:
return IsFP ? static_cast<SystemZISD::NodeType>(0) : SystemZISD::VICMPHL;
default:
return 0;
}
}
// Return the SystemZISD vector comparison operation for CC or its inverse,
// or 0 if neither can be done directly. Indicate in Invert whether the
// result is for the inverse of CC. IsFP is true if CC is for a
// floating-point rather than integer comparison.
static unsigned getVectorComparisonOrInvert(ISD::CondCode CC, bool IsFP,
bool &Invert) {
if (unsigned Opcode = getVectorComparison(CC, IsFP)) {
Invert = false;
return Opcode;
}
CC = ISD::getSetCCInverse(CC, !IsFP);
if (unsigned Opcode = getVectorComparison(CC, IsFP)) {
Invert = true;
return Opcode;
}
return 0;
}
// Return a v2f64 that contains the extended form of elements Start and Start+1
// of v4f32 value Op.
static SDValue expandV4F32ToV2F64(SelectionDAG &DAG, int Start, const SDLoc &DL,
SDValue Op) {
int Mask[] = { Start, -1, Start + 1, -1 };
Op = DAG.getVectorShuffle(MVT::v4f32, DL, Op, DAG.getUNDEF(MVT::v4f32), Mask);
return DAG.getNode(SystemZISD::VEXTEND, DL, MVT::v2f64, Op);
}
// Build a comparison of vectors CmpOp0 and CmpOp1 using opcode Opcode,
// producing a result of type VT.
SDValue SystemZTargetLowering::getVectorCmp(SelectionDAG &DAG, unsigned Opcode,
const SDLoc &DL, EVT VT,
SDValue CmpOp0,
SDValue CmpOp1) const {
// There is no hardware support for v4f32 (unless we have the vector
// enhancements facility 1), so extend the vector into two v2f64s
// and compare those.
if (CmpOp0.getValueType() == MVT::v4f32 &&
!Subtarget.hasVectorEnhancements1()) {
SDValue H0 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp0);
SDValue L0 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp0);
SDValue H1 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp1);
SDValue L1 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp1);
SDValue HRes = DAG.getNode(Opcode, DL, MVT::v2i64, H0, H1);
SDValue LRes = DAG.getNode(Opcode, DL, MVT::v2i64, L0, L1);
return DAG.getNode(SystemZISD::PACK, DL, VT, HRes, LRes);
}
return DAG.getNode(Opcode, DL, VT, CmpOp0, CmpOp1);
}
// Lower a vector comparison of type CC between CmpOp0 and CmpOp1, producing
// an integer mask of type VT.
SDValue SystemZTargetLowering::lowerVectorSETCC(SelectionDAG &DAG,
const SDLoc &DL, EVT VT,
ISD::CondCode CC,
SDValue CmpOp0,
SDValue CmpOp1) const {
bool IsFP = CmpOp0.getValueType().isFloatingPoint();
bool Invert = false;
SDValue Cmp;
switch (CC) {
// Handle tests for order using (or (ogt y x) (oge x y)).
case ISD::SETUO:
Invert = true;
LLVM_FALLTHROUGH;
case ISD::SETO: {
assert(IsFP && "Unexpected integer comparison");
SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
SDValue GE = getVectorCmp(DAG, SystemZISD::VFCMPHE, DL, VT, CmpOp0, CmpOp1);
Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GE);
break;
}
// Handle <> tests using (or (ogt y x) (ogt x y)).
case ISD::SETUEQ:
Invert = true;
LLVM_FALLTHROUGH;
case ISD::SETONE: {
assert(IsFP && "Unexpected integer comparison");
SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
SDValue GT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp0, CmpOp1);
Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GT);
break;
}
// Otherwise a single comparison is enough. It doesn't really
// matter whether we try the inversion or the swap first, since
// there are no cases where both work.
default:
if (unsigned Opcode = getVectorComparisonOrInvert(CC, IsFP, Invert))
Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp0, CmpOp1);
else {
CC = ISD::getSetCCSwappedOperands(CC);
if (unsigned Opcode = getVectorComparisonOrInvert(CC, IsFP, Invert))
Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp1, CmpOp0);
else
llvm_unreachable("Unhandled comparison");
}
break;
}
if (Invert) {
SDValue Mask =
DAG.getSplatBuildVector(VT, DL, DAG.getConstant(-1, DL, MVT::i64));
Cmp = DAG.getNode(ISD::XOR, DL, VT, Cmp, Mask);
}
return Cmp;
}
SDValue SystemZTargetLowering::lowerSETCC(SDValue Op,
SelectionDAG &DAG) const {
SDValue CmpOp0 = Op.getOperand(0);
SDValue CmpOp1 = Op.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
SDLoc DL(Op);
EVT VT = Op.getValueType();
if (VT.isVector())
return lowerVectorSETCC(DAG, DL, VT, CC, CmpOp0, CmpOp1);
Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
SDValue CCReg = emitCmp(DAG, DL, C);
return emitSETCC(DAG, DL, CCReg, C.CCValid, C.CCMask);
}
SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
SDValue CmpOp0 = Op.getOperand(2);
SDValue CmpOp1 = Op.getOperand(3);
SDValue Dest = Op.getOperand(4);
SDLoc DL(Op);
Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
SDValue CCReg = emitCmp(DAG, DL, C);
return DAG.getNode(SystemZISD::BR_CCMASK, DL, Op.getValueType(),
Op.getOperand(0), DAG.getConstant(C.CCValid, DL, MVT::i32),
DAG.getConstant(C.CCMask, DL, MVT::i32), Dest, CCReg);
}
// Return true if Pos is CmpOp and Neg is the negative of CmpOp,
// allowing Pos and Neg to be wider than CmpOp.
static bool isAbsolute(SDValue CmpOp, SDValue Pos, SDValue Neg) {
return (Neg.getOpcode() == ISD::SUB &&
Neg.getOperand(0).getOpcode() == ISD::Constant &&
cast<ConstantSDNode>(Neg.getOperand(0))->getZExtValue() == 0 &&
Neg.getOperand(1) == Pos &&
(Pos == CmpOp ||
(Pos.getOpcode() == ISD::SIGN_EXTEND &&
Pos.getOperand(0) == CmpOp)));
}
// Return the absolute or negative absolute of Op; IsNegative decides which.
static SDValue getAbsolute(SelectionDAG &DAG, const SDLoc &DL, SDValue Op,
bool IsNegative) {
Op = DAG.getNode(SystemZISD::IABS, DL, Op.getValueType(), Op);
if (IsNegative)
Op = DAG.getNode(ISD::SUB, DL, Op.getValueType(),
DAG.getConstant(0, DL, Op.getValueType()), Op);
return Op;
}
SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
SelectionDAG &DAG) const {
SDValue CmpOp0 = Op.getOperand(0);
SDValue CmpOp1 = Op.getOperand(1);
SDValue TrueOp = Op.getOperand(2);
SDValue FalseOp = Op.getOperand(3);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
SDLoc DL(Op);
Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
// Check for absolute and negative-absolute selections, including those
// where the comparison value is sign-extended (for LPGFR and LNGFR).
// This check supplements the one in DAGCombiner.
if (C.Opcode == SystemZISD::ICMP &&
C.CCMask != SystemZ::CCMASK_CMP_EQ &&
C.CCMask != SystemZ::CCMASK_CMP_NE &&
C.Op1.getOpcode() == ISD::Constant &&
cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
if (isAbsolute(C.Op0, TrueOp, FalseOp))
return getAbsolute(DAG, DL, TrueOp, C.CCMask & SystemZ::CCMASK_CMP_LT);
if (isAbsolute(C.Op0, FalseOp, TrueOp))
return getAbsolute(DAG, DL, FalseOp, C.CCMask & SystemZ::CCMASK_CMP_GT);
}
SDValue CCReg = emitCmp(DAG, DL, C);
SDValue Ops[] = {TrueOp, FalseOp, DAG.getConstant(C.CCValid, DL, MVT::i32),
DAG.getConstant(C.CCMask, DL, MVT::i32), CCReg};
return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, Op.getValueType(), Ops);
}
SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
SelectionDAG &DAG) const {
SDLoc DL(Node);
const GlobalValue *GV = Node->getGlobal();
int64_t Offset = Node->getOffset();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
CodeModel::Model CM = DAG.getTarget().getCodeModel();
SDValue Result;
if (Subtarget.isPC32DBLSymbol(GV, CM)) {
// Assign anchors at 1<<12 byte boundaries.
uint64_t Anchor = Offset & ~uint64_t(0xfff);
Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor);
Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
// The offset can be folded into the address if it is aligned to a halfword.
Offset -= Anchor;
if (Offset != 0 && (Offset & 1) == 0) {
SDValue Full = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor + Offset);
Result = DAG.getNode(SystemZISD::PCREL_OFFSET, DL, PtrVT, Full, Result);
Offset = 0;
}
} else {
Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, SystemZII::MO_GOT);
Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
MachinePointerInfo::getGOT(DAG.getMachineFunction()));
}
// If there was a non-zero offset that we didn't fold, create an explicit
// addition for it.
if (Offset != 0)
Result = DAG.getNode(ISD::ADD, DL, PtrVT, Result,
DAG.getConstant(Offset, DL, PtrVT));
return Result;
}
SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node,
SelectionDAG &DAG,
unsigned Opcode,
SDValue GOTOffset) const {
SDLoc DL(Node);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Chain = DAG.getEntryNode();
SDValue Glue;
// __tls_get_offset takes the GOT offset in %r2 and the GOT in %r12.
SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R12D, GOT, Glue);
Glue = Chain.getValue(1);
Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R2D, GOTOffset, Glue);
Glue = Chain.getValue(1);
// The first call operand is the chain and the second is the TLS symbol.
SmallVector<SDValue, 8> Ops;
Ops.push_back(Chain);
Ops.push_back(DAG.getTargetGlobalAddress(Node->getGlobal(), DL,
Node->getValueType(0),
0, 0));
// Add argument registers to the end of the list so that they are
// known live into the call.
Ops.push_back(DAG.getRegister(SystemZ::R2D, PtrVT));
Ops.push_back(DAG.getRegister(SystemZ::R12D, PtrVT));
// Add a register mask operand representing the call-preserved registers.
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
const uint32_t *Mask =
TRI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
// Glue the call to the argument copies.
Ops.push_back(Glue);
// Emit the call.
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
Chain = DAG.getNode(Opcode, DL, NodeTys, Ops);
Glue = Chain.getValue(1);
// Copy the return value from %r2.
return DAG.getCopyFromReg(Chain, DL, SystemZ::R2D, PtrVT, Glue);
}
SDValue SystemZTargetLowering::lowerThreadPointer(const SDLoc &DL,
SelectionDAG &DAG) const {
SDValue Chain = DAG.getEntryNode();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
// The high part of the thread pointer is in access register 0.
SDValue TPHi = DAG.getCopyFromReg(Chain, DL, SystemZ::A0, MVT::i32);
TPHi = DAG.getNode(ISD::ANY_EXTEND, DL, PtrVT, TPHi);
// The low part of the thread pointer is in access register 1.
SDValue TPLo = DAG.getCopyFromReg(Chain, DL, SystemZ::A1, MVT::i32);
TPLo = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TPLo);
// Merge them into a single 64-bit address.
SDValue TPHiShifted = DAG.getNode(ISD::SHL, DL, PtrVT, TPHi,
DAG.getConstant(32, DL, PtrVT));
return DAG.getNode(ISD::OR, DL, PtrVT, TPHiShifted, TPLo);
}
SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
SelectionDAG &DAG) const {
if (DAG.getTarget().useEmulatedTLS())
return LowerToTLSEmulatedModel(Node, DAG);
SDLoc DL(Node);
const GlobalValue *GV = Node->getGlobal();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
SDValue TP = lowerThreadPointer(DL, DAG);
// Get the offset of GA from the thread pointer, based on the TLS model.
SDValue Offset;
switch (model) {
case TLSModel::GeneralDynamic: {
// Load the GOT offset of the tls_index (module ID / per-symbol offset).
SystemZConstantPoolValue *CPV =
SystemZConstantPoolValue::Create(GV, SystemZCP::TLSGD);
Offset = DAG.getConstantPool(CPV, PtrVT, 8);
Offset = DAG.getLoad(
PtrVT, DL, DAG.getEntryNode(), Offset,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
// Call __tls_get_offset to retrieve the offset.
Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_GDCALL, Offset);
break;
}
case TLSModel::LocalDynamic: {
// Load the GOT offset of the module ID.
SystemZConstantPoolValue *CPV =
SystemZConstantPoolValue::Create(GV, SystemZCP::TLSLDM);
Offset = DAG.getConstantPool(CPV, PtrVT, 8);
Offset = DAG.getLoad(
PtrVT, DL, DAG.getEntryNode(), Offset,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
// Call __tls_get_offset to retrieve the module base offset.
Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_LDCALL, Offset);
// Note: The SystemZLDCleanupPass will remove redundant computations
// of the module base offset. Count total number of local-dynamic
// accesses to trigger execution of that pass.
SystemZMachineFunctionInfo* MFI =
DAG.getMachineFunction().getInfo<SystemZMachineFunctionInfo>();
MFI->incNumLocalDynamicTLSAccesses();
// Add the per-symbol offset.
CPV = SystemZConstantPoolValue::Create(GV, SystemZCP::DTPOFF);
SDValue DTPOffset = DAG.getConstantPool(CPV, PtrVT, 8);
DTPOffset = DAG.getLoad(
PtrVT, DL, DAG.getEntryNode(), DTPOffset,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Offset, DTPOffset);
break;
}
case TLSModel::InitialExec: {
// Load the offset from the GOT.
Offset = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
SystemZII::MO_INDNTPOFF);
Offset = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Offset);
Offset =
DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Offset,
MachinePointerInfo::getGOT(DAG.getMachineFunction()));
break;
}
case TLSModel::LocalExec: {
// Force the offset into the constant pool and load it from there.
SystemZConstantPoolValue *CPV =
SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF);
Offset = DAG.getConstantPool(CPV, PtrVT, 8);
Offset = DAG.getLoad(
PtrVT, DL, DAG.getEntryNode(), Offset,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
break;
}
}
// Add the base and offset together.
return DAG.getNode(ISD::ADD, DL, PtrVT, TP, Offset);
}
SDValue SystemZTargetLowering::lowerBlockAddress(BlockAddressSDNode *Node,
SelectionDAG &DAG) const {
SDLoc DL(Node);
const BlockAddress *BA = Node->getBlockAddress();
int64_t Offset = Node->getOffset();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset);
Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
return Result;
}
SDValue SystemZTargetLowering::lowerJumpTable(JumpTableSDNode *JT,
SelectionDAG &DAG) const {
SDLoc DL(JT);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
// Use LARL to load the address of the table.
return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
}
SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
SelectionDAG &DAG) const {
SDLoc DL(CP);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result;
if (CP->isMachineConstantPoolEntry())
Result = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
CP->getAlignment());
else
Result = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
CP->getAlignment(), CP->getOffset());
// Use LARL to load the address of the constant pool entry.
return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
}
SDValue SystemZTargetLowering::lowerFRAMEADDR(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
MFI.setFrameAddressIsTaken(true);
SDLoc DL(Op);
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
// If the back chain frame index has not been allocated yet, do so.
SystemZMachineFunctionInfo *FI = MF.getInfo<SystemZMachineFunctionInfo>();
int BackChainIdx = FI->getFramePointerSaveIndex();
if (!BackChainIdx) {
// By definition, the frame address is the address of the back chain.
BackChainIdx = MFI.CreateFixedObject(8, -SystemZMC::CallFrameSize, false);
FI->setFramePointerSaveIndex(BackChainIdx);
}
SDValue BackChain = DAG.getFrameIndex(BackChainIdx, PtrVT);
// FIXME The frontend should detect this case.
if (Depth > 0) {
report_fatal_error("Unsupported stack frame traversal count");
}
return BackChain;
}
SDValue SystemZTargetLowering::lowerRETURNADDR(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
MFI.setReturnAddressIsTaken(true);
if (verifyReturnAddressArgumentIsConstant(Op, DAG))
return SDValue();
SDLoc DL(Op);
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
// FIXME The frontend should detect this case.
if (Depth > 0) {
report_fatal_error("Unsupported stack frame traversal count");
}
// Return R14D, which has the return address. Mark it an implicit live-in.
unsigned LinkReg = MF.addLiveIn(SystemZ::R14D, &SystemZ::GR64BitRegClass);
return DAG.getCopyFromReg(DAG.getEntryNode(), DL, LinkReg, PtrVT);
}
SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
SDValue In = Op.getOperand(0);
EVT InVT = In.getValueType();
EVT ResVT = Op.getValueType();
// Convert loads directly. This is normally done by DAGCombiner,
// but we need this case for bitcasts that are created during lowering
// and which are then lowered themselves.
if (auto *LoadN = dyn_cast<LoadSDNode>(In))
if (ISD::isNormalLoad(LoadN)) {
SDValue NewLoad = DAG.getLoad(ResVT, DL, LoadN->getChain(),
LoadN->getBasePtr(), LoadN->getMemOperand());
// Update the chain uses.
DAG.ReplaceAllUsesOfValueWith(SDValue(LoadN, 1), NewLoad.getValue(1));
return NewLoad;
}
if (InVT == MVT::i32 && ResVT == MVT::f32) {
SDValue In64;
if (Subtarget.hasHighWord()) {
SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL,
MVT::i64);
In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h32, DL,
MVT::i64, SDValue(U64, 0), In);
} else {
In64 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, In);
In64 = DAG.getNode(ISD::SHL, DL, MVT::i64, In64,
DAG.getConstant(32, DL, MVT::i64));
}
SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::f64, In64);
return DAG.getTargetExtractSubreg(SystemZ::subreg_h32,
DL, MVT::f32, Out64);
}
if (InVT == MVT::f32 && ResVT == MVT::i32) {
SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f64);
SDValue In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h32, DL,
MVT::f64, SDValue(U64, 0), In);
SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::i64, In64);
if (Subtarget.hasHighWord())
return DAG.getTargetExtractSubreg(SystemZ::subreg_h32, DL,
MVT::i32, Out64);
SDValue Shift = DAG.getNode(ISD::SRL, DL, MVT::i64, Out64,
DAG.getConstant(32, DL, MVT::i64));
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Shift);
}
llvm_unreachable("Unexpected bitcast combination");
}
SDValue SystemZTargetLowering::lowerVASTART(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
SystemZMachineFunctionInfo *FuncInfo =
MF.getInfo<SystemZMachineFunctionInfo>();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Chain = Op.getOperand(0);
SDValue Addr = Op.getOperand(1);
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
SDLoc DL(Op);
// The initial values of each field.
const unsigned NumFields = 4;
SDValue Fields[NumFields] = {
DAG.getConstant(FuncInfo->getVarArgsFirstGPR(), DL, PtrVT),
DAG.getConstant(FuncInfo->getVarArgsFirstFPR(), DL, PtrVT),
DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT),
DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT)
};
// Store each field into its respective slot.
SDValue MemOps[NumFields];
unsigned Offset = 0;
for (unsigned I = 0; I < NumFields; ++I) {
SDValue FieldAddr = Addr;
if (Offset != 0)
FieldAddr = DAG.getNode(ISD::ADD, DL, PtrVT, FieldAddr,
DAG.getIntPtrConstant(Offset, DL));
MemOps[I] = DAG.getStore(Chain, DL, Fields[I], FieldAddr,
MachinePointerInfo(SV, Offset));
Offset += 8;
}
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
}
SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op,
SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
SDValue DstPtr = Op.getOperand(1);
SDValue SrcPtr = Op.getOperand(2);
const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
SDLoc DL(Op);
return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(32, DL),
/*Align*/8, /*isVolatile*/false, /*AlwaysInline*/false,
/*isTailCall*/false,
MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
}
SDValue SystemZTargetLowering::
lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
MachineFunction &MF = DAG.getMachineFunction();
bool RealignOpt = !MF.getFunction().hasFnAttribute("no-realign-stack");
bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
SDValue Align = Op.getOperand(2);
SDLoc DL(Op);
// If user has set the no alignment function attribute, ignore
// alloca alignments.
uint64_t AlignVal = (RealignOpt ?
dyn_cast<ConstantSDNode>(Align)->getZExtValue() : 0);
uint64_t StackAlign = TFI->getStackAlignment();
uint64_t RequiredAlign = std::max(AlignVal, StackAlign);
uint64_t ExtraAlignSpace = RequiredAlign - StackAlign;
unsigned SPReg = getStackPointerRegisterToSaveRestore();
SDValue NeededSpace = Size;
// Get a reference to the stack pointer.
SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SPReg, MVT::i64);
// If we need a backchain, save it now.
SDValue Backchain;
if (StoreBackchain)
Backchain = DAG.getLoad(MVT::i64, DL, Chain, OldSP, MachinePointerInfo());
// Add extra space for alignment if needed.
if (ExtraAlignSpace)
NeededSpace = DAG.getNode(ISD::ADD, DL, MVT::i64, NeededSpace,
DAG.getConstant(ExtraAlignSpace, DL, MVT::i64));
// Get the new stack pointer value.
SDValue NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, NeededSpace);
// Copy the new stack pointer back.
Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP);
// The allocated data lives above the 160 bytes allocated for the standard
// frame, plus any outgoing stack arguments. We don't know how much that
// amounts to yet, so emit a special ADJDYNALLOC placeholder.
SDValue ArgAdjust = DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64);
SDValue Result = DAG.getNode(ISD::ADD, DL, MVT::i64, NewSP, ArgAdjust);
// Dynamically realign if needed.
if (RequiredAlign > StackAlign) {
Result =
DAG.getNode(ISD::ADD, DL, MVT::i64, Result,
DAG.getConstant(ExtraAlignSpace, DL, MVT::i64));
Result =
DAG.getNode(ISD::AND, DL, MVT::i64, Result,
DAG.getConstant(~(RequiredAlign - 1), DL, MVT::i64));
}
if (StoreBackchain)
Chain = DAG.getStore(Chain, DL, Backchain, NewSP, MachinePointerInfo());
SDValue Ops[2] = { Result, Chain };
return DAG.getMergeValues(Ops, DL);
}
SDValue SystemZTargetLowering::lowerGET_DYNAMIC_AREA_OFFSET(
SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
return DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64);
}
SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc DL(Op);
SDValue Ops[2];
if (is32Bit(VT))
// Just do a normal 64-bit multiplication and extract the results.
// We define this so that it can be used for constant division.
lowerMUL_LOHI32(DAG, DL, ISD::SIGN_EXTEND, Op.getOperand(0),
Op.getOperand(1), Ops[1], Ops[0]);
else if (Subtarget.hasMiscellaneousExtensions2())
// SystemZISD::SMUL_LOHI returns the low result in the odd register and
// the high result in the even register. ISD::SMUL_LOHI is defined to
// return the low half first, so the results are in reverse order.
lowerGR128Binary(DAG, DL, VT, SystemZISD::SMUL_LOHI,
Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
else {
// Do a full 128-bit multiplication based on SystemZISD::UMUL_LOHI:
//
// (ll * rl) + ((lh * rl) << 64) + ((ll * rh) << 64)
//
// but using the fact that the upper halves are either all zeros
// or all ones:
//
// (ll * rl) - ((lh & rl) << 64) - ((ll & rh) << 64)
//
// and grouping the right terms together since they are quicker than the
// multiplication:
//
// (ll * rl) - (((lh & rl) + (ll & rh)) << 64)
SDValue C63 = DAG.getConstant(63, DL, MVT::i64);
SDValue LL = Op.getOperand(0);
SDValue RL = Op.getOperand(1);
SDValue LH = DAG.getNode(ISD::SRA, DL, VT, LL, C63);
SDValue RH = DAG.getNode(ISD::SRA, DL, VT, RL, C63);
// SystemZISD::UMUL_LOHI returns the low result in the odd register and
// the high result in the even register. ISD::SMUL_LOHI is defined to
// return the low half first, so the results are in reverse order.
lowerGR128Binary(DAG, DL, VT, SystemZISD::UMUL_LOHI,
LL, RL, Ops[1], Ops[0]);
SDValue NegLLTimesRH = DAG.getNode(ISD::AND, DL, VT, LL, RH);
SDValue NegLHTimesRL = DAG.getNode(ISD::AND, DL, VT, LH, RL);
SDValue NegSum = DAG.getNode(ISD::ADD, DL, VT, NegLLTimesRH, NegLHTimesRL);
Ops[1] = DAG.getNode(ISD::SUB, DL, VT, Ops[1], NegSum);
}
return DAG.getMergeValues(Ops, DL);
}
SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc DL(Op);
SDValue Ops[2];
if (is32Bit(VT))
// Just do a normal 64-bit multiplication and extract the results.
// We define this so that it can be used for constant division.
lowerMUL_LOHI32(DAG, DL, ISD::ZERO_EXTEND, Op.getOperand(0),
Op.getOperand(1), Ops[1], Ops[0]);
else
// SystemZISD::UMUL_LOHI returns the low result in the odd register and
// the high result in the even register. ISD::UMUL_LOHI is defined to
// return the low half first, so the results are in reverse order.
lowerGR128Binary(DAG, DL, VT, SystemZISD::UMUL_LOHI,
Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
return DAG.getMergeValues(Ops, DL);
}
SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op,
SelectionDAG &DAG) const {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
EVT VT = Op.getValueType();
SDLoc DL(Op);
// We use DSGF for 32-bit division. This means the first operand must
// always be 64-bit, and the second operand should be 32-bit whenever
// that is possible, to improve performance.
if (is32Bit(VT))
Op0 = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Op0);
else if (DAG.ComputeNumSignBits(Op1) > 32)
Op1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op1);
// DSG(F) returns the remainder in the even register and the
// quotient in the odd register.
SDValue Ops[2];
lowerGR128Binary(DAG, DL, VT, SystemZISD::SDIVREM, Op0, Op1, Ops[1], Ops[0]);
return DAG.getMergeValues(Ops, DL);
}
SDValue SystemZTargetLowering::lowerUDIVREM(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc DL(Op);
// DL(G) returns the remainder in the even register and the
// quotient in the odd register.
SDValue Ops[2];
lowerGR128Binary(DAG, DL, VT, SystemZISD::UDIVREM,
Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
return DAG.getMergeValues(Ops, DL);
}
SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getValueType() == MVT::i64 && "Should be 64-bit operation");
// Get the known-zero masks for each operand.
SDValue Ops[] = {Op.getOperand(0), Op.getOperand(1)};
KnownBits Known[2] = {DAG.computeKnownBits(Ops[0]),
DAG.computeKnownBits(Ops[1])};
// See if the upper 32 bits of one operand and the lower 32 bits of the
// other are known zero. They are the low and high operands respectively.
uint64_t Masks[] = { Known[0].Zero.getZExtValue(),
Known[1].Zero.getZExtValue() };
unsigned High, Low;
if ((Masks[0] >> 32) == 0xffffffff && uint32_t(Masks[1]) == 0xffffffff)
High = 1, Low = 0;
else if ((Masks[1] >> 32) == 0xffffffff && uint32_t(Masks[0]) == 0xffffffff)
High = 0, Low = 1;
else
return Op;
SDValue LowOp = Ops[Low];
SDValue HighOp = Ops[High];
// If the high part is a constant, we're better off using IILH.
if (HighOp.getOpcode() == ISD::Constant)
return Op;
// If the low part is a constant that is outside the range of LHI,
// then we're better off using IILF.
if (LowOp.getOpcode() == ISD::Constant) {
int64_t Value = int32_t(cast<ConstantSDNode>(LowOp)->getZExtValue());
if (!isInt<16>(Value))
return Op;
}
// Check whether the high part is an AND that doesn't change the
// high 32 bits and just masks out low bits. We can skip it if so.
if (HighOp.getOpcode() == ISD::AND &&
HighOp.getOperand(1).getOpcode() == ISD::Constant) {
SDValue HighOp0 = HighOp.getOperand(0);
uint64_t Mask = cast<ConstantSDNode>(HighOp.getOperand(1))->getZExtValue();
if (DAG.MaskedValueIsZero(HighOp0, APInt(64, ~(Mask | 0xffffffff))))
HighOp = HighOp0;
}
// Take advantage of the fact that all GR32 operations only change the
// low 32 bits by truncating Low to an i32 and inserting it directly
// using a subreg. The interesting cases are those where the truncation
// can be folded.
SDLoc DL(Op);
SDValue Low32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, LowOp);
return DAG.getTargetInsertSubreg(SystemZ::subreg_l32, DL,
MVT::i64, HighOp, Low32);
}
// Lower SADDO/SSUBO/UADDO/USUBO nodes.
SDValue SystemZTargetLowering::lowerXALUO(SDValue Op,
SelectionDAG &DAG) const {
SDNode *N = Op.getNode();
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
SDLoc DL(N);
unsigned BaseOp = 0;
unsigned CCValid = 0;
unsigned CCMask = 0;
switch (Op.getOpcode()) {
default: llvm_unreachable("Unknown instruction!");
case ISD::SADDO:
BaseOp = SystemZISD::SADDO;
CCValid = SystemZ::CCMASK_ARITH;
CCMask = SystemZ::CCMASK_ARITH_OVERFLOW;
break;
case ISD::SSUBO:
BaseOp = SystemZISD::SSUBO;
CCValid = SystemZ::CCMASK_ARITH;
CCMask = SystemZ::CCMASK_ARITH_OVERFLOW;
break;
case ISD::UADDO:
BaseOp = SystemZISD::UADDO;
CCValid = SystemZ::CCMASK_LOGICAL;
CCMask = SystemZ::CCMASK_LOGICAL_CARRY;
break;
case ISD::USUBO:
BaseOp = SystemZISD::USUBO;
CCValid = SystemZ::CCMASK_LOGICAL;
CCMask = SystemZ::CCMASK_LOGICAL_BORROW;
break;
}
SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
SDValue Result = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
SDValue SetCC = emitSETCC(DAG, DL, Result.getValue(1), CCValid, CCMask);
if (N->getValueType(1) == MVT::i1)
SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, SetCC);
}
static bool isAddCarryChain(SDValue Carry) {
while (Carry.getOpcode() == ISD::ADDCARRY)
Carry = Carry.getOperand(2);
return Carry.getOpcode() == ISD::UADDO;
}
static bool isSubBorrowChain(SDValue Carry) {
while (Carry.getOpcode() == ISD::SUBCARRY)
Carry = Carry.getOperand(2);
return Carry.getOpcode() == ISD::USUBO;
}
// Lower ADDCARRY/SUBCARRY nodes.
SDValue SystemZTargetLowering::lowerADDSUBCARRY(SDValue Op,
SelectionDAG &DAG) const {
SDNode *N = Op.getNode();
MVT VT = N->getSimpleValueType(0);
// Let legalize expand this if it isn't a legal type yet.
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
SDValue Carry = Op.getOperand(2);
SDLoc DL(N);
unsigned BaseOp = 0;
unsigned CCValid = 0;
unsigned CCMask = 0;
switch (Op.getOpcode()) {
default: llvm_unreachable("Unknown instruction!");
case ISD::ADDCARRY:
if (!isAddCarryChain(Carry))
return SDValue();
BaseOp = SystemZISD::ADDCARRY;
CCValid = SystemZ::CCMASK_LOGICAL;
CCMask = SystemZ::CCMASK_LOGICAL_CARRY;
break;
case ISD::SUBCARRY:
if (!isSubBorrowChain(Carry))
return SDValue();
BaseOp = SystemZISD::SUBCARRY;
CCValid = SystemZ::CCMASK_LOGICAL;
CCMask = SystemZ::CCMASK_LOGICAL_BORROW;
break;
}
// Set the condition code from the carry flag.
Carry = DAG.getNode(SystemZISD::GET_CCMASK, DL, MVT::i32, Carry,
DAG.getConstant(CCValid, DL, MVT::i32),
DAG.getConstant(CCMask, DL, MVT::i32));
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
SDValue Result = DAG.getNode(BaseOp, DL, VTs, LHS, RHS, Carry);
SDValue SetCC = emitSETCC(DAG, DL, Result.getValue(1), CCValid, CCMask);
if (N->getValueType(1) == MVT::i1)
SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, SetCC);
}
SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc DL(Op);
Op = Op.getOperand(0);
// Handle vector types via VPOPCT.
if (VT.isVector()) {
Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Op);
Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::v16i8, Op);
switch (VT.getScalarSizeInBits()) {
case 8:
break;
case 16: {
Op = DAG.getNode(ISD::BITCAST, DL, VT, Op);
SDValue Shift = DAG.getConstant(8, DL, MVT::i32);
SDValue Tmp = DAG.getNode(SystemZISD::VSHL_BY_SCALAR, DL, VT, Op, Shift);
Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp);
Op = DAG.getNode(SystemZISD::VSRL_BY_SCALAR, DL, VT, Op, Shift);
break;
}
case 32: {
SDValue Tmp = DAG.getSplatBuildVector(MVT::v16i8, DL,
DAG.getConstant(0, DL, MVT::i32));
Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp);
break;
}
case 64: {
SDValue Tmp = DAG.getSplatBuildVector(MVT::v16i8, DL,
DAG.getConstant(0, DL, MVT::i32));
Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::v4i32, Op, Tmp);
Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp);
break;
}
default:
llvm_unreachable("Unexpected type");
}
return Op;
}
// Get the known-zero mask for the operand.
KnownBits Known = DAG.computeKnownBits(Op);
unsigned NumSignificantBits = (~Known.Zero).getActiveBits();
if (NumSignificantBits == 0)
return DAG.getConstant(0, DL, VT);
// Skip known-zero high parts of the operand.
int64_t OrigBitSize = VT.getSizeInBits();
int64_t BitSize = (int64_t)1 << Log2_32_Ceil(NumSignificantBits);
BitSize = std::min(BitSize, OrigBitSize);
// The POPCNT instruction counts the number of bits in each byte.
Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op);
Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::i64, Op);
Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
// Add up per-byte counts in a binary tree. All bits of Op at
// position larger than BitSize remain zero throughout.
for (int64_t I = BitSize / 2; I >= 8; I = I / 2) {
SDValue Tmp = DAG.getNode(ISD::SHL, DL, VT, Op, DAG.getConstant(I, DL, VT));
if (BitSize != OrigBitSize)
Tmp = DAG.getNode(ISD::AND, DL, VT, Tmp,
DAG.getConstant(((uint64_t)1 << BitSize) - 1, DL, VT));
Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp);
}
// Extract overall result from high byte.
if (BitSize > 8)
Op = DAG.getNode(ISD::SRL, DL, VT, Op,
DAG.getConstant(BitSize - 8, DL, VT));
return Op;
}
SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
// The only fence that needs an instruction is a sequentially-consistent
// cross-thread fence.
if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
FenceSSID == SyncScope::System) {
return SDValue(DAG.getMachineNode(SystemZ::Serialize, DL, MVT::Other,
Op.getOperand(0)),
0);
}
// MEMBARRIER is a compiler barrier; it codegens to a no-op.
return DAG.getNode(SystemZISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
}
// Op is an atomic load. Lower it into a normal volatile load.
SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
SelectionDAG &DAG) const {
auto *Node = cast<AtomicSDNode>(Op.getNode());
return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), Op.getValueType(),
Node->getChain(), Node->getBasePtr(),
Node->getMemoryVT(), Node->getMemOperand());
}
// Op is an atomic store. Lower it into a normal volatile store.
SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op,
SelectionDAG &DAG) const {
auto *Node = cast<AtomicSDNode>(Op.getNode());
SDValue Chain = DAG.getTruncStore(Node->getChain(), SDLoc(Op), Node->getVal(),
Node->getBasePtr(), Node->getMemoryVT(),
Node->getMemOperand());
// We have to enforce sequential consistency by performing a
// serialization operation after the store.
if (Node->getOrdering() == AtomicOrdering::SequentiallyConsistent)
Chain = SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op),
MVT::Other, Chain), 0);
return Chain;
}
// Op is an 8-, 16-bit or 32-bit ATOMIC_LOAD_* operation. Lower the first
// two into the fullword ATOMIC_LOADW_* operation given by Opcode.
SDValue SystemZTargetLowering::lowerATOMIC_LOAD_OP(SDValue Op,
SelectionDAG &DAG,
unsigned Opcode) const {
auto *Node = cast<AtomicSDNode>(Op.getNode());
// 32-bit operations need no code outside the main loop.
EVT NarrowVT = Node->getMemoryVT();
EVT WideVT = MVT::i32;
if (NarrowVT == WideVT)
return Op;
int64_t BitSize = NarrowVT.getSizeInBits();
SDValue ChainIn = Node->getChain();
SDValue Addr = Node->getBasePtr();
SDValue Src2 = Node->getVal();
MachineMemOperand *MMO = Node->getMemOperand();
SDLoc DL(Node);
EVT PtrVT = Addr.getValueType();
// Convert atomic subtracts of constants into additions.
if (Opcode == SystemZISD::ATOMIC_LOADW_SUB)
if (auto *Const = dyn_cast<ConstantSDNode>(Src2)) {
Opcode = SystemZISD::ATOMIC_LOADW_ADD;
Src2 = DAG.getConstant(-Const->getSExtValue(), DL, Src2.getValueType());
}
// Get the address of the containing word.
SDValue AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr,
DAG.getConstant(-4, DL, PtrVT));
// Get the number of bits that the word must be rotated left in order
// to bring the field to the top bits of a GR32.
SDValue BitShift = DAG.getNode(ISD::SHL, DL, PtrVT, Addr,
DAG.getConstant(3, DL, PtrVT));
BitShift = DAG.getNode(ISD::TRUNCATE, DL, WideVT, BitShift);
// Get the complementing shift amount, for rotating a field in the top
// bits back to its proper position.
SDValue NegBitShift = DAG.getNode(ISD::SUB, DL, WideVT,
DAG.getConstant(0, DL, WideVT), BitShift);
// Extend the source operand to 32 bits and prepare it for the inner loop.
// ATOMIC_SWAPW uses RISBG to rotate the field left, but all other
// operations require the source to be shifted in advance. (This shift
// can be folded if the source is constant.) For AND and NAND, the lower
// bits must be set, while for other opcodes they should be left clear.
if (Opcode != SystemZISD::ATOMIC_SWAPW)
Src2 = DAG.getNode(ISD::SHL, DL, WideVT, Src2,
DAG.getConstant(32 - BitSize, DL, WideVT));
if (Opcode == SystemZISD::ATOMIC_LOADW_AND ||
Opcode == SystemZISD::ATOMIC_LOADW_NAND)
Src2 = DAG.getNode(ISD::OR, DL, WideVT, Src2,
DAG.getConstant(uint32_t(-1) >> BitSize, DL, WideVT));
// Construct the ATOMIC_LOADW_* node.
SDVTList VTList = DAG.getVTList(WideVT, MVT::Other);
SDValue Ops[] = { ChainIn, AlignedAddr, Src2, BitShift, NegBitShift,
DAG.getConstant(BitSize, DL, WideVT) };
SDValue AtomicOp = DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops,
NarrowVT, MMO);
// Rotate the result of the final CS so that the field is in the lower
// bits of a GR32, then truncate it.
SDValue ResultShift = DAG.getNode(ISD::ADD, DL, WideVT, BitShift,
DAG.getConstant(BitSize, DL, WideVT));
SDValue Result = DAG.getNode(ISD::ROTL, DL, WideVT, AtomicOp, ResultShift);
SDValue RetOps[2] = { Result, AtomicOp.getValue(1) };
return DAG.getMergeValues(RetOps, DL);
}
// Op is an ATOMIC_LOAD_SUB operation. Lower 8- and 16-bit operations
// into ATOMIC_LOADW_SUBs and decide whether to convert 32- and 64-bit
// operations into additions.
SDValue SystemZTargetLowering::lowerATOMIC_LOAD_SUB(SDValue Op,
SelectionDAG &DAG) const {
auto *Node = cast<AtomicSDNode>(Op.getNode());
EVT MemVT = Node->getMemoryVT();
if (MemVT == MVT::i32 || MemVT == MVT::i64) {
// A full-width operation.
assert(Op.getValueType() == MemVT && "Mismatched VTs");
SDValue Src2 = Node->getVal();
SDValue NegSrc2;
SDLoc DL(Src2);
if (auto *Op2 = dyn_cast<ConstantSDNode>(Src2)) {
// Use an addition if the operand is constant and either LAA(G) is
// available or the negative value is in the range of A(G)FHI.
int64_t Value = (-Op2->getAPIntValue()).getSExtValue();
if (isInt<32>(Value) || Subtarget.hasInterlockedAccess1())
NegSrc2 = DAG.getConstant(Value, DL, MemVT);
} else if (Subtarget.hasInterlockedAccess1())
// Use LAA(G) if available.
NegSrc2 = DAG.getNode(ISD::SUB, DL, MemVT, DAG.getConstant(0, DL, MemVT),
Src2);
if (NegSrc2.getNode())
return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, MemVT,
Node->getChain(), Node->getBasePtr(), NegSrc2,
Node->getMemOperand());
// Use the node as-is.
return Op;
}
return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_SUB);
}
// Lower 8/16/32/64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS node.
SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
SelectionDAG &DAG) const {
auto *Node = cast<AtomicSDNode>(Op.getNode());
SDValue ChainIn = Node->getOperand(0);
SDValue Addr = Node->getOperand(1);
SDValue CmpVal = Node->getOperand(2);
SDValue SwapVal = Node->getOperand(3);
MachineMemOperand *MMO = Node->getMemOperand();
SDLoc DL(Node);
// We have native support for 32-bit and 64-bit compare and swap, but we
// still need to expand extracting the "success" result from the CC.
EVT NarrowVT = Node->getMemoryVT();
EVT WideVT = NarrowVT == MVT::i64 ? MVT::i64 : MVT::i32;
if (NarrowVT == WideVT) {
SDVTList Tys = DAG.getVTList(WideVT, MVT::i32, MVT::Other);
SDValue Ops[] = { ChainIn, Addr, CmpVal, SwapVal };
SDValue AtomicOp = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAP,
DL, Tys, Ops, NarrowVT, MMO);
SDValue Success = emitSETCC(DAG, DL, AtomicOp.getValue(1),
SystemZ::CCMASK_CS, SystemZ::CCMASK_CS_EQ);
DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), AtomicOp.getValue(0));
DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), AtomicOp.getValue(2));
return SDValue();
}
// Convert 8-bit and 16-bit compare and swap to a loop, implemented
// via a fullword ATOMIC_CMP_SWAPW operation.
int64_t BitSize = NarrowVT.getSizeInBits();
EVT PtrVT = Addr.getValueType();
// Get the address of the containing word.
SDValue AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr,
DAG.getConstant(-4, DL, PtrVT));
// Get the number of bits that the word must be rotated left in order
// to bring the field to the top bits of a GR32.
SDValue BitShift = DAG.getNode(ISD::SHL, DL, PtrVT, Addr,
DAG.getConstant(3, DL, PtrVT));
BitShift = DAG.getNode(ISD::TRUNCATE, DL, WideVT, BitShift);
// Get the complementing shift amount, for rotating a field in the top
// bits back to its proper position.
SDValue NegBitShift = DAG.getNode(ISD::SUB, DL, WideVT,
DAG.getConstant(0, DL, WideVT), BitShift);
// Construct the ATOMIC_CMP_SWAPW node.
SDVTList VTList = DAG.getVTList(WideVT, MVT::i32, MVT::Other);
SDValue Ops[] = { ChainIn, AlignedAddr, CmpVal, SwapVal, BitShift,
NegBitShift, DAG.getConstant(BitSize, DL, WideVT) };
SDValue AtomicOp = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAPW, DL,
VTList, Ops, NarrowVT, MMO);
SDValue Success = emitSETCC(DAG, DL, AtomicOp.getValue(1),
SystemZ::CCMASK_ICMP, SystemZ::CCMASK_CMP_EQ);
DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), AtomicOp.getValue(0));
DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), AtomicOp.getValue(2));
return SDValue();
}
MachineMemOperand::Flags
SystemZTargetLowering::getMMOFlags(const Instruction &I) const {
// Because of how we convert atomic_load and atomic_store to normal loads and
// stores in the DAG, we need to ensure that the MMOs are marked volatile
// since DAGCombine hasn't been updated to account for atomic, but non
// volatile loads. (See D57601)
if (auto *SI = dyn_cast<StoreInst>(&I))
if (SI->isAtomic())
return MachineMemOperand::MOVolatile;
if (auto *LI = dyn_cast<LoadInst>(&I))
if (LI->isAtomic())
return MachineMemOperand::MOVolatile;
if (auto *AI = dyn_cast<AtomicRMWInst>(&I))
if (AI->isAtomic())
return MachineMemOperand::MOVolatile;
if (auto *AI = dyn_cast<AtomicCmpXchgInst>(&I))
if (AI->isAtomic())
return MachineMemOperand::MOVolatile;
return MachineMemOperand::MONone;
}
SDValue SystemZTargetLowering::lowerSTACKSAVE(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
return DAG.getCopyFromReg(Op.getOperand(0), SDLoc(Op),
SystemZ::R15D, Op.getValueType());
}
SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");
SDValue Chain = Op.getOperand(0);
SDValue NewSP = Op.getOperand(1);
SDValue Backchain;
SDLoc DL(Op);
if (StoreBackchain) {
SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SystemZ::R15D, MVT::i64);
Backchain = DAG.getLoad(MVT::i64, DL, Chain, OldSP, MachinePointerInfo());
}
Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R15D, NewSP);
if (StoreBackchain)
Chain = DAG.getStore(Chain, DL, Backchain, NewSP, MachinePointerInfo());
return Chain;
}
SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
SelectionDAG &DAG) const {
bool IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
if (!IsData)
// Just preserve the chain.
return Op.getOperand(0);
SDLoc DL(Op);
bool IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
unsigned Code = IsWrite ? SystemZ::PFD_WRITE : SystemZ::PFD_READ;
auto *Node = cast<MemIntrinsicSDNode>(Op.getNode());
SDValue Ops[] = {
Op.getOperand(0),
DAG.getConstant(Code, DL, MVT::i32),
Op.getOperand(1)
};
return DAG.getMemIntrinsicNode(SystemZISD::PREFETCH, DL,
Node->getVTList(), Ops,
Node->getMemoryVT(), Node->getMemOperand());
}
// Convert condition code in CCReg to an i32 value.
static SDValue getCCResult(SelectionDAG &DAG, SDValue CCReg) {
SDLoc DL(CCReg);
SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, CCReg);
return DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
DAG.getConstant(SystemZ::IPM_CC, DL, MVT::i32));
}
SDValue
SystemZTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned Opcode, CCValid;
if (isIntrinsicWithCCAndChain(Op, Opcode, CCValid)) {
assert(Op->getNumValues() == 2 && "Expected only CC result and chain");
SDNode *Node = emitIntrinsicWithCCAndChain(DAG, Op, Opcode);
SDValue CC = getCCResult(DAG, SDValue(Node, 0));
DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), CC);
return SDValue();
}
return SDValue();
}
SDValue
SystemZTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned Opcode, CCValid;
if (isIntrinsicWithCC(Op, Opcode, CCValid)) {
SDNode *Node = emitIntrinsicWithCC(DAG, Op, Opcode);
if (Op->getNumValues() == 1)
return getCCResult(DAG, SDValue(Node, 0));
assert(Op->getNumValues() == 2 && "Expected a CC and non-CC result");
return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), Op->getVTList(),
SDValue(Node, 0), getCCResult(DAG, SDValue(Node, 1)));
}
unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
switch (Id) {
case Intrinsic::thread_pointer:
return lowerThreadPointer(SDLoc(Op), DAG);
case Intrinsic::s390_vpdi:
return DAG.getNode(SystemZISD::PERMUTE_DWORDS, SDLoc(Op), Op.getValueType(),
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
case Intrinsic::s390_vperm:
return DAG.getNode(SystemZISD::PERMUTE, SDLoc(Op), Op.getValueType(),
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
case Intrinsic::s390_vuphb:
case Intrinsic::s390_vuphh:
case Intrinsic::s390_vuphf:
return DAG.getNode(SystemZISD::UNPACK_HIGH, SDLoc(Op), Op.getValueType(),
Op.getOperand(1));
case Intrinsic::s390_vuplhb:
case Intrinsic::s390_vuplhh:
case Intrinsic::s390_vuplhf:
return DAG.getNode(SystemZISD::UNPACKL_HIGH, SDLoc(Op), Op.getValueType(),
Op.getOperand(1));
case Intrinsic::s390_vuplb:
case Intrinsic::s390_vuplhw:
case Intrinsic::s390_vuplf:
return DAG.getNode(SystemZISD::UNPACK_LOW, SDLoc(Op), Op.getValueType(),
Op.getOperand(1));
case Intrinsic::s390_vupllb:
case Intrinsic::s390_vupllh:
case Intrinsic::s390_vupllf:
return DAG.getNode(SystemZISD::UNPACKL_LOW, SDLoc(Op), Op.getValueType(),
Op.getOperand(1));
case Intrinsic::s390_vsumb:
case Intrinsic::s390_vsumh:
case Intrinsic::s390_vsumgh:
case Intrinsic::s390_vsumgf:
case Intrinsic::s390_vsumqf:
case Intrinsic::s390_vsumqg:
return DAG.getNode(SystemZISD::VSUM, SDLoc(Op), Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
}
return SDValue();
}
namespace {
// Says that SystemZISD operation Opcode can be used to perform the equivalent
// of a VPERM with permute vector Bytes. If Opcode takes three operands,
// Operand is the constant third operand, otherwise it is the number of
// bytes in each element of the result.
struct Permute {
unsigned Opcode;
unsigned Operand;
unsigned char Bytes[SystemZ::VectorBytes];
};
}
static const Permute PermuteForms[] = {
// VMRHG
{ SystemZISD::MERGE_HIGH, 8,
{ 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 } },
// VMRHF
{ SystemZISD::MERGE_HIGH, 4,
{ 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 } },
// VMRHH
{ SystemZISD::MERGE_HIGH, 2,
{ 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 } },
// VMRHB
{ SystemZISD::MERGE_HIGH, 1,
{ 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 } },
// VMRLG
{ SystemZISD::MERGE_LOW, 8,
{ 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 } },
// VMRLF
{ SystemZISD::MERGE_LOW, 4,
{ 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 } },
// VMRLH
{ SystemZISD::MERGE_LOW, 2,
{ 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 } },
// VMRLB
{ SystemZISD::MERGE_LOW, 1,
{ 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 } },
// VPKG
{ SystemZISD::PACK, 4,
{ 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 } },
// VPKF
{ SystemZISD::PACK, 2,
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 } },
// VPKH
{ SystemZISD::PACK, 1,
{ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 } },
// VPDI V1, V2, 4 (low half of V1, high half of V2)
{ SystemZISD::PERMUTE_DWORDS, 4,
{ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 } },
// VPDI V1, V2, 1 (high half of V1, low half of V2)
{ SystemZISD::PERMUTE_DWORDS, 1,
{ 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 } }
};
// Called after matching a vector shuffle against a particular pattern.
// Both the original shuffle and the pattern have two vector operands.
// OpNos[0] is the operand of the original shuffle that should be used for
// operand 0 of the pattern, or -1 if operand 0 of the pattern can be anything.
// OpNos[1] is the same for operand 1 of the pattern. Resolve these -1s and
// set OpNo0 and OpNo1 to the shuffle operands that should actually be used
// for operands 0 and 1 of the pattern.
static bool chooseShuffleOpNos(int *OpNos, unsigned &OpNo0, unsigned &OpNo1) {
if (OpNos[0] < 0) {
if (OpNos[1] < 0)
return false;
OpNo0 = OpNo1 = OpNos[1];
} else if (OpNos[1] < 0) {
OpNo0 = OpNo1 = OpNos[0];
} else {
OpNo0 = OpNos[0];
OpNo1 = OpNos[1];
}
return true;
}
// Bytes is a VPERM-like permute vector, except that -1 is used for
// undefined bytes. Return true if the VPERM can be implemented using P.
// When returning true set OpNo0 to the VPERM operand that should be
// used for operand 0 of P and likewise OpNo1 for operand 1 of P.
//
// For example, if swapping the VPERM operands allows P to match, OpNo0
// will be 1 and OpNo1 will be 0. If instead Bytes only refers to one
// operand, but rewriting it to use two duplicated operands allows it to
// match P, then OpNo0 and OpNo1 will be the same.
static bool matchPermute(const SmallVectorImpl<int> &Bytes, const Permute &P,
unsigned &OpNo0, unsigned &OpNo1) {
int OpNos[] = { -1, -1 };
for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
int Elt = Bytes[I];
if (Elt >= 0) {
// Make sure that the two permute vectors use the same suboperand
// byte number. Only the operand numbers (the high bits) are
// allowed to differ.
if ((Elt ^ P.Bytes[I]) & (SystemZ::VectorBytes - 1))
return false;
int ModelOpNo = P.Bytes[I] / SystemZ::VectorBytes;
int RealOpNo = unsigned(Elt) / SystemZ::VectorBytes;
// Make sure that the operand mappings are consistent with previous
// elements.
if (OpNos[ModelOpNo] == 1 - RealOpNo)
return false;
OpNos[ModelOpNo] = RealOpNo;
}
}
return chooseShuffleOpNos(OpNos, OpNo0, OpNo1);
}
// As above, but search for a matching permute.
static const Permute *matchPermute(const SmallVectorImpl<int> &Bytes,
unsigned &OpNo0, unsigned &OpNo1) {
for (auto &P : PermuteForms)
if (matchPermute(Bytes, P, OpNo0, OpNo1))
return &P;
return nullptr;
}
// Bytes is a VPERM-like permute vector, except that -1 is used for
// undefined bytes. This permute is an operand of an outer permute.
// See whether redistributing the -1 bytes gives a shuffle that can be
// implemented using P. If so, set Transform to a VPERM-like permute vector
// that, when applied to the result of P, gives the original permute in Bytes.
static bool matchDoublePermute(const SmallVectorImpl<int> &Bytes,
const Permute &P,
SmallVectorImpl<int> &Transform) {
unsigned To = 0;
for (unsigned From = 0; From < SystemZ::VectorBytes; ++From) {
int Elt = Bytes[From];
if (Elt < 0)
// Byte number From of the result is undefined.
Transform[From] = -1;
else {
while (P.Bytes[To] != Elt) {
To += 1;
if (To == SystemZ::VectorBytes)
return false;
}
Transform[From] = To;
}
}
return true;
}
// As above, but search for a matching permute.
static const Permute *matchDoublePermute(const SmallVectorImpl<int> &Bytes,
SmallVectorImpl<int> &Transform) {
for (auto &P : PermuteForms)
if (matchDoublePermute(Bytes, P, Transform))
return &P;
return nullptr;
}
// Convert the mask of the given shuffle op into a byte-level mask,
// as if it had type vNi8.
static bool getVPermMask(SDValue ShuffleOp,
SmallVectorImpl<int> &Bytes) {
EVT VT = ShuffleOp.getValueType();
unsigned NumElements = VT.getVectorNumElements();
unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
if (auto *VSN = dyn_cast<ShuffleVectorSDNode>(ShuffleOp)) {
Bytes.resize(NumElements * BytesPerElement, -1);
for (unsigned I = 0; I < NumElements; ++I) {
int Index = VSN->getMaskElt(I);
if (Index >= 0)
for (unsigned J = 0; J < BytesPerElement; ++J)
Bytes[I * BytesPerElement + J] = Index * BytesPerElement + J;
}
return true;
}
if (SystemZISD::SPLAT == ShuffleOp.getOpcode() &&
isa<ConstantSDNode>(ShuffleOp.getOperand(1))) {
unsigned Index = ShuffleOp.getConstantOperandVal(1);
Bytes.resize(NumElements * BytesPerElement, -1);
for (unsigned I = 0; I < NumElements; ++I)
for (unsigned J = 0; J < BytesPerElement; ++J)
Bytes[I * BytesPerElement + J] = Index * BytesPerElement + J;
return true;
}
return false;
}
// Bytes is a VPERM-like permute vector, except that -1 is used for
// undefined bytes. See whether bytes [Start, Start + BytesPerElement) of
// the result come from a contiguous sequence of bytes from one input.
// Set Base to the selector for the first byte if so.
static bool getShuffleInput(const SmallVectorImpl<int> &Bytes, unsigned Start,
unsigned BytesPerElement, int &Base) {
Base = -1;
for (unsigned I = 0; I < BytesPerElement; ++I) {
if (Bytes[Start + I] >= 0) {
unsigned Elem = Bytes[Start + I];
if (Base < 0) {
Base = Elem - I;
// Make sure the bytes would come from one input operand.
if (unsigned(Base) % Bytes.size() + BytesPerElement > Bytes.size())
return false;
} else if (unsigned(Base) != Elem - I)
return false;
}
}
return true;
}
// Bytes is a VPERM-like permute vector, except that -1 is used for
// undefined bytes. Return true if it can be performed using VSLDI.
// When returning true, set StartIndex to the shift amount and OpNo0
// and OpNo1 to the VPERM operands that should be used as the first
// and second shift operand respectively.
static bool isShlDoublePermute(const SmallVectorImpl<int> &Bytes,
unsigned &StartIndex, unsigned &OpNo0,
unsigned &OpNo1) {
int OpNos[] = { -1, -1 };
int Shift = -1;
for (unsigned I = 0; I < 16; ++I) {
int Index = Bytes[I];
if (Index >= 0) {
int ExpectedShift = (Index - I) % SystemZ::VectorBytes;
int ModelOpNo = unsigned(ExpectedShift + I) / SystemZ::VectorBytes;
int RealOpNo = unsigned(Index) / SystemZ::VectorBytes;
if (Shift < 0)
Shift = ExpectedShift;
else if (Shift != ExpectedShift)
return false;
// Make sure that the operand mappings are consistent with previous
// elements.
if (OpNos[ModelOpNo] == 1 - RealOpNo)
return false;
OpNos[ModelOpNo] = RealOpNo;
}
}
StartIndex = Shift;
return chooseShuffleOpNos(OpNos, OpNo0, OpNo1);
}
// Create a node that performs P on operands Op0 and Op1, casting the
// operands to the appropriate type. The type of the result is determined by P.
static SDValue getPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
const Permute &P, SDValue Op0, SDValue Op1) {
// VPDI (PERMUTE_DWORDS) always operates on v2i64s. The input
// elements of a PACK are twice as wide as the outputs.
unsigned InBytes = (P.Opcode == SystemZISD::PERMUTE_DWORDS ? 8 :
P.Opcode == SystemZISD::PACK ? P.Operand * 2 :
P.Operand);
// Cast both operands to the appropriate type.
MVT InVT = MVT::getVectorVT(MVT::getIntegerVT(InBytes * 8),
SystemZ::VectorBytes / InBytes);
Op0 = DAG.getNode(ISD::BITCAST, DL, InVT, Op0);
Op1 = DAG.getNode(ISD::BITCAST, DL, InVT, Op1);
SDValue Op;
if (P.Opcode == SystemZISD::PERMUTE_DWORDS) {
SDValue Op2 = DAG.getConstant(P.Operand, DL, MVT::i32);
Op = DAG.getNode(SystemZISD::PERMUTE_DWORDS, DL, InVT, Op0, Op1, Op2);
} else if (P.Opcode == SystemZISD::PACK) {
MVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(P.Operand * 8),
SystemZ::VectorBytes / P.Operand);
Op = DAG.getNode(SystemZISD::PACK, DL, OutVT, Op0, Op1);
} else {
Op = DAG.getNode(P.Opcode, DL, InVT, Op0, Op1);
}
return Op;
}
// Bytes is a VPERM-like permute vector, except that -1 is used for
// undefined bytes. Implement it on operands Ops[0] and Ops[1] using
// VSLDI or VPERM.
static SDValue getGeneralPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
SDValue *Ops,
const SmallVectorImpl<int> &Bytes) {
for (unsigned I = 0; I < 2; ++I)
Ops[I] = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Ops[I]);
// First see whether VSLDI can be used.
unsigned StartIndex, OpNo0, OpNo1;
if (isShlDoublePermute(Bytes, StartIndex, OpNo0, OpNo1))
return DAG.getNode(SystemZISD::SHL_DOUBLE, DL, MVT::v16i8, Ops[OpNo0],
Ops[OpNo1], DAG.getConstant(StartIndex, DL, MVT::i32));
// Fall back on VPERM. Construct an SDNode for the permute vector.
SDValue IndexNodes[SystemZ::VectorBytes];
for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
if (Bytes[I] >= 0)
IndexNodes[I] = DAG.getConstant(Bytes[I], DL, MVT::i32);
else
IndexNodes[I] = DAG.getUNDEF(MVT::i32);
SDValue Op2 = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes);
return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Ops[0], Ops[1], Op2);
}
namespace {
// Describes a general N-operand vector shuffle.
struct GeneralShuffle {
GeneralShuffle(EVT vt) : VT(vt) {}
void addUndef();
bool add(SDValue, unsigned);
SDValue getNode(SelectionDAG &, const SDLoc &);
// The operands of the shuffle.
SmallVector<SDValue, SystemZ::VectorBytes> Ops;
// Index I is -1 if byte I of the result is undefined. Otherwise the
// result comes from byte Bytes[I] % SystemZ::VectorBytes of operand
// Bytes[I] / SystemZ::VectorBytes.
SmallVector<int, SystemZ::VectorBytes> Bytes;
// The type of the shuffle result.
EVT VT;
};
}
// Add an extra undefined element to the shuffle.
void GeneralShuffle::addUndef() {
unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
for (unsigned I = 0; I < BytesPerElement; ++I)
Bytes.push_back(-1);
}
// Add an extra element to the shuffle, taking it from element Elem of Op.
// A null Op indicates a vector input whose value will be calculated later;
// there is at most one such input per shuffle and it always has the same
// type as the result. Aborts and returns false if the source vector elements
// of an EXTRACT_VECTOR_ELT are smaller than the destination elements. Per
// LLVM they become implicitly extended, but this is rare and not optimized.
bool GeneralShuffle::add(SDValue Op, unsigned Elem) {
unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
// The source vector can have wider elements than the result,
// either through an explicit TRUNCATE or because of type legalization.
// We want the least significant part.
EVT FromVT = Op.getNode() ? Op.getValueType() : VT;
unsigned FromBytesPerElement = FromVT.getVectorElementType().getStoreSize();
// Return false if the source elements are smaller than their destination
// elements.
if (FromBytesPerElement < BytesPerElement)
return false;
unsigned Byte = ((Elem * FromBytesPerElement) % SystemZ::VectorBytes +
(FromBytesPerElement - BytesPerElement));
// Look through things like shuffles and bitcasts.
while (Op.getNode()) {
if (Op.getOpcode() == ISD::BITCAST)
Op = Op.getOperand(0);
else if (Op.getOpcode() == ISD::VECTOR_SHUFFLE && Op.hasOneUse()) {
// See whether the bytes we need come from a contiguous part of one
// operand.
SmallVector<int, SystemZ::VectorBytes> OpBytes;
if (!getVPermMask(Op, OpBytes))
break;
int NewByte;
if (!getShuffleInput(OpBytes, Byte, BytesPerElement, NewByte))
break;
if (NewByte < 0) {
addUndef();
return true;
}
Op = Op.getOperand(unsigned(NewByte) / SystemZ::VectorBytes);
Byte = unsigned(NewByte) % SystemZ::VectorBytes;
} else if (Op.isUndef()) {
addUndef();
return true;
} else
break;
}
// Make sure that the source of the extraction is in Ops.
unsigned OpNo = 0;
for (; OpNo < Ops.size(); ++OpNo)
if (Ops[OpNo] == Op)
break;
if (OpNo == Ops.size())
Ops.push_back(Op);
// Add the element to Bytes.
unsigned Base = OpNo * SystemZ::VectorBytes + Byte;
for (unsigned I = 0; I < BytesPerElement; ++I)
Bytes.push_back(Base + I);
return true;
}
// Return SDNodes for the completed shuffle.
SDValue GeneralShuffle::getNode(SelectionDAG &DAG, const SDLoc &DL) {
assert(Bytes.size() == SystemZ::VectorBytes && "Incomplete vector");
if (Ops.size() == 0)
return DAG.getUNDEF(VT);
// Make sure that there are at least two shuffle operands.
if (Ops.size() == 1)
Ops.push_back(DAG.getUNDEF(MVT::v16i8));
// Create a tree of shuffles, deferring root node until after the loop.
// Try to redistribute the undefined elements of non-root nodes so that
// the non-root shuffles match something like a pack or merge, then adjust
// the parent node's permute vector to compensate for the new order.
// Among other things, this copes with vectors like <2 x i16> that were
// padded with undefined elements during type legalization.
//
// In the best case this redistribution will lead to the whole tree
// using packs and merges. It should rarely be a loss in other cases.
unsigned Stride = 1;
for (; Stride * 2 < Ops.size(); Stride *= 2) {
for (unsigned I = 0; I < Ops.size() - Stride; I += Stride * 2) {
SDValue SubOps[] = { Ops[I], Ops[I + Stride] };
// Create a mask for just these two operands.
SmallVector<int, SystemZ::VectorBytes> NewBytes(SystemZ::VectorBytes);
for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) {
unsigned OpNo = unsigned(Bytes[J]) / SystemZ::VectorBytes;
unsigned Byte = unsigned(Bytes[J]) % SystemZ::VectorBytes;
if (OpNo == I)
NewBytes[J] = Byte;
else if (OpNo == I + Stride)
NewBytes[J] = SystemZ::VectorBytes + Byte;
else
NewBytes[J] = -1;
}
// See if it would be better to reorganize NewMask to avoid using VPERM.
SmallVector<int, SystemZ::VectorBytes> NewBytesMap(SystemZ::VectorBytes);
if (const Permute *P = matchDoublePermute(NewBytes, NewBytesMap)) {
Ops[I] = getPermuteNode(DAG, DL, *P, SubOps[0], SubOps[1]);
// Applying NewBytesMap to Ops[I] gets back to NewBytes.
for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) {
if (NewBytes[J] >= 0) {
assert(unsigned(NewBytesMap[J]) < SystemZ::VectorBytes &&
"Invalid double permute");
Bytes[J] = I * SystemZ::VectorBytes + NewBytesMap[J];
} else
assert(NewBytesMap[J] < 0 && "Invalid double permute");
}
} else {
// Just use NewBytes on the operands.
Ops[I] = getGeneralPermuteNode(DAG, DL, SubOps, NewBytes);
for (unsigned J = 0; J < SystemZ::VectorBytes; ++J)
if (NewBytes[J] >= 0)
Bytes[J] = I * SystemZ::VectorBytes + J;
}
}
}
// Now we just have 2 inputs. Put the second operand in Ops[1].
if (Stride > 1) {
Ops[1] = Ops[Stride];
for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
if (Bytes[I] >= int(SystemZ::VectorBytes))
Bytes[I] -= (Stride - 1) * SystemZ::VectorBytes;
}
// Look for an instruction that can do the permute without resorting
// to VPERM.
unsigned OpNo0, OpNo1;
SDValue Op;
if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1))
Op = getPermuteNode(DAG, DL, *P, Ops[OpNo0], Ops[OpNo1]);
else
Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes);
return DAG.getNode(ISD::BITCAST, DL, VT, Op);
}
// Return true if the given BUILD_VECTOR is a scalar-to-vector conversion.
static bool isScalarToVector(SDValue Op) {
for (unsigned I = 1, E = Op.getNumOperands(); I != E; ++I)
if (!Op.getOperand(I).isUndef())
return false;
return true;
}
// Return a vector of type VT that contains Value in the first element.
// The other elements don't matter.
static SDValue buildScalarToVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
SDValue Value) {
// If we have a constant, replicate it to all elements and let the
// BUILD_VECTOR lowering take care of it.
if (Value.getOpcode() == ISD::Constant ||
Value.getOpcode() == ISD::ConstantFP) {
SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Value);
return DAG.getBuildVector(VT, DL, Ops);
}
if (Value.isUndef())
return DAG.getUNDEF(VT);
return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
}
// Return a vector of type VT in which Op0 is in element 0 and Op1 is in
// element 1. Used for cases in which replication is cheap.
static SDValue buildMergeScalars(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
SDValue Op0, SDValue Op1) {
if (Op0.isUndef()) {
if (Op1.isUndef())
return DAG.getUNDEF(VT);
return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op1);
}
if (Op1.isUndef())
return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op0);
return DAG.getNode(SystemZISD::MERGE_HIGH, DL, VT,
buildScalarToVector(DAG, DL, VT, Op0),
buildScalarToVector(DAG, DL, VT, Op1));
}
// Extend GPR scalars Op0 and Op1 to doublewords and return a v2i64
// vector for them.
static SDValue joinDwords(SelectionDAG &DAG, const SDLoc &DL, SDValue Op0,
SDValue Op1) {
if (Op0.isUndef() && Op1.isUndef())
return DAG.getUNDEF(MVT::v2i64);
// If one of the two inputs is undefined then replicate the other one,
// in order to avoid using another register unnecessarily.
if (Op0.isUndef())
Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op1);
else if (Op1.isUndef())
Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
else {
Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op1);
}
return DAG.getNode(SystemZISD::JOIN_DWORDS, DL, MVT::v2i64, Op0, Op1);
}
// If a BUILD_VECTOR contains some EXTRACT_VECTOR_ELTs, it's usually
// better to use VECTOR_SHUFFLEs on them, only using BUILD_VECTOR for
// the non-EXTRACT_VECTOR_ELT elements. See if the given BUILD_VECTOR
// would benefit from this representation and return it if so.
static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
BuildVectorSDNode *BVN) {
EVT VT = BVN->getValueType(0);
unsigned NumElements = VT.getVectorNumElements();
// Represent the BUILD_VECTOR as an N-operand VECTOR_SHUFFLE-like operation
// on byte vectors. If there are non-EXTRACT_VECTOR_ELT elements that still
// need a BUILD_VECTOR, add an additional placeholder operand for that
// BUILD_VECTOR and store its operands in ResidueOps.
GeneralShuffle GS(VT);
SmallVector<SDValue, SystemZ::VectorBytes> ResidueOps;
bool FoundOne = false;
for (unsigned I = 0; I < NumElements; ++I) {
SDValue Op = BVN->getOperand(I);
if (Op.getOpcode() == ISD::TRUNCATE)
Op = Op.getOperand(0);
if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
Op.getOperand(1).getOpcode() == ISD::Constant) {
unsigned Elem = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
if (!GS.add(Op.getOperand(0), Elem))
return SDValue();
FoundOne = true;
} else if (Op.isUndef()) {
GS.addUndef();
} else {
if (!GS.add(SDValue(), ResidueOps.size()))
return SDValue();
ResidueOps.push_back(BVN->getOperand(I));
}
}
// Nothing to do if there are no EXTRACT_VECTOR_ELTs.
if (!FoundOne)
return SDValue();
// Create the BUILD_VECTOR for the remaining elements, if any.
if (!ResidueOps.empty()) {
while (ResidueOps.size() < NumElements)
ResidueOps.push_back(DAG.getUNDEF(ResidueOps[0].getValueType()));
for (auto &Op : GS.Ops) {
if (!Op.getNode()) {
Op = DAG.getBuildVector(VT, SDLoc(BVN), ResidueOps);
break;
}
}
}
return GS.getNode(DAG, SDLoc(BVN));
}
bool SystemZTargetLowering::isVectorElementLoad(SDValue Op) const {
if (Op.getOpcode() == ISD::LOAD && cast<LoadSDNode>(Op)->isUnindexed())
return true;
if (Subtarget.hasVectorEnhancements2() && Op.getOpcode() == SystemZISD::LRV)
return true;
return false;
}
// Combine GPR scalar values Elems into a vector of type VT.
SDValue
SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
SmallVectorImpl<SDValue> &Elems) const {
// See whether there is a single replicated value.
SDValue Single;
unsigned int NumElements = Elems.size();
unsigned int Count = 0;
for (auto Elem : Elems) {
if (!Elem.isUndef()) {
if (!Single.getNode())
Single = Elem;
else if (Elem != Single) {
Single = SDValue();
break;
}
Count += 1;
}
}
// There are three cases here:
//
// - if the only defined element is a loaded one, the best sequence
// is a replicating load.
//
// - otherwise, if the only defined element is an i64 value, we will
// end up with the same VLVGP sequence regardless of whether we short-cut
// for replication or fall through to the later code.
//
// - otherwise, if the only defined element is an i32 or smaller value,
// we would need 2 instructions to replicate it: VLVGP followed by VREPx.
// This is only a win if the single defined element is used more than once.
// In other cases we're better off using a single VLVGx.
if (Single.getNode() && (Count > 1 || isVectorElementLoad(Single)))
return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Single);
// If all elements are loads, use VLREP/VLEs (below).
bool AllLoads = true;
for (auto Elem : Elems)
if (!isVectorElementLoad(Elem)) {
AllLoads = false;
break;
}
// The best way of building a v2i64 from two i64s is to use VLVGP.
if (VT == MVT::v2i64 && !AllLoads)
return joinDwords(DAG, DL, Elems[0], Elems[1]);
// Use a 64-bit merge high to combine two doubles.
if (VT == MVT::v2f64 && !AllLoads)
return buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
// Build v4f32 values directly from the FPRs:
//
// <Axxx> <Bxxx> <Cxxxx> <Dxxx>
// V V VMRHF
// <ABxx> <CDxx>
// V VMRHG
// <ABCD>
if (VT == MVT::v4f32 && !AllLoads) {
SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[2], Elems[3]);
// Avoid unnecessary undefs by reusing the other operand.
if (Op01.isUndef())
Op01 = Op23;
else if (Op23.isUndef())
Op23 = Op01;
// Merging identical replications is a no-op.
if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23)
return Op01;
Op01 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op01);
Op23 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op23);
SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH,
DL, MVT::v2i64, Op01, Op23);
return DAG.getNode(ISD::BITCAST, DL, VT, Op);
}
// Collect the constant terms.
SmallVector<SDValue, SystemZ::VectorBytes> Constants(NumElements, SDValue());
SmallVector<bool, SystemZ::VectorBytes> Done(NumElements, false);
unsigned NumConstants = 0;
for (unsigned I = 0; I < NumElements; ++I) {
SDValue Elem = Elems[I];
if (Elem.getOpcode() == ISD::Constant ||
Elem.getOpcode() == ISD::ConstantFP) {
NumConstants += 1;
Constants[I] = Elem;
Done[I] = true;
}
}
// If there was at least one constant, fill in the other elements of
// Constants with undefs to get a full vector constant and use that
// as the starting point.
SDValue Result;
SDValue ReplicatedVal;
if (NumConstants > 0) {
for (unsigned I = 0; I < NumElements; ++I)
if (!Constants[I].getNode())
Constants[I] = DAG.getUNDEF(Elems[I].getValueType());
Result = DAG.getBuildVector(VT, DL, Constants);
} else {
// Otherwise try to use VLREP or VLVGP to start the sequence in order to
// avoid a false dependency on any previous contents of the vector
// register.
// Use a VLREP if at least one element is a load. Make sure to replicate
// the load with the most elements having its value.
std::map<const SDNode*, unsigned> UseCounts;
SDNode *LoadMaxUses = nullptr;
for (unsigned I = 0; I < NumElements; ++I)
if (isVectorElementLoad(Elems[I])) {
SDNode *Ld = Elems[I].getNode();
UseCounts[Ld]++;
if (LoadMaxUses == nullptr || UseCounts[LoadMaxUses] < UseCounts[Ld])
LoadMaxUses = Ld;
}
if (LoadMaxUses != nullptr) {
ReplicatedVal = SDValue(LoadMaxUses, 0);
Result = DAG.getNode(SystemZISD::REPLICATE, DL, VT, ReplicatedVal);
} else {
// Try to use VLVGP.
unsigned I1 = NumElements / 2 - 1;
unsigned I2 = NumElements - 1;
bool Def1 = !Elems[I1].isUndef();
bool Def2 = !Elems[I2].isUndef();
if (Def1 || Def2) {
SDValue Elem1 = Elems[Def1 ? I1 : I2];
SDValue Elem2 = Elems[Def2 ? I2 : I1];
Result = DAG.getNode(ISD::BITCAST, DL, VT,
joinDwords(DAG, DL, Elem1, Elem2));
Done[I1] = true;
Done[I2] = true;
} else
Result = DAG.getUNDEF(VT);
}
}
// Use VLVGx to insert the other elements.
for (unsigned I = 0; I < NumElements; ++I)
if (!Done[I] && !Elems[I].isUndef() && Elems[I] != ReplicatedVal)
Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Result, Elems[I],
DAG.getConstant(I, DL, MVT::i32));
return Result;
}
SDValue SystemZTargetLowering::lowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
auto *BVN = cast<BuildVectorSDNode>(Op.getNode());
SDLoc DL(Op);
EVT VT = Op.getValueType();
if (BVN->isConstant()) {
if (SystemZVectorConstantInfo(BVN).isVectorConstantLegal(Subtarget))
return Op;
// Fall back to loading it from memory.
return SDValue();
}
// See if we should use shuffles to construct the vector from other vectors.
if (SDValue Res = tryBuildVectorShuffle(DAG, BVN))
return Res;
// Detect SCALAR_TO_VECTOR conversions.
if (isOperationLegal(ISD::SCALAR_TO_VECTOR, VT) && isScalarToVector(Op))
return buildScalarToVector(DAG, DL, VT, Op.getOperand(0));
// Otherwise use buildVector to build the vector up from GPRs.
unsigned NumElements = Op.getNumOperands();
SmallVector<SDValue, SystemZ::VectorBytes> Ops(NumElements);
for (unsigned I = 0; I < NumElements; ++I)
Ops[I] = Op.getOperand(I);
return buildVector(DAG, DL, VT, Ops);
}
SDValue SystemZTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
SelectionDAG &DAG) const {
auto *VSN = cast<ShuffleVectorSDNode>(Op.getNode());
SDLoc DL(Op);
EVT VT = Op.getValueType();
unsigned NumElements = VT.getVectorNumElements();
if (VSN->isSplat()) {
SDValue Op0 = Op.getOperand(0);
unsigned Index = VSN->getSplatIndex();
assert(Index < VT.getVectorNumElements() &&
"Splat index should be defined and in first operand");
// See whether the value we're splatting is directly available as a scalar.
if ((Index == 0 && Op0.getOpcode() == ISD::SCALAR_TO_VECTOR) ||
Op0.getOpcode() == ISD::BUILD_VECTOR)
return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op0.getOperand(Index));
// Otherwise keep it as a vector-to-vector operation.
return DAG.getNode(SystemZISD::SPLAT, DL, VT, Op.getOperand(0),
DAG.getConstant(Index, DL, MVT::i32));
}
GeneralShuffle GS(VT);
for (unsigned I = 0; I < NumElements; ++I) {
int Elt = VSN->getMaskElt(I);
if (Elt < 0)
GS.addUndef();
else if (!GS.add(Op.getOperand(unsigned(Elt) / NumElements),
unsigned(Elt) % NumElements))
return SDValue();
}
return GS.getNode(DAG, SDLoc(VSN));
}
SDValue SystemZTargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
// Just insert the scalar into element 0 of an undefined vector.
return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
Op.getValueType(), DAG.getUNDEF(Op.getValueType()),
Op.getOperand(0), DAG.getConstant(0, DL, MVT::i32));
}
SDValue SystemZTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
// Handle insertions of floating-point values.
SDLoc DL(Op);
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDValue Op2 = Op.getOperand(2);
EVT VT = Op.getValueType();
// Insertions into constant indices of a v2f64 can be done using VPDI.
// However, if the inserted value is a bitcast or a constant then it's
// better to use GPRs, as below.
if (VT == MVT::v2f64 &&
Op1.getOpcode() != ISD::BITCAST &&
Op1.getOpcode() != ISD::ConstantFP &&
Op2.getOpcode() == ISD::Constant) {
uint64_t Index = cast<ConstantSDNode>(Op2)->getZExtValue();
unsigned Mask = VT.getVectorNumElements() - 1;
if (Index <= Mask)
return Op;
}
// Otherwise bitcast to the equivalent integer form and insert via a GPR.
MVT IntVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
MVT IntVecVT = MVT::getVectorVT(IntVT, VT.getVectorNumElements());
SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, IntVecVT,
DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0),
DAG.getNode(ISD::BITCAST, DL, IntVT, Op1), Op2);
return DAG.getNode(ISD::BITCAST, DL, VT, Res);
}
SDValue
SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
// Handle extractions of floating-point values.
SDLoc DL(Op);
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
EVT VT = Op.getValueType();
EVT VecVT = Op0.getValueType();
// Extractions of constant indices can be done directly.
if (auto *CIndexN = dyn_cast<ConstantSDNode>(Op1)) {
uint64_t Index = CIndexN->getZExtValue();
unsigned Mask = VecVT.getVectorNumElements() - 1;
if (Index <= Mask)
return Op;
}
// Otherwise bitcast to the equivalent integer form and extract via a GPR.
MVT IntVT = MVT::getIntegerVT(VT.getSizeInBits());
MVT IntVecVT = MVT::getVectorVT(IntVT, VecVT.getVectorNumElements());
SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, IntVT,
DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0), Op1);
return DAG.getNode(ISD::BITCAST, DL, VT, Res);
}
SDValue
SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
unsigned UnpackHigh) const {
SDValue PackedOp = Op.getOperand(0);
EVT OutVT = Op.getValueType();
EVT InVT = PackedOp.getValueType();
unsigned ToBits = OutVT.getScalarSizeInBits();
unsigned FromBits = InVT.getScalarSizeInBits();
do {
FromBits *= 2;
EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits),
SystemZ::VectorBits / FromBits);
PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp);
} while (FromBits != ToBits);
return PackedOp;
}
SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
unsigned ByScalar) const {
// Look for cases where a vector shift can use the *_BY_SCALAR form.
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDLoc DL(Op);
EVT VT = Op.getValueType();
unsigned ElemBitSize = VT.getScalarSizeInBits();
// See whether the shift vector is a splat represented as BUILD_VECTOR.
if (auto *BVN = dyn_cast<BuildVectorSDNode>(Op1)) {
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
// Check for constant splats. Use ElemBitSize as the minimum element
// width and reject splats that need wider elements.
if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
ElemBitSize, true) &&
SplatBitSize == ElemBitSize) {
SDValue Shift = DAG.getConstant(SplatBits.getZExtValue() & 0xfff,
DL, MVT::i32);
return DAG.getNode(ByScalar, DL, VT, Op0, Shift);
}
// Check for variable splats.
BitVector UndefElements;
SDValue Splat = BVN->getSplatValue(&UndefElements);
if (Splat) {
// Since i32 is the smallest legal type, we either need a no-op
// or a truncation.
SDValue Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Splat);
return DAG.getNode(ByScalar, DL, VT, Op0, Shift);
}
}
// See whether the shift vector is a splat represented as SHUFFLE_VECTOR,
// and the shift amount is directly available in a GPR.
if (auto *VSN = dyn_cast<ShuffleVectorSDNode>(Op1)) {
if (VSN->isSplat()) {
SDValue VSNOp0 = VSN->getOperand(0);
unsigned Index = VSN->getSplatIndex();
assert(Index < VT.getVectorNumElements() &&
"Splat index should be defined and in first operand");
if ((Index == 0 && VSNOp0.getOpcode() == ISD::SCALAR_TO_VECTOR) ||
VSNOp0.getOpcode() == ISD::BUILD_VECTOR) {
// Since i32 is the smallest legal type, we either need a no-op
// or a truncation.
SDValue Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
VSNOp0.getOperand(Index));
return DAG.getNode(ByScalar, DL, VT, Op0, Shift);
}
}
}
// Otherwise just treat the current form as legal.
return Op;
}
SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
case ISD::FRAMEADDR:
return lowerFRAMEADDR(Op, DAG);
case ISD::RETURNADDR:
return lowerRETURNADDR(Op, DAG);
case ISD::BR_CC:
return lowerBR_CC(Op, DAG);
case ISD::SELECT_CC:
return lowerSELECT_CC(Op, DAG);
case ISD::SETCC:
return lowerSETCC(Op, DAG);
case ISD::GlobalAddress:
return lowerGlobalAddress(cast<GlobalAddressSDNode>(Op), DAG);
case ISD::GlobalTLSAddress:
return lowerGlobalTLSAddress(cast<GlobalAddressSDNode>(Op), DAG);
case ISD::BlockAddress:
return lowerBlockAddress(cast<BlockAddressSDNode>(Op), DAG);
case ISD::JumpTable:
return lowerJumpTable(cast<JumpTableSDNode>(Op), DAG);
case ISD::ConstantPool:
return lowerConstantPool(cast<ConstantPoolSDNode>(Op), DAG);
case ISD::BITCAST:
return lowerBITCAST(Op, DAG);
case ISD::VASTART:
return lowerVASTART(Op, DAG);
case ISD::VACOPY:
return lowerVACOPY(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:
return lowerDYNAMIC_STACKALLOC(Op, DAG);
case ISD::GET_DYNAMIC_AREA_OFFSET:
return lowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);
case ISD::SMUL_LOHI:
return lowerSMUL_LOHI(Op, DAG);
case ISD::UMUL_LOHI:
return lowerUMUL_LOHI(Op, DAG);
case ISD::SDIVREM:
return lowerSDIVREM(Op, DAG);
case ISD::UDIVREM:
return lowerUDIVREM(Op, DAG);
case ISD::SADDO:
case ISD::SSUBO:
case ISD::UADDO:
case ISD::USUBO:
return lowerXALUO(Op, DAG);
case ISD::ADDCARRY:
case ISD::SUBCARRY:
return lowerADDSUBCARRY(Op, DAG);
case ISD::OR:
return lowerOR(Op, DAG);
case ISD::CTPOP:
return lowerCTPOP(Op, DAG);
case ISD::ATOMIC_FENCE:
return lowerATOMIC_FENCE(Op, DAG);
case ISD::ATOMIC_SWAP:
return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
case ISD::ATOMIC_STORE:
return lowerATOMIC_STORE(Op, DAG);
case ISD::ATOMIC_LOAD:
return lowerATOMIC_LOAD(Op, DAG);
case ISD::ATOMIC_LOAD_ADD:
return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD);
case ISD::ATOMIC_LOAD_SUB:
return lowerATOMIC_LOAD_SUB(Op, DAG);
case ISD::ATOMIC_LOAD_AND:
return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_AND);
case ISD::ATOMIC_LOAD_OR:
return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_OR);
case ISD::ATOMIC_LOAD_XOR:
return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_XOR);
case ISD::ATOMIC_LOAD_NAND:
return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_NAND);
case ISD::ATOMIC_LOAD_MIN:
return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_MIN);
case ISD::ATOMIC_LOAD_MAX:
return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_MAX);
case ISD::ATOMIC_LOAD_UMIN:
return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_UMIN);
case ISD::ATOMIC_LOAD_UMAX:
return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_UMAX);
case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
return lowerATOMIC_CMP_SWAP(Op, DAG);
case ISD::STACKSAVE:
return lowerSTACKSAVE(Op, DAG);
case ISD::STACKRESTORE:
return lowerSTACKRESTORE(Op, DAG);
case ISD::PREFETCH:
return lowerPREFETCH(Op, DAG);
case ISD::INTRINSIC_W_CHAIN:
return lowerINTRINSIC_W_CHAIN(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN:
return lowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::BUILD_VECTOR:
return lowerBUILD_VECTOR(Op, DAG);
case ISD::VECTOR_SHUFFLE:
return lowerVECTOR_SHUFFLE(Op, DAG);
case ISD::SCALAR_TO_VECTOR:
return lowerSCALAR_TO_VECTOR(Op, DAG);
case ISD::INSERT_VECTOR_ELT:
return lowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT:
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::SIGN_EXTEND_VECTOR_INREG:
return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH);
case ISD::ZERO_EXTEND_VECTOR_INREG:
return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH);
case ISD::SHL:
return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR);
case ISD::SRL:
return lowerShift(Op, DAG, SystemZISD::VSRL_BY_SCALAR);
case ISD::SRA:
return lowerShift(Op, DAG, SystemZISD::VSRA_BY_SCALAR);
default:
llvm_unreachable("Unexpected node to lower");
}
}
// Lower operations with invalid operand or result types (currently used
// only for 128-bit integer types).
static SDValue lowerI128ToGR128(SelectionDAG &DAG, SDValue In) {
SDLoc DL(In);
SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, In,
DAG.getIntPtrConstant(0, DL));
SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, In,
DAG.getIntPtrConstant(1, DL));
SDNode *Pair = DAG.getMachineNode(SystemZ::PAIR128, DL,
MVT::Untyped, Hi, Lo);
return SDValue(Pair, 0);
}
static SDValue lowerGR128ToI128(SelectionDAG &DAG, SDValue In) {
SDLoc DL(In);
SDValue Hi = DAG.getTargetExtractSubreg(SystemZ::subreg_h64,
DL, MVT::i64, In);
SDValue Lo = DAG.getTargetExtractSubreg(SystemZ::subreg_l64,
DL, MVT::i64, In);
return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, Lo, Hi);
}
void
SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
switch (N->getOpcode()) {
case ISD::ATOMIC_LOAD: {
SDLoc DL(N);
SDVTList Tys = DAG.getVTList(MVT::Untyped, MVT::Other);
SDValue Ops[] = { N->getOperand(0), N->getOperand(1) };
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_LOAD_128,
DL, Tys, Ops, MVT::i128, MMO);
Results.push_back(lowerGR128ToI128(DAG, Res));
Results.push_back(Res.getValue(1));
break;
}
case ISD::ATOMIC_STORE: {
SDLoc DL(N);
SDVTList Tys = DAG.getVTList(MVT::Other);
SDValue Ops[] = { N->getOperand(0),
lowerI128ToGR128(DAG, N->getOperand(2)),
N->getOperand(1) };
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_STORE_128,
DL, Tys, Ops, MVT::i128, MMO);
// We have to enforce sequential consistency by performing a
// serialization operation after the store.
if (cast<AtomicSDNode>(N)->getOrdering() ==
AtomicOrdering::SequentiallyConsistent)
Res = SDValue(DAG.getMachineNode(SystemZ::Serialize, DL,
MVT::Other, Res), 0);
Results.push_back(Res);
break;
}
case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
SDLoc DL(N);
SDVTList Tys = DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other);
SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
lowerI128ToGR128(DAG, N->getOperand(2)),
lowerI128ToGR128(DAG, N->getOperand(3)) };
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAP_128,
DL, Tys, Ops, MVT::i128, MMO);
SDValue Success = emitSETCC(DAG, DL, Res.getValue(1),
SystemZ::CCMASK_CS, SystemZ::CCMASK_CS_EQ);
Success = DAG.getZExtOrTrunc(Success, DL, N->getValueType(1));
Results.push_back(lowerGR128ToI128(DAG, Res));
Results.push_back(Success);
Results.push_back(Res.getValue(2));
break;
}
default:
llvm_unreachable("Unexpected node to lower");
}
}
void
SystemZTargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
return LowerOperationWrapper(N, Results, DAG);
}
const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
#define OPCODE(NAME) case SystemZISD::NAME: return "SystemZISD::" #NAME
switch ((SystemZISD::NodeType)Opcode) {
case SystemZISD::FIRST_NUMBER: break;
OPCODE(RET_FLAG);
OPCODE(CALL);
OPCODE(SIBCALL);
OPCODE(TLS_GDCALL);
OPCODE(TLS_LDCALL);
OPCODE(PCREL_WRAPPER);
OPCODE(PCREL_OFFSET);
OPCODE(IABS);
OPCODE(ICMP);
OPCODE(FCMP);
OPCODE(TM);
OPCODE(BR_CCMASK);
OPCODE(SELECT_CCMASK);
OPCODE(ADJDYNALLOC);
OPCODE(POPCNT);
OPCODE(SMUL_LOHI);
OPCODE(UMUL_LOHI);
OPCODE(SDIVREM);
OPCODE(UDIVREM);
OPCODE(SADDO);
OPCODE(SSUBO);
OPCODE(UADDO);
OPCODE(USUBO);
OPCODE(ADDCARRY);
OPCODE(SUBCARRY);
OPCODE(GET_CCMASK);
OPCODE(MVC);
OPCODE(MVC_LOOP);
OPCODE(NC);
OPCODE(NC_LOOP);
OPCODE(OC);
OPCODE(OC_LOOP);
OPCODE(XC);
OPCODE(XC_LOOP);
OPCODE(CLC);
OPCODE(CLC_LOOP);
OPCODE(STPCPY);
OPCODE(STRCMP);
OPCODE(SEARCH_STRING);
OPCODE(IPM);
OPCODE(MEMBARRIER);
OPCODE(TBEGIN);
OPCODE(TBEGIN_NOFLOAT);
OPCODE(TEND);
OPCODE(BYTE_MASK);
OPCODE(ROTATE_MASK);
OPCODE(REPLICATE);
OPCODE(JOIN_DWORDS);
OPCODE(SPLAT);
OPCODE(MERGE_HIGH);
OPCODE(MERGE_LOW);
OPCODE(SHL_DOUBLE);
OPCODE(PERMUTE_DWORDS);
OPCODE(PERMUTE);
OPCODE(PACK);
OPCODE(PACKS_CC);
OPCODE(PACKLS_CC);
OPCODE(UNPACK_HIGH);
OPCODE(UNPACKL_HIGH);
OPCODE(UNPACK_LOW);
OPCODE(UNPACKL_LOW);
OPCODE(VSHL_BY_SCALAR);
OPCODE(VSRL_BY_SCALAR);
OPCODE(VSRA_BY_SCALAR);
OPCODE(VSUM);
OPCODE(VICMPE);
OPCODE(VICMPH);
OPCODE(VICMPHL);
OPCODE(VICMPES);
OPCODE(VICMPHS);
OPCODE(VICMPHLS);
OPCODE(VFCMPE);
OPCODE(VFCMPH);
OPCODE(VFCMPHE);
OPCODE(VFCMPES);
OPCODE(VFCMPHS);
OPCODE(VFCMPHES);
OPCODE(VFTCI);
OPCODE(VEXTEND);
OPCODE(VROUND);
OPCODE(VTM);
OPCODE(VFAE_CC);
OPCODE(VFAEZ_CC);
OPCODE(VFEE_CC);
OPCODE(VFEEZ_CC);
OPCODE(VFENE_CC);
OPCODE(VFENEZ_CC);
OPCODE(VISTR_CC);
OPCODE(VSTRC_CC);
OPCODE(VSTRCZ_CC);
OPCODE(VSTRS_CC);
OPCODE(VSTRSZ_CC);
OPCODE(TDC);
OPCODE(ATOMIC_SWAPW);
OPCODE(ATOMIC_LOADW_ADD);
OPCODE(ATOMIC_LOADW_SUB);
OPCODE(ATOMIC_LOADW_AND);
OPCODE(ATOMIC_LOADW_OR);
OPCODE(ATOMIC_LOADW_XOR);
OPCODE(ATOMIC_LOADW_NAND);
OPCODE(ATOMIC_LOADW_MIN);
OPCODE(ATOMIC_LOADW_MAX);
OPCODE(ATOMIC_LOADW_UMIN);
OPCODE(ATOMIC_LOADW_UMAX);
OPCODE(ATOMIC_CMP_SWAPW);
OPCODE(ATOMIC_CMP_SWAP);
OPCODE(ATOMIC_LOAD_128);
OPCODE(ATOMIC_STORE_128);
OPCODE(ATOMIC_CMP_SWAP_128);
OPCODE(LRV);
OPCODE(STRV);
OPCODE(VLER);
OPCODE(VSTER);
OPCODE(PREFETCH);
}
return nullptr;
#undef OPCODE
}
// Return true if VT is a vector whose elements are a whole number of bytes
// in width. Also check for presence of vector support.
bool SystemZTargetLowering::canTreatAsByteVector(EVT VT) const {
if (!Subtarget.hasVector())
return false;
return VT.isVector() && VT.getScalarSizeInBits() % 8 == 0 && VT.isSimple();
}
// Try to simplify an EXTRACT_VECTOR_ELT from a vector of type VecVT
// producing a result of type ResVT. Op is a possibly bitcast version
// of the input vector and Index is the index (based on type VecVT) that
// should be extracted. Return the new extraction if a simplification
// was possible or if Force is true.
SDValue SystemZTargetLowering::combineExtract(const SDLoc &DL, EVT ResVT,
EVT VecVT, SDValue Op,
unsigned Index,
DAGCombinerInfo &DCI,
bool Force) const {
SelectionDAG &DAG = DCI.DAG;
// The number of bytes being extracted.
unsigned BytesPerElement = VecVT.getVectorElementType().getStoreSize();
for (;;) {
unsigned Opcode = Op.getOpcode();
if (Opcode == ISD::BITCAST)
// Look through bitcasts.
Op = Op.getOperand(0);
else if ((Opcode == ISD::VECTOR_SHUFFLE || Opcode == SystemZISD::SPLAT) &&
canTreatAsByteVector(Op.getValueType())) {
// Get a VPERM-like permute mask and see whether the bytes covered
// by the extracted element are a contiguous sequence from one
// source operand.
SmallVector<int, SystemZ::VectorBytes> Bytes;
if (!getVPermMask(Op, Bytes))
break;
int First;
if (!getShuffleInput(Bytes, Index * BytesPerElement,
BytesPerElement, First))
break;
if (First < 0)
return DAG.getUNDEF(ResVT);
// Make sure the contiguous sequence starts at a multiple of the
// original element size.
unsigned Byte = unsigned(First) % Bytes.size();
if (Byte % BytesPerElement != 0)
break;
// We can get the extracted value directly from an input.
Index = Byte / BytesPerElement;
Op = Op.getOperand(unsigned(First) / Bytes.size());
Force = true;
} else if (Opcode == ISD::BUILD_VECTOR &&
canTreatAsByteVector(Op.getValueType())) {
// We can only optimize this case if the BUILD_VECTOR elements are
// at least as wide as the extracted value.
EVT OpVT = Op.getValueType();
unsigned OpBytesPerElement = OpVT.getVectorElementType().getStoreSize();
if (OpBytesPerElement < BytesPerElement)
break;
// Make sure that the least-significant bit of the extracted value
// is the least significant bit of an input.
unsigned End = (Index + 1) * BytesPerElement;
if (End % OpBytesPerElement != 0)
break;
// We're extracting the low part of one operand of the BUILD_VECTOR.
Op = Op.getOperand(End / OpBytesPerElement - 1);
if (!Op.getValueType().isInteger()) {
EVT VT = MVT::getIntegerVT(Op.getValueSizeInBits());
Op = DAG.getNode(ISD::BITCAST, DL, VT, Op);
DCI.AddToWorklist(Op.getNode());
}
EVT VT = MVT::getIntegerVT(ResVT.getSizeInBits());
Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
if (VT != ResVT) {
DCI.AddToWorklist(Op.getNode());
Op = DAG.getNode(ISD::BITCAST, DL, ResVT, Op);
}
return Op;
} else if ((Opcode == ISD::SIGN_EXTEND_VECTOR_INREG ||
Opcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
Opcode == ISD::ANY_EXTEND_VECTOR_INREG) &&
canTreatAsByteVector(Op.getValueType()) &&
canTreatAsByteVector(Op.getOperand(0).getValueType())) {
// Make sure that only the unextended bits are significant.
EVT ExtVT = Op.getValueType();
EVT OpVT = Op.getOperand(0).getValueType();
unsigned ExtBytesPerElement = ExtVT.getVectorElementType().getStoreSize();
unsigned OpBytesPerElement = OpVT.getVectorElementType().getStoreSize();
unsigned Byte = Index * BytesPerElement;
unsigned SubByte = Byte % ExtBytesPerElement;
unsigned MinSubByte = ExtBytesPerElement - OpBytesPerElement;
if (SubByte < MinSubByte ||
SubByte + BytesPerElement > ExtBytesPerElement)
break;
// Get the byte offset of the unextended element
Byte = Byte / ExtBytesPerElement * OpBytesPerElement;
// ...then add the byte offset relative to that element.
Byte += SubByte - MinSubByte;
if (Byte % BytesPerElement != 0)
break;
Op = Op.getOperand(0);
Index = Byte / BytesPerElement;
Force = true;
} else
break;
}
if (Force) {
if (Op.getValueType() != VecVT) {
Op = DAG.getNode(ISD::BITCAST, DL, VecVT, Op);
DCI.AddToWorklist(Op.getNode());
}
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Op,
DAG.getConstant(Index, DL, MVT::i32));
}
return SDValue();
}
// Optimize vector operations in scalar value Op on the basis that Op
// is truncated to TruncVT.
SDValue SystemZTargetLowering::combineTruncateExtract(
const SDLoc &DL, EVT TruncVT, SDValue Op, DAGCombinerInfo &DCI) const {
// If we have (trunc (extract_vector_elt X, Y)), try to turn it into
// (extract_vector_elt (bitcast X), Y'), where (bitcast X) has elements
// of type TruncVT.
if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
TruncVT.getSizeInBits() % 8 == 0) {
SDValue Vec = Op.getOperand(0);
EVT VecVT = Vec.getValueType();
if (canTreatAsByteVector(VecVT)) {
if (auto *IndexN = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
unsigned BytesPerElement = VecVT.getVectorElementType().getStoreSize();
unsigned TruncBytes = TruncVT.getStoreSize();
if (BytesPerElement % TruncBytes == 0) {
// Calculate the value of Y' in the above description. We are
// splitting the original elements into Scale equal-sized pieces
// and for truncation purposes want the last (least-significant)
// of these pieces for IndexN. This is easiest to do by calculating
// the start index of the following element and then subtracting 1.
unsigned Scale = BytesPerElement / TruncBytes;
unsigned NewIndex = (IndexN->getZExtValue() + 1) * Scale - 1;
// Defer the creation of the bitcast from X to combineExtract,
// which might be able to optimize the extraction.
VecVT = MVT::getVectorVT(MVT::getIntegerVT(TruncBytes * 8),
VecVT.getStoreSize() / TruncBytes);
EVT ResVT = (TruncBytes < 4 ? MVT::i32 : TruncVT);
return combineExtract(DL, ResVT, VecVT, Vec, NewIndex, DCI, true);
}
}
}
}
return SDValue();
}
SDValue SystemZTargetLowering::combineZERO_EXTEND(
SDNode *N, DAGCombinerInfo &DCI) const {
// Convert (zext (select_ccmask C1, C2)) into (select_ccmask C1', C2')
SelectionDAG &DAG = DCI.DAG;
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
if (N0.getOpcode() == SystemZISD::SELECT_CCMASK) {
auto *TrueOp = dyn_cast<ConstantSDNode>(N0.getOperand(0));
auto *FalseOp = dyn_cast<ConstantSDNode>(N0.getOperand(1));
if (TrueOp && FalseOp) {
SDLoc DL(N0);
SDValue Ops[] = { DAG.getConstant(TrueOp->getZExtValue(), DL, VT),
DAG.getConstant(FalseOp->getZExtValue(), DL, VT),
N0.getOperand(2), N0.getOperand(3), N0.getOperand(4) };
SDValue NewSelect = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VT, Ops);
// If N0 has multiple uses, change other uses as well.
if (!N0.hasOneUse()) {
SDValue TruncSelect =
DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), NewSelect);
DCI.CombineTo(N0.getNode(), TruncSelect);
}
return NewSelect;
}
}
return SDValue();
}
SDValue SystemZTargetLowering::combineSIGN_EXTEND_INREG(
SDNode *N, DAGCombinerInfo &DCI) const {
// Convert (sext_in_reg (setcc LHS, RHS, COND), i1)
// and (sext_in_reg (any_extend (setcc LHS, RHS, COND)), i1)
// into (select_cc LHS, RHS, -1, 0, COND)
SelectionDAG &DAG = DCI.DAG;
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
if (N0.hasOneUse() && N0.getOpcode() == ISD::ANY_EXTEND)
N0 = N0.getOperand(0);
if (EVT == MVT::i1 && N0.hasOneUse() && N0.getOpcode() == ISD::SETCC) {
SDLoc DL(N0);
SDValue Ops[] = { N0.getOperand(0), N0.getOperand(1),
DAG.getConstant(-1, DL, VT), DAG.getConstant(0, DL, VT),
N0.getOperand(2) };
return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);
}
return SDValue();
}
SDValue SystemZTargetLowering::combineSIGN_EXTEND(
SDNode *N, DAGCombinerInfo &DCI) const {
// Convert (sext (ashr (shl X, C1), C2)) to
// (ashr (shl (anyext X), C1'), C2')), since wider shifts are as
// cheap as narrower ones.
SelectionDAG &DAG = DCI.DAG;
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
if (N0.hasOneUse() && N0.getOpcode() == ISD::SRA) {
auto *SraAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1));
SDValue Inner = N0.getOperand(0);
if (SraAmt && Inner.hasOneUse() && Inner.getOpcode() == ISD::SHL) {
if (auto *ShlAmt = dyn_cast<ConstantSDNode>(Inner.getOperand(1))) {
unsigned Extra = (VT.getSizeInBits() - N0.getValueSizeInBits());
unsigned NewShlAmt = ShlAmt->getZExtValue() + Extra;
unsigned NewSraAmt = SraAmt->getZExtValue() + Extra;
EVT ShiftVT = N0.getOperand(1).getValueType();
SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SDLoc(Inner), VT,
Inner.getOperand(0));
SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(Inner), VT, Ext,
DAG.getConstant(NewShlAmt, SDLoc(Inner),
ShiftVT));
return DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl,
DAG.getConstant(NewSraAmt, SDLoc(N0), ShiftVT));
}
}
}
return SDValue();
}
SDValue SystemZTargetLowering::combineMERGE(
SDNode *N, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
unsigned Opcode = N->getOpcode();
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
if (Op0.getOpcode() == ISD::BITCAST)
Op0 = Op0.getOperand(0);
if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
// (z_merge_* 0, 0) -> 0. This is mostly useful for using VLLEZF
// for v4f32.
if (Op1 == N->getOperand(0))
return Op1;
// (z_merge_? 0, X) -> (z_unpackl_? 0, X).
EVT VT = Op1.getValueType();
unsigned ElemBytes = VT.getVectorElementType().getStoreSize();
if (ElemBytes <= 4) {
Opcode = (Opcode == SystemZISD::MERGE_HIGH ?
SystemZISD::UNPACKL_HIGH : SystemZISD::UNPACKL_LOW);
EVT InVT = VT.changeVectorElementTypeToInteger();
EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(ElemBytes * 16),
SystemZ::VectorBytes / ElemBytes / 2);
if (VT != InVT) {
Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), InVT, Op1);
DCI.AddToWorklist(Op1.getNode());
}
SDValue Op = DAG.getNode(Opcode, SDLoc(N), OutVT, Op1);
DCI.AddToWorklist(Op.getNode());
return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
}
}
return SDValue();
}
SDValue SystemZTargetLowering::combineLOAD(
SDNode *N, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
EVT LdVT = N->getValueType(0);
if (LdVT.isVector() || LdVT.isInteger())
return SDValue();
// Transform a scalar load that is REPLICATEd as well as having other
// use(s) to the form where the other use(s) use the first element of the
// REPLICATE instead of the load. Otherwise instruction selection will not
// produce a VLREP. Avoid extracting to a GPR, so only do this for floating
// point loads.
SDValue Replicate;
SmallVector<SDNode*, 8> OtherUses;
for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
UI != UE; ++UI) {
if (UI->getOpcode() == SystemZISD::REPLICATE) {
if (Replicate)
return SDValue(); // Should never happen
Replicate = SDValue(*UI, 0);
}
else if (UI.getUse().getResNo() == 0)
OtherUses.push_back(*UI);
}
if (!Replicate || OtherUses.empty())
return SDValue();
SDLoc DL(N);
SDValue Extract0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, LdVT,
Replicate, DAG.getConstant(0, DL, MVT::i32));
// Update uses of the loaded Value while preserving old chains.
for (SDNode *U : OtherUses) {
SmallVector<SDValue, 8> Ops;
for (SDValue Op : U->ops())
Ops.push_back((Op.getNode() == N && Op.getResNo() == 0) ? Extract0 : Op);
DAG.UpdateNodeOperands(U, Ops);
}
return SDValue(N, 0);
}
bool SystemZTargetLowering::canLoadStoreByteSwapped(EVT VT) const {
if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64)
return true;
if (Subtarget.hasVectorEnhancements2())
if (VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64)
return true;
return false;
}
static bool isVectorElementSwap(ArrayRef<int> M, EVT VT) {
if (!VT.isVector() || !VT.isSimple() ||
VT.getSizeInBits() != 128 ||
VT.getScalarSizeInBits() % 8 != 0)
return false;
unsigned NumElts = VT.getVectorNumElements();
for (unsigned i = 0; i < NumElts; ++i) {
if (M[i] < 0) continue; // ignore UNDEF indices
if ((unsigned) M[i] != NumElts - 1 - i)
return false;
}
return true;
}
SDValue SystemZTargetLowering::combineSTORE(
SDNode *N, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
auto *SN = cast<StoreSDNode>(N);
auto &Op1 = N->getOperand(1);
EVT MemVT = SN->getMemoryVT();
// If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better
// for the extraction to be done on a vMiN value, so that we can use VSTE.
// If X has wider elements then convert it to:
// (truncstoreiN (extract_vector_elt (bitcast X), Y2), Z).
if (MemVT.isInteger() && SN->isTruncatingStore()) {
if (SDValue Value =
combineTruncateExtract(SDLoc(N), MemVT, SN->getValue(), DCI)) {
DCI.AddToWorklist(Value.getNode());
// Rewrite the store with the new form of stored value.
return DAG.getTruncStore(SN->getChain(), SDLoc(SN), Value,
SN->getBasePtr(), SN->getMemoryVT(),
SN->getMemOperand());
}
}
// Combine STORE (BSWAP) into STRVH/STRV/STRVG/VSTBR
if (!SN->isTruncatingStore() &&
Op1.getOpcode() == ISD::BSWAP &&
Op1.getNode()->hasOneUse() &&
canLoadStoreByteSwapped(Op1.getValueType())) {
SDValue BSwapOp = Op1.getOperand(0);
if (BSwapOp.getValueType() == MVT::i16)
BSwapOp = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), MVT::i32, BSwapOp);
SDValue Ops[] = {
N->getOperand(0), BSwapOp, N->getOperand(2)
};
return
DAG.getMemIntrinsicNode(SystemZISD::STRV, SDLoc(N), DAG.getVTList(MVT::Other),
Ops, MemVT, SN->getMemOperand());
}
// Combine STORE (element-swap) into VSTER
if (!SN->isTruncatingStore() &&
Op1.getOpcode() == ISD::VECTOR_SHUFFLE &&
Op1.getNode()->hasOneUse() &&
Subtarget.hasVectorEnhancements2()) {
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op1.getNode());
ArrayRef<int> ShuffleMask = SVN->getMask();
if (isVectorElementSwap(ShuffleMask, Op1.getValueType())) {
SDValue Ops[] = {
N->getOperand(0), Op1.getOperand(0), N->getOperand(2)
};
return DAG.getMemIntrinsicNode(SystemZISD::VSTER, SDLoc(N),
DAG.getVTList(MVT::Other),
Ops, MemVT, SN->getMemOperand());
}
}
return SDValue();
}
SDValue SystemZTargetLowering::combineVECTOR_SHUFFLE(
SDNode *N, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
// Combine element-swap (LOAD) into VLER
if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
N->getOperand(0).hasOneUse() &&
Subtarget.hasVectorEnhancements2()) {
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
ArrayRef<int> ShuffleMask = SVN->getMask();
if (isVectorElementSwap(ShuffleMask, N->getValueType(0))) {
SDValue Load = N->getOperand(0);
LoadSDNode *LD = cast<LoadSDNode>(Load);
// Create the element-swapping load.
SDValue Ops[] = {
LD->getChain(), // Chain
LD->getBasePtr() // Ptr
};
SDValue ESLoad =
DAG.getMemIntrinsicNode(SystemZISD::VLER, SDLoc(N),
DAG.getVTList(LD->getValueType(0), MVT::Other),
Ops, LD->getMemoryVT(), LD->getMemOperand());
// First, combine the VECTOR_SHUFFLE away. This makes the value produced
// by the load dead.
DCI.CombineTo(N, ESLoad);
// Next, combine the load away, we give it a bogus result value but a real
// chain result. The result value is dead because the shuffle is dead.
DCI.CombineTo(Load.getNode(), ESLoad, ESLoad.getValue(1));
// Return N so it doesn't get rechecked!
return SDValue(N, 0);
}
}
return SDValue();
}
SDValue SystemZTargetLowering::combineEXTRACT_VECTOR_ELT(
SDNode *N, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
if (!Subtarget.hasVector())
return SDValue();
// Look through bitcasts that retain the number of vector elements.
SDValue Op = N->getOperand(0);
if (Op.getOpcode() == ISD::BITCAST &&
Op.getValueType().isVector() &&
Op.getOperand(0).getValueType().isVector() &&
Op.getValueType().getVectorNumElements() ==
Op.getOperand(0).getValueType().getVectorNumElements())
Op = Op.getOperand(0);
// Pull BSWAP out of a vector extraction.
if (Op.getOpcode() == ISD::BSWAP && Op.hasOneUse()) {
EVT VecVT = Op.getValueType();
EVT EltVT = VecVT.getVectorElementType();
Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), EltVT,
Op.getOperand(0), N->getOperand(1));
DCI.AddToWorklist(Op.getNode());
Op = DAG.getNode(ISD::BSWAP, SDLoc(N), EltVT, Op);
if (EltVT != N->getValueType(0)) {
DCI.AddToWorklist(Op.getNode());
Op = DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Op);
}
return Op;
}
// Try to simplify a vector extraction.
if (auto *IndexN = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
SDValue Op0 = N->getOperand(0);
EVT VecVT = Op0.getValueType();
return combineExtract(SDLoc(N), N->getValueType(0), VecVT, Op0,
IndexN->getZExtValue(), DCI, false);
}
return SDValue();
}
SDValue SystemZTargetLowering::combineJOIN_DWORDS(
SDNode *N, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
// (join_dwords X, X) == (replicate X)
if (N->getOperand(0) == N->getOperand(1))
return DAG.getNode(SystemZISD::REPLICATE, SDLoc(N), N->getValueType(0),
N->getOperand(0));
return SDValue();
}
SDValue SystemZTargetLowering::combineFP_ROUND(
SDNode *N, DAGCombinerInfo &DCI) const {
if (!Subtarget.hasVector())
return SDValue();
// (fpround (extract_vector_elt X 0))
// (fpround (extract_vector_elt X 1)) ->
// (extract_vector_elt (VROUND X) 0)
// (extract_vector_elt (VROUND X) 2)
//
// This is a special case since the target doesn't really support v2f32s.
SelectionDAG &DAG = DCI.DAG;
SDValue Op0 = N->getOperand(0);
if (N->getValueType(0) == MVT::f32 &&
Op0.hasOneUse() &&
Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
Op0.getOperand(0).getValueType() == MVT::v2f64 &&
Op0.getOperand(1).getOpcode() == ISD::Constant &&
cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue() == 0) {
SDValue Vec = Op0.getOperand(0);
for (auto *U : Vec->uses()) {
if (U != Op0.getNode() &&
U->hasOneUse() &&
U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
U->getOperand(0) == Vec &&
U->getOperand(1).getOpcode() == ISD::Constant &&
cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() == 1) {
SDValue OtherRound = SDValue(*U->use_begin(), 0);
if (OtherRound.getOpcode() == ISD::FP_ROUND &&
OtherRound.getOperand(0) == SDValue(U, 0) &&
OtherRound.getValueType() == MVT::f32) {
SDValue VRound = DAG.getNode(SystemZISD::VROUND, SDLoc(N),
MVT::v4f32, Vec);
DCI.AddToWorklist(VRound.getNode());
SDValue Extract1 =
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(U), MVT::f32,
VRound, DAG.getConstant(2, SDLoc(U), MVT::i32));
DCI.AddToWorklist(Extract1.getNode());
DAG.ReplaceAllUsesOfValueWith(OtherRound, Extract1);
SDValue Extract0 =
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32,
VRound, DAG.getConstant(0, SDLoc(Op0), MVT::i32));
return Extract0;
}
}
}
}
return SDValue();
}
SDValue SystemZTargetLowering::combineFP_EXTEND(
SDNode *N, DAGCombinerInfo &DCI) const {
if (!Subtarget.hasVector())
return SDValue();
// (fpextend (extract_vector_elt X 0))
// (fpextend (extract_vector_elt X 2)) ->
// (extract_vector_elt (VEXTEND X) 0)
// (extract_vector_elt (VEXTEND X) 1)
//
// This is a special case since the target doesn't really support v2f32s.
SelectionDAG &DAG = DCI.DAG;
SDValue Op0 = N->getOperand(0);
if (N->getValueType(0) == MVT::f64 &&
Op0.hasOneUse() &&
Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
Op0.getOperand(0).getValueType() == MVT::v4f32 &&
Op0.getOperand(1).getOpcode() == ISD::Constant &&
cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue() == 0) {
SDValue Vec = Op0.getOperand(0);
for (auto *U : Vec->uses()) {
if (U != Op0.getNode() &&
U->hasOneUse() &&
U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
U->getOperand(0) == Vec &&
U->getOperand(1).getOpcode() == ISD::Constant &&
cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() == 2) {
SDValue OtherExtend = SDValue(*U->use_begin(), 0);
if (OtherExtend.getOpcode() == ISD::FP_EXTEND &&
OtherExtend.getOperand(0) == SDValue(U, 0) &&
OtherExtend.getValueType() == MVT::f64) {
SDValue VExtend = DAG.getNode(SystemZISD::VEXTEND, SDLoc(N),
MVT::v2f64, Vec);
DCI.AddToWorklist(VExtend.getNode());
SDValue Extract1 =
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(U), MVT::f64,
VExtend, DAG.getConstant(1, SDLoc(U), MVT::i32));
DCI.AddToWorklist(Extract1.getNode());
DAG.ReplaceAllUsesOfValueWith(OtherExtend, Extract1);
SDValue Extract0 =
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f64,
VExtend, DAG.getConstant(0, SDLoc(Op0), MVT::i32));
return Extract0;
}
}
}
}
return SDValue();
}
SDValue SystemZTargetLowering::combineBSWAP(
SDNode *N, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
// Combine BSWAP (LOAD) into LRVH/LRV/LRVG/VLBR
if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
N->getOperand(0).hasOneUse() &&
canLoadStoreByteSwapped(N->getValueType(0))) {
SDValue Load = N->getOperand(0);
LoadSDNode *LD = cast<LoadSDNode>(Load);
// Create the byte-swapping load.
SDValue Ops[] = {
LD->getChain(), // Chain
LD->getBasePtr() // Ptr
};
EVT LoadVT = N->getValueType(0);
if (LoadVT == MVT::i16)
LoadVT = MVT::i32;
SDValue BSLoad =
DAG.getMemIntrinsicNode(SystemZISD::LRV, SDLoc(N),
DAG.getVTList(LoadVT, MVT::Other),
Ops, LD->getMemoryVT(), LD->getMemOperand());
// If this is an i16 load, insert the truncate.
SDValue ResVal = BSLoad;
if (N->getValueType(0) == MVT::i16)
ResVal = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i16, BSLoad);
// First, combine the bswap away. This makes the value produced by the
// load dead.
DCI.CombineTo(N, ResVal);
// Next, combine the load away, we give it a bogus result value but a real
// chain result. The result value is dead because the bswap is dead.
DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));
// Return N so it doesn't get rechecked!
return SDValue(N, 0);
}
// Look through bitcasts that retain the number of vector elements.
SDValue Op = N->getOperand(0);
if (Op.getOpcode() == ISD::BITCAST &&
Op.getValueType().isVector() &&
Op.getOperand(0).getValueType().isVector() &&
Op.getValueType().getVectorNumElements() ==
Op.getOperand(0).getValueType().getVectorNumElements())
Op = Op.getOperand(0);
// Push BSWAP into a vector insertion if at least one side then simplifies.
if (Op.getOpcode() == ISD::INSERT_VECTOR_ELT && Op.hasOneUse()) {
SDValue Vec = Op.getOperand(0);
SDValue Elt = Op.getOperand(1);
SDValue Idx = Op.getOperand(2);
if (DAG.isConstantIntBuildVectorOrConstantInt(Vec) ||
Vec.getOpcode() == ISD::BSWAP || Vec.isUndef() ||
DAG.isConstantIntBuildVectorOrConstantInt(Elt) ||
Elt.getOpcode() == ISD::BSWAP || Elt.isUndef() ||
(canLoadStoreByteSwapped(N->getValueType(0)) &&
ISD::isNON_EXTLoad(Elt.getNode()) && Elt.hasOneUse())) {
EVT VecVT = N->getValueType(0);
EVT EltVT = N->getValueType(0).getVectorElementType();
if (VecVT != Vec.getValueType()) {
Vec = DAG.getNode(ISD::BITCAST, SDLoc(N), VecVT, Vec);
DCI.AddToWorklist(Vec.getNode());
}
if (EltVT != Elt.getValueType()) {
Elt = DAG.getNode(ISD::BITCAST, SDLoc(N), EltVT, Elt);
DCI.AddToWorklist(Elt.getNode());
}
Vec = DAG.getNode(ISD::BSWAP, SDLoc(N), VecVT, Vec);
DCI.AddToWorklist(Vec.getNode());
Elt = DAG.getNode(ISD::BSWAP, SDLoc(N), EltVT, Elt);
DCI.AddToWorklist(Elt.getNode());
return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), VecVT,
Vec, Elt, Idx);
}
}
// Push BSWAP into a vector shuffle if at least one side then simplifies.
ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(Op);
if (SV && Op.hasOneUse()) {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) ||
Op0.getOpcode() == ISD::BSWAP || Op0.isUndef() ||
DAG.isConstantIntBuildVectorOrConstantInt(Op1) ||
Op1.getOpcode() == ISD::BSWAP || Op1.isUndef()) {
EVT VecVT = N->getValueType(0);
if (VecVT != Op0.getValueType()) {
Op0 = DAG.getNode(ISD::BITCAST, SDLoc(N), VecVT, Op0);
DCI.AddToWorklist(Op0.getNode());
}
if (VecVT != Op1.getValueType()) {
Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), VecVT, Op1);
DCI.AddToWorklist(Op1.getNode());
}
Op0 = DAG.getNode(ISD::BSWAP, SDLoc(N), VecVT, Op0);
DCI.AddToWorklist(Op0.getNode());
Op1 = DAG.getNode(ISD::BSWAP, SDLoc(N), VecVT, Op1);
DCI.AddToWorklist(Op1.getNode());
return DAG.getVectorShuffle(VecVT, SDLoc(N), Op0, Op1, SV->getMask());
}
}
return SDValue();
}
static bool combineCCMask(SDValue &CCReg, int &CCValid, int &CCMask) {
// We have a SELECT_CCMASK or BR_CCMASK comparing the condition code
// set by the CCReg instruction using the CCValid / CCMask masks,
// If the CCReg instruction is itself a ICMP testing the condition
// code set by some other instruction, see whether we can directly
// use that condition code.
// Verify that we have an ICMP against some constant.
if (CCValid != SystemZ::CCMASK_ICMP)
return false;
auto *ICmp = CCReg.getNode();
if (ICmp->getOpcode() != SystemZISD::ICMP)
return false;
auto *CompareLHS = ICmp->getOperand(0).getNode();
auto *CompareRHS = dyn_cast<ConstantSDNode>(ICmp->getOperand(1));
if (!CompareRHS)
return false;
// Optimize the case where CompareLHS is a SELECT_CCMASK.
if (CompareLHS->getOpcode() == SystemZISD::SELECT_CCMASK) {
// Verify that we have an appropriate mask for a EQ or NE comparison.
bool Invert = false;
if (CCMask == SystemZ::CCMASK_CMP_NE)
Invert = !Invert;
else if (CCMask != SystemZ::CCMASK_CMP_EQ)
return false;
// Verify that the ICMP compares against one of select values.
auto *TrueVal = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(0));
if (!TrueVal)
return false;
auto *FalseVal = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(1));
if (!FalseVal)
return false;
if (CompareRHS->getZExtValue() == FalseVal->getZExtValue())
Invert = !Invert;
else if (CompareRHS->getZExtValue() != TrueVal->getZExtValue())
return false;
// Compute the effective CC mask for the new branch or select.
auto *NewCCValid = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(2));
auto *NewCCMask = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(3));
if (!NewCCValid || !NewCCMask)
return false;
CCValid = NewCCValid->getZExtValue();
CCMask = NewCCMask->getZExtValue();
if (Invert)
CCMask ^= CCValid;
// Return the updated CCReg link.
CCReg = CompareLHS->getOperand(4);
return true;
}
// Optimize the case where CompareRHS is (SRA (SHL (IPM))).
if (CompareLHS->getOpcode() == ISD::SRA) {
auto *SRACount = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(1));
if (!SRACount || SRACount->getZExtValue() != 30)
return false;
auto *SHL = CompareLHS->getOperand(0).getNode();
if (SHL->getOpcode() != ISD::SHL)
return false;
auto *SHLCount = dyn_cast<ConstantSDNode>(SHL->getOperand(1));
if (!SHLCount || SHLCount->getZExtValue() != 30 - SystemZ::IPM_CC)
return false;
auto *IPM = SHL->getOperand(0).getNode();
if (IPM->getOpcode() != SystemZISD::IPM)
return false;
// Avoid introducing CC spills (because SRA would clobber CC).
if (!CompareLHS->hasOneUse())
return false;
// Verify that the ICMP compares against zero.
if (CompareRHS->getZExtValue() != 0)
return false;
// Compute the effective CC mask for the new branch or select.
switch (CCMask) {
case SystemZ::CCMASK_CMP_EQ: break;
case SystemZ::CCMASK_CMP_NE: break;
case SystemZ::CCMASK_CMP_LT: CCMask = SystemZ::CCMASK_CMP_GT; break;
case SystemZ::CCMASK_CMP_GT: CCMask = SystemZ::CCMASK_CMP_LT; break;
case SystemZ::CCMASK_CMP_LE: CCMask = SystemZ::CCMASK_CMP_GE; break;
case SystemZ::CCMASK_CMP_GE: CCMask = SystemZ::CCMASK_CMP_LE; break;
default: return false;
}
// Return the updated CCReg link.
CCReg = IPM->getOperand(0);
return true;
}
return false;
}
SDValue SystemZTargetLowering::combineBR_CCMASK(
SDNode *N, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
// Combine BR_CCMASK (ICMP (SELECT_CCMASK)) into a single BR_CCMASK.
auto *CCValid = dyn_cast<ConstantSDNode>(N->getOperand(1));
auto *CCMask = dyn_cast<ConstantSDNode>(N->getOperand(2));
if (!CCValid || !CCMask)
return SDValue();
int CCValidVal = CCValid->getZExtValue();
int CCMaskVal = CCMask->getZExtValue();
SDValue Chain = N->getOperand(0);
SDValue CCReg = N->getOperand(4);
if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
return DAG.getNode(SystemZISD::BR_CCMASK, SDLoc(N), N->getValueType(0),
Chain,
DAG.getConstant(CCValidVal, SDLoc(N), MVT::i32),
DAG.getConstant(CCMaskVal, SDLoc(N), MVT::i32),
N->getOperand(3), CCReg);
return SDValue();
}
SDValue SystemZTargetLowering::combineSELECT_CCMASK(
SDNode *N, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
// Combine SELECT_CCMASK (ICMP (SELECT_CCMASK)) into a single SELECT_CCMASK.
auto *CCValid = dyn_cast<ConstantSDNode>(N->getOperand(2));
auto *CCMask = dyn_cast<ConstantSDNode>(N->getOperand(3));
if (!CCValid || !CCMask)
return SDValue();
int CCValidVal = CCValid->getZExtValue();
int CCMaskVal = CCMask->getZExtValue();
SDValue CCReg = N->getOperand(4);
if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
return DAG.getNode(SystemZISD::SELECT_CCMASK, SDLoc(N), N->getValueType(0),
N->getOperand(0),
N->getOperand(1),
DAG.getConstant(CCValidVal, SDLoc(N), MVT::i32),
DAG.getConstant(CCMaskVal, SDLoc(N), MVT::i32),
CCReg);
return SDValue();
}
SDValue SystemZTargetLowering::combineGET_CCMASK(
SDNode *N, DAGCombinerInfo &DCI) const {
// Optimize away GET_CCMASK (SELECT_CCMASK) if the CC masks are compatible
auto *CCValid = dyn_cast<ConstantSDNode>(N->getOperand(1));
auto *CCMask = dyn_cast<ConstantSDNode>(N->getOperand(2));
if (!CCValid || !CCMask)
return SDValue();
int CCValidVal = CCValid->getZExtValue();
int CCMaskVal = CCMask->getZExtValue();
SDValue Select = N->getOperand(0);
if (Select->getOpcode() != SystemZISD::SELECT_CCMASK)
return SDValue();
auto *SelectCCValid = dyn_cast<ConstantSDNode>(Select->getOperand(2));
auto *SelectCCMask = dyn_cast<ConstantSDNode>(Select->getOperand(3));
if (!SelectCCValid || !SelectCCMask)
return SDValue();
int SelectCCValidVal = SelectCCValid->getZExtValue();
int SelectCCMaskVal = SelectCCMask->getZExtValue();
auto *TrueVal = dyn_cast<ConstantSDNode>(Select->getOperand(0));
auto *FalseVal = dyn_cast<ConstantSDNode>(Select->getOperand(1));
if (!TrueVal || !FalseVal)
return SDValue();
if (TrueVal->getZExtValue() != 0 && FalseVal->getZExtValue() == 0)
;
else if (TrueVal->getZExtValue() == 0 && FalseVal->getZExtValue() != 0)
SelectCCMaskVal ^= SelectCCValidVal;
else
return SDValue();
if (SelectCCValidVal & ~CCValidVal)
return SDValue();
if (SelectCCMaskVal != (CCMaskVal & SelectCCValidVal))
return SDValue();
return Select->getOperand(4);
}
SDValue SystemZTargetLowering::combineIntDIVREM(
SDNode *N, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
// In the case where the divisor is a vector of constants a cheaper
// sequence of instructions can replace the divide. BuildSDIV is called to
// do this during DAG combining, but it only succeeds when it can build a
// multiplication node. The only option for SystemZ is ISD::SMUL_LOHI, and
// since it is not Legal but Custom it can only happen before
// legalization. Therefore we must scalarize this early before Combine
// 1. For widened vectors, this is already the result of type legalization.
if (DCI.Level == BeforeLegalizeTypes && VT.isVector() && isTypeLegal(VT) &&
DAG.isConstantIntBuildVectorOrConstantInt(N->getOperand(1)))
return DAG.UnrollVectorOp(N);
return SDValue();
}
SDValue SystemZTargetLowering::unwrapAddress(SDValue N) const {
if (N->getOpcode() == SystemZISD::PCREL_WRAPPER)
return N->getOperand(0);
return N;
}
SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
switch(N->getOpcode()) {
default: break;
case ISD::ZERO_EXTEND: return combineZERO_EXTEND(N, DCI);
case ISD::SIGN_EXTEND: return combineSIGN_EXTEND(N, DCI);
case ISD::SIGN_EXTEND_INREG: return combineSIGN_EXTEND_INREG(N, DCI);
case SystemZISD::MERGE_HIGH:
case SystemZISD::MERGE_LOW: return combineMERGE(N, DCI);
case ISD::LOAD: return combineLOAD(N, DCI);
case ISD::STORE: return combineSTORE(N, DCI);
case ISD::VECTOR_SHUFFLE: return combineVECTOR_SHUFFLE(N, DCI);
case ISD::EXTRACT_VECTOR_ELT: return combineEXTRACT_VECTOR_ELT(N, DCI);
case SystemZISD::JOIN_DWORDS: return combineJOIN_DWORDS(N, DCI);
case ISD::FP_ROUND: return combineFP_ROUND(N, DCI);
case ISD::FP_EXTEND: return combineFP_EXTEND(N, DCI);
case ISD::BSWAP: return combineBSWAP(N, DCI);
case SystemZISD::BR_CCMASK: return combineBR_CCMASK(N, DCI);
case SystemZISD::SELECT_CCMASK: return combineSELECT_CCMASK(N, DCI);
case SystemZISD::GET_CCMASK: return combineGET_CCMASK(N, DCI);
case ISD::SDIV:
case ISD::UDIV:
case ISD::SREM:
case ISD::UREM: return combineIntDIVREM(N, DCI);
}
return SDValue();
}
// Return the demanded elements for the OpNo source operand of Op. DemandedElts
// are for Op.
static APInt getDemandedSrcElements(SDValue Op, const APInt &DemandedElts,
unsigned OpNo) {
EVT VT = Op.getValueType();
unsigned NumElts = (VT.isVector() ? VT.getVectorNumElements() : 1);
APInt SrcDemE;
unsigned Opcode = Op.getOpcode();
if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
switch (Id) {
case Intrinsic::s390_vpksh: // PACKS
case Intrinsic::s390_vpksf:
case Intrinsic::s390_vpksg:
case Intrinsic::s390_vpkshs: // PACKS_CC
case Intrinsic::s390_vpksfs:
case Intrinsic::s390_vpksgs:
case Intrinsic::s390_vpklsh: // PACKLS
case Intrinsic::s390_vpklsf:
case Intrinsic::s390_vpklsg:
case Intrinsic::s390_vpklshs: // PACKLS_CC
case Intrinsic::s390_vpklsfs:
case Intrinsic::s390_vpklsgs:
// VECTOR PACK truncates the elements of two source vectors into one.
SrcDemE = DemandedElts;
if (OpNo == 2)
SrcDemE.lshrInPlace(NumElts / 2);
SrcDemE = SrcDemE.trunc(NumElts / 2);
break;
// VECTOR UNPACK extends half the elements of the source vector.
case Intrinsic::s390_vuphb: // VECTOR UNPACK HIGH
case Intrinsic::s390_vuphh:
case Intrinsic::s390_vuphf:
case Intrinsic::s390_vuplhb: // VECTOR UNPACK LOGICAL HIGH
case Intrinsic::s390_vuplhh:
case Intrinsic::s390_vuplhf:
SrcDemE = APInt(NumElts * 2, 0);
SrcDemE.insertBits(DemandedElts, 0);
break;
case Intrinsic::s390_vuplb: // VECTOR UNPACK LOW
case Intrinsic::s390_vuplhw:
case Intrinsic::s390_vuplf:
case Intrinsic::s390_vupllb: // VECTOR UNPACK LOGICAL LOW
case Intrinsic::s390_vupllh:
case Intrinsic::s390_vupllf:
SrcDemE = APInt(NumElts * 2, 0);
SrcDemE.insertBits(DemandedElts, NumElts);
break;
case Intrinsic::s390_vpdi: {
// VECTOR PERMUTE DWORD IMMEDIATE selects one element from each source.
SrcDemE = APInt(NumElts, 0);
if (!DemandedElts[OpNo - 1])
break;
unsigned Mask = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
unsigned MaskBit = ((OpNo - 1) ? 1 : 4);
// Demand input element 0 or 1, given by the mask bit value.
SrcDemE.setBit((Mask & MaskBit)? 1 : 0);
break;
}
case Intrinsic::s390_vsldb: {
// VECTOR SHIFT LEFT DOUBLE BY BYTE
assert(VT == MVT::v16i8 && "Unexpected type.");
unsigned FirstIdx = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
assert (FirstIdx > 0 && FirstIdx < 16 && "Unused operand.");
unsigned NumSrc0Els = 16 - FirstIdx;
SrcDemE = APInt(NumElts, 0);
if (OpNo == 1) {
APInt DemEls = DemandedElts.trunc(NumSrc0Els);
SrcDemE.insertBits(DemEls, FirstIdx);
} else {
APInt DemEls = DemandedElts.lshr(NumSrc0Els);
SrcDemE.insertBits(DemEls, 0);
}
break;
}
case Intrinsic::s390_vperm:
SrcDemE = APInt(NumElts, 1);
break;
default:
llvm_unreachable("Unhandled intrinsic.");
break;
}
} else {
switch (Opcode) {
case SystemZISD::JOIN_DWORDS:
// Scalar operand.
SrcDemE = APInt(1, 1);
break;
case SystemZISD::SELECT_CCMASK:
SrcDemE = DemandedElts;
break;
default:
llvm_unreachable("Unhandled opcode.");
break;
}
}
return SrcDemE;
}
static void computeKnownBitsBinOp(const SDValue Op, KnownBits &Known,
const APInt &DemandedElts,
const SelectionDAG &DAG, unsigned Depth,
unsigned OpNo) {
APInt Src0DemE = getDemandedSrcElements(Op, DemandedElts, OpNo);
APInt Src1DemE = getDemandedSrcElements(Op, DemandedElts, OpNo + 1);
KnownBits LHSKnown =
DAG.computeKnownBits(Op.getOperand(OpNo), Src0DemE, Depth + 1);
KnownBits RHSKnown =
DAG.computeKnownBits(Op.getOperand(OpNo + 1), Src1DemE, Depth + 1);
Known.Zero = LHSKnown.Zero & RHSKnown.Zero;
Known.One = LHSKnown.One & RHSKnown.One;
}
void
SystemZTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
KnownBits &Known,
const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth) const {
Known.resetAll();
// Intrinsic CC result is returned in the two low bits.
unsigned tmp0, tmp1; // not used
if (Op.getResNo() == 1 && isIntrinsicWithCC(Op, tmp0, tmp1)) {
Known.Zero.setBitsFrom(2);
return;
}
EVT VT = Op.getValueType();
if (Op.getResNo() != 0 || VT == MVT::Untyped)
return;
assert (Known.getBitWidth() == VT.getScalarSizeInBits() &&
"KnownBits does not match VT in bitwidth");
assert ((!VT.isVector() ||
(DemandedElts.getBitWidth() == VT.getVectorNumElements())) &&
"DemandedElts does not match VT number of elements");
unsigned BitWidth = Known.getBitWidth();
unsigned Opcode = Op.getOpcode();
if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
bool IsLogical = false;
unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
switch (Id) {
case Intrinsic::s390_vpksh: // PACKS
case Intrinsic::s390_vpksf:
case Intrinsic::s390_vpksg:
case Intrinsic::s390_vpkshs: // PACKS_CC
case Intrinsic::s390_vpksfs:
case Intrinsic::s390_vpksgs:
case Intrinsic::s390_vpklsh: // PACKLS
case Intrinsic::s390_vpklsf:
case Intrinsic::s390_vpklsg:
case Intrinsic::s390_vpklshs: // PACKLS_CC
case Intrinsic::s390_vpklsfs:
case Intrinsic::s390_vpklsgs:
case Intrinsic::s390_vpdi:
case Intrinsic::s390_vsldb:
case Intrinsic::s390_vperm:
computeKnownBitsBinOp(Op, Known, DemandedElts, DAG, Depth, 1);
break;
case Intrinsic::s390_vuplhb: // VECTOR UNPACK LOGICAL HIGH
case Intrinsic::s390_vuplhh:
case Intrinsic::s390_vuplhf:
case Intrinsic::s390_vupllb: // VECTOR UNPACK LOGICAL LOW
case Intrinsic::s390_vupllh:
case Intrinsic::s390_vupllf:
IsLogical = true;
LLVM_FALLTHROUGH;
case Intrinsic::s390_vuphb: // VECTOR UNPACK HIGH
case Intrinsic::s390_vuphh:
case Intrinsic::s390_vuphf:
case Intrinsic::s390_vuplb: // VECTOR UNPACK LOW
case Intrinsic::s390_vuplhw:
case Intrinsic::s390_vuplf: {
SDValue SrcOp = Op.getOperand(1);
APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 0);
Known = DAG.computeKnownBits(SrcOp, SrcDemE, Depth + 1);
if (IsLogical) {
Known = Known.zext(BitWidth, true);
} else
Known = Known.sext(BitWidth);
break;
}
default:
break;
}
} else {
switch (Opcode) {
case SystemZISD::JOIN_DWORDS:
case SystemZISD::SELECT_CCMASK:
computeKnownBitsBinOp(Op, Known, DemandedElts, DAG, Depth, 0);
break;
case SystemZISD::REPLICATE: {
SDValue SrcOp = Op.getOperand(0);
Known = DAG.computeKnownBits(SrcOp, Depth + 1);
if (Known.getBitWidth() < BitWidth && isa<ConstantSDNode>(SrcOp))
Known = Known.sext(BitWidth); // VREPI sign extends the immedate.
break;
}
default:
break;
}
}
// Known has the width of the source operand(s). Adjust if needed to match
// the passed bitwidth.
if (Known.getBitWidth() != BitWidth)
Known = Known.zextOrTrunc(BitWidth, false);
}
static unsigned computeNumSignBitsBinOp(SDValue Op, const APInt &DemandedElts,
const SelectionDAG &DAG, unsigned Depth,
unsigned OpNo) {
APInt Src0DemE = getDemandedSrcElements(Op, DemandedElts, OpNo);
unsigned LHS = DAG.ComputeNumSignBits(Op.getOperand(OpNo), Src0DemE, Depth + 1);
if (LHS == 1) return 1; // Early out.
APInt Src1DemE = getDemandedSrcElements(Op, DemandedElts, OpNo + 1);
unsigned RHS = DAG.ComputeNumSignBits(Op.getOperand(OpNo + 1), Src1DemE, Depth + 1);
if (RHS == 1) return 1; // Early out.
unsigned Common = std::min(LHS, RHS);
unsigned SrcBitWidth = Op.getOperand(OpNo).getScalarValueSizeInBits();
EVT VT = Op.getValueType();
unsigned VTBits = VT.getScalarSizeInBits();
if (SrcBitWidth > VTBits) { // PACK
unsigned SrcExtraBits = SrcBitWidth - VTBits;
if (Common > SrcExtraBits)
return (Common - SrcExtraBits);
return 1;
}
assert (SrcBitWidth == VTBits && "Expected operands of same bitwidth.");
return Common;
}
unsigned
SystemZTargetLowering::ComputeNumSignBitsForTargetNode(
SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
unsigned Depth) const {
if (Op.getResNo() != 0)
return 1;
unsigned Opcode = Op.getOpcode();
if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
switch (Id) {
case Intrinsic::s390_vpksh: // PACKS
case Intrinsic::s390_vpksf:
case Intrinsic::s390_vpksg:
case Intrinsic::s390_vpkshs: // PACKS_CC
case Intrinsic::s390_vpksfs:
case Intrinsic::s390_vpksgs:
case Intrinsic::s390_vpklsh: // PACKLS
case Intrinsic::s390_vpklsf:
case Intrinsic::s390_vpklsg:
case Intrinsic::s390_vpklshs: // PACKLS_CC
case Intrinsic::s390_vpklsfs:
case Intrinsic::s390_vpklsgs:
case Intrinsic::s390_vpdi:
case Intrinsic::s390_vsldb:
case Intrinsic::s390_vperm:
return computeNumSignBitsBinOp(Op, DemandedElts, DAG, Depth, 1);
case Intrinsic::s390_vuphb: // VECTOR UNPACK HIGH
case Intrinsic::s390_vuphh:
case Intrinsic::s390_vuphf:
case Intrinsic::s390_vuplb: // VECTOR UNPACK LOW
case Intrinsic::s390_vuplhw:
case Intrinsic::s390_vuplf: {
SDValue PackedOp = Op.getOperand(1);
APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 1);
unsigned Tmp = DAG.ComputeNumSignBits(PackedOp, SrcDemE, Depth + 1);
EVT VT = Op.getValueType();
unsigned VTBits = VT.getScalarSizeInBits();
Tmp += VTBits - PackedOp.getScalarValueSizeInBits();
return Tmp;
}
default:
break;
}
} else {
switch (Opcode) {
case SystemZISD::SELECT_CCMASK:
return computeNumSignBitsBinOp(Op, DemandedElts, DAG, Depth, 0);
default:
break;
}
}
return 1;
}
//===----------------------------------------------------------------------===//
// Custom insertion
//===----------------------------------------------------------------------===//
// Create a new basic block after MBB.
static MachineBasicBlock *emitBlockAfter(MachineBasicBlock *MBB) {
MachineFunction &MF = *MBB->getParent();
MachineBasicBlock *NewMBB = MF.CreateMachineBasicBlock(MBB->getBasicBlock());
MF.insert(std::next(MachineFunction::iterator(MBB)), NewMBB);
return NewMBB;
}
// Split MBB after MI and return the new block (the one that contains
// instructions after MI).
static MachineBasicBlock *splitBlockAfter(MachineBasicBlock::iterator MI,
MachineBasicBlock *MBB) {
MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
NewMBB->splice(NewMBB->begin(), MBB,
std::next(MachineBasicBlock::iterator(MI)), MBB->end());
NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
return NewMBB;
}
// Split MBB before MI and return the new block (the one that contains MI).
static MachineBasicBlock *splitBlockBefore(MachineBasicBlock::iterator MI,
MachineBasicBlock *MBB) {
MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
NewMBB->splice(NewMBB->begin(), MBB, MI, MBB->end());
NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
return NewMBB;
}
// Force base value Base into a register before MI. Return the register.
static Register forceReg(MachineInstr &MI, MachineOperand &Base,
const SystemZInstrInfo *TII) {
if (Base.isReg())
return Base.getReg();
MachineBasicBlock *MBB = MI.getParent();
MachineFunction &MF = *MBB->getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LA), Reg)
.add(Base)
.addImm(0)
.addReg(0);
return Reg;
}
// The CC operand of MI might be missing a kill marker because there
// were multiple uses of CC, and ISel didn't know which to mark.
// Figure out whether MI should have had a kill marker.
static bool checkCCKill(MachineInstr &MI, MachineBasicBlock *MBB) {
// Scan forward through BB for a use/def of CC.
MachineBasicBlock::iterator miI(std::next(MachineBasicBlock::iterator(MI)));
for (MachineBasicBlock::iterator miE = MBB->end(); miI != miE; ++miI) {
const MachineInstr& mi = *miI;
if (mi.readsRegister(SystemZ::CC))
return false;
if (mi.definesRegister(SystemZ::CC))
break; // Should have kill-flag - update below.
}
// If we hit the end of the block, check whether CC is live into a
// successor.
if (miI == MBB->end()) {
for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI)
if ((*SI)->isLiveIn(SystemZ::CC))
return false;
}
return true;
}
// Return true if it is OK for this Select pseudo-opcode to be cascaded
// together with other Select pseudo-opcodes into a single basic-block with
// a conditional jump around it.
static bool isSelectPseudo(MachineInstr &MI) {
switch (MI.getOpcode()) {
case SystemZ::Select32:
case SystemZ::Select64:
case SystemZ::SelectF32:
case SystemZ::SelectF64:
case SystemZ::SelectF128:
case SystemZ::SelectVR32:
case SystemZ::SelectVR64:
case SystemZ::SelectVR128:
return true;
default:
return false;
}
}
// Helper function, which inserts PHI functions into SinkMBB:
// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
// where %FalseValue(i) and %TrueValue(i) are taken from the consequent Selects
// in [MIItBegin, MIItEnd) range.
static void createPHIsForSelects(MachineBasicBlock::iterator MIItBegin,
MachineBasicBlock::iterator MIItEnd,
MachineBasicBlock *TrueMBB,
MachineBasicBlock *FalseMBB,
MachineBasicBlock *SinkMBB) {
MachineFunction *MF = TrueMBB->getParent();
const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
unsigned CCValid = MIItBegin->getOperand(3).getImm();
unsigned CCMask = MIItBegin->getOperand(4).getImm();
DebugLoc DL = MIItBegin->getDebugLoc();
MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
// As we are creating the PHIs, we have to be careful if there is more than
// one. Later Selects may reference the results of earlier Selects, but later
// PHIs have to reference the individual true/false inputs from earlier PHIs.
// That also means that PHI construction must work forward from earlier to
// later, and that the code must maintain a mapping from earlier PHI's
// destination registers, and the registers that went into the PHI.
DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;
MIIt = skipDebugInstructionsForward(++MIIt, MIItEnd)) {
unsigned DestReg = MIIt->getOperand(0).getReg();
unsigned TrueReg = MIIt->getOperand(1).getReg();
unsigned FalseReg = MIIt->getOperand(2).getReg();
// If this Select we are generating is the opposite condition from
// the jump we generated, then we have to swap the operands for the
// PHI that is going to be generated.
if (MIIt->getOperand(4).getImm() == (CCValid ^ CCMask))
std::swap(TrueReg, FalseReg);
if (RegRewriteTable.find(TrueReg) != RegRewriteTable.end())
TrueReg = RegRewriteTable[TrueReg].first;
if (RegRewriteTable.find(FalseReg) != RegRewriteTable.end())
FalseReg = RegRewriteTable[FalseReg].second;
BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(SystemZ::PHI), DestReg)
.addReg(TrueReg).addMBB(TrueMBB)
.addReg(FalseReg).addMBB(FalseMBB);
// Add this PHI to the rewrite table.
RegRewriteTable[DestReg] = std::make_pair(TrueReg, FalseReg);
}
MF->getProperties().reset(MachineFunctionProperties::Property::NoPHIs);
}
// Implement EmitInstrWithCustomInserter for pseudo Select* instruction MI.
MachineBasicBlock *
SystemZTargetLowering::emitSelect(MachineInstr &MI,
MachineBasicBlock *MBB) const {
const SystemZInstrInfo *TII =
static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
unsigned CCValid = MI.getOperand(3).getImm();
unsigned CCMask = MI.getOperand(4).getImm();
DebugLoc DL = MI.getDebugLoc();
// If we have a sequence of Select* pseudo instructions using the
// same condition code value, we want to expand all of them into
// a single pair of basic blocks using the same condition.
MachineInstr *LastMI = &MI;
MachineBasicBlock::iterator NextMIIt = skipDebugInstructionsForward(
std::next(MachineBasicBlock::iterator(MI)), MBB->end());
if (isSelectPseudo(MI))
while (NextMIIt != MBB->end() && isSelectPseudo(*NextMIIt) &&
NextMIIt->getOperand(3).getImm() == CCValid &&
(NextMIIt->getOperand(4).getImm() == CCMask ||
NextMIIt->getOperand(4).getImm() == (CCValid ^ CCMask))) {
LastMI = &*NextMIIt;
NextMIIt = skipDebugInstructionsForward(++NextMIIt, MBB->end());
}
MachineBasicBlock *StartMBB = MBB;
MachineBasicBlock *JoinMBB = splitBlockBefore(MI, MBB);
MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
// Unless CC was killed in the last Select instruction, mark it as
// live-in to both FalseMBB and JoinMBB.
if (!LastMI->killsRegister(SystemZ::CC) && !checkCCKill(*LastMI, JoinMBB)) {
FalseMBB->addLiveIn(SystemZ::CC);
JoinMBB->addLiveIn(SystemZ::CC);
}
// StartMBB:
// BRC CCMask, JoinMBB
// # fallthrough to FalseMBB
MBB = StartMBB;
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
.addImm(CCValid).addImm(CCMask).addMBB(JoinMBB);
MBB->addSuccessor(JoinMBB);
MBB->addSuccessor(FalseMBB);
// FalseMBB:
// # fallthrough to JoinMBB
MBB = FalseMBB;
MBB->addSuccessor(JoinMBB);
// JoinMBB:
// %Result = phi [ %FalseReg, FalseMBB ], [ %TrueReg, StartMBB ]
// ...
MBB = JoinMBB;
MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
MachineBasicBlock::iterator MIItEnd = skipDebugInstructionsForward(
std::next(MachineBasicBlock::iterator(LastMI)), MBB->end());
createPHIsForSelects(MIItBegin, MIItEnd, StartMBB, FalseMBB, MBB);
StartMBB->erase(MIItBegin, MIItEnd);
return JoinMBB;
}
// Implement EmitInstrWithCustomInserter for pseudo CondStore* instruction MI.
// StoreOpcode is the store to use and Invert says whether the store should
// happen when the condition is false rather than true. If a STORE ON
// CONDITION is available, STOCOpcode is its opcode, otherwise it is 0.
MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
MachineBasicBlock *MBB,
unsigned StoreOpcode,
unsigned STOCOpcode,
bool Invert) const {
const SystemZInstrInfo *TII =
static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
unsigned SrcReg = MI.getOperand(0).getReg();
MachineOperand Base = MI.getOperand(1);
int64_t Disp = MI.getOperand(2).getImm();
unsigned IndexReg = MI.getOperand(3).getReg();
unsigned CCValid = MI.getOperand(4).getImm();
unsigned CCMask = MI.getOperand(5).getImm();
DebugLoc DL = MI.getDebugLoc();
StoreOpcode = TII->getOpcodeForOffset(StoreOpcode, Disp);
// Use STOCOpcode if possible. We could use different store patterns in
// order to avoid matching the index register, but the performance trade-offs
// might be more complicated in that case.
if (STOCOpcode && !IndexReg && Subtarget.hasLoadStoreOnCond()) {
if (Invert)
CCMask ^= CCValid;
// ISel pattern matching also adds a load memory operand of the same
// address, so take special care to find the storing memory operand.
MachineMemOperand *MMO = nullptr;
for (auto *I : MI.memoperands())
if (I->isStore()) {
MMO = I;
break;
}
BuildMI(*MBB, MI, DL, TII->get(STOCOpcode))
.addReg(SrcReg)
.add(Base)
.addImm(Disp)
.addImm(CCValid)
.addImm(CCMask)
.addMemOperand(MMO);
MI.eraseFromParent();
return MBB;
}
// Get the condition needed to branch around the store.
if (!Invert)
CCMask ^= CCValid;
MachineBasicBlock *StartMBB = MBB;
MachineBasicBlock *JoinMBB = splitBlockBefore(MI, MBB);
MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
// Unless CC was killed in the CondStore instruction, mark it as
// live-in to both FalseMBB and JoinMBB.
if (!MI.killsRegister(SystemZ::CC) && !checkCCKill(MI, JoinMBB)) {
FalseMBB->addLiveIn(SystemZ::CC);
JoinMBB->addLiveIn(SystemZ::CC);
}
// StartMBB:
// BRC CCMask, JoinMBB
// # fallthrough to FalseMBB
MBB = StartMBB;
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
.addImm(CCValid).addImm(CCMask).addMBB(JoinMBB);
MBB->addSuccessor(JoinMBB);
MBB->addSuccessor(FalseMBB);
// FalseMBB:
// store %SrcReg, %Disp(%Index,%Base)
// # fallthrough to JoinMBB
MBB = FalseMBB;
BuildMI(MBB, DL, TII->get(StoreOpcode))
.addReg(SrcReg)
.add(Base)
.addImm(Disp)
.addReg(IndexReg);
MBB->addSuccessor(JoinMBB);
MI.eraseFromParent();
return JoinMBB;
}
// Implement EmitInstrWithCustomInserter for pseudo ATOMIC_LOAD{,W}_*
// or ATOMIC_SWAP{,W} instruction MI. BinOpcode is the instruction that
// performs the binary operation elided by "*", or 0 for ATOMIC_SWAP{,W}.
// BitSize is the width of the field in bits, or 0 if this is a partword
// ATOMIC_LOADW_* or ATOMIC_SWAPW instruction, in which case the bitsize
// is one of the operands. Invert says whether the field should be
// inverted after performing BinOpcode (e.g. for NAND).
MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
MachineInstr &MI, MachineBasicBlock *MBB, unsigned BinOpcode,
unsigned BitSize, bool Invert) const {
MachineFunction &MF = *MBB->getParent();
const SystemZInstrInfo *TII =
static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
MachineRegisterInfo &MRI = MF.getRegInfo();
bool IsSubWord = (BitSize < 32);
// Extract the operands. Base can be a register or a frame index.
// Src2 can be a register or immediate.
unsigned Dest = MI.getOperand(0).getReg();
MachineOperand Base = earlyUseOperand(MI.getOperand(1));
int64_t Disp = MI.getOperand(2).getImm();
MachineOperand Src2 = earlyUseOperand(MI.getOperand(3));
Register BitShift = IsSubWord ? MI.getOperand(4).getReg() : Register();
Register NegBitShift = IsSubWord ? MI.getOperand(5).getReg() : Register();
DebugLoc DL = MI.getDebugLoc();
if (IsSubWord)
BitSize = MI.getOperand(6).getImm();
// Subword operations use 32-bit registers.
const TargetRegisterClass *RC = (BitSize <= 32 ?
&SystemZ::GR32BitRegClass :
&SystemZ::GR64BitRegClass);
unsigned LOpcode = BitSize <= 32 ? SystemZ::L : SystemZ::LG;
unsigned CSOpcode = BitSize <= 32 ? SystemZ::CS : SystemZ::CSG;
// Get the right opcodes for the displacement.
LOpcode = TII->getOpcodeForOffset(LOpcode, Disp);
CSOpcode = TII->getOpcodeForOffset(CSOpcode, Disp);
assert(LOpcode && CSOpcode && "Displacement out of range");
// Create virtual registers for temporary results.
Register OrigVal = MRI.createVirtualRegister(RC);
Register OldVal = MRI.createVirtualRegister(RC);
Register NewVal = (BinOpcode || IsSubWord ?
MRI.createVirtualRegister(RC) : Src2.getReg());
Register RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal);
Register RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal);
// Insert a basic block for the main loop.
MachineBasicBlock *StartMBB = MBB;
MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
// StartMBB:
// ...
// %OrigVal = L Disp(%Base)
// # fall through to LoopMMB
MBB = StartMBB;
BuildMI(MBB, DL, TII->get(LOpcode), OrigVal).add(Base).addImm(Disp).addReg(0);
MBB->addSuccessor(LoopMBB);
// LoopMBB:
// %OldVal = phi [ %OrigVal, StartMBB ], [ %Dest, LoopMBB ]
// %RotatedOldVal = RLL %OldVal, 0(%BitShift)
// %RotatedNewVal = OP %RotatedOldVal, %Src2
// %NewVal = RLL %RotatedNewVal, 0(%NegBitShift)
// %Dest = CS %OldVal, %NewVal, Disp(%Base)
// JNE LoopMBB
// # fall through to DoneMMB
MBB = LoopMBB;
BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
.addReg(OrigVal).addMBB(StartMBB)
.addReg(Dest).addMBB(LoopMBB);
if (IsSubWord)
BuildMI(MBB, DL, TII->get(SystemZ::RLL), RotatedOldVal)
.addReg(OldVal).addReg(BitShift).addImm(0);
if (Invert) {
// Perform the operation normally and then invert every bit of the field.
unsigned Tmp = MRI.createVirtualRegister(RC);
BuildMI(MBB, DL, TII->get(BinOpcode), Tmp).addReg(RotatedOldVal).add(Src2);
if (BitSize <= 32)
// XILF with the upper BitSize bits set.
BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal)
.addReg(Tmp).addImm(-1U << (32 - BitSize));
else {
// Use LCGR and add -1 to the result, which is more compact than
// an XILF, XILH pair.
unsigned Tmp2 = MRI.createVirtualRegister(RC);
BuildMI(MBB, DL, TII->get(SystemZ::LCGR), Tmp2).addReg(Tmp);
BuildMI(MBB, DL, TII->get(SystemZ::AGHI), RotatedNewVal)
.addReg(Tmp2).addImm(-1);
}
} else if (BinOpcode)
// A simply binary operation.
BuildMI(MBB, DL, TII->get(BinOpcode), RotatedNewVal)
.addReg(RotatedOldVal)
.add(Src2);
else if (IsSubWord)
// Use RISBG to rotate Src2 into position and use it to replace the
// field in RotatedOldVal.
BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RotatedNewVal)
.addReg(RotatedOldVal).addReg(Src2.getReg())
.addImm(32).addImm(31 + BitSize).addImm(32 - BitSize);
if (IsSubWord)
BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
.addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
.addReg(OldVal)
.addReg(NewVal)
.add(Base)
.addImm(Disp);
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
.addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
MBB->addSuccessor(LoopMBB);
MBB->addSuccessor(DoneMBB);
MI.eraseFromParent();
return DoneMBB;
}
// Implement EmitInstrWithCustomInserter for pseudo
// ATOMIC_LOAD{,W}_{,U}{MIN,MAX} instruction MI. CompareOpcode is the
// instruction that should be used to compare the current field with the
// minimum or maximum value. KeepOldMask is the BRC condition-code mask
// for when the current field should be kept. BitSize is the width of
// the field in bits, or 0 if this is a partword ATOMIC_LOADW_* instruction.
MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax(
MachineInstr &MI, MachineBasicBlock *MBB, unsigned CompareOpcode,
unsigned KeepOldMask, unsigned BitSize) const {
MachineFunction &MF = *MBB->getParent();
const SystemZInstrInfo *TII =
static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
MachineRegisterInfo &MRI = MF.getRegInfo();
bool IsSubWord = (BitSize < 32);
// Extract the operands. Base can be a register or a frame index.
unsigned Dest = MI.getOperand(0).getReg();
MachineOperand Base = earlyUseOperand(MI.getOperand(1));
int64_t Disp = MI.getOperand(2).getImm();
Register Src2 = MI.getOperand(3).getReg();
Register BitShift = (IsSubWord ? MI.getOperand(4).getReg() : Register());
Register NegBitShift = (IsSubWord ? MI.getOperand(5).getReg() : Register());
DebugLoc DL = MI.getDebugLoc();
if (IsSubWord)
BitSize = MI.getOperand(6).getImm();
// Subword operations use 32-bit registers.
const TargetRegisterClass *RC = (BitSize <= 32 ?
&SystemZ::GR32BitRegClass :
&SystemZ::GR64BitRegClass);
unsigned LOpcode = BitSize <= 32 ? SystemZ::L : SystemZ::LG;
unsigned CSOpcode = BitSize <= 32 ? SystemZ::CS : SystemZ::CSG;
// Get the right opcodes for the displacement.
LOpcode = TII->getOpcodeForOffset(LOpcode, Disp);
CSOpcode = TII->getOpcodeForOffset(CSOpcode, Disp);
assert(LOpcode && CSOpcode && "Displacement out of range");
// Create virtual registers for temporary results.
Register OrigVal = MRI.createVirtualRegister(RC);
Register OldVal = MRI.createVirtualRegister(RC);
Register NewVal = MRI.createVirtualRegister(RC);
Register RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal);
Register RotatedAltVal = (IsSubWord ? MRI.createVirtualRegister(RC) : Src2);
Register RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal);
// Insert 3 basic blocks for the loop.
MachineBasicBlock *StartMBB = MBB;
MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
MachineBasicBlock *UseAltMBB = emitBlockAfter(LoopMBB);
MachineBasicBlock *UpdateMBB = emitBlockAfter(UseAltMBB);
// StartMBB:
// ...
// %OrigVal = L Disp(%Base)
// # fall through to LoopMMB
MBB = StartMBB;
BuildMI(MBB, DL, TII->get(LOpcode), OrigVal).add(Base).addImm(Disp).addReg(0);
MBB->addSuccessor(LoopMBB);
// LoopMBB:
// %OldVal = phi [ %OrigVal, StartMBB ], [ %Dest, UpdateMBB ]
// %RotatedOldVal = RLL %OldVal, 0(%BitShift)
// CompareOpcode %RotatedOldVal, %Src2
// BRC KeepOldMask, UpdateMBB
MBB = LoopMBB;
BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
.addReg(OrigVal).addMBB(StartMBB)
.addReg(Dest).addMBB(UpdateMBB);
if (IsSubWord)
BuildMI(MBB, DL, TII->get(SystemZ::RLL), RotatedOldVal)
.addReg(OldVal).addReg(BitShift).addImm(0);
BuildMI(MBB, DL, TII->get(CompareOpcode))
.addReg(RotatedOldVal).addReg(Src2);
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
.addImm(SystemZ::CCMASK_ICMP).addImm(KeepOldMask).addMBB(UpdateMBB);
MBB->addSuccessor(UpdateMBB);
MBB->addSuccessor(UseAltMBB);
// UseAltMBB:
// %RotatedAltVal = RISBG %RotatedOldVal, %Src2, 32, 31 + BitSize, 0
// # fall through to UpdateMMB
MBB = UseAltMBB;
if (IsSubWord)
BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RotatedAltVal)
.addReg(RotatedOldVal).addReg(Src2)
.addImm(32).addImm(31 + BitSize).addImm(0);
MBB->addSuccessor(UpdateMBB);
// UpdateMBB:
// %RotatedNewVal = PHI [ %RotatedOldVal, LoopMBB ],
// [ %RotatedAltVal, UseAltMBB ]
// %NewVal = RLL %RotatedNewVal, 0(%NegBitShift)
// %Dest = CS %OldVal, %NewVal, Disp(%Base)
// JNE LoopMBB
// # fall through to DoneMMB
MBB = UpdateMBB;
BuildMI(MBB, DL, TII->get(SystemZ::PHI), RotatedNewVal)
.addReg(RotatedOldVal).addMBB(LoopMBB)
.addReg(RotatedAltVal).addMBB(UseAltMBB);
if (IsSubWord)
BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
.addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
.addReg(OldVal)
.addReg(NewVal)
.add(Base)
.addImm(Disp);
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
.addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
MBB->addSuccessor(LoopMBB);
MBB->addSuccessor(DoneMBB);
MI.eraseFromParent();
return DoneMBB;
}
// Implement EmitInstrWithCustomInserter for pseudo ATOMIC_CMP_SWAPW
// instruction MI.
MachineBasicBlock *
SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
MachineBasicBlock *MBB) const {
MachineFunction &MF = *MBB->getParent();
const SystemZInstrInfo *TII =
static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
MachineRegisterInfo &MRI = MF.getRegInfo();
// Extract the operands. Base can be a register or a frame index.
unsigned Dest = MI.getOperand(0).getReg();
MachineOperand Base = earlyUseOperand(MI.getOperand(1));
int64_t Disp = MI.getOperand(2).getImm();
unsigned OrigCmpVal = MI.getOperand(3).getReg();
unsigned OrigSwapVal = MI.getOperand(4).getReg();
unsigned BitShift = MI.getOperand(5).getReg();
unsigned NegBitShift = MI.getOperand(6).getReg();
int64_t BitSize = MI.getOperand(7).getImm();
DebugLoc DL = MI.getDebugLoc();
const TargetRegisterClass *RC = &SystemZ::GR32BitRegClass;
// Get the right opcodes for the displacement.
unsigned LOpcode = TII->getOpcodeForOffset(SystemZ::L, Disp);
unsigned CSOpcode = TII->getOpcodeForOffset(SystemZ::CS, Disp);
assert(LOpcode && CSOpcode && "Displacement out of range");
// Create virtual registers for temporary results.
unsigned OrigOldVal = MRI.createVirtualRegister(RC);
unsigned OldVal = MRI.createVirtualRegister(RC);
unsigned CmpVal = MRI.createVirtualRegister(RC);
unsigned SwapVal = MRI.createVirtualRegister(RC);
unsigned StoreVal = MRI.createVirtualRegister(RC);
unsigned RetryOldVal = MRI.createVirtualRegister(RC);
unsigned RetryCmpVal = MRI.createVirtualRegister(RC);
unsigned RetrySwapVal = MRI.createVirtualRegister(RC);
// Insert 2 basic blocks for the loop.
MachineBasicBlock *StartMBB = MBB;
MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
MachineBasicBlock *SetMBB = emitBlockAfter(LoopMBB);
// StartMBB:
// ...
// %OrigOldVal = L Disp(%Base)
// # fall through to LoopMMB
MBB = StartMBB;
BuildMI(MBB, DL, TII->get(LOpcode), OrigOldVal)
.add(Base)
.addImm(Disp)
.addReg(0);
MBB->addSuccessor(LoopMBB);
// LoopMBB:
// %OldVal = phi [ %OrigOldVal, EntryBB ], [ %RetryOldVal, SetMBB ]
// %CmpVal = phi [ %OrigCmpVal, EntryBB ], [ %RetryCmpVal, SetMBB ]
// %SwapVal = phi [ %OrigSwapVal, EntryBB ], [ %RetrySwapVal, SetMBB ]
// %Dest = RLL %OldVal, BitSize(%BitShift)
// ^^ The low BitSize bits contain the field
// of interest.
// %RetryCmpVal = RISBG32 %CmpVal, %Dest, 32, 63-BitSize, 0
// ^^ Replace the upper 32-BitSize bits of the
// comparison value with those that we loaded,
// so that we can use a full word comparison.
// CR %Dest, %RetryCmpVal
// JNE DoneMBB
// # Fall through to SetMBB
MBB = LoopMBB;
BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
.addReg(OrigOldVal).addMBB(StartMBB)
.addReg(RetryOldVal).addMBB(SetMBB);
BuildMI(MBB, DL, TII->get(SystemZ::PHI), CmpVal)
.addReg(OrigCmpVal).addMBB(StartMBB)
.addReg(RetryCmpVal).addMBB(SetMBB);
BuildMI(MBB, DL, TII->get(SystemZ::PHI), SwapVal)
.addReg(OrigSwapVal).addMBB(StartMBB)
.addReg(RetrySwapVal).addMBB(SetMBB);
BuildMI(MBB, DL, TII->get(SystemZ::RLL), Dest)
.addReg(OldVal).addReg(BitShift).addImm(BitSize);
BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RetryCmpVal)
.addReg(CmpVal).addReg(Dest).addImm(32).addImm(63 - BitSize).addImm(0);
BuildMI(MBB, DL, TII->get(SystemZ::CR))
.addReg(Dest).addReg(RetryCmpVal);
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
.addImm(SystemZ::CCMASK_ICMP)
.addImm(SystemZ::CCMASK_CMP_NE).addMBB(DoneMBB);
MBB->addSuccessor(DoneMBB);
MBB->addSuccessor(SetMBB);
// SetMBB:
// %RetrySwapVal = RISBG32 %SwapVal, %Dest, 32, 63-BitSize, 0
// ^^ Replace the upper 32-BitSize bits of the new
// value with those that we loaded.
// %StoreVal = RLL %RetrySwapVal, -BitSize(%NegBitShift)
// ^^ Rotate the new field to its proper position.
// %RetryOldVal = CS %Dest, %StoreVal, Disp(%Base)
// JNE LoopMBB
// # fall through to ExitMMB
MBB = SetMBB;
BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RetrySwapVal)
.addReg(SwapVal).addReg(Dest).addImm(32).addImm(63 - BitSize).addImm(0);
BuildMI(MBB, DL, TII->get(SystemZ::RLL), StoreVal)
.addReg(RetrySwapVal).addReg(NegBitShift).addImm(-BitSize);
BuildMI(MBB, DL, TII->get(CSOpcode), RetryOldVal)
.addReg(OldVal)
.addReg(StoreVal)
.add(Base)
.addImm(Disp);
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
.addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
MBB->addSuccessor(LoopMBB);
MBB->addSuccessor(DoneMBB);
// If the CC def wasn't dead in the ATOMIC_CMP_SWAPW, mark CC as live-in
// to the block after the loop. At this point, CC may have been defined
// either by the CR in LoopMBB or by the CS in SetMBB.
if (!MI.registerDefIsDead(SystemZ::CC))
DoneMBB->addLiveIn(SystemZ::CC);
MI.eraseFromParent();
return DoneMBB;
}
// Emit a move from two GR64s to a GR128.
MachineBasicBlock *
SystemZTargetLowering::emitPair128(MachineInstr &MI,
MachineBasicBlock *MBB) const {
MachineFunction &MF = *MBB->getParent();
const SystemZInstrInfo *TII =
static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
MachineRegisterInfo &MRI = MF.getRegInfo();
DebugLoc DL = MI.getDebugLoc();
unsigned Dest = MI.getOperand(0).getReg();
unsigned Hi = MI.getOperand(1).getReg();
unsigned Lo = MI.getOperand(2).getReg();
unsigned Tmp1 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
unsigned Tmp2 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Tmp1);
BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Tmp2)
.addReg(Tmp1).addReg(Hi).addImm(SystemZ::subreg_h64);
BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dest)
.addReg(Tmp2).addReg(Lo).addImm(SystemZ::subreg_l64);
MI.eraseFromParent();
return MBB;
}
// Emit an extension from a GR64 to a GR128. ClearEven is true
// if the high register of the GR128 value must be cleared or false if
// it's "don't care".
MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI,
MachineBasicBlock *MBB,
bool ClearEven) const {
MachineFunction &MF = *MBB->getParent();
const SystemZInstrInfo *TII =
static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
MachineRegisterInfo &MRI = MF.getRegInfo();
DebugLoc DL = MI.getDebugLoc();
unsigned Dest = MI.getOperand(0).getReg();
unsigned Src = MI.getOperand(1).getReg();
unsigned In128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), In128);
if (ClearEven) {
unsigned NewIn128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
unsigned Zero64 = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
BuildMI(*MBB, MI, DL, TII->get(SystemZ::LLILL), Zero64)
.addImm(0);
BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewIn128)
.addReg(In128).addReg(Zero64).addImm(SystemZ::subreg_h64);
In128 = NewIn128;
}
BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dest)
.addReg(In128).addReg(Src).addImm(SystemZ::subreg_l64);
MI.eraseFromParent();
return MBB;
}
MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
MachineFunction &MF = *MBB->getParent();
const SystemZInstrInfo *TII =
static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
MachineRegisterInfo &MRI = MF.getRegInfo();
DebugLoc DL = MI.getDebugLoc();
MachineOperand DestBase = earlyUseOperand(MI.getOperand(0));
uint64_t DestDisp = MI.getOperand(1).getImm();
MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2));
uint64_t SrcDisp = MI.getOperand(3).getImm();
uint64_t Length = MI.getOperand(4).getImm();
// When generating more than one CLC, all but the last will need to
// branch to the end when a difference is found.
MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ?
splitBlockAfter(MI, MBB) : nullptr);
// Check for the loop form, in which operand 5 is the trip count.
if (MI.getNumExplicitOperands() > 5) {
bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
Register StartCountReg = MI.getOperand(5).getReg();
Register StartSrcReg = forceReg(MI, SrcBase, TII);
Register StartDestReg = (HaveSingleBase ? StartSrcReg :
forceReg(MI, DestBase, TII));
const TargetRegisterClass *RC = &SystemZ::ADDR64BitRegClass;
Register ThisSrcReg = MRI.createVirtualRegister(RC);
Register ThisDestReg = (HaveSingleBase ? ThisSrcReg :
MRI.createVirtualRegister(RC));
Register NextSrcReg = MRI.createVirtualRegister(RC);
Register NextDestReg = (HaveSingleBase ? NextSrcReg :
MRI.createVirtualRegister(RC));
RC = &SystemZ::GR64BitRegClass;
Register ThisCountReg = MRI.createVirtualRegister(RC);
Register NextCountReg = MRI.createVirtualRegister(RC);
MachineBasicBlock *StartMBB = MBB;
MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
MachineBasicBlock *NextMBB = (EndMBB ? emitBlockAfter(LoopMBB) : LoopMBB);
// StartMBB:
// # fall through to LoopMMB
MBB->addSuccessor(LoopMBB);
// LoopMBB:
// %ThisDestReg = phi [ %StartDestReg, StartMBB ],
// [ %NextDestReg, NextMBB ]
// %ThisSrcReg = phi [ %StartSrcReg, StartMBB ],
// [ %NextSrcReg, NextMBB ]
// %ThisCountReg = phi [ %StartCountReg, StartMBB ],
// [ %NextCountReg, NextMBB ]
// ( PFD 2, 768+DestDisp(%ThisDestReg) )
// Opcode DestDisp(256,%ThisDestReg), SrcDisp(%ThisSrcReg)
// ( JLH EndMBB )
//
// The prefetch is used only for MVC. The JLH is used only for CLC.
MBB = LoopMBB;
BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg)
.addReg(StartDestReg).addMBB(StartMBB)
.addReg(NextDestReg).addMBB(NextMBB);
if (!HaveSingleBase)
BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisSrcReg)
.addReg(StartSrcReg).addMBB(StartMBB)
.addReg(NextSrcReg).addMBB(NextMBB);
BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisCountReg)
.addReg(StartCountReg).addMBB(StartMBB)
.addReg(NextCountReg).addMBB(NextMBB);
if (Opcode == SystemZ::MVC)
BuildMI(MBB, DL, TII->get(SystemZ::PFD))
.addImm(SystemZ::PFD_WRITE)
.addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0);
BuildMI(MBB, DL, TII->get(Opcode))
.addReg(ThisDestReg).addImm(DestDisp).addImm(256)
.addReg(ThisSrcReg).addImm(SrcDisp);
if (EndMBB) {
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
.addMBB(EndMBB);
MBB->addSuccessor(EndMBB);
MBB->addSuccessor(NextMBB);
}
// NextMBB:
// %NextDestReg = LA 256(%ThisDestReg)
// %NextSrcReg = LA 256(%ThisSrcReg)
// %NextCountReg = AGHI %ThisCountReg, -1
// CGHI %NextCountReg, 0
// JLH LoopMBB
// # fall through to DoneMMB
//
// The AGHI, CGHI and JLH should be converted to BRCTG by later passes.
MBB = NextMBB;
BuildMI(MBB, DL, TII->get(SystemZ::LA), NextDestReg)
.addReg(ThisDestReg).addImm(256).addReg(0);
if (!HaveSingleBase)
BuildMI(MBB, DL, TII->get(SystemZ::LA), NextSrcReg)
.addReg(ThisSrcReg).addImm(256).addReg(0);
BuildMI(MBB, DL, TII->get(SystemZ::AGHI), NextCountReg)
.addReg(ThisCountReg).addImm(-1);
BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
.addReg(NextCountReg).addImm(0);
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
.addMBB(LoopMBB);
MBB->addSuccessor(LoopMBB);
MBB->addSuccessor(DoneMBB);
DestBase = MachineOperand::CreateReg(NextDestReg, false);
SrcBase = MachineOperand::CreateReg(NextSrcReg, false);
Length &= 255;
if (EndMBB && !Length)
// If the loop handled the whole CLC range, DoneMBB will be empty with
// CC live-through into EndMBB, so add it as live-in.
DoneMBB->addLiveIn(SystemZ::CC);
MBB = DoneMBB;
}
// Handle any remaining bytes with straight-line code.
while (Length > 0) {
uint64_t ThisLength = std::min(Length, uint64_t(256));
// The previous iteration might have created out-of-range displacements.
// Apply them using LAY if so.
if (!isUInt<12>(DestDisp)) {
unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
.add(DestBase)
.addImm(DestDisp)
.addReg(0);
DestBase = MachineOperand::CreateReg(Reg, false);
DestDisp = 0;
}
if (!isUInt<12>(SrcDisp)) {
unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
.add(SrcBase)
.addImm(SrcDisp)
.addReg(0);
SrcBase = MachineOperand::CreateReg(Reg, false);
SrcDisp = 0;
}
BuildMI(*MBB, MI, DL, TII->get(Opcode))
.add(DestBase)
.addImm(DestDisp)
.addImm(ThisLength)
.add(SrcBase)
.addImm(SrcDisp)
.setMemRefs(MI.memoperands());
DestDisp += ThisLength;
SrcDisp += ThisLength;
Length -= ThisLength;
// If there's another CLC to go, branch to the end if a difference
// was found.
if (EndMBB && Length > 0) {
MachineBasicBlock *NextMBB = splitBlockBefore(MI, MBB);
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
.addMBB(EndMBB);
MBB->addSuccessor(EndMBB);
MBB->addSuccessor(NextMBB);
MBB = NextMBB;
}
}
if (EndMBB) {
MBB->addSuccessor(EndMBB);
MBB = EndMBB;
MBB->addLiveIn(SystemZ::CC);
}
MI.eraseFromParent();
return MBB;
}
// Decompose string pseudo-instruction MI into a loop that continually performs
// Opcode until CC != 3.
MachineBasicBlock *SystemZTargetLowering::emitStringWrapper(
MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
MachineFunction &MF = *MBB->getParent();
const SystemZInstrInfo *TII =
static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
MachineRegisterInfo &MRI = MF.getRegInfo();
DebugLoc DL = MI.getDebugLoc();
uint64_t End1Reg = MI.getOperand(0).getReg();
uint64_t Start1Reg = MI.getOperand(1).getReg();
uint64_t Start2Reg = MI.getOperand(2).getReg();
uint64_t CharReg = MI.getOperand(3).getReg();
const TargetRegisterClass *RC = &SystemZ::GR64BitRegClass;
uint64_t This1Reg = MRI.createVirtualRegister(RC);
uint64_t This2Reg = MRI.createVirtualRegister(RC);
uint64_t End2Reg = MRI.createVirtualRegister(RC);
MachineBasicBlock *StartMBB = MBB;
MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
// StartMBB:
// # fall through to LoopMMB
MBB->addSuccessor(LoopMBB);
// LoopMBB:
// %This1Reg = phi [ %Start1Reg, StartMBB ], [ %End1Reg, LoopMBB ]
// %This2Reg = phi [ %Start2Reg, StartMBB ], [ %End2Reg, LoopMBB ]
// R0L = %CharReg
// %End1Reg, %End2Reg = CLST %This1Reg, %This2Reg -- uses R0L
// JO LoopMBB
// # fall through to DoneMMB
//
// The load of R0L can be hoisted by post-RA LICM.
MBB = LoopMBB;
BuildMI(MBB, DL, TII->get(SystemZ::PHI), This1Reg)
.addReg(Start1Reg).addMBB(StartMBB)
.addReg(End1Reg).addMBB(LoopMBB);
BuildMI(MBB, DL, TII->get(SystemZ::PHI), This2Reg)
.addReg(Start2Reg).addMBB(StartMBB)
.addReg(End2Reg).addMBB(LoopMBB);
BuildMI(MBB, DL, TII->get(TargetOpcode::COPY), SystemZ::R0L).addReg(CharReg);
BuildMI(MBB, DL, TII->get(Opcode))
.addReg(End1Reg, RegState::Define).addReg(End2Reg, RegState::Define)
.addReg(This1Reg).addReg(This2Reg);
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
.addImm(SystemZ::CCMASK_ANY).addImm(SystemZ::CCMASK_3).addMBB(LoopMBB);
MBB->addSuccessor(LoopMBB);
MBB->addSuccessor(DoneMBB);
DoneMBB->addLiveIn(SystemZ::CC);
MI.eraseFromParent();
return DoneMBB;
}
// Update TBEGIN instruction with final opcode and register clobbers.
MachineBasicBlock *SystemZTargetLowering::emitTransactionBegin(
MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode,
bool NoFloat) const {
MachineFunction &MF = *MBB->getParent();
const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
// Update opcode.
MI.setDesc(TII->get(Opcode));
// We cannot handle a TBEGIN that clobbers the stack or frame pointer.
// Make sure to add the corresponding GRSM bits if they are missing.
uint64_t Control = MI.getOperand(2).getImm();
static const unsigned GPRControlBit[16] = {
0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000,
0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100
};
Control |= GPRControlBit[15];
if (TFI->hasFP(MF))
Control |= GPRControlBit[11];
MI.getOperand(2).setImm(Control);
// Add GPR clobbers.
for (int I = 0; I < 16; I++) {
if ((Control & GPRControlBit[I]) == 0) {
unsigned Reg = SystemZMC::GR64Regs[I];
MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
}
}
// Add FPR/VR clobbers.
if (!NoFloat && (Control & 4) != 0) {
if (Subtarget.hasVector()) {
for (int I = 0; I < 32; I++) {
unsigned Reg = SystemZMC::VR128Regs[I];
MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
}
} else {
for (int I = 0; I < 16; I++) {
unsigned Reg = SystemZMC::FP64Regs[I];
MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
}
}
}
return MBB;
}
MachineBasicBlock *SystemZTargetLowering::emitLoadAndTestCmp0(
MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
MachineFunction &MF = *MBB->getParent();
MachineRegisterInfo *MRI = &MF.getRegInfo();
const SystemZInstrInfo *TII =
static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
DebugLoc DL = MI.getDebugLoc();
unsigned SrcReg = MI.getOperand(0).getReg();
// Create new virtual register of the same class as source.
const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
unsigned DstReg = MRI->createVirtualRegister(RC);
// Replace pseudo with a normal load-and-test that models the def as
// well.
BuildMI(*MBB, MI, DL, TII->get(Opcode), DstReg)
.addReg(SrcReg);
MI.eraseFromParent();
return MBB;
}
MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *MBB) const {
switch (MI.getOpcode()) {
case SystemZ::Select32:
case SystemZ::Select64:
case SystemZ::SelectF32:
case SystemZ::SelectF64:
case SystemZ::SelectF128:
case SystemZ::SelectVR32:
case SystemZ::SelectVR64:
case SystemZ::SelectVR128:
return emitSelect(MI, MBB);
case SystemZ::CondStore8Mux:
return emitCondStore(MI, MBB, SystemZ::STCMux, 0, false);
case SystemZ::CondStore8MuxInv:
return emitCondStore(MI, MBB, SystemZ::STCMux, 0, true);
case SystemZ::CondStore16Mux:
return emitCondStore(MI, MBB, SystemZ::STHMux, 0, false);
case SystemZ::CondStore16MuxInv:
return emitCondStore(MI, MBB, SystemZ::STHMux, 0, true);
case SystemZ::CondStore32Mux:
return emitCondStore(MI, MBB, SystemZ::STMux, SystemZ::STOCMux, false);
case SystemZ::CondStore32MuxInv:
return emitCondStore(MI, MBB, SystemZ::STMux, SystemZ::STOCMux, true);
case SystemZ::CondStore8:
return emitCondStore(MI, MBB, SystemZ::STC, 0, false);
case SystemZ::CondStore8Inv:
return emitCondStore(MI, MBB, SystemZ::STC, 0, true);
case SystemZ::CondStore16:
return emitCondStore(MI, MBB, SystemZ::STH, 0, false);
case SystemZ::CondStore16Inv:
return emitCondStore(MI, MBB, SystemZ::STH, 0, true);
case SystemZ::CondStore32:
return emitCondStore(MI, MBB, SystemZ::ST, SystemZ::STOC, false);
case SystemZ::CondStore32Inv:
return emitCondStore(MI, MBB, SystemZ::ST, SystemZ::STOC, true);
case SystemZ::CondStore64:
return emitCondStore(MI, MBB, SystemZ::STG, SystemZ::STOCG, false);
case SystemZ::CondStore64Inv:
return emitCondStore(MI, MBB, SystemZ::STG, SystemZ::STOCG, true);
case SystemZ::CondStoreF32:
return emitCondStore(MI, MBB, SystemZ::STE, 0, false);
case SystemZ::CondStoreF32Inv:
return emitCondStore(MI, MBB, SystemZ::STE, 0, true);
case SystemZ::CondStoreF64:
return emitCondStore(MI, MBB, SystemZ::STD, 0, false);
case SystemZ::CondStoreF64Inv:
return emitCondStore(MI, MBB, SystemZ::STD, 0, true);
case SystemZ::PAIR128:
return emitPair128(MI, MBB);
case SystemZ::AEXT128:
return emitExt128(MI, MBB, false);
case SystemZ::ZEXT128:
return emitExt128(MI, MBB, true);
case SystemZ::ATOMIC_SWAPW:
return emitAtomicLoadBinary(MI, MBB, 0, 0);
case SystemZ::ATOMIC_SWAP_32:
return emitAtomicLoadBinary(MI, MBB, 0, 32);
case SystemZ::ATOMIC_SWAP_64:
return emitAtomicLoadBinary(MI, MBB, 0, 64);
case SystemZ::ATOMIC_LOADW_AR:
return emitAtomicLoadBinary(MI, MBB, SystemZ::AR, 0);
case SystemZ::ATOMIC_LOADW_AFI:
return emitAtomicLoadBinary(MI, MBB, SystemZ::AFI, 0);
case SystemZ::ATOMIC_LOAD_AR:
return emitAtomicLoadBinary(MI, MBB, SystemZ::AR, 32);
case SystemZ::ATOMIC_LOAD_AHI:
return emitAtomicLoadBinary(MI, MBB, SystemZ::AHI, 32);
case SystemZ::ATOMIC_LOAD_AFI:
return emitAtomicLoadBinary(MI, MBB, SystemZ::AFI, 32);
case SystemZ::ATOMIC_LOAD_AGR:
return emitAtomicLoadBinary(MI, MBB, SystemZ::AGR, 64);
case SystemZ::ATOMIC_LOAD_AGHI:
return emitAtomicLoadBinary(MI, MBB, SystemZ::AGHI, 64);
case SystemZ::ATOMIC_LOAD_AGFI:
return emitAtomicLoadBinary(MI, MBB, SystemZ::AGFI, 64);
case SystemZ::ATOMIC_LOADW_SR:
return emitAtomicLoadBinary(MI, MBB, SystemZ::SR, 0);
case SystemZ::ATOMIC_LOAD_SR:
return emitAtomicLoadBinary(MI, MBB, SystemZ::SR, 32);
case SystemZ::ATOMIC_LOAD_SGR:
return emitAtomicLoadBinary(MI, MBB, SystemZ::SGR, 64);
case SystemZ::ATOMIC_LOADW_NR:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0);
case SystemZ::ATOMIC_LOADW_NILH:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0);
case SystemZ::ATOMIC_LOAD_NR:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32);
case SystemZ::ATOMIC_LOAD_NILL:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32);
case SystemZ::ATOMIC_LOAD_NILH:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32);
case SystemZ::ATOMIC_LOAD_NILF:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32);
case SystemZ::ATOMIC_LOAD_NGR:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64);
case SystemZ::ATOMIC_LOAD_NILL64:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64);
case SystemZ::ATOMIC_LOAD_NILH64:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64);
case SystemZ::ATOMIC_LOAD_NIHL64:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64);
case SystemZ::ATOMIC_LOAD_NIHH64:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64);
case SystemZ::ATOMIC_LOAD_NILF64:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64);
case SystemZ::ATOMIC_LOAD_NIHF64:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64);
case SystemZ::ATOMIC_LOADW_OR:
return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 0);
case SystemZ::ATOMIC_LOADW_OILH:
return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 0);
case SystemZ::ATOMIC_LOAD_OR:
return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 32);
case SystemZ::ATOMIC_LOAD_OILL:
return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL, 32);
case SystemZ::ATOMIC_LOAD_OILH:
return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 32);
case SystemZ::ATOMIC_LOAD_OILF:
return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF, 32);
case SystemZ::ATOMIC_LOAD_OGR:
return emitAtomicLoadBinary(MI, MBB, SystemZ::OGR, 64);
case SystemZ::ATOMIC_LOAD_OILL64:
return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL64, 64);
case SystemZ::ATOMIC_LOAD_OILH64:
return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH64, 64);
case SystemZ::ATOMIC_LOAD_OIHL64:
return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHL64, 64);
case SystemZ::ATOMIC_LOAD_OIHH64:
return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHH64, 64);
case SystemZ::ATOMIC_LOAD_OILF64:
return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF64, 64);
case SystemZ::ATOMIC_LOAD_OIHF64:
return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHF64, 64);
case SystemZ::ATOMIC_LOADW_XR:
return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 0);
case SystemZ::ATOMIC_LOADW_XILF:
return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 0);
case SystemZ::ATOMIC_LOAD_XR:
return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 32);
case SystemZ::ATOMIC_LOAD_XILF:
return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 32);
case SystemZ::ATOMIC_LOAD_XGR:
return emitAtomicLoadBinary(MI, MBB, SystemZ::XGR, 64);
case SystemZ::ATOMIC_LOAD_XILF64:
return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF64, 64);
case SystemZ::ATOMIC_LOAD_XIHF64:
return emitAtomicLoadBinary(MI, MBB, SystemZ::XIHF64, 64);
case SystemZ::ATOMIC_LOADW_NRi:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0, true);
case SystemZ::ATOMIC_LOADW_NILHi:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0, true);
case SystemZ::ATOMIC_LOAD_NRi:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32, true);
case SystemZ::ATOMIC_LOAD_NILLi:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32, true);
case SystemZ::ATOMIC_LOAD_NILHi:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32, true);
case SystemZ::ATOMIC_LOAD_NILFi:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32, true);
case SystemZ::ATOMIC_LOAD_NGRi:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64, true);
case SystemZ::ATOMIC_LOAD_NILL64i:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64, true);
case SystemZ::ATOMIC_LOAD_NILH64i:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64, true);
case SystemZ::ATOMIC_LOAD_NIHL64i:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64, true);
case SystemZ::ATOMIC_LOAD_NIHH64i:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64, true);
case SystemZ::ATOMIC_LOAD_NILF64i:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64, true);
case SystemZ::ATOMIC_LOAD_NIHF64i:
return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64, true);
case SystemZ::ATOMIC_LOADW_MIN:
return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
SystemZ::CCMASK_CMP_LE, 0);
case SystemZ::ATOMIC_LOAD_MIN_32:
return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
SystemZ::CCMASK_CMP_LE, 32);
case SystemZ::ATOMIC_LOAD_MIN_64:
return emitAtomicLoadMinMax(MI, MBB, SystemZ::CGR,
SystemZ::CCMASK_CMP_LE, 64);
case SystemZ::ATOMIC_LOADW_MAX:
return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
SystemZ::CCMASK_CMP_GE, 0);
case SystemZ::ATOMIC_LOAD_MAX_32:
return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
SystemZ::CCMASK_CMP_GE, 32);
case SystemZ::ATOMIC_LOAD_MAX_64:
return emitAtomicLoadMinMax(MI, MBB, SystemZ::CGR,
SystemZ::CCMASK_CMP_GE, 64);
case SystemZ::ATOMIC_LOADW_UMIN:
return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
SystemZ::CCMASK_CMP_LE, 0);
case SystemZ::ATOMIC_LOAD_UMIN_32:
return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
SystemZ::CCMASK_CMP_LE, 32);
case SystemZ::ATOMIC_LOAD_UMIN_64:
return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLGR,
SystemZ::CCMASK_CMP_LE, 64);
case SystemZ::ATOMIC_LOADW_UMAX:
return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
SystemZ::CCMASK_CMP_GE, 0);
case SystemZ::ATOMIC_LOAD_UMAX_32:
return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
SystemZ::CCMASK_CMP_GE, 32);
case SystemZ::ATOMIC_LOAD_UMAX_64:
return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLGR,
SystemZ::CCMASK_CMP_GE, 64);
case SystemZ::ATOMIC_CMP_SWAPW:
return emitAtomicCmpSwapW(MI, MBB);
case SystemZ::MVCSequence:
case SystemZ::MVCLoop:
return emitMemMemWrapper(MI, MBB, SystemZ::MVC);
case SystemZ::NCSequence:
case SystemZ::NCLoop:
return emitMemMemWrapper(MI, MBB, SystemZ::NC);
case SystemZ::OCSequence:
case SystemZ::OCLoop:
return emitMemMemWrapper(MI, MBB, SystemZ::OC);
case SystemZ::XCSequence:
case SystemZ::XCLoop:
return emitMemMemWrapper(MI, MBB, SystemZ::XC);
case SystemZ::CLCSequence:
case SystemZ::CLCLoop:
return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
case SystemZ::CLSTLoop:
return emitStringWrapper(MI, MBB, SystemZ::CLST);
case SystemZ::MVSTLoop:
return emitStringWrapper(MI, MBB, SystemZ::MVST);
case SystemZ::SRSTLoop:
return emitStringWrapper(MI, MBB, SystemZ::SRST);
case SystemZ::TBEGIN:
return emitTransactionBegin(MI, MBB, SystemZ::TBEGIN, false);
case SystemZ::TBEGIN_nofloat:
return emitTransactionBegin(MI, MBB, SystemZ::TBEGIN, true);
case SystemZ::TBEGINC:
return emitTransactionBegin(MI, MBB, SystemZ::TBEGINC, true);
case SystemZ::LTEBRCompare_VecPseudo:
return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTEBR);
case SystemZ::LTDBRCompare_VecPseudo:
return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTDBR);
case SystemZ::LTXBRCompare_VecPseudo:
return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTXBR);
case TargetOpcode::STACKMAP:
case TargetOpcode::PATCHPOINT:
return emitPatchPoint(MI, MBB);
default:
llvm_unreachable("Unexpected instr type to insert");
}
}
// This is only used by the isel schedulers, and is needed only to prevent
// compiler from crashing when list-ilp is used.
const TargetRegisterClass *
SystemZTargetLowering::getRepRegClassFor(MVT VT) const {
if (VT == MVT::Untyped)
return &SystemZ::ADDR128BitRegClass;
return TargetLowering::getRepRegClassFor(VT);
}
Index: vendor/llvm/dist-release_90/lib/Target/X86/X86.td
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/X86/X86.td (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/X86/X86.td (revision 351303)
@@ -1,1253 +1,1254 @@
//===-- X86.td - Target definition file for the Intel X86 --*- tablegen -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This is a target description file for the Intel i386 architecture, referred
// to here as the "X86" architecture.
//
//===----------------------------------------------------------------------===//
// Get the target-independent interfaces which we are implementing...
//
include "llvm/Target/Target.td"
//===----------------------------------------------------------------------===//
// X86 Subtarget state
//
def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true",
"64-bit mode (x86_64)">;
def Mode32Bit : SubtargetFeature<"32bit-mode", "In32BitMode", "true",
"32-bit mode (80386)">;
def Mode16Bit : SubtargetFeature<"16bit-mode", "In16BitMode", "true",
"16-bit mode (i8086)">;
//===----------------------------------------------------------------------===//
// X86 Subtarget features
//===----------------------------------------------------------------------===//
def FeatureX87 : SubtargetFeature<"x87","HasX87", "true",
"Enable X87 float instructions">;
def FeatureNOPL : SubtargetFeature<"nopl", "HasNOPL", "true",
"Enable NOPL instruction">;
def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true",
"Enable conditional move instructions">;
def FeatureCMPXCHG8B : SubtargetFeature<"cx8", "HasCmpxchg8b", "true",
"Support CMPXCHG8B instructions">;
def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true",
"Support POPCNT instruction">;
def FeatureFXSR : SubtargetFeature<"fxsr", "HasFXSR", "true",
"Support fxsave/fxrestore instructions">;
def FeatureXSAVE : SubtargetFeature<"xsave", "HasXSAVE", "true",
"Support xsave instructions">;
def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true",
"Support xsaveopt instructions">;
def FeatureXSAVEC : SubtargetFeature<"xsavec", "HasXSAVEC", "true",
"Support xsavec instructions">;
def FeatureXSAVES : SubtargetFeature<"xsaves", "HasXSAVES", "true",
"Support xsaves instructions">;
def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
"Enable SSE instructions">;
def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
"Enable SSE2 instructions",
[FeatureSSE1]>;
def FeatureSSE3 : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
"Enable SSE3 instructions",
[FeatureSSE2]>;
def FeatureSSSE3 : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
"Enable SSSE3 instructions",
[FeatureSSE3]>;
def FeatureSSE41 : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41",
"Enable SSE 4.1 instructions",
[FeatureSSSE3]>;
def FeatureSSE42 : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42",
"Enable SSE 4.2 instructions",
[FeatureSSE41]>;
// The MMX subtarget feature is separate from the rest of the SSE features
// because it's important (for odd compatibility reasons) to be able to
// turn it off explicitly while allowing SSE+ to be on.
def FeatureMMX : SubtargetFeature<"mmx","X863DNowLevel", "MMX",
"Enable MMX instructions">;
def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
"Enable 3DNow! instructions",
[FeatureMMX]>;
def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
"Enable 3DNow! Athlon instructions",
[Feature3DNow]>;
// All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied
// feature, because SSE2 can be disabled (e.g. for compiling OS kernels)
// without disabling 64-bit mode. Nothing should imply this feature bit. It
// is used to enforce that only 64-bit capable CPUs are used in 64-bit mode.
def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true",
"Support 64-bit instructions">;
def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
- "64-bit with cmpxchg16b">;
+ "64-bit with cmpxchg16b",
+ [FeatureCMPXCHG8B]>;
def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
"SHLD instruction is slow">;
def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
"PMULLD instruction is slow">;
def FeatureSlowPMADDWD : SubtargetFeature<"slow-pmaddwd", "IsPMADDWDSlow",
"true",
"PMADDWD is slower than PMULLD">;
// FIXME: This should not apply to CPUs that do not have SSE.
def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
"IsUAMem16Slow", "true",
"Slow unaligned 16-byte memory access">;
def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
"IsUAMem32Slow", "true",
"Slow unaligned 32-byte memory access">;
def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true",
"Support SSE 4a instructions",
[FeatureSSE3]>;
def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX",
"Enable AVX instructions",
[FeatureSSE42]>;
def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
"Enable AVX2 instructions",
[FeatureAVX]>;
def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true",
"Enable three-operand fused multiple-add",
[FeatureAVX]>;
def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true",
"Support 16-bit floating point conversion instructions",
[FeatureAVX]>;
def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F",
"Enable AVX-512 instructions",
[FeatureAVX2, FeatureFMA, FeatureF16C]>;
def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true",
"Enable AVX-512 Exponential and Reciprocal Instructions",
[FeatureAVX512]>;
def FeatureCDI : SubtargetFeature<"avx512cd", "HasCDI", "true",
"Enable AVX-512 Conflict Detection Instructions",
[FeatureAVX512]>;
def FeatureVPOPCNTDQ : SubtargetFeature<"avx512vpopcntdq", "HasVPOPCNTDQ",
"true", "Enable AVX-512 Population Count Instructions",
[FeatureAVX512]>;
def FeaturePFI : SubtargetFeature<"avx512pf", "HasPFI", "true",
"Enable AVX-512 PreFetch Instructions",
[FeatureAVX512]>;
def FeaturePREFETCHWT1 : SubtargetFeature<"prefetchwt1", "HasPREFETCHWT1",
"true",
"Prefetch with Intent to Write and T1 Hint">;
def FeatureDQI : SubtargetFeature<"avx512dq", "HasDQI", "true",
"Enable AVX-512 Doubleword and Quadword Instructions",
[FeatureAVX512]>;
def FeatureBWI : SubtargetFeature<"avx512bw", "HasBWI", "true",
"Enable AVX-512 Byte and Word Instructions",
[FeatureAVX512]>;
def FeatureVLX : SubtargetFeature<"avx512vl", "HasVLX", "true",
"Enable AVX-512 Vector Length eXtensions",
[FeatureAVX512]>;
def FeatureVBMI : SubtargetFeature<"avx512vbmi", "HasVBMI", "true",
"Enable AVX-512 Vector Byte Manipulation Instructions",
[FeatureBWI]>;
def FeatureVBMI2 : SubtargetFeature<"avx512vbmi2", "HasVBMI2", "true",
"Enable AVX-512 further Vector Byte Manipulation Instructions",
[FeatureBWI]>;
def FeatureIFMA : SubtargetFeature<"avx512ifma", "HasIFMA", "true",
"Enable AVX-512 Integer Fused Multiple-Add",
[FeatureAVX512]>;
def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true",
"Enable protection keys">;
def FeatureVNNI : SubtargetFeature<"avx512vnni", "HasVNNI", "true",
"Enable AVX-512 Vector Neural Network Instructions",
[FeatureAVX512]>;
def FeatureBF16 : SubtargetFeature<"avx512bf16", "HasBF16", "true",
"Support bfloat16 floating point",
[FeatureBWI]>;
def FeatureBITALG : SubtargetFeature<"avx512bitalg", "HasBITALG", "true",
"Enable AVX-512 Bit Algorithms",
[FeatureBWI]>;
def FeatureVP2INTERSECT : SubtargetFeature<"avx512vp2intersect",
"HasVP2INTERSECT", "true",
"Enable AVX-512 vp2intersect",
[FeatureAVX512]>;
def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
"Enable packed carry-less multiplication instructions",
[FeatureSSE2]>;
def FeatureGFNI : SubtargetFeature<"gfni", "HasGFNI", "true",
"Enable Galois Field Arithmetic Instructions",
[FeatureSSE2]>;
def FeatureVPCLMULQDQ : SubtargetFeature<"vpclmulqdq", "HasVPCLMULQDQ", "true",
"Enable vpclmulqdq instructions",
[FeatureAVX, FeaturePCLMUL]>;
def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true",
"Enable four-operand fused multiple-add",
[FeatureAVX, FeatureSSE4A]>;
def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true",
"Enable XOP instructions",
[FeatureFMA4]>;
def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem",
"HasSSEUnalignedMem", "true",
"Allow unaligned memory operands with SSE instructions">;
def FeatureAES : SubtargetFeature<"aes", "HasAES", "true",
"Enable AES instructions",
[FeatureSSE2]>;
def FeatureVAES : SubtargetFeature<"vaes", "HasVAES", "true",
"Promote selected AES instructions to AVX512/AVX registers",
[FeatureAVX, FeatureAES]>;
def FeatureTBM : SubtargetFeature<"tbm", "HasTBM", "true",
"Enable TBM instructions">;
def FeatureLWP : SubtargetFeature<"lwp", "HasLWP", "true",
"Enable LWP instructions">;
def FeatureMOVBE : SubtargetFeature<"movbe", "HasMOVBE", "true",
"Support MOVBE instruction">;
def FeatureRDRAND : SubtargetFeature<"rdrnd", "HasRDRAND", "true",
"Support RDRAND instruction">;
def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true",
"Support FS/GS Base instructions">;
def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
"Support LZCNT instruction">;
def FeatureBMI : SubtargetFeature<"bmi", "HasBMI", "true",
"Support BMI instructions">;
def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true",
"Support BMI2 instructions">;
def FeatureRTM : SubtargetFeature<"rtm", "HasRTM", "true",
"Support RTM instructions">;
def FeatureADX : SubtargetFeature<"adx", "HasADX", "true",
"Support ADX instructions">;
def FeatureSHA : SubtargetFeature<"sha", "HasSHA", "true",
"Enable SHA instructions",
[FeatureSSE2]>;
def FeatureSHSTK : SubtargetFeature<"shstk", "HasSHSTK", "true",
"Support CET Shadow-Stack instructions">;
def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
"Support PRFCHW instructions">;
def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true",
"Support RDSEED instruction">;
def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true",
"Support LAHF and SAHF instructions">;
def FeatureMWAITX : SubtargetFeature<"mwaitx", "HasMWAITX", "true",
"Enable MONITORX/MWAITX timer functionality">;
def FeatureCLZERO : SubtargetFeature<"clzero", "HasCLZERO", "true",
"Enable Cache Line Zero">;
def FeatureCLDEMOTE : SubtargetFeature<"cldemote", "HasCLDEMOTE", "true",
"Enable Cache Demote">;
def FeaturePTWRITE : SubtargetFeature<"ptwrite", "HasPTWRITE", "true",
"Support ptwrite instruction">;
def FeatureMPX : SubtargetFeature<"mpx", "HasMPX", "true",
"Support MPX instructions">;
def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
"Use LEA for adjusting the stack pointer">;
def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
"HasSlowDivide32", "true",
"Use 8-bit divide for positive values less than 256">;
def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl",
"HasSlowDivide64", "true",
"Use 32-bit divide for positive values less than 2^32">;
def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
"PadShortFunctions", "true",
"Pad short functions">;
def FeatureINVPCID : SubtargetFeature<"invpcid", "HasINVPCID", "true",
"Invalidate Process-Context Identifier">;
def FeatureSGX : SubtargetFeature<"sgx", "HasSGX", "true",
"Enable Software Guard Extensions">;
def FeatureCLFLUSHOPT : SubtargetFeature<"clflushopt", "HasCLFLUSHOPT", "true",
"Flush A Cache Line Optimized">;
def FeatureCLWB : SubtargetFeature<"clwb", "HasCLWB", "true",
"Cache Line Write Back">;
def FeatureWBNOINVD : SubtargetFeature<"wbnoinvd", "HasWBNOINVD", "true",
"Write Back No Invalidate">;
def FeatureRDPID : SubtargetFeature<"rdpid", "HasRDPID", "true",
"Support RDPID instructions">;
def FeatureWAITPKG : SubtargetFeature<"waitpkg", "HasWAITPKG", "true",
"Wait and pause enhancements">;
def FeatureENQCMD : SubtargetFeature<"enqcmd", "HasENQCMD", "true",
"Has ENQCMD instructions">;
// On some processors, instructions that implicitly take two memory operands are
// slow. In practice, this means that CALL, PUSH, and POP with memory operands
// should be avoided in favor of a MOV + register CALL/PUSH/POP.
def FeatureSlowTwoMemOps : SubtargetFeature<"slow-two-mem-ops",
"SlowTwoMemOps", "true",
"Two memory operand instructions are slow">;
def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
"LEA instruction needs inputs at AG stage">;
def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
"LEA instruction with certain arguments is slow">;
def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true",
"LEA instruction with 3 ops or certain registers is slow">;
def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
"INC and DEC instructions are slower than ADD and SUB">;
def FeatureSoftFloat
: SubtargetFeature<"soft-float", "UseSoftFloat", "true",
"Use software floating point features">;
def FeaturePOPCNTFalseDeps : SubtargetFeature<"false-deps-popcnt",
"HasPOPCNTFalseDeps", "true",
"POPCNT has a false dependency on dest register">;
def FeatureLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt",
"HasLZCNTFalseDeps", "true",
"LZCNT/TZCNT have a false dependency on dest register">;
def FeaturePCONFIG : SubtargetFeature<"pconfig", "HasPCONFIG", "true",
"platform configuration instruction">;
// On recent X86 (port bound) processors, its preferable to combine to a single shuffle
// using a variable mask over multiple fixed shuffles.
def FeatureFastVariableShuffle
: SubtargetFeature<"fast-variable-shuffle",
"HasFastVariableShuffle",
"true", "Shuffles with variable masks are fast">;
// On some X86 processors, there is no performance hazard to writing only the
// lower parts of a YMM or ZMM register without clearing the upper part.
def FeatureFastPartialYMMorZMMWrite
: SubtargetFeature<"fast-partial-ymm-or-zmm-write",
"HasFastPartialYMMorZMMWrite",
"true", "Partial writes to YMM/ZMM registers are fast">;
// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
// vector FSQRT has higher throughput than the corresponding NR code.
// The idea is that throughput bound code is likely to be vectorized, so for
// vectorized code we should care about the throughput of SQRT operations.
// But if the code is scalar that probably means that the code has some kind of
// dependency and we should care more about reducing the latency.
def FeatureFastScalarFSQRT
: SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT",
"true", "Scalar SQRT is fast (disable Newton-Raphson)">;
def FeatureFastVectorFSQRT
: SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT",
"true", "Vector SQRT is fast (disable Newton-Raphson)">;
// If lzcnt has equivalent latency/throughput to most simple integer ops, it can
// be used to replace test/set sequences.
def FeatureFastLZCNT
: SubtargetFeature<
"fast-lzcnt", "HasFastLZCNT", "true",
"LZCNT instructions are as fast as most simple integer ops">;
// If the target can efficiently decode NOPs upto 11-bytes in length.
def FeatureFast11ByteNOP
: SubtargetFeature<
"fast-11bytenop", "HasFast11ByteNOP", "true",
"Target can quickly decode up to 11 byte NOPs">;
// If the target can efficiently decode NOPs upto 15-bytes in length.
def FeatureFast15ByteNOP
: SubtargetFeature<
"fast-15bytenop", "HasFast15ByteNOP", "true",
"Target can quickly decode up to 15 byte NOPs">;
// Sandy Bridge and newer processors can use SHLD with the same source on both
// inputs to implement rotate to avoid the partial flag update of the normal
// rotate instructions.
def FeatureFastSHLDRotate
: SubtargetFeature<
"fast-shld-rotate", "HasFastSHLDRotate", "true",
"SHLD can be used as a faster rotate">;
// Ivy Bridge and newer processors have enhanced REP MOVSB and STOSB (aka
// "string operations"). See "REP String Enhancement" in the Intel Software
// Development Manual. This feature essentially means that REP MOVSB will copy
// using the largest available size instead of copying bytes one by one, making
// it at least as fast as REPMOVS{W,D,Q}.
def FeatureERMSB
: SubtargetFeature<
"ermsb", "HasERMSB", "true",
"REP MOVS/STOS are fast">;
// Bulldozer and newer processors can merge CMP/TEST (but not other
// instructions) with conditional branches.
def FeatureBranchFusion
: SubtargetFeature<"branchfusion", "HasBranchFusion", "true",
"CMP/TEST can be fused with conditional branches">;
// Sandy Bridge and newer processors have many instructions that can be
// fused with conditional branches and pass through the CPU as a single
// operation.
def FeatureMacroFusion
: SubtargetFeature<"macrofusion", "HasMacroFusion", "true",
"Various instructions can be fused with conditional branches">;
// Gather is available since Haswell (AVX2 set). So technically, we can
// generate Gathers on all AVX2 processors. But the overhead on HSW is high.
// Skylake Client processor has faster Gathers than HSW and performance is
// similar to Skylake Server (AVX-512).
def FeatureHasFastGather
: SubtargetFeature<"fast-gather", "HasFastGather", "true",
"Indicates if gather is reasonably fast">;
def FeaturePrefer256Bit
: SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true",
"Prefer 256-bit AVX instructions">;
// Lower indirect calls using a special construct called a `retpoline` to
// mitigate potential Spectre v2 attacks against them.
def FeatureRetpolineIndirectCalls
: SubtargetFeature<
"retpoline-indirect-calls", "UseRetpolineIndirectCalls", "true",
"Remove speculation of indirect calls from the generated code">;
// Lower indirect branches and switches either using conditional branch trees
// or using a special construct called a `retpoline` to mitigate potential
// Spectre v2 attacks against them.
def FeatureRetpolineIndirectBranches
: SubtargetFeature<
"retpoline-indirect-branches", "UseRetpolineIndirectBranches", "true",
"Remove speculation of indirect branches from the generated code">;
// Deprecated umbrella feature for enabling both `retpoline-indirect-calls` and
// `retpoline-indirect-branches` above.
def FeatureRetpoline
: SubtargetFeature<"retpoline", "DeprecatedUseRetpoline", "true",
"Remove speculation of indirect branches from the "
"generated code, either by avoiding them entirely or "
"lowering them with a speculation blocking construct",
[FeatureRetpolineIndirectCalls,
FeatureRetpolineIndirectBranches]>;
// Rely on external thunks for the emitted retpoline calls. This allows users
// to provide their own custom thunk definitions in highly specialized
// environments such as a kernel that does boot-time hot patching.
def FeatureRetpolineExternalThunk
: SubtargetFeature<
"retpoline-external-thunk", "UseRetpolineExternalThunk", "true",
"When lowering an indirect call or branch using a `retpoline`, rely "
"on the specified user provided thunk rather than emitting one "
"ourselves. Only has effect when combined with some other retpoline "
"feature", [FeatureRetpolineIndirectCalls]>;
// Direct Move instructions.
def FeatureMOVDIRI : SubtargetFeature<"movdiri", "HasMOVDIRI", "true",
"Support movdiri instruction">;
def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true",
"Support movdir64b instruction">;
def FeatureFastBEXTR : SubtargetFeature<"fast-bextr", "HasFastBEXTR", "true",
"Indicates that the BEXTR instruction is implemented as a single uop "
"with good throughput">;
// Combine vector math operations with shuffles into horizontal math
// instructions if a CPU implements horizontal operations (introduced with
// SSE3) with better latency/throughput than the alternative sequence.
def FeatureFastHorizontalOps
: SubtargetFeature<
"fast-hops", "HasFastHorizontalOps", "true",
"Prefer horizontal vector math instructions (haddp, phsub, etc.) over "
"normal vector instructions with shuffles", [FeatureSSE3]>;
def FeatureFastScalarShiftMasks
: SubtargetFeature<
"fast-scalar-shift-masks", "HasFastScalarShiftMasks", "true",
"Prefer a left/right scalar logical shift pair over a shift+and pair">;
def FeatureFastVectorShiftMasks
: SubtargetFeature<
"fast-vector-shift-masks", "HasFastVectorShiftMasks", "true",
"Prefer a left/right vector logical shift pair over a shift+and pair">;
// Merge branches using three-way conditional code.
def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch",
"ThreewayBranchProfitable", "true",
"Merge branches to a three-way "
"conditional branch">;
// Bonnell
def ProcIntelAtom : SubtargetFeature<"", "X86ProcFamily", "IntelAtom", "">;
// Silvermont
def ProcIntelSLM : SubtargetFeature<"", "X86ProcFamily", "IntelSLM", "">;
// Goldmont
def ProcIntelGLM : SubtargetFeature<"", "X86ProcFamily", "IntelGLM", "">;
// Goldmont Plus
def ProcIntelGLP : SubtargetFeature<"", "X86ProcFamily", "IntelGLP", "">;
// Tremont
def ProcIntelTRM : SubtargetFeature<"", "X86ProcFamily", "IntelTRM", "">;
//===----------------------------------------------------------------------===//
// Register File Description
//===----------------------------------------------------------------------===//
include "X86RegisterInfo.td"
include "X86RegisterBanks.td"
//===----------------------------------------------------------------------===//
// Instruction Descriptions
//===----------------------------------------------------------------------===//
include "X86Schedule.td"
include "X86InstrInfo.td"
include "X86SchedPredicates.td"
def X86InstrInfo : InstrInfo;
//===----------------------------------------------------------------------===//
// X86 Scheduler Models
//===----------------------------------------------------------------------===//
include "X86ScheduleAtom.td"
include "X86SchedSandyBridge.td"
include "X86SchedHaswell.td"
include "X86SchedBroadwell.td"
include "X86ScheduleSLM.td"
include "X86ScheduleZnver1.td"
include "X86ScheduleBdVer2.td"
include "X86ScheduleBtVer2.td"
include "X86SchedSkylakeClient.td"
include "X86SchedSkylakeServer.td"
//===----------------------------------------------------------------------===//
// X86 Processor Feature Lists
//===----------------------------------------------------------------------===//
def ProcessorFeatures {
// Nehalem
list<SubtargetFeature> NHMInheritableFeatures = [FeatureX87,
FeatureCMPXCHG8B,
FeatureCMOV,
FeatureMMX,
FeatureSSE42,
FeatureFXSR,
FeatureNOPL,
Feature64Bit,
FeatureCMPXCHG16B,
FeaturePOPCNT,
FeatureLAHFSAHF,
FeatureMacroFusion];
list<SubtargetFeature> NHMSpecificFeatures = [];
list<SubtargetFeature> NHMFeatures =
!listconcat(NHMInheritableFeatures, NHMSpecificFeatures);
// Westmere
list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
list<SubtargetFeature> WSMSpecificFeatures = [];
list<SubtargetFeature> WSMInheritableFeatures =
!listconcat(NHMInheritableFeatures, WSMAdditionalFeatures);
list<SubtargetFeature> WSMFeatures =
!listconcat(WSMInheritableFeatures, WSMSpecificFeatures);
// Sandybridge
list<SubtargetFeature> SNBAdditionalFeatures = [FeatureAVX,
FeatureSlowDivide64,
FeatureXSAVE,
FeatureXSAVEOPT,
FeatureSlow3OpsLEA,
FeatureFastScalarFSQRT,
FeatureFastSHLDRotate,
FeatureMergeToThreeWayBranch];
list<SubtargetFeature> SNBSpecificFeatures = [FeatureSlowUAMem32,
FeaturePOPCNTFalseDeps];
list<SubtargetFeature> SNBInheritableFeatures =
!listconcat(WSMInheritableFeatures, SNBAdditionalFeatures);
list<SubtargetFeature> SNBFeatures =
!listconcat(SNBInheritableFeatures, SNBSpecificFeatures);
// Ivybridge
list<SubtargetFeature> IVBAdditionalFeatures = [FeatureRDRAND,
FeatureF16C,
FeatureFSGSBase];
list<SubtargetFeature> IVBSpecificFeatures = [FeatureSlowUAMem32,
FeaturePOPCNTFalseDeps];
list<SubtargetFeature> IVBInheritableFeatures =
!listconcat(SNBInheritableFeatures, IVBAdditionalFeatures);
list<SubtargetFeature> IVBFeatures =
!listconcat(IVBInheritableFeatures, IVBSpecificFeatures);
// Haswell
list<SubtargetFeature> HSWAdditionalFeatures = [FeatureAVX2,
FeatureBMI,
FeatureBMI2,
FeatureERMSB,
FeatureFMA,
FeatureINVPCID,
FeatureLZCNT,
FeatureMOVBE,
FeatureFastVariableShuffle];
list<SubtargetFeature> HSWSpecificFeatures = [FeaturePOPCNTFalseDeps,
FeatureLZCNTFalseDeps];
list<SubtargetFeature> HSWInheritableFeatures =
!listconcat(IVBInheritableFeatures, HSWAdditionalFeatures);
list<SubtargetFeature> HSWFeatures =
!listconcat(HSWInheritableFeatures, HSWSpecificFeatures);
// Broadwell
list<SubtargetFeature> BDWAdditionalFeatures = [FeatureADX,
FeatureRDSEED,
FeaturePRFCHW];
list<SubtargetFeature> BDWSpecificFeatures = [FeaturePOPCNTFalseDeps,
FeatureLZCNTFalseDeps];
list<SubtargetFeature> BDWInheritableFeatures =
!listconcat(HSWInheritableFeatures, BDWAdditionalFeatures);
list<SubtargetFeature> BDWFeatures =
!listconcat(BDWInheritableFeatures, BDWSpecificFeatures);
// Skylake
list<SubtargetFeature> SKLAdditionalFeatures = [FeatureAES,
FeatureMPX,
FeatureXSAVEC,
FeatureXSAVES,
FeatureCLFLUSHOPT,
FeatureFastVectorFSQRT];
list<SubtargetFeature> SKLSpecificFeatures = [FeatureHasFastGather,
FeaturePOPCNTFalseDeps,
FeatureSGX];
list<SubtargetFeature> SKLInheritableFeatures =
!listconcat(BDWInheritableFeatures, SKLAdditionalFeatures);
list<SubtargetFeature> SKLFeatures =
!listconcat(SKLInheritableFeatures, SKLSpecificFeatures);
// Skylake-AVX512
list<SubtargetFeature> SKXAdditionalFeatures = [FeatureAVX512,
FeatureCDI,
FeatureDQI,
FeatureBWI,
FeatureVLX,
FeaturePKU,
FeatureCLWB];
list<SubtargetFeature> SKXSpecificFeatures = [FeatureHasFastGather,
FeaturePOPCNTFalseDeps];
list<SubtargetFeature> SKXInheritableFeatures =
!listconcat(SKLInheritableFeatures, SKXAdditionalFeatures);
list<SubtargetFeature> SKXFeatures =
!listconcat(SKXInheritableFeatures, SKXSpecificFeatures);
// Cascadelake
list<SubtargetFeature> CLXAdditionalFeatures = [FeatureVNNI];
list<SubtargetFeature> CLXSpecificFeatures = [FeatureHasFastGather,
FeaturePOPCNTFalseDeps];
list<SubtargetFeature> CLXInheritableFeatures =
!listconcat(SKXInheritableFeatures, CLXAdditionalFeatures);
list<SubtargetFeature> CLXFeatures =
!listconcat(CLXInheritableFeatures, CLXSpecificFeatures);
// Cooperlake
list<SubtargetFeature> CPXAdditionalFeatures = [FeatureBF16];
list<SubtargetFeature> CPXSpecificFeatures = [FeatureHasFastGather,
FeaturePOPCNTFalseDeps];
list<SubtargetFeature> CPXInheritableFeatures =
!listconcat(CLXInheritableFeatures, CPXAdditionalFeatures);
list<SubtargetFeature> CPXFeatures =
!listconcat(CPXInheritableFeatures, CPXSpecificFeatures);
// Cannonlake
list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512,
FeatureCDI,
FeatureDQI,
FeatureBWI,
FeatureVLX,
FeaturePKU,
FeatureVBMI,
FeatureIFMA,
FeatureSHA,
FeatureSGX];
list<SubtargetFeature> CNLSpecificFeatures = [FeatureHasFastGather];
list<SubtargetFeature> CNLInheritableFeatures =
!listconcat(SKLInheritableFeatures, CNLAdditionalFeatures);
list<SubtargetFeature> CNLFeatures =
!listconcat(CNLInheritableFeatures, CNLSpecificFeatures);
// Icelake
list<SubtargetFeature> ICLAdditionalFeatures = [FeatureBITALG,
FeatureVAES,
FeatureVBMI2,
FeatureVNNI,
FeatureVPCLMULQDQ,
FeatureVPOPCNTDQ,
FeatureGFNI,
FeatureCLWB,
FeatureRDPID];
list<SubtargetFeature> ICLSpecificFeatures = [FeatureHasFastGather];
list<SubtargetFeature> ICLInheritableFeatures =
!listconcat(CNLInheritableFeatures, ICLAdditionalFeatures);
list<SubtargetFeature> ICLFeatures =
!listconcat(ICLInheritableFeatures, ICLSpecificFeatures);
// Icelake Server
list<SubtargetFeature> ICXSpecificFeatures = [FeaturePCONFIG,
FeatureWBNOINVD,
FeatureHasFastGather];
list<SubtargetFeature> ICXFeatures =
!listconcat(ICLInheritableFeatures, ICXSpecificFeatures);
// Atom
list<SubtargetFeature> AtomInheritableFeatures = [FeatureX87,
FeatureCMPXCHG8B,
FeatureCMOV,
FeatureMMX,
FeatureSSSE3,
FeatureFXSR,
FeatureNOPL,
Feature64Bit,
FeatureCMPXCHG16B,
FeatureMOVBE,
FeatureSlowTwoMemOps,
FeatureLAHFSAHF];
list<SubtargetFeature> AtomSpecificFeatures = [ProcIntelAtom,
FeatureSlowUAMem16,
FeatureLEAForSP,
FeatureSlowDivide32,
FeatureSlowDivide64,
FeatureLEAUsesAG,
FeaturePadShortFunctions];
list<SubtargetFeature> AtomFeatures =
!listconcat(AtomInheritableFeatures, AtomSpecificFeatures);
// Silvermont
list<SubtargetFeature> SLMAdditionalFeatures = [FeatureSSE42,
FeaturePOPCNT,
FeaturePCLMUL,
FeaturePRFCHW,
FeatureSlowLEA,
FeatureSlowIncDec,
FeatureRDRAND];
list<SubtargetFeature> SLMSpecificFeatures = [ProcIntelSLM,
FeatureSlowDivide64,
FeatureSlowPMULLD,
FeaturePOPCNTFalseDeps];
list<SubtargetFeature> SLMInheritableFeatures =
!listconcat(AtomInheritableFeatures, SLMAdditionalFeatures);
list<SubtargetFeature> SLMFeatures =
!listconcat(SLMInheritableFeatures, SLMSpecificFeatures);
// Goldmont
list<SubtargetFeature> GLMAdditionalFeatures = [FeatureAES,
FeatureMPX,
FeatureSHA,
FeatureRDSEED,
FeatureXSAVE,
FeatureXSAVEOPT,
FeatureXSAVEC,
FeatureXSAVES,
FeatureCLFLUSHOPT,
FeatureFSGSBase];
list<SubtargetFeature> GLMSpecificFeatures = [ProcIntelGLM,
FeaturePOPCNTFalseDeps];
list<SubtargetFeature> GLMInheritableFeatures =
!listconcat(SLMInheritableFeatures, GLMAdditionalFeatures);
list<SubtargetFeature> GLMFeatures =
!listconcat(GLMInheritableFeatures, GLMSpecificFeatures);
// Goldmont Plus
list<SubtargetFeature> GLPAdditionalFeatures = [FeaturePTWRITE,
FeatureRDPID,
FeatureSGX];
list<SubtargetFeature> GLPSpecificFeatures = [ProcIntelGLP];
list<SubtargetFeature> GLPInheritableFeatures =
!listconcat(GLMInheritableFeatures, GLPAdditionalFeatures);
list<SubtargetFeature> GLPFeatures =
!listconcat(GLPInheritableFeatures, GLPSpecificFeatures);
// Tremont
list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLDEMOTE,
FeatureGFNI,
FeatureMOVDIRI,
FeatureMOVDIR64B,
FeatureWAITPKG];
list<SubtargetFeature> TRMSpecificFeatures = [ProcIntelTRM];
list<SubtargetFeature> TRMFeatures =
!listconcat(GLPInheritableFeatures, TRMAdditionalFeatures,
TRMSpecificFeatures);
// Knights Landing
list<SubtargetFeature> KNLFeatures = [FeatureX87,
FeatureCMPXCHG8B,
FeatureCMOV,
FeatureMMX,
FeatureFXSR,
FeatureNOPL,
Feature64Bit,
FeatureCMPXCHG16B,
FeaturePOPCNT,
FeatureSlowDivide64,
FeaturePCLMUL,
FeatureXSAVE,
FeatureXSAVEOPT,
FeatureLAHFSAHF,
FeatureSlow3OpsLEA,
FeatureSlowIncDec,
FeatureAES,
FeatureRDRAND,
FeatureF16C,
FeatureFSGSBase,
FeatureAVX512,
FeatureERI,
FeatureCDI,
FeaturePFI,
FeaturePREFETCHWT1,
FeatureADX,
FeatureRDSEED,
FeatureMOVBE,
FeatureLZCNT,
FeatureBMI,
FeatureBMI2,
FeatureFMA,
FeaturePRFCHW,
FeatureSlowTwoMemOps,
FeatureFastPartialYMMorZMMWrite,
FeatureHasFastGather,
FeatureSlowPMADDWD];
// TODO Add AVX5124FMAPS/AVX5124VNNIW features
list<SubtargetFeature> KNMFeatures =
!listconcat(KNLFeatures, [FeatureVPOPCNTDQ]);
// Bobcat
list<SubtargetFeature> BtVer1InheritableFeatures = [FeatureX87,
FeatureCMPXCHG8B,
FeatureCMOV,
FeatureMMX,
FeatureSSSE3,
FeatureSSE4A,
FeatureFXSR,
FeatureNOPL,
Feature64Bit,
FeatureCMPXCHG16B,
FeaturePRFCHW,
FeatureLZCNT,
FeaturePOPCNT,
FeatureSlowSHLD,
FeatureLAHFSAHF,
FeatureFast15ByteNOP,
FeatureFastScalarShiftMasks,
FeatureFastVectorShiftMasks];
list<SubtargetFeature> BtVer1Features = BtVer1InheritableFeatures;
// Jaguar
list<SubtargetFeature> BtVer2AdditionalFeatures = [FeatureAVX,
FeatureAES,
FeaturePCLMUL,
FeatureBMI,
FeatureF16C,
FeatureMOVBE,
FeatureXSAVE,
FeatureXSAVEOPT];
list<SubtargetFeature> BtVer2SpecificFeatures = [FeatureFastLZCNT,
FeatureFastBEXTR,
FeatureFastPartialYMMorZMMWrite,
FeatureFastHorizontalOps];
list<SubtargetFeature> BtVer2InheritableFeatures =
!listconcat(BtVer1InheritableFeatures, BtVer2AdditionalFeatures);
list<SubtargetFeature> BtVer2Features =
!listconcat(BtVer2InheritableFeatures, BtVer2SpecificFeatures);
// Bulldozer
list<SubtargetFeature> BdVer1InheritableFeatures = [FeatureX87,
FeatureCMPXCHG8B,
FeatureCMOV,
FeatureXOP,
Feature64Bit,
FeatureCMPXCHG16B,
FeatureAES,
FeaturePRFCHW,
FeaturePCLMUL,
FeatureMMX,
FeatureFXSR,
FeatureNOPL,
FeatureLZCNT,
FeaturePOPCNT,
FeatureXSAVE,
FeatureLWP,
FeatureSlowSHLD,
FeatureLAHFSAHF,
FeatureFast11ByteNOP,
FeatureFastScalarShiftMasks,
FeatureBranchFusion];
list<SubtargetFeature> BdVer1Features = BdVer1InheritableFeatures;
// PileDriver
list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
FeatureBMI,
FeatureTBM,
FeatureFMA,
FeatureFastBEXTR];
list<SubtargetFeature> BdVer2InheritableFeatures =
!listconcat(BdVer1InheritableFeatures, BdVer2AdditionalFeatures);
list<SubtargetFeature> BdVer2Features = BdVer2InheritableFeatures;
// Steamroller
list<SubtargetFeature> BdVer3AdditionalFeatures = [FeatureXSAVEOPT,
FeatureFSGSBase];
list<SubtargetFeature> BdVer3InheritableFeatures =
!listconcat(BdVer2InheritableFeatures, BdVer3AdditionalFeatures);
list<SubtargetFeature> BdVer3Features = BdVer3InheritableFeatures;
// Excavator
list<SubtargetFeature> BdVer4AdditionalFeatures = [FeatureAVX2,
FeatureBMI2,
FeatureMWAITX];
list<SubtargetFeature> BdVer4InheritableFeatures =
!listconcat(BdVer3InheritableFeatures, BdVer4AdditionalFeatures);
list<SubtargetFeature> BdVer4Features = BdVer4InheritableFeatures;
// AMD Zen Processors common ISAs
list<SubtargetFeature> ZNFeatures = [FeatureADX,
FeatureAES,
FeatureAVX2,
FeatureBMI,
FeatureBMI2,
FeatureCLFLUSHOPT,
FeatureCLZERO,
FeatureCMOV,
Feature64Bit,
FeatureCMPXCHG16B,
FeatureF16C,
FeatureFMA,
FeatureFSGSBase,
FeatureFXSR,
FeatureNOPL,
FeatureFastLZCNT,
FeatureLAHFSAHF,
FeatureLZCNT,
FeatureFastBEXTR,
FeatureFast15ByteNOP,
FeatureBranchFusion,
FeatureFastScalarShiftMasks,
FeatureMMX,
FeatureMOVBE,
FeatureMWAITX,
FeaturePCLMUL,
FeaturePOPCNT,
FeaturePRFCHW,
FeatureRDRAND,
FeatureRDSEED,
FeatureSHA,
FeatureSSE4A,
FeatureSlowSHLD,
FeatureX87,
FeatureXSAVE,
FeatureXSAVEC,
FeatureXSAVEOPT,
FeatureXSAVES];
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
FeatureRDPID,
FeatureWBNOINVD];
list<SubtargetFeature> ZN2Features =
!listconcat(ZNFeatures, ZN2AdditionalFeatures);
}
//===----------------------------------------------------------------------===//
// X86 processors supported.
//===----------------------------------------------------------------------===//
class Proc<string Name, list<SubtargetFeature> Features>
: ProcessorModel<Name, GenericModel, Features>;
// NOTE: CMPXCHG8B is here for legacy compatbility so that it is only disabled
// if i386/i486 is specifically requested.
def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16,
FeatureCMPXCHG8B]>;
def : Proc<"i386", [FeatureX87, FeatureSlowUAMem16]>;
def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16]>;
def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16,
FeatureCMPXCHG8B]>;
def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16,
FeatureCMPXCHG8B]>;
def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16,
FeatureCMPXCHG8B, FeatureMMX]>;
def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureCMOV]>;
def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureCMOV, FeatureNOPL]>;
def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureMMX, FeatureCMOV, FeatureFXSR,
FeatureNOPL]>;
foreach P = ["pentium3", "pentium3m"] in {
def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,FeatureMMX,
FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV]>;
}
// Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
// The intent is to enable it for pentium4 which is the current default
// processor in a vanilla 32-bit clang compilation when no specific
// architecture is specified. This generally gives a nice performance
// increase on silvermont, with largely neutral behavior on other
// contemporary large core processors.
// pentium-m, pentium4m, prescott and nocona are included as a preventative
// measure to avoid performance surprises, in case clang's default cpu
// changes slightly.
def : ProcessorModel<"pentium-m", GenericPostRAModel,
[FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
FeatureCMOV]>;
foreach P = ["pentium4", "pentium4m"] in {
def : ProcessorModel<P, GenericPostRAModel,
[FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
FeatureCMOV]>;
}
// Intel Quark.
def : Proc<"lakemont", []>;
// Intel Core Duo.
def : ProcessorModel<"yonah", SandyBridgeModel,
[FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL,
FeatureCMOV]>;
// NetBurst.
def : ProcessorModel<"prescott", GenericPostRAModel,
[FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL,
FeatureCMOV]>;
def : ProcessorModel<"nocona", GenericPostRAModel, [
FeatureX87,
FeatureSlowUAMem16,
FeatureCMPXCHG8B,
FeatureCMOV,
FeatureMMX,
FeatureSSE3,
FeatureFXSR,
FeatureNOPL,
Feature64Bit,
FeatureCMPXCHG16B
]>;
// Intel Core 2 Solo/Duo.
def : ProcessorModel<"core2", SandyBridgeModel, [
FeatureX87,
FeatureSlowUAMem16,
FeatureCMPXCHG8B,
FeatureCMOV,
FeatureMMX,
FeatureSSSE3,
FeatureFXSR,
FeatureNOPL,
Feature64Bit,
FeatureCMPXCHG16B,
FeatureLAHFSAHF,
FeatureMacroFusion
]>;
def : ProcessorModel<"penryn", SandyBridgeModel, [
FeatureX87,
FeatureSlowUAMem16,
FeatureCMPXCHG8B,
FeatureCMOV,
FeatureMMX,
FeatureSSE41,
FeatureFXSR,
FeatureNOPL,
Feature64Bit,
FeatureCMPXCHG16B,
FeatureLAHFSAHF,
FeatureMacroFusion
]>;
// Atom CPUs.
foreach P = ["bonnell", "atom"] in {
def : ProcessorModel<P, AtomModel, ProcessorFeatures.AtomFeatures>;
}
foreach P = ["silvermont", "slm"] in {
def : ProcessorModel<P, SLMModel, ProcessorFeatures.SLMFeatures>;
}
def : ProcessorModel<"goldmont", SLMModel, ProcessorFeatures.GLMFeatures>;
def : ProcessorModel<"goldmont-plus", SLMModel, ProcessorFeatures.GLPFeatures>;
def : ProcessorModel<"tremont", SLMModel, ProcessorFeatures.TRMFeatures>;
// "Arrandale" along with corei3 and corei5
foreach P = ["nehalem", "corei7"] in {
def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.NHMFeatures>;
}
// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
def : ProcessorModel<"westmere", SandyBridgeModel,
ProcessorFeatures.WSMFeatures>;
foreach P = ["sandybridge", "corei7-avx"] in {
def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.SNBFeatures>;
}
foreach P = ["ivybridge", "core-avx-i"] in {
def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.IVBFeatures>;
}
foreach P = ["haswell", "core-avx2"] in {
def : ProcessorModel<P, HaswellModel, ProcessorFeatures.HSWFeatures>;
}
def : ProcessorModel<"broadwell", BroadwellModel,
ProcessorFeatures.BDWFeatures>;
def : ProcessorModel<"skylake", SkylakeClientModel,
ProcessorFeatures.SKLFeatures>;
// FIXME: define KNL scheduler model
def : ProcessorModel<"knl", HaswellModel, ProcessorFeatures.KNLFeatures>;
def : ProcessorModel<"knm", HaswellModel, ProcessorFeatures.KNMFeatures>;
foreach P = ["skylake-avx512", "skx"] in {
def : ProcessorModel<P, SkylakeServerModel, ProcessorFeatures.SKXFeatures>;
}
def : ProcessorModel<"cascadelake", SkylakeServerModel,
ProcessorFeatures.CLXFeatures>;
def : ProcessorModel<"cooperlake", SkylakeServerModel,
ProcessorFeatures.CPXFeatures>;
def : ProcessorModel<"cannonlake", SkylakeServerModel,
ProcessorFeatures.CNLFeatures>;
def : ProcessorModel<"icelake-client", SkylakeServerModel,
ProcessorFeatures.ICLFeatures>;
def : ProcessorModel<"icelake-server", SkylakeServerModel,
ProcessorFeatures.ICXFeatures>;
// AMD CPUs.
def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureMMX]>;
def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
Feature3DNow]>;
def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
Feature3DNow]>;
foreach P = ["athlon", "athlon-tbird"] in {
def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV,
Feature3DNowA, FeatureNOPL, FeatureSlowSHLD]>;
}
foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV,
FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL,
FeatureSlowSHLD]>;
}
foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureSSE2, Feature3DNowA, FeatureFXSR, FeatureNOPL,
Feature64Bit, FeatureSlowSHLD, FeatureCMOV,
FeatureFastScalarShiftMasks]>;
}
foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureSSE3,
Feature3DNowA, FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B,
FeatureSlowSHLD, FeatureCMOV, Feature64Bit,
FeatureFastScalarShiftMasks]>;
}
foreach P = ["amdfam10", "barcelona"] in {
def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE4A, Feature3DNowA,
FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureLZCNT,
FeaturePOPCNT, FeatureSlowSHLD, FeatureLAHFSAHF, FeatureCMOV,
Feature64Bit, FeatureFastScalarShiftMasks]>;
}
// Bobcat
def : Proc<"btver1", ProcessorFeatures.BtVer1Features>;
// Jaguar
def : ProcessorModel<"btver2", BtVer2Model, ProcessorFeatures.BtVer2Features>;
// Bulldozer
def : ProcessorModel<"bdver1", BdVer2Model, ProcessorFeatures.BdVer1Features>;
// Piledriver
def : ProcessorModel<"bdver2", BdVer2Model, ProcessorFeatures.BdVer2Features>;
// Steamroller
def : Proc<"bdver3", ProcessorFeatures.BdVer3Features>;
// Excavator
def : Proc<"bdver4", ProcessorFeatures.BdVer4Features>;
def : ProcessorModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures>;
def : ProcessorModel<"znver2", Znver1Model, ProcessorFeatures.ZN2Features>;
def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
Feature3DNowA]>;
def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
def : Proc<"winchip2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
def : Proc<"c3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureMMX, FeatureSSE1, FeatureFXSR,
FeatureCMOV]>;
// We also provide a generic 64-bit specific x86 processor model which tries to
// be good for modern chips without enabling instruction set encodings past the
// basic SSE2 and 64-bit ones. It disables slow things from any mainstream and
// modern 64-bit x86 chip, and enables features that are generally beneficial.
//
// We currently use the Sandy Bridge model as the default scheduling model as
// we use it across Nehalem, Westmere, Sandy Bridge, and Ivy Bridge which
// covers a huge swath of x86 processors. If there are specific scheduling
// knobs which need to be tuned differently for AMD chips, we might consider
// forming a common base for them.
def : ProcessorModel<"x86-64", SandyBridgeModel, [
FeatureX87,
FeatureCMPXCHG8B,
FeatureCMOV,
FeatureMMX,
FeatureSSE2,
FeatureFXSR,
FeatureNOPL,
Feature64Bit,
FeatureSlow3OpsLEA,
FeatureSlowIncDec,
FeatureMacroFusion
]>;
//===----------------------------------------------------------------------===//
// Calling Conventions
//===----------------------------------------------------------------------===//
include "X86CallingConv.td"
//===----------------------------------------------------------------------===//
// Assembly Parser
//===----------------------------------------------------------------------===//
def ATTAsmParserVariant : AsmParserVariant {
int Variant = 0;
// Variant name.
string Name = "att";
// Discard comments in assembly strings.
string CommentDelimiter = "#";
// Recognize hard coded registers.
string RegisterPrefix = "%";
}
def IntelAsmParserVariant : AsmParserVariant {
int Variant = 1;
// Variant name.
string Name = "intel";
// Discard comments in assembly strings.
string CommentDelimiter = ";";
// Recognize hard coded registers.
string RegisterPrefix = "";
}
//===----------------------------------------------------------------------===//
// Assembly Printers
//===----------------------------------------------------------------------===//
// The X86 target supports two different syntaxes for emitting machine code.
// This is controlled by the -x86-asm-syntax={att|intel}
def ATTAsmWriter : AsmWriter {
string AsmWriterClassName = "ATTInstPrinter";
int Variant = 0;
}
def IntelAsmWriter : AsmWriter {
string AsmWriterClassName = "IntelInstPrinter";
int Variant = 1;
}
def X86 : Target {
// Information about the instructions...
let InstructionSet = X86InstrInfo;
let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant];
let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
let AllowRegisterRenaming = 1;
}
//===----------------------------------------------------------------------===//
// Pfm Counters
//===----------------------------------------------------------------------===//
include "X86PfmCounters.td"
Index: vendor/llvm/dist-release_90/lib/Target/X86/X86ISelDAGToDAG.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/X86/X86ISelDAGToDAG.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/X86/X86ISelDAGToDAG.cpp (revision 351303)
@@ -1,5045 +1,5080 @@
//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines a DAG pattern matching instruction selector for X86,
// converting from a legalized dag to a X86 dag.
//
//===----------------------------------------------------------------------===//
#include "X86.h"
#include "X86MachineFunctionInfo.h"
#include "X86RegisterInfo.h"
#include "X86Subtarget.h"
#include "X86TargetMachine.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include <stdint.h>
using namespace llvm;
#define DEBUG_TYPE "x86-isel"
STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
cl::desc("Enable setting constant bits to reduce size of mask immediates"),
cl::Hidden);
//===----------------------------------------------------------------------===//
// Pattern Matcher Implementation
//===----------------------------------------------------------------------===//
namespace {
/// This corresponds to X86AddressMode, but uses SDValue's instead of register
/// numbers for the leaves of the matched tree.
struct X86ISelAddressMode {
enum {
RegBase,
FrameIndexBase
} BaseType;
// This is really a union, discriminated by BaseType!
SDValue Base_Reg;
int Base_FrameIndex;
unsigned Scale;
SDValue IndexReg;
int32_t Disp;
SDValue Segment;
const GlobalValue *GV;
const Constant *CP;
const BlockAddress *BlockAddr;
const char *ES;
MCSymbol *MCSym;
int JT;
unsigned Align; // CP alignment.
unsigned char SymbolFlags; // X86II::MO_*
bool NegateIndex = false;
X86ISelAddressMode()
: BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}
bool hasSymbolicDisplacement() const {
return GV != nullptr || CP != nullptr || ES != nullptr ||
MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
}
bool hasBaseOrIndexReg() const {
return BaseType == FrameIndexBase ||
IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
}
/// Return true if this addressing mode is already RIP-relative.
bool isRIPRelative() const {
if (BaseType != RegBase) return false;
if (RegisterSDNode *RegNode =
dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
return RegNode->getReg() == X86::RIP;
return false;
}
void setBaseReg(SDValue Reg) {
BaseType = RegBase;
Base_Reg = Reg;
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void dump(SelectionDAG *DAG = nullptr) {
dbgs() << "X86ISelAddressMode " << this << '\n';
dbgs() << "Base_Reg ";
if (Base_Reg.getNode())
Base_Reg.getNode()->dump(DAG);
else
dbgs() << "nul\n";
if (BaseType == FrameIndexBase)
dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
dbgs() << " Scale " << Scale << '\n'
<< "IndexReg ";
if (NegateIndex)
dbgs() << "negate ";
if (IndexReg.getNode())
IndexReg.getNode()->dump(DAG);
else
dbgs() << "nul\n";
dbgs() << " Disp " << Disp << '\n'
<< "GV ";
if (GV)
GV->dump();
else
dbgs() << "nul";
dbgs() << " CP ";
if (CP)
CP->dump();
else
dbgs() << "nul";
dbgs() << '\n'
<< "ES ";
if (ES)
dbgs() << ES;
else
dbgs() << "nul";
dbgs() << " MCSym ";
if (MCSym)
dbgs() << MCSym;
else
dbgs() << "nul";
dbgs() << " JT" << JT << " Align" << Align << '\n';
}
#endif
};
}
namespace {
//===--------------------------------------------------------------------===//
/// ISel - X86-specific code to select X86 machine instructions for
/// SelectionDAG operations.
///
class X86DAGToDAGISel final : public SelectionDAGISel {
/// Keep a pointer to the X86Subtarget around so that we can
/// make the right decision when generating code for different targets.
const X86Subtarget *Subtarget;
/// If true, selector should try to optimize for code size instead of
/// performance.
bool OptForSize;
/// If true, selector should try to optimize for minimum code size.
bool OptForMinSize;
/// Disable direct TLS access through segment registers.
bool IndirectTlsSegRefs;
public:
explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
: SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForSize(false),
OptForMinSize(false), IndirectTlsSegRefs(false) {}
StringRef getPassName() const override {
return "X86 DAG->DAG Instruction Selection";
}
bool runOnMachineFunction(MachineFunction &MF) override {
// Reset the subtarget each time through.
Subtarget = &MF.getSubtarget<X86Subtarget>();
IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
"indirect-tls-seg-refs");
// OptFor[Min]Size are used in pattern predicates that isel is matching.
OptForSize = MF.getFunction().hasOptSize();
OptForMinSize = MF.getFunction().hasMinSize();
assert((!OptForMinSize || OptForSize) &&
"OptForMinSize implies OptForSize");
SelectionDAGISel::runOnMachineFunction(MF);
return true;
}
void EmitFunctionEntryCode() override;
bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
void PreprocessISelDAG() override;
void PostprocessISelDAG() override;
// Include the pieces autogenerated from the target description.
#include "X86GenDAGISel.inc"
private:
void Select(SDNode *N) override;
bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
bool matchAddress(SDValue N, X86ISelAddressMode &AM);
bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
unsigned Depth);
bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
SDValue &Segment);
bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
SDValue &Segment);
bool selectMOV64Imm32(SDValue N, SDValue &Imm);
bool selectLEAAddr(SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
SDValue &Segment);
bool selectLEA64_32Addr(SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
SDValue &Segment);
bool selectTLSADDRAddr(SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
SDValue &Segment);
bool selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment,
SDValue &NodeWithChain);
bool selectRelocImm(SDValue N, SDValue &Op);
bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment);
// Convenience method where P is also root.
bool tryFoldLoad(SDNode *P, SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment) {
return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
}
/// Implement addressing mode selection for inline asm expressions.
bool SelectInlineAsmMemoryOperand(const SDValue &Op,
unsigned ConstraintID,
std::vector<SDValue> &OutOps) override;
void emitSpecialCodeForMain();
inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
MVT VT, SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment) {
if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
Base = CurDAG->getTargetFrameIndex(
AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
else if (AM.Base_Reg.getNode())
Base = AM.Base_Reg;
else
Base = CurDAG->getRegister(0, VT);
Scale = getI8Imm(AM.Scale, DL);
// Negate the index if needed.
if (AM.NegateIndex) {
unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r;
SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
AM.IndexReg), 0);
AM.IndexReg = Neg;
}
if (AM.IndexReg.getNode())
Index = AM.IndexReg;
else
Index = CurDAG->getRegister(0, VT);
// These are 32-bit even in 64-bit mode since RIP-relative offset
// is 32-bit.
if (AM.GV)
Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
MVT::i32, AM.Disp,
AM.SymbolFlags);
else if (AM.CP)
Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
AM.Align, AM.Disp, AM.SymbolFlags);
else if (AM.ES) {
assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
} else if (AM.MCSym) {
assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
assert(AM.SymbolFlags == 0 && "oo");
Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
} else if (AM.JT != -1) {
assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
} else if (AM.BlockAddr)
Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
AM.SymbolFlags);
else
Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
if (AM.Segment.getNode())
Segment = AM.Segment;
else
Segment = CurDAG->getRegister(0, MVT::i16);
}
// Utility function to determine whether we should avoid selecting
// immediate forms of instructions for better code size or not.
// At a high level, we'd like to avoid such instructions when
// we have similar constants used within the same basic block
// that can be kept in a register.
//
bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
uint32_t UseCount = 0;
// Do not want to hoist if we're not optimizing for size.
// TODO: We'd like to remove this restriction.
// See the comment in X86InstrInfo.td for more info.
if (!OptForSize)
return false;
// Walk all the users of the immediate.
for (SDNode::use_iterator UI = N->use_begin(),
UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
SDNode *User = *UI;
// This user is already selected. Count it as a legitimate use and
// move on.
if (User->isMachineOpcode()) {
UseCount++;
continue;
}
// We want to count stores of immediates as real uses.
if (User->getOpcode() == ISD::STORE &&
User->getOperand(1).getNode() == N) {
UseCount++;
continue;
}
// We don't currently match users that have > 2 operands (except
// for stores, which are handled above)
// Those instruction won't match in ISEL, for now, and would
// be counted incorrectly.
// This may change in the future as we add additional instruction
// types.
if (User->getNumOperands() != 2)
continue;
// Immediates that are used for offsets as part of stack
// manipulation should be left alone. These are typically
// used to indicate SP offsets for argument passing and
// will get pulled into stores/pushes (implicitly).
if (User->getOpcode() == X86ISD::ADD ||
User->getOpcode() == ISD::ADD ||
User->getOpcode() == X86ISD::SUB ||
User->getOpcode() == ISD::SUB) {
// Find the other operand of the add/sub.
SDValue OtherOp = User->getOperand(0);
if (OtherOp.getNode() == N)
OtherOp = User->getOperand(1);
// Don't count if the other operand is SP.
RegisterSDNode *RegNode;
if (OtherOp->getOpcode() == ISD::CopyFromReg &&
(RegNode = dyn_cast_or_null<RegisterSDNode>(
OtherOp->getOperand(1).getNode())))
if ((RegNode->getReg() == X86::ESP) ||
(RegNode->getReg() == X86::RSP))
continue;
}
// ... otherwise, count this and move on.
UseCount++;
}
// If we have more than 1 use, then recommend for hoisting.
return (UseCount > 1);
}
/// Return a target constant with the specified value of type i8.
inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
}
/// Return a target constant with the specified value, of type i32.
inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
}
/// Return a target constant with the specified value, of type i64.
inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
}
SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
const SDLoc &DL) {
assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
uint64_t Index = N->getConstantOperandVal(1);
MVT VecVT = N->getOperand(0).getSimpleValueType();
return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
}
SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
const SDLoc &DL) {
assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
uint64_t Index = N->getConstantOperandVal(2);
MVT VecVT = N->getSimpleValueType(0);
return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
}
// Helper to detect unneeded and instructions on shift amounts. Called
// from PatFrags in tablegen.
bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
if (Val.countTrailingOnes() >= Width)
return true;
APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
return Mask.countTrailingOnes() >= Width;
}
/// Return an SDNode that returns the value of the global base register.
/// Output instructions required to initialize the global base register,
/// if necessary.
SDNode *getGlobalBaseReg();
/// Return a reference to the TargetMachine, casted to the target-specific
/// type.
const X86TargetMachine &getTargetMachine() const {
return static_cast<const X86TargetMachine &>(TM);
}
/// Return a reference to the TargetInstrInfo, casted to the target-specific
/// type.
const X86InstrInfo *getInstrInfo() const {
return Subtarget->getInstrInfo();
}
/// Address-mode matching performs shift-of-and to and-of-shift
/// reassociation in order to expose more scaled addressing
/// opportunities.
bool ComplexPatternFuncMutatesDAG() const override {
return true;
}
bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
/// Returns whether this is a relocatable immediate in the range
/// [-2^Width .. 2^Width-1].
template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
if (auto *CN = dyn_cast<ConstantSDNode>(N))
return isInt<Width>(CN->getSExtValue());
return isSExtAbsoluteSymbolRef(Width, N);
}
// Indicates we should prefer to use a non-temporal load for this load.
bool useNonTemporalLoad(LoadSDNode *N) const {
if (!N->isNonTemporal())
return false;
unsigned StoreSize = N->getMemoryVT().getStoreSize();
if (N->getAlignment() < StoreSize)
return false;
switch (StoreSize) {
default: llvm_unreachable("Unsupported store size");
case 4:
case 8:
return false;
case 16:
return Subtarget->hasSSE41();
case 32:
return Subtarget->hasAVX2();
case 64:
return Subtarget->hasAVX512();
}
}
bool foldLoadStoreIntoMemOperand(SDNode *Node);
MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
bool matchBitExtract(SDNode *Node);
bool shrinkAndImmediate(SDNode *N);
bool isMaskZeroExtended(SDNode *N) const;
bool tryShiftAmountMod(SDNode *N);
bool tryShrinkShlLogicImm(SDNode *N);
bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
const SDLoc &dl, MVT VT, SDNode *Node);
MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
const SDLoc &dl, MVT VT, SDNode *Node,
SDValue &InFlag);
bool tryOptimizeRem8Extend(SDNode *N);
bool onlyUsesZeroFlag(SDValue Flags) const;
bool hasNoSignFlagUses(SDValue Flags) const;
bool hasNoCarryFlagUses(SDValue Flags) const;
};
}
// Returns true if this masked compare can be implemented legally with this
// type.
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
unsigned Opcode = N->getOpcode();
if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC ||
Opcode == X86ISD::CMPM_SAE || Opcode == X86ISD::VFPCLASS) {
// We can get 256-bit 8 element types here without VLX being enabled. When
// this happens we will use 512-bit operations and the mask will not be
// zero extended.
EVT OpVT = N->getOperand(0).getValueType();
if (OpVT.is256BitVector() || OpVT.is128BitVector())
return Subtarget->hasVLX();
return true;
}
// Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
Opcode == X86ISD::FSETCCM_SAE)
return true;
return false;
}
// Returns true if we can assume the writer of the mask has zero extended it
// for us.
bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
// If this is an AND, check if we have a compare on either side. As long as
// one side guarantees the mask is zero extended, the AND will preserve those
// zeros.
if (N->getOpcode() == ISD::AND)
return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
return isLegalMaskCompare(N, Subtarget);
}
bool
X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
if (OptLevel == CodeGenOpt::None) return false;
if (!N.hasOneUse())
return false;
if (N.getOpcode() != ISD::LOAD)
return true;
// Don't fold non-temporal loads if we have an instruction for them.
if (useNonTemporalLoad(cast<LoadSDNode>(N)))
return false;
// If N is a load, do additional profitability checks.
if (U == Root) {
switch (U->getOpcode()) {
default: break;
case X86ISD::ADD:
case X86ISD::ADC:
case X86ISD::SUB:
case X86ISD::SBB:
case X86ISD::AND:
case X86ISD::XOR:
case X86ISD::OR:
case ISD::ADD:
case ISD::ADDCARRY:
case ISD::AND:
case ISD::OR:
case ISD::XOR: {
SDValue Op1 = U->getOperand(1);
// If the other operand is a 8-bit immediate we should fold the immediate
// instead. This reduces code size.
// e.g.
// movl 4(%esp), %eax
// addl $4, %eax
// vs.
// movl $4, %eax
// addl 4(%esp), %eax
// The former is 2 bytes shorter. In case where the increment is 1, then
// the saving can be 4 bytes (by using incl %eax).
if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) {
if (Imm->getAPIntValue().isSignedIntN(8))
return false;
// If this is a 64-bit AND with an immediate that fits in 32-bits,
// prefer using the smaller and over folding the load. This is needed to
// make sure immediates created by shrinkAndImmediate are always folded.
// Ideally we would narrow the load during DAG combine and get the
// best of both worlds.
if (U->getOpcode() == ISD::AND &&
Imm->getAPIntValue().getBitWidth() == 64 &&
Imm->getAPIntValue().isIntN(32))
return false;
// If this really a zext_inreg that can be represented with a movzx
// instruction, prefer that.
// TODO: We could shrink the load and fold if it is non-volatile.
if (U->getOpcode() == ISD::AND &&
(Imm->getAPIntValue() == UINT8_MAX ||
Imm->getAPIntValue() == UINT16_MAX ||
Imm->getAPIntValue() == UINT32_MAX))
return false;
// ADD/SUB with can negate the immediate and use the opposite operation
// to fit 128 into a sign extended 8 bit immediate.
if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
(-Imm->getAPIntValue()).isSignedIntN(8))
return false;
}
// If the other operand is a TLS address, we should fold it instead.
// This produces
// movl %gs:0, %eax
// leal i@NTPOFF(%eax), %eax
// instead of
// movl $i@NTPOFF, %eax
// addl %gs:0, %eax
// if the block also has an access to a second TLS address this will save
// a load.
// FIXME: This is probably also true for non-TLS addresses.
if (Op1.getOpcode() == X86ISD::Wrapper) {
SDValue Val = Op1.getOperand(0);
if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
return false;
}
// Don't fold load if this matches the BTS/BTR/BTC patterns.
// BTS: (or X, (shl 1, n))
// BTR: (and X, (rotl -2, n))
// BTC: (xor X, (shl 1, n))
if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
if (U->getOperand(0).getOpcode() == ISD::SHL &&
isOneConstant(U->getOperand(0).getOperand(0)))
return false;
if (U->getOperand(1).getOpcode() == ISD::SHL &&
isOneConstant(U->getOperand(1).getOperand(0)))
return false;
}
if (U->getOpcode() == ISD::AND) {
SDValue U0 = U->getOperand(0);
SDValue U1 = U->getOperand(1);
if (U0.getOpcode() == ISD::ROTL) {
auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
if (C && C->getSExtValue() == -2)
return false;
}
if (U1.getOpcode() == ISD::ROTL) {
auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
if (C && C->getSExtValue() == -2)
return false;
}
}
break;
}
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
// Don't fold a load into a shift by immediate. The BMI2 instructions
// support folding a load, but not an immediate. The legacy instructions
// support folding an immediate, but can't fold a load. Folding an
// immediate is preferable to folding a load.
if (isa<ConstantSDNode>(U->getOperand(1)))
return false;
break;
}
}
// Prevent folding a load if this can implemented with an insert_subreg or
// a move that implicitly zeroes.
if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
isNullConstant(Root->getOperand(2)) &&
(Root->getOperand(0).isUndef() ||
ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
return false;
return true;
}
/// Replace the original chain operand of the call with
/// load's chain operand and move load below the call's chain operand.
static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
SDValue Call, SDValue OrigChain) {
SmallVector<SDValue, 8> Ops;
SDValue Chain = OrigChain.getOperand(0);
if (Chain.getNode() == Load.getNode())
Ops.push_back(Load.getOperand(0));
else {
assert(Chain.getOpcode() == ISD::TokenFactor &&
"Unexpected chain operand");
for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
if (Chain.getOperand(i).getNode() == Load.getNode())
Ops.push_back(Load.getOperand(0));
else
Ops.push_back(Chain.getOperand(i));
SDValue NewChain =
CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
Ops.clear();
Ops.push_back(NewChain);
}
Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
Load.getOperand(1), Load.getOperand(2));
Ops.clear();
Ops.push_back(SDValue(Load.getNode(), 1));
Ops.append(Call->op_begin() + 1, Call->op_end());
CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
}
/// Return true if call address is a load and it can be
/// moved below CALLSEQ_START and the chains leading up to the call.
/// Return the CALLSEQ_START by reference as a second output.
/// In the case of a tail call, there isn't a callseq node between the call
/// chain and the load.
static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
// The transformation is somewhat dangerous if the call's chain was glued to
// the call. After MoveBelowOrigChain the load is moved between the call and
// the chain, this can create a cycle if the load is not folded. So it is
// *really* important that we are sure the load will be folded.
if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
return false;
LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
if (!LD ||
LD->isVolatile() ||
LD->getAddressingMode() != ISD::UNINDEXED ||
LD->getExtensionType() != ISD::NON_EXTLOAD)
return false;
// Now let's find the callseq_start.
while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
if (!Chain.hasOneUse())
return false;
Chain = Chain.getOperand(0);
}
if (!Chain.getNumOperands())
return false;
// Since we are not checking for AA here, conservatively abort if the chain
// writes to memory. It's not safe to move the callee (a load) across a store.
if (isa<MemSDNode>(Chain.getNode()) &&
cast<MemSDNode>(Chain.getNode())->writeMem())
return false;
if (Chain.getOperand(0).getNode() == Callee.getNode())
return true;
if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
Callee.getValue(1).hasOneUse())
return true;
return false;
}
void X86DAGToDAGISel::PreprocessISelDAG() {
for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
E = CurDAG->allnodes_end(); I != E; ) {
SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
// If this is a target specific AND node with no flag usages, turn it back
// into ISD::AND to enable test instruction matching.
if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
N->getOperand(0), N->getOperand(1));
--I;
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
++I;
CurDAG->DeleteNode(N);
continue;
}
switch (N->getOpcode()) {
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT: {
// Replace vector fp_to_s/uint with their X86 specific equivalent so we
// don't need 2 sets of patterns.
if (!N->getSimpleValueType(0).isVector())
break;
unsigned NewOpc;
switch (N->getOpcode()) {
default: llvm_unreachable("Unexpected opcode!");
case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
}
SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
N->getOperand(0));
--I;
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
++I;
CurDAG->DeleteNode(N);
continue;
}
case ISD::SHL:
case ISD::SRA:
case ISD::SRL: {
// Replace vector shifts with their X86 specific equivalent so we don't
// need 2 sets of patterns.
if (!N->getValueType(0).isVector())
break;
unsigned NewOpc;
switch (N->getOpcode()) {
default: llvm_unreachable("Unexpected opcode!");
case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
}
SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
N->getOperand(0), N->getOperand(1));
--I;
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
++I;
CurDAG->DeleteNode(N);
continue;
}
case ISD::ANY_EXTEND:
case ISD::ANY_EXTEND_VECTOR_INREG: {
// Replace vector any extend with the zero extend equivalents so we don't
// need 2 sets of patterns. Ignore vXi1 extensions.
if (!N->getValueType(0).isVector() ||
N->getOperand(0).getScalarValueSizeInBits() == 1)
break;
unsigned NewOpc = N->getOpcode() == ISD::ANY_EXTEND
? ISD::ZERO_EXTEND
: ISD::ZERO_EXTEND_VECTOR_INREG;
SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
N->getOperand(0));
--I;
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
++I;
CurDAG->DeleteNode(N);
continue;
}
case ISD::FCEIL:
case ISD::FFLOOR:
case ISD::FTRUNC:
case ISD::FNEARBYINT:
case ISD::FRINT: {
// Replace fp rounding with their X86 specific equivalent so we don't
// need 2 sets of patterns.
unsigned Imm;
switch (N->getOpcode()) {
default: llvm_unreachable("Unexpected opcode!");
case ISD::FCEIL: Imm = 0xA; break;
case ISD::FFLOOR: Imm = 0x9; break;
case ISD::FTRUNC: Imm = 0xB; break;
case ISD::FNEARBYINT: Imm = 0xC; break;
case ISD::FRINT: Imm = 0x4; break;
}
SDLoc dl(N);
SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl,
N->getValueType(0),
N->getOperand(0),
CurDAG->getConstant(Imm, dl, MVT::i8));
--I;
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
++I;
CurDAG->DeleteNode(N);
continue;
}
case X86ISD::FANDN:
case X86ISD::FAND:
case X86ISD::FOR:
case X86ISD::FXOR: {
// Widen scalar fp logic ops to vector to reduce isel patterns.
// FIXME: Can we do this during lowering/combine.
MVT VT = N->getSimpleValueType(0);
if (VT.isVector() || VT == MVT::f128)
break;
MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32;
SDLoc dl(N);
SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
N->getOperand(0));
SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
N->getOperand(1));
SDValue Res;
if (Subtarget->hasSSE2()) {
EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
unsigned Opc;
switch (N->getOpcode()) {
default: llvm_unreachable("Unexpected opcode!");
case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
case X86ISD::FAND: Opc = ISD::AND; break;
case X86ISD::FOR: Opc = ISD::OR; break;
case X86ISD::FXOR: Opc = ISD::XOR; break;
}
Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
} else {
Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
}
Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
CurDAG->getIntPtrConstant(0, dl));
--I;
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
++I;
CurDAG->DeleteNode(N);
continue;
}
}
if (OptLevel != CodeGenOpt::None &&
// Only do this when the target can fold the load into the call or
// jmp.
!Subtarget->useRetpolineIndirectCalls() &&
((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
(N->getOpcode() == X86ISD::TC_RETURN &&
(Subtarget->is64Bit() ||
!getTargetMachine().isPositionIndependent())))) {
/// Also try moving call address load from outside callseq_start to just
/// before the call to allow it to be folded.
///
/// [Load chain]
/// ^
/// |
/// [Load]
/// ^ ^
/// | |
/// / \--
/// / |
///[CALLSEQ_START] |
/// ^ |
/// | |
/// [LOAD/C2Reg] |
/// | |
/// \ /
/// \ /
/// [CALL]
bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
SDValue Chain = N->getOperand(0);
SDValue Load = N->getOperand(1);
if (!isCalleeLoad(Load, Chain, HasCallSeq))
continue;
moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
++NumLoadMoved;
continue;
}
// Lower fpround and fpextend nodes that target the FP stack to be store and
// load to the stack. This is a gross hack. We would like to simply mark
// these as being illegal, but when we do that, legalize produces these when
// it expands calls, then expands these in the same legalize pass. We would
// like dag combine to be able to hack on these between the call expansion
// and the node legalization. As such this pass basically does "really
// late" legalization of these inline with the X86 isel pass.
// FIXME: This should only happen when not compiled with -O0.
switch (N->getOpcode()) {
default: continue;
case ISD::FP_ROUND:
case ISD::FP_EXTEND:
{
MVT SrcVT = N->getOperand(0).getSimpleValueType();
MVT DstVT = N->getSimpleValueType(0);
// If any of the sources are vectors, no fp stack involved.
if (SrcVT.isVector() || DstVT.isVector())
continue;
// If the source and destination are SSE registers, then this is a legal
// conversion that should not be lowered.
const X86TargetLowering *X86Lowering =
static_cast<const X86TargetLowering *>(TLI);
bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
if (SrcIsSSE && DstIsSSE)
continue;
if (!SrcIsSSE && !DstIsSSE) {
// If this is an FPStack extension, it is a noop.
if (N->getOpcode() == ISD::FP_EXTEND)
continue;
// If this is a value-preserving FPStack truncation, it is a noop.
if (N->getConstantOperandVal(1))
continue;
}
// Here we could have an FP stack truncation or an FPStack <-> SSE convert.
// FPStack has extload and truncstore. SSE can fold direct loads into other
// operations. Based on this, decide what we want to do.
MVT MemVT;
if (N->getOpcode() == ISD::FP_ROUND)
MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
else
MemVT = SrcIsSSE ? SrcVT : DstVT;
SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
SDLoc dl(N);
// FIXME: optimize the case where the src/dest is a load or store?
SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
MemTmp, MachinePointerInfo(), MemVT);
SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
MachinePointerInfo(), MemVT);
// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
// extload we created. This will cause general havok on the dag because
// anything below the conversion could be folded into other existing nodes.
// To avoid invalidating 'I', back it up to the convert node.
--I;
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
break;
}
//The sequence of events for lowering STRICT_FP versions of these nodes requires
//dealing with the chain differently, as there is already a preexisting chain.
case ISD::STRICT_FP_ROUND:
case ISD::STRICT_FP_EXTEND:
{
MVT SrcVT = N->getOperand(1).getSimpleValueType();
MVT DstVT = N->getSimpleValueType(0);
// If any of the sources are vectors, no fp stack involved.
if (SrcVT.isVector() || DstVT.isVector())
continue;
// If the source and destination are SSE registers, then this is a legal
// conversion that should not be lowered.
const X86TargetLowering *X86Lowering =
static_cast<const X86TargetLowering *>(TLI);
bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
if (SrcIsSSE && DstIsSSE)
continue;
if (!SrcIsSSE && !DstIsSSE) {
// If this is an FPStack extension, it is a noop.
if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
continue;
// If this is a value-preserving FPStack truncation, it is a noop.
if (N->getConstantOperandVal(2))
continue;
}
// Here we could have an FP stack truncation or an FPStack <-> SSE convert.
// FPStack has extload and truncstore. SSE can fold direct loads into other
// operations. Based on this, decide what we want to do.
MVT MemVT;
if (N->getOpcode() == ISD::STRICT_FP_ROUND)
MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
else
MemVT = SrcIsSSE ? SrcVT : DstVT;
SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
SDLoc dl(N);
// FIXME: optimize the case where the src/dest is a load or store?
//Since the operation is StrictFP, use the preexisting chain.
SDValue Store = CurDAG->getTruncStore(N->getOperand(0), dl, N->getOperand(1),
MemTmp, MachinePointerInfo(), MemVT);
SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
MachinePointerInfo(), MemVT);
// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
// extload we created. This will cause general havok on the dag because
// anything below the conversion could be folded into other existing nodes.
// To avoid invalidating 'I', back it up to the convert node.
--I;
CurDAG->ReplaceAllUsesWith(N, Result.getNode());
break;
}
}
// Now that we did that, the node is dead. Increment the iterator to the
// next node to process, then delete N.
++I;
CurDAG->DeleteNode(N);
}
// The load+call transform above can leave some dead nodes in the graph. Make
// sure we remove them. Its possible some of the other transforms do to so
// just remove dead nodes unconditionally.
CurDAG->RemoveDeadNodes();
}
// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
unsigned Opc = N->getMachineOpcode();
if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
Opc != X86::MOVSX64rr8)
return false;
SDValue N0 = N->getOperand(0);
// We need to be extracting the lower bit of an extend.
if (!N0.isMachineOpcode() ||
N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
N0.getConstantOperandVal(1) != X86::sub_8bit)
return false;
// We're looking for either a movsx or movzx to match the original opcode.
unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
: X86::MOVSX32rr8_NOREX;
SDValue N00 = N0.getOperand(0);
if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
return false;
if (Opc == X86::MOVSX64rr8) {
// If we had a sign extend from 8 to 64 bits. We still need to go from 32
// to 64.
MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
MVT::i64, N00);
ReplaceUses(N, Extend);
} else {
// Ok we can drop this extend and just use the original extend.
ReplaceUses(N, N00.getNode());
}
return true;
}
void X86DAGToDAGISel::PostprocessISelDAG() {
// Skip peepholes at -O0.
if (TM.getOptLevel() == CodeGenOpt::None)
return;
SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
bool MadeChange = false;
while (Position != CurDAG->allnodes_begin()) {
SDNode *N = &*--Position;
// Skip dead nodes and any non-machine opcodes.
if (N->use_empty() || !N->isMachineOpcode())
continue;
if (tryOptimizeRem8Extend(N)) {
MadeChange = true;
continue;
}
// Look for a TESTrr+ANDrr pattern where both operands of the test are
// the same. Rewrite to remove the AND.
unsigned Opc = N->getMachineOpcode();
if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr ||
Opc == X86::TEST32rr || Opc == X86::TEST64rr) &&
N->getOperand(0) == N->getOperand(1) &&
N->isOnlyUserOf(N->getOperand(0).getNode()) &&
N->getOperand(0).isMachineOpcode()) {
SDValue And = N->getOperand(0);
unsigned N0Opc = And.getMachineOpcode();
if (N0Opc == X86::AND8rr || N0Opc == X86::AND16rr ||
N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) {
MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N),
MVT::i32,
And.getOperand(0),
And.getOperand(1));
ReplaceUses(N, Test);
MadeChange = true;
continue;
}
if (N0Opc == X86::AND8rm || N0Opc == X86::AND16rm ||
N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) {
unsigned NewOpc;
switch (N0Opc) {
case X86::AND8rm: NewOpc = X86::TEST8mr; break;
case X86::AND16rm: NewOpc = X86::TEST16mr; break;
case X86::AND32rm: NewOpc = X86::TEST32mr; break;
case X86::AND64rm: NewOpc = X86::TEST64mr; break;
}
// Need to swap the memory and register operand.
SDValue Ops[] = { And.getOperand(1),
And.getOperand(2),
And.getOperand(3),
And.getOperand(4),
And.getOperand(5),
And.getOperand(0),
And.getOperand(6) /* Chain */ };
MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
MVT::i32, MVT::Other, Ops);
ReplaceUses(N, Test);
MadeChange = true;
continue;
}
}
// Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
// used. We're doing this late so we can prefer to fold the AND into masked
// comparisons. Doing that can be better for the live range of the mask
// register.
if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr ||
Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) &&
N->getOperand(0) == N->getOperand(1) &&
N->isOnlyUserOf(N->getOperand(0).getNode()) &&
N->getOperand(0).isMachineOpcode() &&
onlyUsesZeroFlag(SDValue(N, 0))) {
SDValue And = N->getOperand(0);
unsigned N0Opc = And.getMachineOpcode();
// KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
// KAND instructions and KTEST use the same ISA feature.
if (N0Opc == X86::KANDBrr ||
(N0Opc == X86::KANDWrr && Subtarget->hasDQI()) ||
N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) {
unsigned NewOpc;
switch (Opc) {
default: llvm_unreachable("Unexpected opcode!");
case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break;
case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break;
case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break;
case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break;
}
MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N),
MVT::i32,
And.getOperand(0),
And.getOperand(1));
ReplaceUses(N, KTest);
MadeChange = true;
continue;
}
}
// Attempt to remove vectors moves that were inserted to zero upper bits.
if (Opc != TargetOpcode::SUBREG_TO_REG)
continue;
unsigned SubRegIdx = N->getConstantOperandVal(2);
if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
continue;
SDValue Move = N->getOperand(1);
if (!Move.isMachineOpcode())
continue;
// Make sure its one of the move opcodes we recognize.
switch (Move.getMachineOpcode()) {
default:
continue;
case X86::VMOVAPDrr: case X86::VMOVUPDrr:
case X86::VMOVAPSrr: case X86::VMOVUPSrr:
case X86::VMOVDQArr: case X86::VMOVDQUrr:
case X86::VMOVAPDYrr: case X86::VMOVUPDYrr:
case X86::VMOVAPSYrr: case X86::VMOVUPSYrr:
case X86::VMOVDQAYrr: case X86::VMOVDQUYrr:
case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr:
case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr:
case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr:
case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr:
case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
break;
}
SDValue In = Move.getOperand(0);
if (!In.isMachineOpcode() ||
In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
continue;
// Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
// the SHA instructions which use a legacy encoding.
uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
(TSFlags & X86II::EncodingMask) != X86II::EVEX &&
(TSFlags & X86II::EncodingMask) != X86II::XOP)
continue;
// Producing instruction is another vector instruction. We can drop the
// move.
CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
MadeChange = true;
}
if (MadeChange)
CurDAG->RemoveDeadNodes();
}
/// Emit any code that needs to be executed only in the main function.
void X86DAGToDAGISel::emitSpecialCodeForMain() {
if (Subtarget->isTargetCygMing()) {
TargetLowering::ArgListTy Args;
auto &DL = CurDAG->getDataLayout();
TargetLowering::CallLoweringInfo CLI(*CurDAG);
CLI.setChain(CurDAG->getRoot())
.setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
std::move(Args));
const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
CurDAG->setRoot(Result.second);
}
}
void X86DAGToDAGISel::EmitFunctionEntryCode() {
// If this is main, emit special code for main.
const Function &F = MF->getFunction();
if (F.hasExternalLinkage() && F.getName() == "main")
emitSpecialCodeForMain();
}
static bool isDispSafeForFrameIndex(int64_t Val) {
// On 64-bit platforms, we can run into an issue where a frame index
// includes a displacement that, when added to the explicit displacement,
// will overflow the displacement field. Assuming that the frame index
// displacement fits into a 31-bit integer (which is only slightly more
// aggressive than the current fundamental assumption that it fits into
// a 32-bit integer), a 31-bit disp should always be safe.
return isInt<31>(Val);
}
bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
X86ISelAddressMode &AM) {
// If there's no offset to fold, we don't need to do any work.
if (Offset == 0)
return false;
// Cannot combine ExternalSymbol displacements with integer offsets.
if (AM.ES || AM.MCSym)
return true;
int64_t Val = AM.Disp + Offset;
CodeModel::Model M = TM.getCodeModel();
if (Subtarget->is64Bit()) {
if (!X86::isOffsetSuitableForCodeModel(Val, M,
AM.hasSymbolicDisplacement()))
return true;
// In addition to the checks required for a register base, check that
// we do not try to use an unsafe Disp with a frame index.
if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
!isDispSafeForFrameIndex(Val))
return true;
}
AM.Disp = Val;
return false;
}
bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
SDValue Address = N->getOperand(1);
// load gs:0 -> GS segment register.
// load fs:0 -> FS segment register.
//
// This optimization is valid because the GNU TLS model defines that
// gs:0 (or fs:0 on X86-64) contains its own address.
// For more information see http://people.redhat.com/drepper/tls.pdf
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
!IndirectTlsSegRefs &&
(Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
Subtarget->isTargetFuchsia()))
switch (N->getPointerInfo().getAddrSpace()) {
case 256:
AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
return false;
case 257:
AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
return false;
// Address space 258 is not handled here, because it is not used to
// address TLS areas.
}
return true;
}
/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
/// mode. These wrap things that will resolve down into a symbol reference.
/// If no match is possible, this returns true, otherwise it returns false.
bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
// If the addressing mode already has a symbol as the displacement, we can
// never match another symbol.
if (AM.hasSymbolicDisplacement())
return true;
bool IsRIPRelTLS = false;
bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
if (IsRIPRel) {
SDValue Val = N.getOperand(0);
if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
IsRIPRelTLS = true;
}
// We can't use an addressing mode in the 64-bit large code model.
// Global TLS addressing is an exception. In the medium code model,
// we use can use a mode when RIP wrappers are present.
// That signifies access to globals that are known to be "near",
// such as the GOT itself.
CodeModel::Model M = TM.getCodeModel();
if (Subtarget->is64Bit() &&
((M == CodeModel::Large && !IsRIPRelTLS) ||
(M == CodeModel::Medium && !IsRIPRel)))
return true;
// Base and index reg must be 0 in order to use %rip as base.
if (IsRIPRel && AM.hasBaseOrIndexReg())
return true;
// Make a local copy in case we can't do this fold.
X86ISelAddressMode Backup = AM;
int64_t Offset = 0;
SDValue N0 = N.getOperand(0);
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
AM.GV = G->getGlobal();
AM.SymbolFlags = G->getTargetFlags();
Offset = G->getOffset();
} else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
AM.CP = CP->getConstVal();
AM.Align = CP->getAlignment();
AM.SymbolFlags = CP->getTargetFlags();
Offset = CP->getOffset();
} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
AM.ES = S->getSymbol();
AM.SymbolFlags = S->getTargetFlags();
} else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
AM.MCSym = S->getMCSymbol();
} else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
AM.JT = J->getIndex();
AM.SymbolFlags = J->getTargetFlags();
} else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
AM.BlockAddr = BA->getBlockAddress();
AM.SymbolFlags = BA->getTargetFlags();
Offset = BA->getOffset();
} else
llvm_unreachable("Unhandled symbol reference node.");
if (foldOffsetIntoAddress(Offset, AM)) {
AM = Backup;
return true;
}
if (IsRIPRel)
AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
// Commit the changes now that we know this fold is safe.
return false;
}
/// Add the specified node to the specified addressing mode, returning true if
/// it cannot be done. This just pattern matches for the addressing mode.
bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
if (matchAddressRecursively(N, AM, 0))
return true;
// Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
// a smaller encoding and avoids a scaled-index.
if (AM.Scale == 2 &&
AM.BaseType == X86ISelAddressMode::RegBase &&
AM.Base_Reg.getNode() == nullptr) {
AM.Base_Reg = AM.IndexReg;
AM.Scale = 1;
}
// Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
// because it has a smaller encoding.
// TODO: Which other code models can use this?
switch (TM.getCodeModel()) {
default: break;
case CodeModel::Small:
case CodeModel::Kernel:
if (Subtarget->is64Bit() &&
AM.Scale == 1 &&
AM.BaseType == X86ISelAddressMode::RegBase &&
AM.Base_Reg.getNode() == nullptr &&
AM.IndexReg.getNode() == nullptr &&
AM.SymbolFlags == X86II::MO_NO_FLAG &&
AM.hasSymbolicDisplacement())
AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
break;
}
return false;
}
bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
unsigned Depth) {
// Add an artificial use to this node so that we can keep track of
// it if it gets CSE'd with a different node.
HandleSDNode Handle(N);
X86ISelAddressMode Backup = AM;
if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
return false;
AM = Backup;
// Try again after commuting the operands.
if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
!matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
return false;
AM = Backup;
// If we couldn't fold both operands into the address at the same time,
// see if we can just put each operand into a register and fold at least
// the add.
if (AM.BaseType == X86ISelAddressMode::RegBase &&
!AM.Base_Reg.getNode() &&
!AM.IndexReg.getNode()) {
N = Handle.getValue();
AM.Base_Reg = N.getOperand(0);
AM.IndexReg = N.getOperand(1);
AM.Scale = 1;
return false;
}
N = Handle.getValue();
return true;
}
// Insert a node into the DAG at least before the Pos node's position. This
// will reposition the node as needed, and will assign it a node ID that is <=
// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
// IDs! The selection DAG must no longer depend on their uniqueness when this
// is used.
static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
if (N->getNodeId() == -1 ||
(SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
DAG.RepositionNode(Pos->getIterator(), N.getNode());
// Mark Node as invalid for pruning as after this it may be a successor to a
// selected node but otherwise be in the same position of Pos.
// Conservatively mark it with the same -abs(Id) to assure node id
// invariant is preserved.
N->setNodeId(Pos->getNodeId());
SelectionDAGISel::InvalidateNodeId(N.getNode());
}
}
// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
// safe. This allows us to convert the shift and and into an h-register
// extract and a scaled index. Returns false if the simplification is
// performed.
static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
uint64_t Mask,
SDValue Shift, SDValue X,
X86ISelAddressMode &AM) {
if (Shift.getOpcode() != ISD::SRL ||
!isa<ConstantSDNode>(Shift.getOperand(1)) ||
!Shift.hasOneUse())
return true;
int ScaleLog = 8 - Shift.getConstantOperandVal(1);
if (ScaleLog <= 0 || ScaleLog >= 4 ||
Mask != (0xffu << ScaleLog))
return true;
MVT VT = N.getSimpleValueType();
SDLoc DL(N);
SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
SDValue NewMask = DAG.getConstant(0xff, DL, VT);
SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
// Insert the new nodes into the topological ordering. We must do this in
// a valid topological ordering as nothing is going to go back and re-sort
// these nodes. We continually insert before 'N' in sequence as this is
// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
// hierarchy left to express.
insertDAGNode(DAG, N, Eight);
insertDAGNode(DAG, N, Srl);
insertDAGNode(DAG, N, NewMask);
insertDAGNode(DAG, N, And);
insertDAGNode(DAG, N, ShlCount);
insertDAGNode(DAG, N, Shl);
DAG.ReplaceAllUsesWith(N, Shl);
DAG.RemoveDeadNode(N.getNode());
AM.IndexReg = And;
AM.Scale = (1 << ScaleLog);
return false;
}
// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
// allows us to fold the shift into this addressing mode. Returns false if the
// transform succeeded.
static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
X86ISelAddressMode &AM) {
SDValue Shift = N.getOperand(0);
// Use a signed mask so that shifting right will insert sign bits. These
// bits will be removed when we shift the result left so it doesn't matter
// what we use. This might allow a smaller immediate encoding.
int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
// If we have an any_extend feeding the AND, look through it to see if there
// is a shift behind it. But only if the AND doesn't use the extended bits.
// FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
bool FoundAnyExtend = false;
if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
isUInt<32>(Mask)) {
FoundAnyExtend = true;
Shift = Shift.getOperand(0);
}
if (Shift.getOpcode() != ISD::SHL ||
!isa<ConstantSDNode>(Shift.getOperand(1)))
return true;
SDValue X = Shift.getOperand(0);
// Not likely to be profitable if either the AND or SHIFT node has more
// than one use (unless all uses are for address computation). Besides,
// isel mechanism requires their node ids to be reused.
if (!N.hasOneUse() || !Shift.hasOneUse())
return true;
// Verify that the shift amount is something we can fold.
unsigned ShiftAmt = Shift.getConstantOperandVal(1);
if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
return true;
MVT VT = N.getSimpleValueType();
SDLoc DL(N);
if (FoundAnyExtend) {
SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
insertDAGNode(DAG, N, NewX);
X = NewX;
}
SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
// Insert the new nodes into the topological ordering. We must do this in
// a valid topological ordering as nothing is going to go back and re-sort
// these nodes. We continually insert before 'N' in sequence as this is
// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
// hierarchy left to express.
insertDAGNode(DAG, N, NewMask);
insertDAGNode(DAG, N, NewAnd);
insertDAGNode(DAG, N, NewShift);
DAG.ReplaceAllUsesWith(N, NewShift);
DAG.RemoveDeadNode(N.getNode());
AM.Scale = 1 << ShiftAmt;
AM.IndexReg = NewAnd;
return false;
}
// Implement some heroics to detect shifts of masked values where the mask can
// be replaced by extending the shift and undoing that in the addressing mode
// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
// the addressing mode. This results in code such as:
//
// int f(short *y, int *lookup_table) {
// ...
// return *y + lookup_table[*y >> 11];
// }
//
// Turning into:
// movzwl (%rdi), %eax
// movl %eax, %ecx
// shrl $11, %ecx
// addl (%rsi,%rcx,4), %eax
//
// Instead of:
// movzwl (%rdi), %eax
// movl %eax, %ecx
// shrl $9, %ecx
// andl $124, %rcx
// addl (%rsi,%rcx), %eax
//
// Note that this function assumes the mask is provided as a mask *after* the
// value is shifted. The input chain may or may not match that, but computing
// such a mask is trivial.
static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
uint64_t Mask,
SDValue Shift, SDValue X,
X86ISelAddressMode &AM) {
if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
!isa<ConstantSDNode>(Shift.getOperand(1)))
return true;
unsigned ShiftAmt = Shift.getConstantOperandVal(1);
unsigned MaskLZ = countLeadingZeros(Mask);
unsigned MaskTZ = countTrailingZeros(Mask);
// The amount of shift we're trying to fit into the addressing mode is taken
// from the trailing zeros of the mask.
unsigned AMShiftAmt = MaskTZ;
// There is nothing we can do here unless the mask is removing some bits.
// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
// We also need to ensure that mask is a continuous run of bits.
if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
// Scale the leading zero count down based on the actual size of the value.
// Also scale it down based on the size of the shift.
unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
if (MaskLZ < ScaleDown)
return true;
MaskLZ -= ScaleDown;
// The final check is to ensure that any masked out high bits of X are
// already known to be zero. Otherwise, the mask has a semantic impact
// other than masking out a couple of low bits. Unfortunately, because of
// the mask, zero extensions will be removed from operands in some cases.
// This code works extra hard to look through extensions because we can
// replace them with zero extensions cheaply if necessary.
bool ReplacingAnyExtend = false;
if (X.getOpcode() == ISD::ANY_EXTEND) {
unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
X.getOperand(0).getSimpleValueType().getSizeInBits();
// Assume that we'll replace the any-extend with a zero-extend, and
// narrow the search to the extended value.
X = X.getOperand(0);
MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
ReplacingAnyExtend = true;
}
APInt MaskedHighBits =
APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
KnownBits Known = DAG.computeKnownBits(X);
if (MaskedHighBits != Known.Zero) return true;
// We've identified a pattern that can be transformed into a single shift
// and an addressing mode. Make it so.
MVT VT = N.getSimpleValueType();
if (ReplacingAnyExtend) {
assert(X.getValueType() != VT);
// We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
insertDAGNode(DAG, N, NewX);
X = NewX;
}
SDLoc DL(N);
SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
// Insert the new nodes into the topological ordering. We must do this in
// a valid topological ordering as nothing is going to go back and re-sort
// these nodes. We continually insert before 'N' in sequence as this is
// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
// hierarchy left to express.
insertDAGNode(DAG, N, NewSRLAmt);
insertDAGNode(DAG, N, NewSRL);
insertDAGNode(DAG, N, NewSHLAmt);
insertDAGNode(DAG, N, NewSHL);
DAG.ReplaceAllUsesWith(N, NewSHL);
DAG.RemoveDeadNode(N.getNode());
AM.Scale = 1 << AMShiftAmt;
AM.IndexReg = NewSRL;
return false;
}
// Transform "(X >> SHIFT) & (MASK << C1)" to
// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
// matched to a BEXTR later. Returns false if the simplification is performed.
static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
uint64_t Mask,
SDValue Shift, SDValue X,
X86ISelAddressMode &AM,
const X86Subtarget &Subtarget) {
if (Shift.getOpcode() != ISD::SRL ||
!isa<ConstantSDNode>(Shift.getOperand(1)) ||
!Shift.hasOneUse() || !N.hasOneUse())
return true;
// Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
if (!Subtarget.hasTBM() &&
!(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
return true;
// We need to ensure that mask is a continuous run of bits.
if (!isShiftedMask_64(Mask)) return true;
unsigned ShiftAmt = Shift.getConstantOperandVal(1);
// The amount of shift we're trying to fit into the addressing mode is taken
// from the trailing zeros of the mask.
unsigned AMShiftAmt = countTrailingZeros(Mask);
// There is nothing we can do here unless the mask is removing some bits.
// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
MVT VT = N.getSimpleValueType();
SDLoc DL(N);
SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT);
SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask);
SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt);
// Insert the new nodes into the topological ordering. We must do this in
// a valid topological ordering as nothing is going to go back and re-sort
// these nodes. We continually insert before 'N' in sequence as this is
// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
// hierarchy left to express.
insertDAGNode(DAG, N, NewSRLAmt);
insertDAGNode(DAG, N, NewSRL);
insertDAGNode(DAG, N, NewMask);
insertDAGNode(DAG, N, NewAnd);
insertDAGNode(DAG, N, NewSHLAmt);
insertDAGNode(DAG, N, NewSHL);
DAG.ReplaceAllUsesWith(N, NewSHL);
DAG.RemoveDeadNode(N.getNode());
AM.Scale = 1 << AMShiftAmt;
AM.IndexReg = NewAnd;
return false;
}
bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
unsigned Depth) {
SDLoc dl(N);
LLVM_DEBUG({
dbgs() << "MatchAddress: ";
AM.dump(CurDAG);
});
// Limit recursion.
if (Depth > 5)
return matchAddressBase(N, AM);
// If this is already a %rip relative address, we can only merge immediates
// into it. Instead of handling this in every case, we handle it here.
// RIP relative addressing: %rip + 32-bit displacement!
if (AM.isRIPRelative()) {
// FIXME: JumpTable and ExternalSymbol address currently don't like
// displacements. It isn't very important, but this should be fixed for
// consistency.
if (!(AM.ES || AM.MCSym) && AM.JT != -1)
return true;
if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
return false;
return true;
}
switch (N.getOpcode()) {
default: break;
case ISD::LOCAL_RECOVER: {
if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
// Use the symbol and don't prefix it.
AM.MCSym = ESNode->getMCSymbol();
return false;
}
break;
}
case ISD::Constant: {
uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
if (!foldOffsetIntoAddress(Val, AM))
return false;
break;
}
case X86ISD::Wrapper:
case X86ISD::WrapperRIP:
if (!matchWrapper(N, AM))
return false;
break;
case ISD::LOAD:
if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
return false;
break;
case ISD::FrameIndex:
if (AM.BaseType == X86ISelAddressMode::RegBase &&
AM.Base_Reg.getNode() == nullptr &&
(!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
AM.BaseType = X86ISelAddressMode::FrameIndexBase;
AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
return false;
}
break;
case ISD::SHL:
if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
break;
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
unsigned Val = CN->getZExtValue();
// Note that we handle x<<1 as (,x,2) rather than (x,x) here so
// that the base operand remains free for further matching. If
// the base doesn't end up getting used, a post-processing step
// in MatchAddress turns (,x,2) into (x,x), which is cheaper.
if (Val == 1 || Val == 2 || Val == 3) {
AM.Scale = 1 << Val;
SDValue ShVal = N.getOperand(0);
// Okay, we know that we have a scale by now. However, if the scaled
// value is an add of something and a constant, we can fold the
// constant into the disp field here.
if (CurDAG->isBaseWithConstantOffset(ShVal)) {
AM.IndexReg = ShVal.getOperand(0);
ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
if (!foldOffsetIntoAddress(Disp, AM))
return false;
}
AM.IndexReg = ShVal;
return false;
}
}
break;
case ISD::SRL: {
// Scale must not be used already.
if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
// We only handle up to 64-bit values here as those are what matter for
// addressing mode optimizations.
assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
"Unexpected value size!");
SDValue And = N.getOperand(0);
if (And.getOpcode() != ISD::AND) break;
SDValue X = And.getOperand(0);
// The mask used for the transform is expected to be post-shift, but we
// found the shift first so just apply the shift to the mask before passing
// it down.
if (!isa<ConstantSDNode>(N.getOperand(1)) ||
!isa<ConstantSDNode>(And.getOperand(1)))
break;
uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
// Try to fold the mask and shift into the scale, and return false if we
// succeed.
if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
return false;
break;
}
case ISD::SMUL_LOHI:
case ISD::UMUL_LOHI:
// A mul_lohi where we need the low part can be folded as a plain multiply.
if (N.getResNo() != 0) break;
LLVM_FALLTHROUGH;
case ISD::MUL:
case X86ISD::MUL_IMM:
// X*[3,5,9] -> X+X*[2,4,8]
if (AM.BaseType == X86ISelAddressMode::RegBase &&
AM.Base_Reg.getNode() == nullptr &&
AM.IndexReg.getNode() == nullptr) {
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
CN->getZExtValue() == 9) {
AM.Scale = unsigned(CN->getZExtValue())-1;
SDValue MulVal = N.getOperand(0);
SDValue Reg;
// Okay, we know that we have a scale by now. However, if the scaled
// value is an add of something and a constant, we can fold the
// constant into the disp field here.
if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
isa<ConstantSDNode>(MulVal.getOperand(1))) {
Reg = MulVal.getOperand(0);
ConstantSDNode *AddVal =
cast<ConstantSDNode>(MulVal.getOperand(1));
uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
if (foldOffsetIntoAddress(Disp, AM))
Reg = N.getOperand(0);
} else {
Reg = N.getOperand(0);
}
AM.IndexReg = AM.Base_Reg = Reg;
return false;
}
}
break;
case ISD::SUB: {
// Given A-B, if A can be completely folded into the address and
// the index field with the index field unused, use -B as the index.
// This is a win if a has multiple parts that can be folded into
// the address. Also, this saves a mov if the base register has
// other uses, since it avoids a two-address sub instruction, however
// it costs an additional mov if the index register has other uses.
// Add an artificial use to this node so that we can keep track of
// it if it gets CSE'd with a different node.
HandleSDNode Handle(N);
// Test if the LHS of the sub can be folded.
X86ISelAddressMode Backup = AM;
if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
N = Handle.getValue();
AM = Backup;
break;
}
N = Handle.getValue();
// Test if the index field is free for use.
if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
AM = Backup;
break;
}
int Cost = 0;
SDValue RHS = N.getOperand(1);
// If the RHS involves a register with multiple uses, this
// transformation incurs an extra mov, due to the neg instruction
// clobbering its operand.
if (!RHS.getNode()->hasOneUse() ||
RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
(RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
RHS.getOperand(0).getValueType() == MVT::i32))
++Cost;
// If the base is a register with multiple uses, this
// transformation may save a mov.
if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
!AM.Base_Reg.getNode()->hasOneUse()) ||
AM.BaseType == X86ISelAddressMode::FrameIndexBase)
--Cost;
// If the folded LHS was interesting, this transformation saves
// address arithmetic.
if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
((AM.Disp != 0) && (Backup.Disp == 0)) +
(AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
--Cost;
// If it doesn't look like it may be an overall win, don't do it.
if (Cost >= 0) {
AM = Backup;
break;
}
// Ok, the transformation is legal and appears profitable. Go for it.
// Negation will be emitted later to avoid creating dangling nodes if this
// was an unprofitable LEA.
AM.IndexReg = RHS;
AM.NegateIndex = true;
AM.Scale = 1;
return false;
}
case ISD::ADD:
if (!matchAdd(N, AM, Depth))
return false;
break;
case ISD::OR:
// We want to look through a transform in InstCombine and DAGCombiner that
// turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
// Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
// An 'lea' can then be used to match the shift (multiply) and add:
// and $1, %esi
// lea (%rsi, %rdi, 8), %rax
if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
!matchAdd(N, AM, Depth))
return false;
break;
case ISD::AND: {
// Perform some heroic transforms on an and of a constant-count shift
// with a constant to enable use of the scaled offset field.
// Scale must not be used already.
if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
// We only handle up to 64-bit values here as those are what matter for
// addressing mode optimizations.
assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
"Unexpected value size!");
if (!isa<ConstantSDNode>(N.getOperand(1)))
break;
if (N.getOperand(0).getOpcode() == ISD::SRL) {
SDValue Shift = N.getOperand(0);
SDValue X = Shift.getOperand(0);
uint64_t Mask = N.getConstantOperandVal(1);
// Try to fold the mask and shift into an extract and scale.
if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
return false;
// Try to fold the mask and shift directly into the scale.
if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
return false;
// Try to fold the mask and shift into BEXTR and scale.
if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
return false;
}
// Try to swap the mask and shift to place shifts which can be done as
// a scale on the outside of the mask.
if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
return false;
break;
}
case ISD::ZERO_EXTEND: {
// Try to widen a zexted shift left to the same size as its use, so we can
// match the shift as a scale factor.
if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
break;
if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse())
break;
// Give up if the shift is not a valid scale factor [1,2,3].
SDValue Shl = N.getOperand(0);
auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1));
if (!ShAmtC || ShAmtC->getZExtValue() > 3)
break;
// The narrow shift must only shift out zero bits (it must be 'nuw').
// That makes it safe to widen to the destination type.
APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(),
ShAmtC->getZExtValue());
if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros))
break;
// zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C)
MVT VT = N.getSimpleValueType();
SDLoc DL(N);
SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0));
SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1));
// Convert the shift to scale factor.
AM.Scale = 1 << ShAmtC->getZExtValue();
AM.IndexReg = Zext;
insertDAGNode(*CurDAG, N, Zext);
insertDAGNode(*CurDAG, N, NewShl);
CurDAG->ReplaceAllUsesWith(N, NewShl);
CurDAG->RemoveDeadNode(N.getNode());
return false;
}
}
return matchAddressBase(N, AM);
}
/// Helper for MatchAddress. Add the specified node to the
/// specified addressing mode without any further recursion.
bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
// Is the base register already occupied?
if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
// If so, check to see if the scale index register is set.
if (!AM.IndexReg.getNode()) {
AM.IndexReg = N;
AM.Scale = 1;
return false;
}
// Otherwise, we cannot select it.
return true;
}
// Default, generate it as a register.
AM.BaseType = X86ISelAddressMode::RegBase;
AM.Base_Reg = N;
return false;
}
/// Helper for selectVectorAddr. Handles things that can be folded into a
/// gather scatter address. The index register and scale should have already
/// been handled.
bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
// TODO: Support other operations.
switch (N.getOpcode()) {
case ISD::Constant: {
uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
if (!foldOffsetIntoAddress(Val, AM))
return false;
break;
}
case X86ISD::Wrapper:
if (!matchWrapper(N, AM))
return false;
break;
}
return matchAddressBase(N, AM);
}
bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index,
SDValue &Disp, SDValue &Segment) {
X86ISelAddressMode AM;
auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent);
AM.IndexReg = Mgs->getIndex();
AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue();
unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
// AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
if (AddrSpace == 256)
AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
if (AddrSpace == 257)
AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
if (AddrSpace == 258)
AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
SDLoc DL(N);
MVT VT = N.getSimpleValueType();
// Try to match into the base and displacement fields.
if (matchVectorAddress(N, AM))
return false;
getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
return true;
}
/// Returns true if it is able to pattern match an addressing mode.
/// It returns the operands which make up the maximal addressing mode it can
/// match by reference.
///
/// Parent is the parent node of the addr operand that is being matched. It
/// is always a load, store, atomic node, or null. It is only null when
/// checking memory operands for inline asm nodes.
bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index,
SDValue &Disp, SDValue &Segment) {
X86ISelAddressMode AM;
if (Parent &&
// This list of opcodes are all the nodes that have an "addr:$ptr" operand
// that are not a MemSDNode, and thus don't have proper addrspace info.
Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
unsigned AddrSpace =
cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
// AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
if (AddrSpace == 256)
AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
if (AddrSpace == 257)
AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
if (AddrSpace == 258)
AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
}
// Save the DL and VT before calling matchAddress, it can invalidate N.
SDLoc DL(N);
MVT VT = N.getSimpleValueType();
if (matchAddress(N, AM))
return false;
getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
return true;
}
// We can only fold a load if all nodes between it and the root node have a
// single use. If there are additional uses, we could end up duplicating the
// load.
static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *User) {
while (User != Root) {
if (!User->hasOneUse())
return false;
User = *User->use_begin();
}
return true;
}
/// Match a scalar SSE load. In particular, we want to match a load whose top
/// elements are either undef or zeros. The load flavor is derived from the
/// type of N, which is either v4f32 or v2f64.
///
/// We also return:
/// PatternChainNode: this is the matched node that has a chain input and
/// output.
bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index,
SDValue &Disp, SDValue &Segment,
SDValue &PatternNodeWithChain) {
if (!hasSingleUsesFromRoot(Root, Parent))
return false;
// We can allow a full vector load here since narrowing a load is ok unless
// it's volatile.
if (ISD::isNON_EXTLoad(N.getNode())) {
LoadSDNode *LD = cast<LoadSDNode>(N);
if (!LD->isVolatile() &&
IsProfitableToFold(N, LD, Root) &&
IsLegalToFold(N, Parent, Root, OptLevel)) {
PatternNodeWithChain = N;
return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
Segment);
}
}
// We can also match the special zero extended load opcode.
if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
PatternNodeWithChain = N;
if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
Segment);
}
}
// Need to make sure that the SCALAR_TO_VECTOR and load are both only used
// once. Otherwise the load might get duplicated and the chain output of the
// duplicate load will not be observed by all dependencies.
if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) {
PatternNodeWithChain = N.getOperand(0);
if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
Segment);
}
}
return false;
}
bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
uint64_t ImmVal = CN->getZExtValue();
if (!isUInt<32>(ImmVal))
return false;
Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
return true;
}
// In static codegen with small code model, we can get the address of a label
// into a register with 'movl'
if (N->getOpcode() != X86ISD::Wrapper)
return false;
N = N.getOperand(0);
// At least GNU as does not accept 'movl' for TPOFF relocations.
// FIXME: We could use 'movl' when we know we are targeting MC.
if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
return false;
Imm = N;
if (N->getOpcode() != ISD::TargetGlobalAddress)
return TM.getCodeModel() == CodeModel::Small;
Optional<ConstantRange> CR =
cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
if (!CR)
return TM.getCodeModel() == CodeModel::Small;
return CR->getUnsignedMax().ult(1ull << 32);
}
bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index,
SDValue &Disp, SDValue &Segment) {
// Save the debug loc before calling selectLEAAddr, in case it invalidates N.
SDLoc DL(N);
if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
return false;
RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
if (RN && RN->getReg() == 0)
Base = CurDAG->getRegister(0, MVT::i64);
else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
// Base could already be %rip, particularly in the x32 ABI.
SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
MVT::i64), 0);
Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
Base);
}
RN = dyn_cast<RegisterSDNode>(Index);
if (RN && RN->getReg() == 0)
Index = CurDAG->getRegister(0, MVT::i64);
else {
assert(Index.getValueType() == MVT::i32 &&
"Expect to be extending 32-bit registers for use in LEA");
SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
MVT::i64), 0);
Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
Index);
}
return true;
}
/// Calls SelectAddr and determines if the maximal addressing
/// mode it matches can be cost effectively emitted as an LEA instruction.
bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment) {
X86ISelAddressMode AM;
// Save the DL and VT before calling matchAddress, it can invalidate N.
SDLoc DL(N);
MVT VT = N.getSimpleValueType();
// Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
// segments.
SDValue Copy = AM.Segment;
SDValue T = CurDAG->getRegister(0, MVT::i32);
AM.Segment = T;
if (matchAddress(N, AM))
return false;
assert (T == AM.Segment);
AM.Segment = Copy;
unsigned Complexity = 0;
if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
Complexity = 1;
else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
Complexity = 4;
if (AM.IndexReg.getNode())
Complexity++;
// Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
// a simple shift.
if (AM.Scale > 1)
Complexity++;
// FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
// to a LEA. This is determined with some experimentation but is by no means
// optimal (especially for code size consideration). LEA is nice because of
// its three-address nature. Tweak the cost function again when we can run
// convertToThreeAddress() at register allocation time.
if (AM.hasSymbolicDisplacement()) {
// For X86-64, always use LEA to materialize RIP-relative addresses.
if (Subtarget->is64Bit())
Complexity = 4;
else
Complexity += 2;
}
+ // Heuristic: try harder to form an LEA from ADD if the operands set flags.
+ // Unlike ADD, LEA does not affect flags, so we will be less likely to require
+ // duplicating flag-producing instructions later in the pipeline.
+ if (N.getOpcode() == ISD::ADD) {
+ auto isMathWithFlags = [](SDValue V) {
+ switch (V.getOpcode()) {
+ case X86ISD::ADD:
+ case X86ISD::SUB:
+ case X86ISD::ADC:
+ case X86ISD::SBB:
+ /* TODO: These opcodes can be added safely, but we may want to justify
+ their inclusion for different reasons (better for reg-alloc).
+ case X86ISD::SMUL:
+ case X86ISD::UMUL:
+ case X86ISD::OR:
+ case X86ISD::XOR:
+ case X86ISD::AND:
+ */
+ // Value 1 is the flag output of the node - verify it's not dead.
+ return !SDValue(V.getNode(), 1).use_empty();
+ default:
+ return false;
+ }
+ };
+ // TODO: This could be an 'or' rather than 'and' to make the transform more
+ // likely to happen. We might want to factor in whether there's a
+ // load folding opportunity for the math op that disappears with LEA.
+ if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1)))
+ Complexity++;
+ }
+
if (AM.Disp)
Complexity++;
// If it isn't worth using an LEA, reject it.
if (Complexity <= 2)
return false;
getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
return true;
}
/// This is only run on TargetGlobalTLSAddress nodes.
bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index,
SDValue &Disp, SDValue &Segment) {
assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
X86ISelAddressMode AM;
AM.GV = GA->getGlobal();
AM.Disp += GA->getOffset();
AM.SymbolFlags = GA->getTargetFlags();
MVT VT = N.getSimpleValueType();
if (VT == MVT::i32) {
AM.Scale = 1;
AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
}
getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
return true;
}
bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN),
N.getValueType());
return true;
}
// Keep track of the original value type and whether this value was
// truncated. If we see a truncation from pointer type to VT that truncates
// bits that are known to be zero, we can use a narrow reference.
EVT VT = N.getValueType();
bool WasTruncated = false;
if (N.getOpcode() == ISD::TRUNCATE) {
WasTruncated = true;
N = N.getOperand(0);
}
if (N.getOpcode() != X86ISD::Wrapper)
return false;
// We can only use non-GlobalValues as immediates if they were not truncated,
// as we do not have any range information. If we have a GlobalValue and the
// address was not truncated, we can select it as an operand directly.
unsigned Opc = N.getOperand(0)->getOpcode();
if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
Op = N.getOperand(0);
// We can only select the operand directly if we didn't have to look past a
// truncate.
return !WasTruncated;
}
// Check that the global's range fits into VT.
auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
return false;
// Okay, we can use a narrow reference.
Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
GA->getOffset(), GA->getTargetFlags());
return true;
}
bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment) {
if (!ISD::isNON_EXTLoad(N.getNode()) ||
!IsProfitableToFold(N, P, Root) ||
!IsLegalToFold(N, P, Root, OptLevel))
return false;
return selectAddr(N.getNode(),
N.getOperand(1), Base, Scale, Index, Disp, Segment);
}
/// Return an SDNode that returns the value of the global base register.
/// Output instructions required to initialize the global base register,
/// if necessary.
SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
auto &DL = MF->getDataLayout();
return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
}
bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
if (N->getOpcode() == ISD::TRUNCATE)
N = N->getOperand(0).getNode();
if (N->getOpcode() != X86ISD::Wrapper)
return false;
auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
if (!GA)
return false;
Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
return CR && CR->getSignedMin().sge(-1ull << Width) &&
CR->getSignedMax().slt(1ull << Width);
}
static X86::CondCode getCondFromNode(SDNode *N) {
assert(N->isMachineOpcode() && "Unexpected node");
X86::CondCode CC = X86::COND_INVALID;
unsigned Opc = N->getMachineOpcode();
if (Opc == X86::JCC_1)
CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1));
else if (Opc == X86::SETCCr)
CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0));
else if (Opc == X86::SETCCm)
CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5));
else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr ||
Opc == X86::CMOV64rr)
CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2));
else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm ||
Opc == X86::CMOV64rm)
CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6));
return CC;
}
/// Test whether the given X86ISD::CMP node has any users that use a flag
/// other than ZF.
bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
// Examine each user of the node.
for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
UI != UE; ++UI) {
// Only check things that use the flags.
if (UI.getUse().getResNo() != Flags.getResNo())
continue;
// Only examine CopyToReg uses that copy to EFLAGS.
if (UI->getOpcode() != ISD::CopyToReg ||
cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
return false;
// Examine each user of the CopyToReg use.
for (SDNode::use_iterator FlagUI = UI->use_begin(),
FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
// Only examine the Flag result.
if (FlagUI.getUse().getResNo() != 1) continue;
// Anything unusual: assume conservatively.
if (!FlagUI->isMachineOpcode()) return false;
// Examine the condition code of the user.
X86::CondCode CC = getCondFromNode(*FlagUI);
switch (CC) {
// Comparisons which only use the zero flag.
case X86::COND_E: case X86::COND_NE:
continue;
// Anything else: assume conservatively.
default:
return false;
}
}
}
return true;
}
/// Test whether the given X86ISD::CMP node has any uses which require the SF
/// flag to be accurate.
bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
// Examine each user of the node.
for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
UI != UE; ++UI) {
// Only check things that use the flags.
if (UI.getUse().getResNo() != Flags.getResNo())
continue;
// Only examine CopyToReg uses that copy to EFLAGS.
if (UI->getOpcode() != ISD::CopyToReg ||
cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
return false;
// Examine each user of the CopyToReg use.
for (SDNode::use_iterator FlagUI = UI->use_begin(),
FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
// Only examine the Flag result.
if (FlagUI.getUse().getResNo() != 1) continue;
// Anything unusual: assume conservatively.
if (!FlagUI->isMachineOpcode()) return false;
// Examine the condition code of the user.
X86::CondCode CC = getCondFromNode(*FlagUI);
switch (CC) {
// Comparisons which don't examine the SF flag.
case X86::COND_A: case X86::COND_AE:
case X86::COND_B: case X86::COND_BE:
case X86::COND_E: case X86::COND_NE:
case X86::COND_O: case X86::COND_NO:
case X86::COND_P: case X86::COND_NP:
continue;
// Anything else: assume conservatively.
default:
return false;
}
}
}
return true;
}
static bool mayUseCarryFlag(X86::CondCode CC) {
switch (CC) {
// Comparisons which don't examine the CF flag.
case X86::COND_O: case X86::COND_NO:
case X86::COND_E: case X86::COND_NE:
case X86::COND_S: case X86::COND_NS:
case X86::COND_P: case X86::COND_NP:
case X86::COND_L: case X86::COND_GE:
case X86::COND_G: case X86::COND_LE:
return false;
// Anything else: assume conservatively.
default:
return true;
}
}
/// Test whether the given node which sets flags has any uses which require the
/// CF flag to be accurate.
bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
// Examine each user of the node.
for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
UI != UE; ++UI) {
// Only check things that use the flags.
if (UI.getUse().getResNo() != Flags.getResNo())
continue;
unsigned UIOpc = UI->getOpcode();
if (UIOpc == ISD::CopyToReg) {
// Only examine CopyToReg uses that copy to EFLAGS.
if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
return false;
// Examine each user of the CopyToReg use.
for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
FlagUI != FlagUE; ++FlagUI) {
// Only examine the Flag result.
if (FlagUI.getUse().getResNo() != 1)
continue;
// Anything unusual: assume conservatively.
if (!FlagUI->isMachineOpcode())
return false;
// Examine the condition code of the user.
X86::CondCode CC = getCondFromNode(*FlagUI);
if (mayUseCarryFlag(CC))
return false;
}
// This CopyToReg is ok. Move on to the next user.
continue;
}
// This might be an unselected node. So look for the pre-isel opcodes that
// use flags.
unsigned CCOpNo;
switch (UIOpc) {
default:
// Something unusual. Be conservative.
return false;
case X86ISD::SETCC: CCOpNo = 0; break;
case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
case X86ISD::CMOV: CCOpNo = 2; break;
case X86ISD::BRCOND: CCOpNo = 2; break;
}
X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
if (mayUseCarryFlag(CC))
return false;
}
return true;
}
/// Check whether or not the chain ending in StoreNode is suitable for doing
/// the {load; op; store} to modify transformation.
static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
SDValue StoredVal, SelectionDAG *CurDAG,
unsigned LoadOpNo,
LoadSDNode *&LoadNode,
SDValue &InputChain) {
// Is the stored value result 0 of the operation?
if (StoredVal.getResNo() != 0) return false;
// Are there other uses of the operation other than the store?
if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
// Is the store non-extending and non-indexed?
if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
return false;
SDValue Load = StoredVal->getOperand(LoadOpNo);
// Is the stored value a non-extending and non-indexed load?
if (!ISD::isNormalLoad(Load.getNode())) return false;
// Return LoadNode by reference.
LoadNode = cast<LoadSDNode>(Load);
// Is store the only read of the loaded value?
if (!Load.hasOneUse())
return false;
// Is the address of the store the same as the load?
if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
LoadNode->getOffset() != StoreNode->getOffset())
return false;
bool FoundLoad = false;
SmallVector<SDValue, 4> ChainOps;
SmallVector<const SDNode *, 4> LoopWorklist;
SmallPtrSet<const SDNode *, 16> Visited;
const unsigned int Max = 1024;
// Visualization of Load-Op-Store fusion:
// -------------------------
// Legend:
// *-lines = Chain operand dependencies.
// |-lines = Normal operand dependencies.
// Dependencies flow down and right. n-suffix references multiple nodes.
//
// C Xn C
// * * *
// * * *
// Xn A-LD Yn TF Yn
// * * \ | * |
// * * \ | * |
// * * \ | => A--LD_OP_ST
// * * \| \
// TF OP \
// * | \ Zn
// * | \
// A-ST Zn
//
// This merge induced dependences from: #1: Xn -> LD, OP, Zn
// #2: Yn -> LD
// #3: ST -> Zn
// Ensure the transform is safe by checking for the dual
// dependencies to make sure we do not induce a loop.
// As LD is a predecessor to both OP and ST we can do this by checking:
// a). if LD is a predecessor to a member of Xn or Yn.
// b). if a Zn is a predecessor to ST.
// However, (b) can only occur through being a chain predecessor to
// ST, which is the same as Zn being a member or predecessor of Xn,
// which is a subset of LD being a predecessor of Xn. So it's
// subsumed by check (a).
SDValue Chain = StoreNode->getChain();
// Gather X elements in ChainOps.
if (Chain == Load.getValue(1)) {
FoundLoad = true;
ChainOps.push_back(Load.getOperand(0));
} else if (Chain.getOpcode() == ISD::TokenFactor) {
for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
SDValue Op = Chain.getOperand(i);
if (Op == Load.getValue(1)) {
FoundLoad = true;
// Drop Load, but keep its chain. No cycle check necessary.
ChainOps.push_back(Load.getOperand(0));
continue;
}
LoopWorklist.push_back(Op.getNode());
ChainOps.push_back(Op);
}
}
if (!FoundLoad)
return false;
// Worklist is currently Xn. Add Yn to worklist.
for (SDValue Op : StoredVal->ops())
if (Op.getNode() != LoadNode)
LoopWorklist.push_back(Op.getNode());
// Check (a) if Load is a predecessor to Xn + Yn
if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
true))
return false;
InputChain =
CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
return true;
}
// Change a chain of {load; op; store} of the same value into a simple op
// through memory of that value, if the uses of the modified value and its
// address are suitable.
//
// The tablegen pattern memory operand pattern is currently not able to match
// the case where the EFLAGS on the original operation are used.
//
// To move this to tablegen, we'll need to improve tablegen to allow flags to
// be transferred from a node in the pattern to the result node, probably with
// a new keyword. For example, we have this
// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
// (implicit EFLAGS)]>;
// but maybe need something like this
// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
// (transferrable EFLAGS)]>;
//
// Until then, we manually fold these and instruction select the operation
// here.
bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
SDValue StoredVal = StoreNode->getOperand(1);
unsigned Opc = StoredVal->getOpcode();
// Before we try to select anything, make sure this is memory operand size
// and opcode we can handle. Note that this must match the code below that
// actually lowers the opcodes.
EVT MemVT = StoreNode->getMemoryVT();
if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
MemVT != MVT::i8)
return false;
bool IsCommutable = false;
bool IsNegate = false;
switch (Opc) {
default:
return false;
case X86ISD::SUB:
IsNegate = isNullConstant(StoredVal.getOperand(0));
break;
case X86ISD::SBB:
break;
case X86ISD::ADD:
case X86ISD::ADC:
case X86ISD::AND:
case X86ISD::OR:
case X86ISD::XOR:
IsCommutable = true;
break;
}
unsigned LoadOpNo = IsNegate ? 1 : 0;
LoadSDNode *LoadNode = nullptr;
SDValue InputChain;
if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
LoadNode, InputChain)) {
if (!IsCommutable)
return false;
// This operation is commutable, try the other operand.
LoadOpNo = 1;
if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
LoadNode, InputChain))
return false;
}
SDValue Base, Scale, Index, Disp, Segment;
if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
Segment))
return false;
auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
unsigned Opc8) {
switch (MemVT.getSimpleVT().SimpleTy) {
case MVT::i64:
return Opc64;
case MVT::i32:
return Opc32;
case MVT::i16:
return Opc16;
case MVT::i8:
return Opc8;
default:
llvm_unreachable("Invalid size!");
}
};
MachineSDNode *Result;
switch (Opc) {
case X86ISD::SUB:
// Handle negate.
if (IsNegate) {
unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
X86::NEG8m);
const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
MVT::Other, Ops);
break;
}
LLVM_FALLTHROUGH;
case X86ISD::ADD:
// Try to match inc/dec.
if (!Subtarget->slowIncDec() || OptForSize) {
bool IsOne = isOneConstant(StoredVal.getOperand(1));
bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
// ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
unsigned NewOpc =
((Opc == X86ISD::ADD) == IsOne)
? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
: SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
MVT::Other, Ops);
break;
}
}
LLVM_FALLTHROUGH;
case X86ISD::ADC:
case X86ISD::SBB:
case X86ISD::AND:
case X86ISD::OR:
case X86ISD::XOR: {
auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
switch (Opc) {
case X86ISD::ADD:
return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
X86::ADD8mr);
case X86ISD::ADC:
return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
X86::ADC8mr);
case X86ISD::SUB:
return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
X86::SUB8mr);
case X86ISD::SBB:
return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
X86::SBB8mr);
case X86ISD::AND:
return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
X86::AND8mr);
case X86ISD::OR:
return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
case X86ISD::XOR:
return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
X86::XOR8mr);
default:
llvm_unreachable("Invalid opcode!");
}
};
auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) {
switch (Opc) {
case X86ISD::ADD:
return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
case X86ISD::ADC:
return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0);
case X86ISD::SUB:
return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
case X86ISD::SBB:
return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0);
case X86ISD::AND:
return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
case X86ISD::OR:
return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0);
case X86ISD::XOR:
return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0);
default:
llvm_unreachable("Invalid opcode!");
}
};
auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
switch (Opc) {
case X86ISD::ADD:
return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
X86::ADD8mi);
case X86ISD::ADC:
return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
X86::ADC8mi);
case X86ISD::SUB:
return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
X86::SUB8mi);
case X86ISD::SBB:
return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
X86::SBB8mi);
case X86ISD::AND:
return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
X86::AND8mi);
case X86ISD::OR:
return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
X86::OR8mi);
case X86ISD::XOR:
return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
X86::XOR8mi);
default:
llvm_unreachable("Invalid opcode!");
}
};
unsigned NewOpc = SelectRegOpcode(Opc);
SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
// See if the operand is a constant that we can fold into an immediate
// operand.
if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
int64_t OperandV = OperandC->getSExtValue();
// Check if we can shrink the operand enough to fit in an immediate (or
// fit into a smaller immediate) by negating it and switching the
// operation.
if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
(MemVT == MVT::i64 && !isInt<32>(OperandV) &&
isInt<32>(-OperandV))) &&
hasNoCarryFlagUses(StoredVal.getValue(1))) {
OperandV = -OperandV;
Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
}
// First try to fit this into an Imm8 operand. If it doesn't fit, then try
// the larger immediate operand.
if (MemVT != MVT::i8 && isInt<8>(OperandV)) {
Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
NewOpc = SelectImm8Opcode(Opc);
} else if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
NewOpc = SelectImmOpcode(Opc);
}
}
if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
SDValue CopyTo =
CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
StoredVal.getOperand(2), SDValue());
const SDValue Ops[] = {Base, Scale, Index, Disp,
Segment, Operand, CopyTo, CopyTo.getValue(1)};
Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
Ops);
} else {
const SDValue Ops[] = {Base, Scale, Index, Disp,
Segment, Operand, InputChain};
Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
Ops);
}
break;
}
default:
llvm_unreachable("Invalid opcode!");
}
MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
LoadNode->getMemOperand()};
CurDAG->setNodeMemRefs(Result, MemOps);
// Update Load Chain uses as well.
ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
CurDAG->RemoveDeadNode(Node);
return true;
}
// See if this is an X & Mask that we can match to BEXTR/BZHI.
// Where Mask is one of the following patterns:
// a) x & (1 << nbits) - 1
// b) x & ~(-1 << nbits)
// c) x & (-1 >> (32 - y))
// d) x << (32 - y) >> (32 - y)
bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
assert(
(Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) &&
"Should be either an and-mask, or right-shift after clearing high bits.");
// BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
return false;
MVT NVT = Node->getSimpleValueType(0);
// Only supported for 32 and 64 bits.
if (NVT != MVT::i32 && NVT != MVT::i64)
return false;
SDValue NBits;
// If we have BMI2's BZHI, we are ok with muti-use patterns.
// Else, if we only have BMI1's BEXTR, we require one-use.
const bool CanHaveExtraUses = Subtarget->hasBMI2();
auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) {
return CanHaveExtraUses ||
Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
};
auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); };
auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); };
auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
assert(V.getSimpleValueType() == MVT::i32 &&
V.getOperand(0).getSimpleValueType() == MVT::i64 &&
"Expected i64 -> i32 truncation");
V = V.getOperand(0);
}
return V;
};
// a) x & ((1 << nbits) + (-1))
auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation,
&NBits](SDValue Mask) -> bool {
// Match `add`. Must only have one use!
if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
return false;
// We should be adding all-ones constant (i.e. subtracting one.)
if (!isAllOnesConstant(Mask->getOperand(1)))
return false;
// Match `1 << nbits`. Might be truncated. Must only have one use!
SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
return false;
if (!isOneConstant(M0->getOperand(0)))
return false;
NBits = M0->getOperand(1);
return true;
};
auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
V = peekThroughOneUseTruncation(V);
return CurDAG->MaskedValueIsAllOnes(
V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
NVT.getSizeInBits()));
};
// b) x & ~(-1 << nbits)
auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
&NBits](SDValue Mask) -> bool {
// Match `~()`. Must only have one use!
if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
return false;
// The -1 only has to be all-ones for the final Node's NVT.
if (!isAllOnes(Mask->getOperand(1)))
return false;
// Match `-1 << nbits`. Might be truncated. Must only have one use!
SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
return false;
// The -1 only has to be all-ones for the final Node's NVT.
if (!isAllOnes(M0->getOperand(0)))
return false;
NBits = M0->getOperand(1);
return true;
};
// Match potentially-truncated (bitwidth - y)
auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt,
unsigned Bitwidth) {
// Skip over a truncate of the shift amount.
if (ShiftAmt.getOpcode() == ISD::TRUNCATE) {
ShiftAmt = ShiftAmt.getOperand(0);
// The trunc should have been the only user of the real shift amount.
if (!checkOneUse(ShiftAmt))
return false;
}
// Match the shift amount as: (bitwidth - y). It should go away, too.
if (ShiftAmt.getOpcode() != ISD::SUB)
return false;
auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
if (!V0 || V0->getZExtValue() != Bitwidth)
return false;
NBits = ShiftAmt.getOperand(1);
return true;
};
// c) x & (-1 >> (32 - y))
auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation,
matchShiftAmt](SDValue Mask) -> bool {
// The mask itself may be truncated.
Mask = peekThroughOneUseTruncation(Mask);
unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
// Match `l>>`. Must only have one use!
if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
return false;
// We should be shifting truly all-ones constant.
if (!isAllOnesConstant(Mask.getOperand(0)))
return false;
SDValue M1 = Mask.getOperand(1);
// The shift amount should not be used externally.
if (!checkOneUse(M1))
return false;
return matchShiftAmt(M1, Bitwidth);
};
SDValue X;
// d) x << (32 - y) >> (32 - y)
auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt,
&X](SDNode *Node) -> bool {
if (Node->getOpcode() != ISD::SRL)
return false;
SDValue N0 = Node->getOperand(0);
if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0))
return false;
unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
SDValue N1 = Node->getOperand(1);
SDValue N01 = N0->getOperand(1);
// Both of the shifts must be by the exact same value.
// There should not be any uses of the shift amount outside of the pattern.
if (N1 != N01 || !checkTwoUse(N1))
return false;
if (!matchShiftAmt(N1, Bitwidth))
return false;
X = N0->getOperand(0);
return true;
};
auto matchLowBitMask = [matchPatternA, matchPatternB,
matchPatternC](SDValue Mask) -> bool {
return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
};
if (Node->getOpcode() == ISD::AND) {
X = Node->getOperand(0);
SDValue Mask = Node->getOperand(1);
if (matchLowBitMask(Mask)) {
// Great.
} else {
std::swap(X, Mask);
if (!matchLowBitMask(Mask))
return false;
}
} else if (!matchPatternD(Node))
return false;
SDLoc DL(Node);
// Truncate the shift amount.
NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
// Insert 8-bit NBits into lowest 8 bits of 32-bit register.
// All the other bits are undefined, we do not care about them.
SDValue ImplDef = SDValue(
CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
- NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, MVT::i32, ImplDef,
- NBits);
+
+ SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
+ NBits = SDValue(
+ CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef,
+ NBits, SRIdxVal), 0);
insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
if (Subtarget->hasBMI2()) {
// Great, just emit the the BZHI..
if (NVT != MVT::i32) {
// But have to place the bit count into the wide-enough register first.
NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
}
SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
ReplaceNode(Node, Extract.getNode());
SelectCode(Extract.getNode());
return true;
}
// Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
// *logically* shifted (potentially with one-use trunc inbetween),
// and the truncation was the only use of the shift,
// and if so look past one-use truncation.
{
SDValue RealX = peekThroughOneUseTruncation(X);
// FIXME: only if the shift is one-use?
if (RealX != X && RealX.getOpcode() == ISD::SRL)
X = RealX;
}
MVT XVT = X.getSimpleValueType();
// Else, emitting BEXTR requires one more step.
// The 'control' of BEXTR has the pattern of:
// [15...8 bit][ 7...0 bit] location
// [ bit count][ shift] name
// I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
// Shift NBits left by 8 bits, thus producing 'control'.
// This makes the low 8 bits to be zero.
SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
// If the 'X' is *logically* shifted, we can fold that shift into 'control'.
// FIXME: only if the shift is one-use?
if (X.getOpcode() == ISD::SRL) {
SDValue ShiftAmt = X.getOperand(1);
X = X.getOperand(0);
assert(ShiftAmt.getValueType() == MVT::i8 &&
"Expected shift amount to be i8");
// Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
// We could zext to i16 in some form, but we intentionally don't do that.
SDValue OrigShiftAmt = ShiftAmt;
ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
// And now 'or' these low 8 bits of shift amount into the 'control'.
Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
}
// But have to place the 'control' into the wide-enough register first.
if (XVT != MVT::i32) {
Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
}
// And finally, form the BEXTR itself.
SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
// The 'X' was originally truncated. Do that now.
if (XVT != NVT) {
insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
}
ReplaceNode(Node, Extract.getNode());
SelectCode(Extract.getNode());
return true;
}
// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
MVT NVT = Node->getSimpleValueType(0);
SDLoc dl(Node);
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
// If we have TBM we can use an immediate for the control. If we have BMI
// we should only do this if the BEXTR instruction is implemented well.
// Otherwise moving the control into a register makes this more costly.
// TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
// hoisting the move immediate would make it worthwhile with a less optimal
// BEXTR?
if (!Subtarget->hasTBM() &&
!(Subtarget->hasBMI() && Subtarget->hasFastBEXTR()))
return nullptr;
// Must have a shift right.
if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
return nullptr;
// Shift can't have additional users.
if (!N0->hasOneUse())
return nullptr;
// Only supported for 32 and 64 bits.
if (NVT != MVT::i32 && NVT != MVT::i64)
return nullptr;
// Shift amount and RHS of and must be constant.
ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
if (!MaskCst || !ShiftCst)
return nullptr;
// And RHS must be a mask.
uint64_t Mask = MaskCst->getZExtValue();
if (!isMask_64(Mask))
return nullptr;
uint64_t Shift = ShiftCst->getZExtValue();
uint64_t MaskSize = countPopulation(Mask);
// Don't interfere with something that can be handled by extracting AH.
// TODO: If we are able to fold a load, BEXTR might still be better than AH.
if (Shift == 8 && MaskSize == 8)
return nullptr;
// Make sure we are only using bits that were in the original value, not
// shifted in.
if (Shift + MaskSize > NVT.getSizeInBits())
return nullptr;
SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
// BMI requires the immediate to placed in a register.
if (!Subtarget->hasTBM()) {
ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0);
}
MachineSDNode *NewNode;
SDValue Input = N0->getOperand(0);
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
// Update the chain.
ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
// Record the mem-refs
CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
} else {
NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, New);
}
return NewNode;
}
// Emit a PCMISTR(I/M) instruction.
MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
bool MayFoldLoad, const SDLoc &dl,
MVT VT, SDNode *Node) {
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
SDValue Imm = Node->getOperand(2);
const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
// Try to fold a load. No need to check alignment.
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
N1.getOperand(0) };
SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
// Update the chain.
ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
// Record the mem-refs
CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
return CNode;
}
SDValue Ops[] = { N0, N1, Imm };
SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
return CNode;
}
// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
// to emit a second instruction after this one. This is needed since we have two
// copyToReg nodes glued before this and we need to continue that glue through.
MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
bool MayFoldLoad, const SDLoc &dl,
MVT VT, SDNode *Node,
SDValue &InFlag) {
SDValue N0 = Node->getOperand(0);
SDValue N2 = Node->getOperand(2);
SDValue Imm = Node->getOperand(4);
const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
// Try to fold a load. No need to check alignment.
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
N2.getOperand(0), InFlag };
SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
InFlag = SDValue(CNode, 3);
// Update the chain.
ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
// Record the mem-refs
CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
return CNode;
}
SDValue Ops[] = { N0, N2, Imm, InFlag };
SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
InFlag = SDValue(CNode, 2);
return CNode;
}
bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
EVT VT = N->getValueType(0);
// Only handle scalar shifts.
if (VT.isVector())
return false;
// Narrower shifts only mask to 5 bits in hardware.
unsigned Size = VT == MVT::i64 ? 64 : 32;
SDValue OrigShiftAmt = N->getOperand(1);
SDValue ShiftAmt = OrigShiftAmt;
SDLoc DL(N);
// Skip over a truncate of the shift amount.
if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
ShiftAmt = ShiftAmt->getOperand(0);
// This function is called after X86DAGToDAGISel::matchBitExtract(),
// so we are not afraid that we might mess up BZHI/BEXTR pattern.
SDValue NewShiftAmt;
if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
SDValue Add0 = ShiftAmt->getOperand(0);
SDValue Add1 = ShiftAmt->getOperand(1);
// If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
// to avoid the ADD/SUB.
if (isa<ConstantSDNode>(Add1) &&
cast<ConstantSDNode>(Add1)->getZExtValue() % Size == 0) {
NewShiftAmt = Add0;
// If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
// generate a NEG instead of a SUB of a constant.
} else if (ShiftAmt->getOpcode() == ISD::SUB &&
isa<ConstantSDNode>(Add0) &&
cast<ConstantSDNode>(Add0)->getZExtValue() != 0 &&
cast<ConstantSDNode>(Add0)->getZExtValue() % Size == 0) {
// Insert a negate op.
// TODO: This isn't guaranteed to replace the sub if there is a logic cone
// that uses it that's not a shift.
EVT SubVT = ShiftAmt.getValueType();
SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1);
NewShiftAmt = Neg;
// Insert these operands into a valid topological order so they can
// get selected independently.
insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
} else
return false;
} else
return false;
if (NewShiftAmt.getValueType() != MVT::i8) {
// Need to truncate the shift amount.
NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
// Add to a correct topological ordering.
insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
}
// Insert a new mask to keep the shift amount legal. This should be removed
// by isel patterns.
NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
CurDAG->getConstant(Size - 1, DL, MVT::i8));
// Place in a correct topological ordering.
insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
NewShiftAmt);
if (UpdatedNode != N) {
// If we found an existing node, we should replace ourselves with that node
// and wait for it to be selected after its other users.
ReplaceNode(N, UpdatedNode);
return true;
}
// If the original shift amount is now dead, delete it so that we don't run
// it through isel.
if (OrigShiftAmt.getNode()->use_empty())
CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
// Now that we've optimized the shift amount, defer to normal isel to get
// load folding and legacy vs BMI2 selection without repeating it here.
SelectCode(N);
return true;
}
bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
MVT NVT = N->getSimpleValueType(0);
unsigned Opcode = N->getOpcode();
SDLoc dl(N);
// For operations of the form (x << C1) op C2, check if we can use a smaller
// encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
SDValue Shift = N->getOperand(0);
SDValue N1 = N->getOperand(1);
ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
if (!Cst)
return false;
int64_t Val = Cst->getSExtValue();
// If we have an any_extend feeding the AND, look through it to see if there
// is a shift behind it. But only if the AND doesn't use the extended bits.
// FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
bool FoundAnyExtend = false;
if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
isUInt<32>(Val)) {
FoundAnyExtend = true;
Shift = Shift.getOperand(0);
}
if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
return false;
// i8 is unshrinkable, i16 should be promoted to i32.
if (NVT != MVT::i32 && NVT != MVT::i64)
return false;
ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
if (!ShlCst)
return false;
uint64_t ShAmt = ShlCst->getZExtValue();
// Make sure that we don't change the operation by removing bits.
// This only matters for OR and XOR, AND is unaffected.
uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
return false;
// Check the minimum bitwidth for the new constant.
// TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
if (Opcode == ISD::AND) {
// AND32ri is the same as AND64ri32 with zext imm.
// Try this before sign extended immediates below.
ShiftedVal = (uint64_t)Val >> ShAmt;
if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
return true;
// Also swap order when the AND can become MOVZX.
if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
return true;
}
ShiftedVal = Val >> ShAmt;
if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
(!isInt<32>(Val) && isInt<32>(ShiftedVal)))
return true;
if (Opcode != ISD::AND) {
// MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
ShiftedVal = (uint64_t)Val >> ShAmt;
if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
return true;
}
return false;
};
int64_t ShiftedVal;
if (!CanShrinkImmediate(ShiftedVal))
return false;
// Ok, we can reorder to get a smaller immediate.
// But, its possible the original immediate allowed an AND to become MOVZX.
// Doing this late due to avoid the MakedValueIsZero call as late as
// possible.
if (Opcode == ISD::AND) {
// Find the smallest zext this could possibly be.
unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U));
// Figure out which bits need to be zero to achieve that mask.
APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
ZExtWidth);
NeededMask &= ~Cst->getAPIntValue();
if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
return false;
}
SDValue X = Shift.getOperand(0);
if (FoundAnyExtend) {
SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
X = NewX;
}
SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
Shift.getOperand(1));
ReplaceNode(N, NewSHL.getNode());
SelectCode(NewSHL.getNode());
return true;
}
/// If the high bits of an 'and' operand are known zero, try setting the
/// high bits of an 'and' constant operand to produce a smaller encoding by
/// creating a small, sign-extended negative immediate rather than a large
/// positive one. This reverses a transform in SimplifyDemandedBits that
/// shrinks mask constants by clearing bits. There is also a possibility that
/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
/// case, just replace the 'and'. Return 'true' if the node is replaced.
bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
// i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
// have immediate operands.
MVT VT = And->getSimpleValueType(0);
if (VT != MVT::i32 && VT != MVT::i64)
return false;
auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
if (!And1C)
return false;
// Bail out if the mask constant is already negative. It's can't shrink more.
// If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
// patterns to use a 32-bit and instead of a 64-bit and by relying on the
// implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
// are negative too.
APInt MaskVal = And1C->getAPIntValue();
unsigned MaskLZ = MaskVal.countLeadingZeros();
if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
return false;
// Don't extend into the upper 32 bits of a 64 bit mask.
if (VT == MVT::i64 && MaskLZ >= 32) {
MaskLZ -= 32;
MaskVal = MaskVal.trunc(32);
}
SDValue And0 = And->getOperand(0);
APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
APInt NegMaskVal = MaskVal | HighZeros;
// If a negative constant would not allow a smaller encoding, there's no need
// to continue. Only change the constant when we know it's a win.
unsigned MinWidth = NegMaskVal.getMinSignedBits();
if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32))
return false;
// Extend masks if we truncated above.
if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
NegMaskVal = NegMaskVal.zext(64);
HighZeros = HighZeros.zext(64);
}
// The variable operand must be all zeros in the top bits to allow using the
// new, negative constant as the mask.
if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
return false;
// Check if the mask is -1. In that case, this is an unnecessary instruction
// that escaped earlier analysis.
if (NegMaskVal.isAllOnesValue()) {
ReplaceNode(And, And0.getNode());
return true;
}
// A negative mask allows a smaller encoding. Create a new 'and' node.
SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
ReplaceNode(And, NewAnd.getNode());
SelectCode(NewAnd.getNode());
return true;
}
static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
bool FoldedBCast, bool Masked) {
if (Masked) {
if (FoldedLoad) {
switch (TestVT.SimpleTy) {
default: llvm_unreachable("Unexpected VT!");
case MVT::v16i8:
return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk;
case MVT::v8i16:
return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk;
case MVT::v4i32:
return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk;
case MVT::v2i64:
return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk;
case MVT::v32i8:
return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk;
case MVT::v16i16:
return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk;
case MVT::v8i32:
return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk;
case MVT::v4i64:
return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk;
case MVT::v64i8:
return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk;
case MVT::v32i16:
return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk;
case MVT::v16i32:
return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk;
case MVT::v8i64:
return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk;
}
}
if (FoldedBCast) {
switch (TestVT.SimpleTy) {
default: llvm_unreachable("Unexpected VT!");
case MVT::v4i32:
return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk;
case MVT::v2i64:
return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk;
case MVT::v8i32:
return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk;
case MVT::v4i64:
return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk;
case MVT::v16i32:
return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk;
case MVT::v8i64:
return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk;
}
}
switch (TestVT.SimpleTy) {
default: llvm_unreachable("Unexpected VT!");
case MVT::v16i8:
return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk;
case MVT::v8i16:
return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk;
case MVT::v4i32:
return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk;
case MVT::v2i64:
return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk;
case MVT::v32i8:
return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk;
case MVT::v16i16:
return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk;
case MVT::v8i32:
return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk;
case MVT::v4i64:
return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk;
case MVT::v64i8:
return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk;
case MVT::v32i16:
return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk;
case MVT::v16i32:
return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk;
case MVT::v8i64:
return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk;
}
}
if (FoldedLoad) {
switch (TestVT.SimpleTy) {
default: llvm_unreachable("Unexpected VT!");
case MVT::v16i8:
return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm;
case MVT::v8i16:
return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm;
case MVT::v4i32:
return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm;
case MVT::v2i64:
return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm;
case MVT::v32i8:
return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm;
case MVT::v16i16:
return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm;
case MVT::v8i32:
return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm;
case MVT::v4i64:
return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm;
case MVT::v64i8:
return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm;
case MVT::v32i16:
return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm;
case MVT::v16i32:
return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm;
case MVT::v8i64:
return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm;
}
}
if (FoldedBCast) {
switch (TestVT.SimpleTy) {
default: llvm_unreachable("Unexpected VT!");
case MVT::v4i32:
return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb;
case MVT::v2i64:
return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb;
case MVT::v8i32:
return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb;
case MVT::v4i64:
return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb;
case MVT::v16i32:
return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb;
case MVT::v8i64:
return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb;
}
}
switch (TestVT.SimpleTy) {
default: llvm_unreachable("Unexpected VT!");
case MVT::v16i8:
return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr;
case MVT::v8i16:
return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr;
case MVT::v4i32:
return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr;
case MVT::v2i64:
return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr;
case MVT::v32i8:
return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr;
case MVT::v16i16:
return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr;
case MVT::v8i32:
return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr;
case MVT::v4i64:
return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr;
case MVT::v64i8:
return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr;
case MVT::v32i16:
return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr;
case MVT::v16i32:
return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr;
case MVT::v8i64:
return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr;
}
}
// Try to create VPTESTM instruction. If InMask is not null, it will be used
// to form a masked operation.
bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
SDValue InMask) {
assert(Subtarget->hasAVX512() && "Expected AVX512!");
assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
"Unexpected VT!");
// Look for equal and not equal compares.
ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
if (CC != ISD::SETEQ && CC != ISD::SETNE)
return false;
// See if we're comparing against zero. This should have been canonicalized
// to RHS during lowering.
if (!ISD::isBuildVectorAllZeros(Setcc.getOperand(1).getNode()))
return false;
SDValue N0 = Setcc.getOperand(0);
MVT CmpVT = N0.getSimpleValueType();
MVT CmpSVT = CmpVT.getVectorElementType();
// Start with both operands the same. We'll try to refine this.
SDValue Src0 = N0;
SDValue Src1 = N0;
{
// Look through single use bitcasts.
SDValue N0Temp = N0;
if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
N0Temp = N0.getOperand(0);
// Look for single use AND.
if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
Src0 = N0Temp.getOperand(0);
Src1 = N0Temp.getOperand(1);
}
}
// Without VLX we need to widen the load.
bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
// We can only fold loads if the sources are unique.
bool CanFoldLoads = Src0 != Src1;
// Try to fold loads unless we need to widen.
bool FoldedLoad = false;
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load;
if (!Widen && CanFoldLoads) {
Load = Src1;
FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3,
Tmp4);
if (!FoldedLoad) {
// And is computative.
Load = Src0;
FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2,
Tmp3, Tmp4);
if (FoldedLoad)
std::swap(Src0, Src1);
}
}
auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) {
// Look through single use bitcasts.
if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse())
Src = Src.getOperand(0);
if (Src.getOpcode() == X86ISD::VBROADCAST && Src.hasOneUse()) {
Parent = Src.getNode();
Src = Src.getOperand(0);
if (Src.getSimpleValueType() == CmpSVT)
return Src;
}
return SDValue();
};
// If we didn't fold a load, try to match broadcast. No widening limitation
// for this. But only 32 and 64 bit types are supported.
bool FoldedBCast = false;
if (!FoldedLoad && CanFoldLoads &&
(CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) {
SDNode *ParentNode = nullptr;
if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) {
FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
Tmp1, Tmp2, Tmp3, Tmp4);
}
// Try the other operand.
if (!FoldedBCast) {
if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) {
FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
Tmp1, Tmp2, Tmp3, Tmp4);
if (FoldedBCast)
std::swap(Src0, Src1);
}
}
}
auto getMaskRC = [](MVT MaskVT) {
switch (MaskVT.SimpleTy) {
default: llvm_unreachable("Unexpected VT!");
case MVT::v2i1: return X86::VK2RegClassID;
case MVT::v4i1: return X86::VK4RegClassID;
case MVT::v8i1: return X86::VK8RegClassID;
case MVT::v16i1: return X86::VK16RegClassID;
case MVT::v32i1: return X86::VK32RegClassID;
case MVT::v64i1: return X86::VK64RegClassID;
}
};
bool IsMasked = InMask.getNode() != nullptr;
SDLoc dl(Root);
MVT ResVT = Setcc.getSimpleValueType();
MVT MaskVT = ResVT;
if (Widen) {
// Widen the inputs using insert_subreg or copy_to_regclass.
unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
CmpVT), 0);
Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
assert(!FoldedLoad && "Shouldn't have folded the load");
if (!FoldedBCast)
Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
if (IsMasked) {
// Widen the mask.
unsigned RegClass = getMaskRC(MaskVT);
SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
dl, MaskVT, InMask, RC), 0);
}
}
bool IsTestN = CC == ISD::SETEQ;
unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
IsMasked);
MachineSDNode *CNode;
if (FoldedLoad || FoldedBCast) {
SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
if (IsMasked) {
SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
Load.getOperand(0) };
CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
} else {
SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
Load.getOperand(0) };
CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
}
// Update the chain.
ReplaceUses(Load.getValue(1), SDValue(CNode, 1));
// Record the mem-refs
CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()});
} else {
if (IsMasked)
CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
else
CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
}
// If we widened, we need to shrink the mask VT.
if (Widen) {
unsigned RegClass = getMaskRC(ResVT);
SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
dl, ResVT, SDValue(CNode, 0), RC);
}
ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
CurDAG->RemoveDeadNode(Root);
return true;
}
void X86DAGToDAGISel::Select(SDNode *Node) {
MVT NVT = Node->getSimpleValueType(0);
unsigned Opcode = Node->getOpcode();
SDLoc dl(Node);
if (Node->isMachineOpcode()) {
LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
Node->setNodeId(-1);
return; // Already selected.
}
switch (Opcode) {
default: break;
case ISD::INTRINSIC_VOID: {
unsigned IntNo = Node->getConstantOperandVal(1);
switch (IntNo) {
default: break;
case Intrinsic::x86_sse3_monitor:
case Intrinsic::x86_monitorx:
case Intrinsic::x86_clzero: {
bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
unsigned Opc = 0;
switch (IntNo) {
case Intrinsic::x86_sse3_monitor:
if (!Subtarget->hasSSE3())
break;
Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
break;
case Intrinsic::x86_monitorx:
if (!Subtarget->hasMWAITX())
break;
Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
break;
case Intrinsic::x86_clzero:
if (!Subtarget->hasCLZERO())
break;
Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
break;
}
if (Opc) {
unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
Node->getOperand(2), SDValue());
SDValue InFlag = Chain.getValue(1);
if (IntNo == Intrinsic::x86_sse3_monitor ||
IntNo == Intrinsic::x86_monitorx) {
// Copy the other two operands to ECX and EDX.
Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
InFlag);
InFlag = Chain.getValue(1);
Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
InFlag);
InFlag = Chain.getValue(1);
}
MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
{ Chain, InFlag});
ReplaceNode(Node, CNode);
return;
}
}
}
break;
}
case ISD::BRIND: {
if (Subtarget->isTargetNaCl())
// NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
// leave the instruction alone.
break;
if (Subtarget->isTarget64BitILP32()) {
// Converts a 32-bit register to a 64-bit, zero-extended version of
// it. This is needed because x86-64 can do many things, but jmp %r32
// ain't one of them.
const SDValue &Target = Node->getOperand(1);
assert(Target.getSimpleValueType() == llvm::MVT::i32);
SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
Node->getOperand(0), ZextTarget);
ReplaceNode(Node, Brind.getNode());
SelectCode(ZextTarget.getNode());
SelectCode(Brind.getNode());
return;
}
break;
}
case X86ISD::GlobalBaseReg:
ReplaceNode(Node, getGlobalBaseReg());
return;
case ISD::BITCAST:
// Just drop all 128/256/512-bit bitcasts.
if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
NVT == MVT::f128) {
ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
CurDAG->RemoveDeadNode(Node);
return;
}
break;
case ISD::VSELECT: {
// Replace VSELECT with non-mask conditions with with BLENDV.
if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1)
break;
assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
SDValue Blendv = CurDAG->getNode(
X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
Node->getOperand(1), Node->getOperand(2));
ReplaceNode(Node, Blendv.getNode());
SelectCode(Blendv.getNode());
// We already called ReplaceUses.
return;
}
case ISD::SRL:
if (matchBitExtract(Node))
return;
LLVM_FALLTHROUGH;
case ISD::SRA:
case ISD::SHL:
if (tryShiftAmountMod(Node))
return;
break;
case ISD::AND:
if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
// Try to form a masked VPTESTM. Operands can be in either order.
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
tryVPTESTM(Node, N0, N1))
return;
if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
tryVPTESTM(Node, N1, N0))
return;
}
if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
CurDAG->RemoveDeadNode(Node);
return;
}
if (matchBitExtract(Node))
return;
if (AndImmShrink && shrinkAndImmediate(Node))
return;
LLVM_FALLTHROUGH;
case ISD::OR:
case ISD::XOR:
if (tryShrinkShlLogicImm(Node))
return;
LLVM_FALLTHROUGH;
case ISD::ADD:
case ISD::SUB: {
// Try to avoid folding immediates with multiple uses for optsize.
// This code tries to select to register form directly to avoid going
// through the isel table which might fold the immediate. We can't change
// the patterns on the add/sub/and/or/xor with immediate paterns in the
// tablegen files to check immediate use count without making the patterns
// unavailable to the fast-isel table.
if (!OptForSize)
break;
// Only handle i8/i16/i32/i64.
if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
break;
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
if (!Cst)
break;
int64_t Val = Cst->getSExtValue();
// Make sure its an immediate that is considered foldable.
// FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
if (!isInt<8>(Val) && !isInt<32>(Val))
break;
// Check if we should avoid folding this immediate.
if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
break;
// We should not fold the immediate. So we need a register form instead.
unsigned ROpc, MOpc;
switch (NVT.SimpleTy) {
default: llvm_unreachable("Unexpected VT!");
case MVT::i8:
switch (Opcode) {
default: llvm_unreachable("Unexpected opcode!");
case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break;
case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break;
case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break;
case ISD::OR: ROpc = X86::OR8rr; MOpc = X86::OR8rm; break;
case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break;
}
break;
case MVT::i16:
switch (Opcode) {
default: llvm_unreachable("Unexpected opcode!");
case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break;
case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break;
case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break;
case ISD::OR: ROpc = X86::OR16rr; MOpc = X86::OR16rm; break;
case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break;
}
break;
case MVT::i32:
switch (Opcode) {
default: llvm_unreachable("Unexpected opcode!");
case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break;
case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break;
case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break;
case ISD::OR: ROpc = X86::OR32rr; MOpc = X86::OR32rm; break;
case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break;
}
break;
case MVT::i64:
switch (Opcode) {
default: llvm_unreachable("Unexpected opcode!");
case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break;
case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break;
case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break;
case ISD::OR: ROpc = X86::OR64rr; MOpc = X86::OR64rm; break;
case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break;
}
break;
}
// Ok this is a AND/OR/XOR/ADD/SUB with constant.
// If this is a not a subtract, we can still try to fold a load.
if (Opcode != ISD::SUB) {
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
// Update the chain.
ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
// Record the mem-refs
CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
CurDAG->RemoveDeadNode(Node);
return;
}
}
CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
return;
}
case X86ISD::SMUL:
// i16/i32/i64 are handled with isel patterns.
if (NVT != MVT::i8)
break;
LLVM_FALLTHROUGH;
case X86ISD::UMUL: {
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
unsigned LoReg, ROpc, MOpc;
switch (NVT.SimpleTy) {
default: llvm_unreachable("Unsupported VT!");
case MVT::i8:
LoReg = X86::AL;
ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
break;
case MVT::i16:
LoReg = X86::AX;
ROpc = X86::MUL16r;
MOpc = X86::MUL16m;
break;
case MVT::i32:
LoReg = X86::EAX;
ROpc = X86::MUL32r;
MOpc = X86::MUL32m;
break;
case MVT::i64:
LoReg = X86::RAX;
ROpc = X86::MUL64r;
MOpc = X86::MUL64m;
break;
}
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
// Multiply is commmutative.
if (!FoldedLoad) {
FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
if (FoldedLoad)
std::swap(N0, N1);
}
SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
N0, SDValue()).getValue(1);
MachineSDNode *CNode;
if (FoldedLoad) {
// i16/i32/i64 use an instruction that produces a low and high result even
// though only the low result is used.
SDVTList VTs;
if (NVT == MVT::i8)
VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
else
VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
InFlag };
CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
// Update the chain.
ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
// Record the mem-refs
CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
} else {
// i16/i32/i64 use an instruction that produces a low and high result even
// though only the low result is used.
SDVTList VTs;
if (NVT == MVT::i8)
VTs = CurDAG->getVTList(NVT, MVT::i32);
else
VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag});
}
ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
CurDAG->RemoveDeadNode(Node);
return;
}
case ISD::SMUL_LOHI:
case ISD::UMUL_LOHI: {
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
unsigned Opc, MOpc;
bool isSigned = Opcode == ISD::SMUL_LOHI;
if (!isSigned) {
switch (NVT.SimpleTy) {
default: llvm_unreachable("Unsupported VT!");
case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
}
} else {
switch (NVT.SimpleTy) {
default: llvm_unreachable("Unsupported VT!");
case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
}
}
unsigned SrcReg, LoReg, HiReg;
switch (Opc) {
default: llvm_unreachable("Unknown MUL opcode!");
case X86::IMUL32r:
case X86::MUL32r:
SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
break;
case X86::IMUL64r:
case X86::MUL64r:
SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
break;
}
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
// Multiply is commmutative.
if (!foldedLoad) {
foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
if (foldedLoad)
std::swap(N0, N1);
}
SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
N0, SDValue()).getValue(1);
if (foldedLoad) {
SDValue Chain;
MachineSDNode *CNode = nullptr;
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
InFlag };
SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
Chain = SDValue(CNode, 0);
InFlag = SDValue(CNode, 1);
// Update the chain.
ReplaceUses(N1.getValue(1), Chain);
// Record the mem-refs
CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
} else {
SDValue Ops[] = { N1, InFlag };
SDVTList VTs = CurDAG->getVTList(MVT::Glue);
SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
InFlag = SDValue(CNode, 0);
}
// Copy the low half of the result, if it is needed.
if (!SDValue(Node, 0).use_empty()) {
assert(LoReg && "Register for low half is not defined!");
SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
NVT, InFlag);
InFlag = ResLo.getValue(2);
ReplaceUses(SDValue(Node, 0), ResLo);
LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
dbgs() << '\n');
}
// Copy the high half of the result, if it is needed.
if (!SDValue(Node, 1).use_empty()) {
assert(HiReg && "Register for high half is not defined!");
SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
NVT, InFlag);
InFlag = ResHi.getValue(2);
ReplaceUses(SDValue(Node, 1), ResHi);
LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
dbgs() << '\n');
}
CurDAG->RemoveDeadNode(Node);
return;
}
case ISD::SDIVREM:
case ISD::UDIVREM: {
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
unsigned Opc, MOpc;
bool isSigned = Opcode == ISD::SDIVREM;
if (!isSigned) {
switch (NVT.SimpleTy) {
default: llvm_unreachable("Unsupported VT!");
case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break;
case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
}
} else {
switch (NVT.SimpleTy) {
default: llvm_unreachable("Unsupported VT!");
case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
}
}
unsigned LoReg, HiReg, ClrReg;
unsigned SExtOpcode;
switch (NVT.SimpleTy) {
default: llvm_unreachable("Unsupported VT!");
case MVT::i8:
LoReg = X86::AL; ClrReg = HiReg = X86::AH;
SExtOpcode = X86::CBW;
break;
case MVT::i16:
LoReg = X86::AX; HiReg = X86::DX;
ClrReg = X86::DX;
SExtOpcode = X86::CWD;
break;
case MVT::i32:
LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
SExtOpcode = X86::CDQ;
break;
case MVT::i64:
LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
SExtOpcode = X86::CQO;
break;
}
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
bool signBitIsZero = CurDAG->SignBitIsZero(N0);
SDValue InFlag;
if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
// Special case for div8, just use a move with zero extension to AX to
// clear the upper 8 bits (AH).
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
MachineSDNode *Move;
if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
Move = CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
MVT::Other, Ops);
Chain = SDValue(Move, 1);
ReplaceUses(N0.getValue(1), Chain);
// Record the mem-refs
CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
} else {
Move = CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0);
Chain = CurDAG->getEntryNode();
}
Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, SDValue(Move, 0),
SDValue());
InFlag = Chain.getValue(1);
} else {
InFlag =
CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
LoReg, N0, SDValue()).getValue(1);
if (isSigned && !signBitIsZero) {
// Sign extend the low part into the high part.
InFlag =
SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
} else {
// Zero out the high part, effectively zero extending the input.
SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
switch (NVT.SimpleTy) {
case MVT::i16:
ClrNode =
SDValue(CurDAG->getMachineNode(
TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
CurDAG->getTargetConstant(X86::sub_16bit, dl,
MVT::i32)),
0);
break;
case MVT::i32:
break;
case MVT::i64:
ClrNode =
SDValue(CurDAG->getMachineNode(
TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
CurDAG->getTargetConstant(X86::sub_32bit, dl,
MVT::i32)),
0);
break;
default:
llvm_unreachable("Unexpected division source");
}
InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
ClrNode, InFlag).getValue(1);
}
}
if (foldedLoad) {
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
InFlag };
MachineSDNode *CNode =
CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
InFlag = SDValue(CNode, 1);
// Update the chain.
ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
// Record the mem-refs
CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
} else {
InFlag =
SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
}
// Prevent use of AH in a REX instruction by explicitly copying it to
// an ABCD_L register.
//
// The current assumption of the register allocator is that isel
// won't generate explicit references to the GR8_ABCD_H registers. If
// the allocator and/or the backend get enhanced to be more robust in
// that regard, this can be, and should be, removed.
if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
unsigned AHExtOpcode =
isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
MVT::Glue, AHCopy, InFlag);
SDValue Result(RNode, 0);
InFlag = SDValue(RNode, 1);
Result =
CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
ReplaceUses(SDValue(Node, 1), Result);
LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
dbgs() << '\n');
}
// Copy the division (low) result, if it is needed.
if (!SDValue(Node, 0).use_empty()) {
SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
LoReg, NVT, InFlag);
InFlag = Result.getValue(2);
ReplaceUses(SDValue(Node, 0), Result);
LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
dbgs() << '\n');
}
// Copy the remainder (high) result, if it is needed.
if (!SDValue(Node, 1).use_empty()) {
SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
HiReg, NVT, InFlag);
InFlag = Result.getValue(2);
ReplaceUses(SDValue(Node, 1), Result);
LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
dbgs() << '\n');
}
CurDAG->RemoveDeadNode(Node);
return;
}
case X86ISD::CMP: {
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
// Optimizations for TEST compares.
if (!isNullConstant(N1))
break;
// Save the original VT of the compare.
MVT CmpVT = N0.getSimpleValueType();
// If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
// by a test instruction. The test should be removed later by
// analyzeCompare if we are using only the zero flag.
// TODO: Should we check the users and use the BEXTR flags directly?
if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
: X86::TEST32rr;
SDValue BEXTR = SDValue(NewNode, 0);
NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
CurDAG->RemoveDeadNode(Node);
return;
}
}
// We can peek through truncates, but we need to be careful below.
if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
N0 = N0.getOperand(0);
// Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
// use a smaller encoding.
// Look past the truncate if CMP is the only use of it.
if (N0.getOpcode() == ISD::AND &&
N0.getNode()->hasOneUse() &&
N0.getValueType() != MVT::i8) {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
if (!C) break;
uint64_t Mask = C->getZExtValue();
// Check if we can replace AND+IMM64 with a shift. This is possible for
// masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero
// flag.
if (CmpVT == MVT::i64 && !isInt<32>(Mask) &&
onlyUsesZeroFlag(SDValue(Node, 0))) {
if (isMask_64(~Mask)) {
unsigned TrailingZeros = countTrailingZeros(Mask);
SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64);
SDValue Shift =
SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32,
N0.getOperand(0), Imm), 0);
MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
MVT::i32, Shift, Shift);
ReplaceNode(Node, Test);
return;
}
if (isMask_64(Mask)) {
unsigned LeadingZeros = countLeadingZeros(Mask);
SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64);
SDValue Shift =
SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32,
N0.getOperand(0), Imm), 0);
MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
MVT::i32, Shift, Shift);
ReplaceNode(Node, Test);
return;
}
}
MVT VT;
int SubRegOp;
unsigned ROpc, MOpc;
// For each of these checks we need to be careful if the sign flag is
// being used. It is only safe to use the sign flag in two conditions,
// either the sign bit in the shrunken mask is zero or the final test
// size is equal to the original compare size.
if (isUInt<8>(Mask) &&
(!(Mask & 0x80) || CmpVT == MVT::i8 ||
hasNoSignFlagUses(SDValue(Node, 0)))) {
// For example, convert "testl %eax, $8" to "testb %al, $8"
VT = MVT::i8;
SubRegOp = X86::sub_8bit;
ROpc = X86::TEST8ri;
MOpc = X86::TEST8mi;
} else if (OptForMinSize && isUInt<16>(Mask) &&
(!(Mask & 0x8000) || CmpVT == MVT::i16 ||
hasNoSignFlagUses(SDValue(Node, 0)))) {
// For example, "testl %eax, $32776" to "testw %ax, $32776".
// NOTE: We only want to form TESTW instructions if optimizing for
// min size. Otherwise we only save one byte and possibly get a length
// changing prefix penalty in the decoders.
VT = MVT::i16;
SubRegOp = X86::sub_16bit;
ROpc = X86::TEST16ri;
MOpc = X86::TEST16mi;
} else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
((!(Mask & 0x80000000) &&
// Without minsize 16-bit Cmps can get here so we need to
// be sure we calculate the correct sign flag if needed.
(CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
CmpVT == MVT::i32 ||
hasNoSignFlagUses(SDValue(Node, 0)))) {
// For example, "testq %rax, $268468232" to "testl %eax, $268468232".
// NOTE: We only want to run that transform if N0 is 32 or 64 bits.
// Otherwize, we find ourselves in a position where we have to do
// promotion. If previous passes did not promote the and, we assume
// they had a good reason not to and do not promote here.
VT = MVT::i32;
SubRegOp = X86::sub_32bit;
ROpc = X86::TEST32ri;
MOpc = X86::TEST32mi;
} else {
// No eligible transformation was found.
break;
}
SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
SDValue Reg = N0.getOperand(0);
// Emit a testl or testw.
MachineSDNode *NewNode;
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
Reg.getOperand(0) };
NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
// Update the chain.
ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
// Record the mem-refs
CurDAG->setNodeMemRefs(NewNode,
{cast<LoadSDNode>(Reg)->getMemOperand()});
} else {
// Extract the subregister if necessary.
if (N0.getValueType() != VT)
Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
}
// Replace CMP with TEST.
ReplaceNode(Node, NewNode);
return;
}
break;
}
case X86ISD::PCMPISTR: {
if (!Subtarget->hasSSE42())
break;
bool NeedIndex = !SDValue(Node, 0).use_empty();
bool NeedMask = !SDValue(Node, 1).use_empty();
// We can't fold a load if we are going to make two instructions.
bool MayFoldLoad = !NeedIndex || !NeedMask;
MachineSDNode *CNode;
if (NeedMask) {
unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
}
if (NeedIndex || !NeedMask) {
unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
}
// Connect the flag usage to the last instruction created.
ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
CurDAG->RemoveDeadNode(Node);
return;
}
case X86ISD::PCMPESTR: {
if (!Subtarget->hasSSE42())
break;
// Copy the two implicit register inputs.
SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
Node->getOperand(1),
SDValue()).getValue(1);
InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
Node->getOperand(3), InFlag).getValue(1);
bool NeedIndex = !SDValue(Node, 0).use_empty();
bool NeedMask = !SDValue(Node, 1).use_empty();
// We can't fold a load if we are going to make two instructions.
bool MayFoldLoad = !NeedIndex || !NeedMask;
MachineSDNode *CNode;
if (NeedMask) {
unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
InFlag);
ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
}
if (NeedIndex || !NeedMask) {
unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
}
// Connect the flag usage to the last instruction created.
ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
CurDAG->RemoveDeadNode(Node);
return;
}
case ISD::SETCC: {
if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
return;
break;
}
case ISD::STORE:
if (foldLoadStoreIntoMemOperand(Node))
return;
break;
case ISD::FCEIL:
case ISD::FFLOOR:
case ISD::FTRUNC:
case ISD::FNEARBYINT:
case ISD::FRINT: {
// Replace fp rounding with their X86 specific equivalent so we don't
// need 2 sets of patterns.
// FIXME: This can only happen when the nodes started as STRICT_* and have
// been mutated into their non-STRICT equivalents. Eventually this
// mutation will be removed and we should switch the STRICT_ nodes to a
// strict version of RNDSCALE in PreProcessISelDAG.
unsigned Imm;
switch (Node->getOpcode()) {
default: llvm_unreachable("Unexpected opcode!");
case ISD::FCEIL: Imm = 0xA; break;
case ISD::FFLOOR: Imm = 0x9; break;
case ISD::FTRUNC: Imm = 0xB; break;
case ISD::FNEARBYINT: Imm = 0xC; break;
case ISD::FRINT: Imm = 0x4; break;
}
SDLoc dl(Node);
SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl,
Node->getValueType(0),
Node->getOperand(0),
CurDAG->getConstant(Imm, dl, MVT::i8));
ReplaceNode(Node, Res.getNode());
SelectCode(Res.getNode());
return;
}
}
SelectCode(Node);
}
bool X86DAGToDAGISel::
SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
std::vector<SDValue> &OutOps) {
SDValue Op0, Op1, Op2, Op3, Op4;
switch (ConstraintID) {
default:
llvm_unreachable("Unexpected asm memory constraint");
case InlineAsm::Constraint_i:
// FIXME: It seems strange that 'i' is needed here since it's supposed to
// be an immediate and not a memory constraint.
LLVM_FALLTHROUGH;
case InlineAsm::Constraint_o: // offsetable ??
case InlineAsm::Constraint_v: // not offsetable ??
case InlineAsm::Constraint_m: // memory
case InlineAsm::Constraint_X:
if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
return true;
break;
}
OutOps.push_back(Op0);
OutOps.push_back(Op1);
OutOps.push_back(Op2);
OutOps.push_back(Op3);
OutOps.push_back(Op4);
return false;
}
/// This pass converts a legalized DAG into a X86-specific DAG,
/// ready for instruction scheduling.
FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
CodeGenOpt::Level OptLevel) {
return new X86DAGToDAGISel(TM, OptLevel);
}
Index: vendor/llvm/dist-release_90/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/Target/X86/X86ISelLowering.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Target/X86/X86ISelLowering.cpp (revision 351303)
@@ -1,45516 +1,45520 @@
//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines the interfaces that X86 uses to lower LLVM code into a
// selection DAG.
//
//===----------------------------------------------------------------------===//
#include "X86ISelLowering.h"
#include "Utils/X86ShuffleDecode.h"
#include "X86CallingConv.h"
#include "X86FrameLowering.h"
#include "X86InstrBuilder.h"
#include "X86IntrinsicsInfo.h"
#include "X86MachineFunctionInfo.h"
#include "X86TargetMachine.h"
#include "X86TargetObjectFile.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/WinEHFuncInfo.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetOptions.h"
#include <algorithm>
#include <bitset>
#include <cctype>
#include <numeric>
using namespace llvm;
#define DEBUG_TYPE "x86-isel"
STATISTIC(NumTailCalls, "Number of tail calls");
static cl::opt<bool> ExperimentalVectorWideningLegalization(
"x86-experimental-vector-widening-legalization", cl::init(false),
cl::desc("Enable an experimental vector type legalization through widening "
"rather than promotion."),
cl::Hidden);
static cl::opt<int> ExperimentalPrefLoopAlignment(
"x86-experimental-pref-loop-alignment", cl::init(4),
cl::desc("Sets the preferable loop alignment for experiments "
"(the last x86-experimental-pref-loop-alignment bits"
" of the loop header PC will be 0)."),
cl::Hidden);
static cl::opt<bool> MulConstantOptimization(
"mul-constant-optimization", cl::init(true),
cl::desc("Replace 'mul x, Const' with more effective instructions like "
"SHIFT, LEA, etc."),
cl::Hidden);
/// Call this when the user attempts to do something unsupported, like
/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
/// report_fatal_error, so calling code should attempt to recover without
/// crashing.
static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
const char *Msg) {
MachineFunction &MF = DAG.getMachineFunction();
DAG.getContext()->diagnose(
DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
}
X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
const X86Subtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
X86ScalarSSEf64 = Subtarget.hasSSE2();
X86ScalarSSEf32 = Subtarget.hasSSE1();
MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
// Set up the TargetLowering object.
// X86 is weird. It always uses i8 for shift amounts and setcc results.
setBooleanContents(ZeroOrOneBooleanContent);
// X86-SSE is even stranger. It uses -1 or 0 for vector masks.
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
// For 64-bit, since we have so many registers, use the ILP scheduler.
// For 32-bit, use the register pressure specific scheduling.
// For Atom, always use ILP scheduling.
if (Subtarget.isAtom())
setSchedulingPreference(Sched::ILP);
else if (Subtarget.is64Bit())
setSchedulingPreference(Sched::ILP);
else
setSchedulingPreference(Sched::RegPressure);
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
// Bypass expensive divides and use cheaper ones.
if (TM.getOptLevel() >= CodeGenOpt::Default) {
if (Subtarget.hasSlowDivide32())
addBypassSlowDiv(32, 8);
if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
addBypassSlowDiv(64, 32);
}
if (Subtarget.isTargetWindowsMSVC() ||
Subtarget.isTargetWindowsItanium()) {
// Setup Windows compiler runtime calls.
setLibcallName(RTLIB::SDIV_I64, "_alldiv");
setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
setLibcallName(RTLIB::SREM_I64, "_allrem");
setLibcallName(RTLIB::UREM_I64, "_aullrem");
setLibcallName(RTLIB::MUL_I64, "_allmul");
setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
}
if (Subtarget.isTargetDarwin()) {
// Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
setUseUnderscoreSetJmp(false);
setUseUnderscoreLongJmp(false);
} else if (Subtarget.isTargetWindowsGNU()) {
// MS runtime is weird: it exports _setjmp, but longjmp!
setUseUnderscoreSetJmp(true);
setUseUnderscoreLongJmp(false);
} else {
setUseUnderscoreSetJmp(true);
setUseUnderscoreLongJmp(true);
}
// If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
// 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
// FIXME: Should we be limitting the atomic size on other configs? Default is
// 1024.
if (!Subtarget.hasCmpxchg8b())
setMaxAtomicSizeInBitsSupported(32);
// Set up the register classes.
addRegisterClass(MVT::i8, &X86::GR8RegClass);
addRegisterClass(MVT::i16, &X86::GR16RegClass);
addRegisterClass(MVT::i32, &X86::GR32RegClass);
if (Subtarget.is64Bit())
addRegisterClass(MVT::i64, &X86::GR64RegClass);
for (MVT VT : MVT::integer_valuetypes())
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
// We don't accept any truncstore of integer registers.
setTruncStoreAction(MVT::i64, MVT::i32, Expand);
setTruncStoreAction(MVT::i64, MVT::i16, Expand);
setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
setTruncStoreAction(MVT::i32, MVT::i16, Expand);
setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
setTruncStoreAction(MVT::i16, MVT::i8, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
// SETOEQ and SETUNE require checking two conditions.
setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
// Integer absolute.
if (Subtarget.hasCMov()) {
setOperationAction(ISD::ABS , MVT::i16 , Custom);
setOperationAction(ISD::ABS , MVT::i32 , Custom);
}
setOperationAction(ISD::ABS , MVT::i64 , Custom);
// Funnel shifts.
for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
setOperationAction(ShiftOp , MVT::i16 , Custom);
setOperationAction(ShiftOp , MVT::i32 , Custom);
if (Subtarget.is64Bit())
setOperationAction(ShiftOp , MVT::i64 , Custom);
}
// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
// operation.
setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
if (Subtarget.is64Bit()) {
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
// f32/f64 are legal, f80 is custom.
setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
else
setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
} else if (!Subtarget.useSoftFloat()) {
// We have an algorithm for SSE2->double, and we turn this into a
// 64-bit FILD followed by conditional FADD for other targets.
setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
// We have an algorithm for SSE2, and we turn this into a 64-bit
// FILD or VCVTUSI2SS/SD for other targets.
setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
} else {
setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Expand);
}
// Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
// this operation.
setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
if (!Subtarget.useSoftFloat()) {
// SSE has no i16 to fp conversion, only i32.
if (X86ScalarSSEf32) {
setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
// f32 and f64 cases are Legal, f80 case is not
setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
} else {
setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
}
} else {
setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Expand);
}
// Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
// this operation.
setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
if (!Subtarget.useSoftFloat()) {
// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
// are Legal, f80 is custom lowered.
setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
} else {
setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
}
// Handle FP_TO_UINT by promoting the destination to a larger signed
// conversion.
setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
if (Subtarget.is64Bit()) {
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
// FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
} else {
setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
}
} else if (!Subtarget.useSoftFloat()) {
// Since AVX is a superset of SSE3, only check for SSE here.
if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
// Expand FP_TO_UINT into a select.
// FIXME: We would like to use a Custom expander here eventually to do
// the optimal thing for SSE vs. the default expansion in the legalizer.
setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
else
// With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
// With SSE3 we can use fisttpll to convert to a signed i64; without
// SSE, we're stuck with a fistpll.
setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
}
// TODO: when we have SSE, these could be more efficient, by using movd/movq.
if (!X86ScalarSSEf64) {
setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
if (Subtarget.is64Bit()) {
setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
// Without SSE, i64->f64 goes through memory.
setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
}
} else if (!Subtarget.is64Bit())
setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
// Scalar integer divide and remainder are lowered to use operations that
// produce two results, to match the available instructions. This exposes
// the two-result form to trivial CSE, which is able to combine x/y and x%y
// into a single instruction.
//
// Scalar integer multiply-high is also lowered to use two-result
// operations, to match the available instructions. However, plain multiply
// (low) operations are left as Legal, as there are single-result
// instructions for this in x86. Using the two-result multiply instructions
// when both high and low results are needed must be arranged by dagcombine.
for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UDIV, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
}
setOperationAction(ISD::BR_JT , MVT::Other, Expand);
setOperationAction(ISD::BRCOND , MVT::Other, Custom);
for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
setOperationAction(ISD::BR_CC, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
}
if (Subtarget.is64Bit())
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
setOperationAction(ISD::FREM , MVT::f32 , Expand);
setOperationAction(ISD::FREM , MVT::f64 , Expand);
setOperationAction(ISD::FREM , MVT::f80 , Expand);
setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
// Promote the i8 variants and force them on up to i32 which has a shorter
// encoding.
setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
if (!Subtarget.hasBMI()) {
setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
if (Subtarget.is64Bit()) {
setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
}
}
if (Subtarget.hasLZCNT()) {
// When promoting the i8 variants, force them to i32 for a shorter
// encoding.
setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
} else {
setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
if (Subtarget.is64Bit()) {
setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
}
}
// Special handling for half-precision floating point conversions.
// If we don't have F16C support, then lower half float conversions
// into library calls.
if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
}
// There's never any support for operations beyond MVT::f32.
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f80, MVT::f16, Expand);
if (Subtarget.hasPOPCNT()) {
setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
} else {
setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
if (Subtarget.is64Bit())
setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
else
setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
}
setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
if (!Subtarget.hasMOVBE())
setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
// These should be promoted to a larger select which is supported.
setOperationAction(ISD::SELECT , MVT::i1 , Promote);
// X86 wants to expand cmov itself.
for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
}
for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
continue;
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
}
// Custom action for SELECT MMX and expand action for SELECT_CC MMX
setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
// NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
// LLVM/Clang supports zero-cost DWARF and SEH exception handling.
setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
// Darwin ABI issue.
for (auto VT : { MVT::i32, MVT::i64 }) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
continue;
setOperationAction(ISD::ConstantPool , VT, Custom);
setOperationAction(ISD::JumpTable , VT, Custom);
setOperationAction(ISD::GlobalAddress , VT, Custom);
setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
setOperationAction(ISD::ExternalSymbol , VT, Custom);
setOperationAction(ISD::BlockAddress , VT, Custom);
}
// 64-bit shl, sra, srl (iff 32-bit x86)
for (auto VT : { MVT::i32, MVT::i64 }) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
continue;
setOperationAction(ISD::SHL_PARTS, VT, Custom);
setOperationAction(ISD::SRA_PARTS, VT, Custom);
setOperationAction(ISD::SRL_PARTS, VT, Custom);
}
if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
// Expand certain atomics
for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
}
if (!Subtarget.is64Bit())
setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
if (Subtarget.hasCmpxchg16b()) {
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
}
// FIXME - use subtarget debug flags
if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
}
setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
setOperationAction(ISD::TRAP, MVT::Other, Legal);
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
// VASTART needs to be custom lowered to use the VarArgsFrameIndex
setOperationAction(ISD::VASTART , MVT::Other, Custom);
setOperationAction(ISD::VAEND , MVT::Other, Expand);
bool Is64Bit = Subtarget.is64Bit();
setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
// f32 and f64 use SSE.
// Set up the FP register classes.
addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
: &X86::FR32RegClass);
addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
: &X86::FR64RegClass);
// Disable f32->f64 extload as we can only generate this in one instruction
// under optsize. So its easier to pattern match (fpext (load)) for that
// case instead of needing to emit 2 instructions for extload in the
// non-optsize case.
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
for (auto VT : { MVT::f32, MVT::f64 }) {
// Use ANDPD to simulate FABS.
setOperationAction(ISD::FABS, VT, Custom);
// Use XORP to simulate FNEG.
setOperationAction(ISD::FNEG, VT, Custom);
// Use ANDPD and ORPD to simulate FCOPYSIGN.
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
// These might be better off as horizontal vector ops.
setOperationAction(ISD::FADD, VT, Custom);
setOperationAction(ISD::FSUB, VT, Custom);
// We don't support sin/cos/fmod
setOperationAction(ISD::FSIN , VT, Expand);
setOperationAction(ISD::FCOS , VT, Expand);
setOperationAction(ISD::FSINCOS, VT, Expand);
}
// Lower this to MOVMSK plus an AND.
setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
} else if (!useSoftFloat() && X86ScalarSSEf32 && (UseX87 || Is64Bit)) {
// Use SSE for f32, x87 for f64.
// Set up the FP register classes.
addRegisterClass(MVT::f32, &X86::FR32RegClass);
if (UseX87)
addRegisterClass(MVT::f64, &X86::RFP64RegClass);
// Use ANDPS to simulate FABS.
setOperationAction(ISD::FABS , MVT::f32, Custom);
// Use XORP to simulate FNEG.
setOperationAction(ISD::FNEG , MVT::f32, Custom);
if (UseX87)
setOperationAction(ISD::UNDEF, MVT::f64, Expand);
// Use ANDPS and ORPS to simulate FCOPYSIGN.
if (UseX87)
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
// We don't support sin/cos/fmod
setOperationAction(ISD::FSIN , MVT::f32, Expand);
setOperationAction(ISD::FCOS , MVT::f32, Expand);
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
if (UseX87) {
// Always expand sin/cos functions even though x87 has an instruction.
setOperationAction(ISD::FSIN, MVT::f64, Expand);
setOperationAction(ISD::FCOS, MVT::f64, Expand);
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
}
} else if (UseX87) {
// f32 and f64 in x87.
// Set up the FP register classes.
addRegisterClass(MVT::f64, &X86::RFP64RegClass);
addRegisterClass(MVT::f32, &X86::RFP32RegClass);
for (auto VT : { MVT::f32, MVT::f64 }) {
setOperationAction(ISD::UNDEF, VT, Expand);
setOperationAction(ISD::FCOPYSIGN, VT, Expand);
// Always expand sin/cos functions even though x87 has an instruction.
setOperationAction(ISD::FSIN , VT, Expand);
setOperationAction(ISD::FCOS , VT, Expand);
setOperationAction(ISD::FSINCOS, VT, Expand);
}
}
// Expand FP32 immediates into loads from the stack, save special cases.
if (isTypeLegal(MVT::f32)) {
if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
addLegalFPImmediate(APFloat(+0.0f)); // FLD0
addLegalFPImmediate(APFloat(+1.0f)); // FLD1
addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
} else // SSE immediates.
addLegalFPImmediate(APFloat(+0.0f)); // xorps
}
// Expand FP64 immediates into loads from the stack, save special cases.
if (isTypeLegal(MVT::f64)) {
if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
addLegalFPImmediate(APFloat(+0.0)); // FLD0
addLegalFPImmediate(APFloat(+1.0)); // FLD1
addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
} else // SSE immediates.
addLegalFPImmediate(APFloat(+0.0)); // xorpd
}
// We don't support FMA.
setOperationAction(ISD::FMA, MVT::f64, Expand);
setOperationAction(ISD::FMA, MVT::f32, Expand);
// Long double always uses X87, except f128 in MMX.
if (UseX87) {
if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
setOperationAction(ISD::FABS , MVT::f128, Custom);
setOperationAction(ISD::FNEG , MVT::f128, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
}
addRegisterClass(MVT::f80, &X86::RFP80RegClass);
setOperationAction(ISD::UNDEF, MVT::f80, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
{
APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
addLegalFPImmediate(TmpFlt); // FLD0
TmpFlt.changeSign();
addLegalFPImmediate(TmpFlt); // FLD0/FCHS
bool ignored;
APFloat TmpFlt2(+1.0);
TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
&ignored);
addLegalFPImmediate(TmpFlt2); // FLD1
TmpFlt2.changeSign();
addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
}
// Always expand sin/cos functions even though x87 has an instruction.
setOperationAction(ISD::FSIN , MVT::f80, Expand);
setOperationAction(ISD::FCOS , MVT::f80, Expand);
setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
setOperationAction(ISD::FCEIL, MVT::f80, Expand);
setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
setOperationAction(ISD::FRINT, MVT::f80, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
setOperationAction(ISD::FMA, MVT::f80, Expand);
setOperationAction(ISD::LROUND, MVT::f80, Expand);
setOperationAction(ISD::LLROUND, MVT::f80, Expand);
setOperationAction(ISD::LRINT, MVT::f80, Expand);
setOperationAction(ISD::LLRINT, MVT::f80, Expand);
}
// Always use a library call for pow.
setOperationAction(ISD::FPOW , MVT::f32 , Expand);
setOperationAction(ISD::FPOW , MVT::f64 , Expand);
setOperationAction(ISD::FPOW , MVT::f80 , Expand);
setOperationAction(ISD::FLOG, MVT::f80, Expand);
setOperationAction(ISD::FLOG2, MVT::f80, Expand);
setOperationAction(ISD::FLOG10, MVT::f80, Expand);
setOperationAction(ISD::FEXP, MVT::f80, Expand);
setOperationAction(ISD::FEXP2, MVT::f80, Expand);
setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
// Some FP actions are always expanded for vector types.
for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
setOperationAction(ISD::FSIN, VT, Expand);
setOperationAction(ISD::FSINCOS, VT, Expand);
setOperationAction(ISD::FCOS, VT, Expand);
setOperationAction(ISD::FREM, VT, Expand);
setOperationAction(ISD::FCOPYSIGN, VT, Expand);
setOperationAction(ISD::FPOW, VT, Expand);
setOperationAction(ISD::FLOG, VT, Expand);
setOperationAction(ISD::FLOG2, VT, Expand);
setOperationAction(ISD::FLOG10, VT, Expand);
setOperationAction(ISD::FEXP, VT, Expand);
setOperationAction(ISD::FEXP2, VT, Expand);
}
// First set operation action for all vector types to either promote
// (for widening) or expand (for scalarization). Then we will selectively
// turn on ones that can be effectively codegen'd.
for (MVT VT : MVT::vector_valuetypes()) {
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UDIV, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
setOperationAction(ISD::FMA, VT, Expand);
setOperationAction(ISD::FFLOOR, VT, Expand);
setOperationAction(ISD::FCEIL, VT, Expand);
setOperationAction(ISD::FTRUNC, VT, Expand);
setOperationAction(ISD::FRINT, VT, Expand);
setOperationAction(ISD::FNEARBYINT, VT, Expand);
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::UDIVREM, VT, Expand);
setOperationAction(ISD::CTPOP, VT, Expand);
setOperationAction(ISD::CTTZ, VT, Expand);
setOperationAction(ISD::CTLZ, VT, Expand);
setOperationAction(ISD::ROTL, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
setOperationAction(ISD::BSWAP, VT, Expand);
setOperationAction(ISD::SETCC, VT, Expand);
setOperationAction(ISD::FP_TO_UINT, VT, Expand);
setOperationAction(ISD::FP_TO_SINT, VT, Expand);
setOperationAction(ISD::UINT_TO_FP, VT, Expand);
setOperationAction(ISD::SINT_TO_FP, VT, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
setOperationAction(ISD::TRUNCATE, VT, Expand);
setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
setOperationAction(ISD::ANY_EXTEND, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
for (MVT InnerVT : MVT::vector_valuetypes()) {
setTruncStoreAction(InnerVT, VT, Expand);
setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
// types, we have to deal with them whether we ask for Expansion or not.
// Setting Expand causes its own optimisation problems though, so leave
// them legal.
if (VT.getVectorElementType() == MVT::i1)
setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
// split/scalarized right now.
if (VT.getVectorElementType() == MVT::f16)
setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
}
}
// FIXME: In order to prevent SSE instructions being expanded to MMX ones
// with -msoft-float, disable use of MMX as well.
if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
// No operations on x86mmx supported, everything uses intrinsics.
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
setOperationAction(ISD::FABS, MVT::v4f32, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
setOperationAction(ISD::STORE, MVT::v2f32, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
// registers cannot be used even for integer operations.
addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
setOperationAction(ISD::SDIV, VT, Custom);
setOperationAction(ISD::SREM, VT, Custom);
setOperationAction(ISD::UDIV, VT, Custom);
setOperationAction(ISD::UREM, VT, Custom);
}
setOperationAction(ISD::MUL, MVT::v2i8, Custom);
setOperationAction(ISD::MUL, MVT::v2i16, Custom);
setOperationAction(ISD::MUL, MVT::v2i32, Custom);
setOperationAction(ISD::MUL, MVT::v4i8, Custom);
setOperationAction(ISD::MUL, MVT::v4i16, Custom);
setOperationAction(ISD::MUL, MVT::v8i8, Custom);
setOperationAction(ISD::MUL, MVT::v16i8, Custom);
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
setOperationAction(ISD::MUL, MVT::v8i16, Legal);
setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
setOperationAction(ISD::FABS, MVT::v2f64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
}
setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
if (!ExperimentalVectorWideningLegalization) {
// Use widening instead of promotion.
for (auto VT : { MVT::v8i8, MVT::v4i8, MVT::v2i8,
MVT::v4i16, MVT::v2i16 }) {
setOperationAction(ISD::UADDSAT, VT, Custom);
setOperationAction(ISD::SADDSAT, VT, Custom);
setOperationAction(ISD::USUBSAT, VT, Custom);
setOperationAction(ISD::SSUBSAT, VT, Custom);
}
}
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
// Provide custom widening for v2f32 setcc. This is really for VLX when
// setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to
// type legalization changing the result type to v4i1 during widening.
// It works fine for SSE2 and is probably faster so no need to qualify with
// VLX support.
setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::ABS, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
setCondCodeAction(ISD::SETLT, VT, Custom);
setCondCodeAction(ISD::SETLE, VT, Custom);
}
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
}
// We support custom legalizing of sext and anyext loads for specific
// memory vector types which we can load as a scalar (or sequence of
// scalars) and extend in-register to a legal 128-bit vector type. For sext
// loads these must work with a single scalar load.
for (MVT VT : MVT::integer_vector_valuetypes()) {
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
}
for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
if (VT == MVT::v2i64 && !Subtarget.is64Bit())
continue;
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
}
// Custom lower v2i64 and v2f64 selects.
setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom);
// Custom legalize these to avoid over promotion or custom promotion.
setOperationAction(ISD::FP_TO_SINT, MVT::v2i8, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v4i8, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i8, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v2i8, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i8, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i8, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v2i16, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
// By marking FP_TO_SINT v8i16 as Custom, will trick type legalization into
// promoting v8i8 FP_TO_UINT into FP_TO_SINT. When the v8i16 FP_TO_SINT is
// split again based on the input type, this will cause an AssertSExt i16 to
// be emitted instead of an AssertZExt. This will allow packssdw followed by
// packuswb to be used to truncate to v8i8. This is necessary since packusdw
// isn't available until sse4.1.
setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
// We want to legalize this to an f64 load rather than an i64 load on
// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
// store.
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i16, Custom);
setOperationAction(ISD::STORE, MVT::v8i8, Custom);
setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
if (!Subtarget.hasAVX512())
setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
if (ExperimentalVectorWideningLegalization) {
setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
} else {
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
}
// In the customized shift lowering, the legal v4i32/v2i64 cases
// in AVX2 will be recognized.
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
}
setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
// With AVX512, expanding (and promoting the shifts) is better.
if (!Subtarget.hasAVX512())
setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
setOperationAction(ISD::ABS, MVT::v16i8, Legal);
setOperationAction(ISD::ABS, MVT::v8i16, Legal);
setOperationAction(ISD::ABS, MVT::v4i32, Legal);
setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
// These might be better off as horizontal vector ops.
setOperationAction(ISD::ADD, MVT::i16, Custom);
setOperationAction(ISD::ADD, MVT::i32, Custom);
setOperationAction(ISD::SUB, MVT::i16, Custom);
setOperationAction(ISD::SUB, MVT::i32, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
setOperationAction(ISD::FCEIL, RoundedTy, Legal);
setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
setOperationAction(ISD::FRINT, RoundedTy, Legal);
setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
}
setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
// FIXME: Do we need to handle scalar-to-vector here?
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
// We directly match byte blends in the backend as they match the VSELECT
// condition form.
setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
// SSE41 brings specific instructions for doing vector sign extend even in
// cases where we don't have SRA.
for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
}
if (!ExperimentalVectorWideningLegalization) {
// Avoid narrow result types when widening. The legal types are listed
// in the next loop.
for (MVT VT : MVT::integer_vector_valuetypes()) {
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
}
}
// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
if (!ExperimentalVectorWideningLegalization)
setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
}
// i8 vectors are custom because the source register and source
// source memory operand types are not the same width.
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
setOperationAction(ISD::ROTL, VT, Custom);
// XOP can efficiently perform BITREVERSE with VPPERM.
for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
setOperationAction(ISD::BITREVERSE, VT, Custom);
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
setOperationAction(ISD::BITREVERSE, VT, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
bool HasInt256 = Subtarget.hasInt256();
addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
setOperationAction(ISD::FFLOOR, VT, Legal);
setOperationAction(ISD::FCEIL, VT, Legal);
setOperationAction(ISD::FTRUNC, VT, Legal);
setOperationAction(ISD::FRINT, VT, Legal);
setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FABS, VT, Custom);
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
}
// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
// even though v8i16 is a legal type.
setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
if (!Subtarget.hasAVX512())
setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
// In the customized shift lowering, the legal v8i32/v4i64 cases
// in AVX2 will be recognized.
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
}
// These types need custom splitting if their input is a 128-bit vector.
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
// With BWI, expanding (and promoting the shifts) is the better.
if (!Subtarget.hasBWI())
setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
setOperationAction(ISD::ANY_EXTEND, VT, Custom);
}
setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTLZ, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
setCondCodeAction(ISD::SETLT, VT, Custom);
setCondCodeAction(ISD::SETLE, VT, Custom);
}
if (Subtarget.hasAnyFMA()) {
for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
MVT::v2f64, MVT::v4f64 })
setOperationAction(ISD::FMA, VT, Legal);
}
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
}
setOperationAction(ISD::MUL, MVT::v4i64, Custom);
setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MUL, MVT::v32i8, Custom);
setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
setOperationAction(ISD::ABS, MVT::v4i64, Custom);
setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
}
for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
}
if (HasInt256) {
// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
// when we have a 256bit-wide blend with immediate.
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
}
}
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::MSTORE, VT, Legal);
}
// Extract subvector is special because the value type
// (result) is 128-bit but the source is 256-bit wide.
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
MVT::v4f32, MVT::v2f64 }) {
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
}
// Custom lower several nodes for 256-bit types.
for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
MVT::v8f32, MVT::v4f64 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::STORE, VT, Custom);
}
if (HasInt256)
setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
if (HasInt256) {
// Custom legalize 2x32 to get a little better code.
setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
setOperationAction(ISD::MGATHER, VT, Custom);
}
}
// This block controls legalization of the mask vector sizes that are
// available with AVX512. 512-bit vectors are in a separate block controlled
// by useAVX512Regs.
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
// There is no byte sized k-register load or store without AVX512DQ.
if (!Subtarget.hasDQI()) {
setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
setOperationAction(ISD::STORE, MVT::v1i1, Custom);
setOperationAction(ISD::STORE, MVT::v2i1, Custom);
setOperationAction(ISD::STORE, MVT::v4i1, Custom);
setOperationAction(ISD::STORE, MVT::v8i1, Custom);
}
// Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
setOperationAction(ISD::ANY_EXTEND, VT, Custom);
}
for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
setOperationAction(ISD::ADD, VT, Custom);
setOperationAction(ISD::SUB, VT, Custom);
setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::TRUNCATE, VT, Custom);
setOperationAction(ISD::UADDSAT, VT, Custom);
setOperationAction(ISD::SADDSAT, VT, Custom);
setOperationAction(ISD::USUBSAT, VT, Custom);
setOperationAction(ISD::SSUBSAT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Expand);
}
for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
}
// This block controls legalization for 512-bit operations with 32/64 bit
// elements. 512-bits can be disabled based on prefer-vector-width and
// required-vector-width function attributes.
if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
}
for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FABS, VT, Custom);
setOperationAction(ISD::FMA, VT, Legal);
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
}
setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
// to 512-bit rather than use the AVX2 instructions so that we can use
// k-masks.
if (!Subtarget.hasVLX()) {
for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::MSTORE, VT, Custom);
}
}
setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
if (ExperimentalVectorWideningLegalization) {
// Need to custom widen this if we don't have AVX512BW.
setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);
}
for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::FFLOOR, VT, Legal);
setOperationAction(ISD::FCEIL, VT, Legal);
setOperationAction(ISD::FTRUNC, VT, Legal);
setOperationAction(ISD::FRINT, VT, Legal);
setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::SELECT, VT, Custom);
}
// Without BWI we need to use custom lowering to handle MVT::v64i8 input.
for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) {
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
}
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
setOperationAction(ISD::MUL, MVT::v8i64, Custom);
setOperationAction(ISD::MUL, MVT::v16i32, Legal);
setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::SMIN, VT, Legal);
setOperationAction(ISD::UMIN, VT, Legal);
setOperationAction(ISD::ABS, VT, Legal);
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::ROTL, VT, Custom);
setOperationAction(ISD::ROTR, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
setCondCodeAction(ISD::SETLT, VT, Custom);
setCondCodeAction(ISD::SETLE, VT, Custom);
}
if (Subtarget.hasDQI()) {
setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
setOperationAction(ISD::MUL, MVT::v8i64, Legal);
}
if (Subtarget.hasCDI()) {
// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
setOperationAction(ISD::CTLZ, VT, Legal);
}
} // Subtarget.hasCDI()
if (Subtarget.hasVPOPCNTDQ()) {
for (auto VT : { MVT::v16i32, MVT::v8i64 })
setOperationAction(ISD::CTPOP, VT, Legal);
}
// Extract subvector is special because the value type
// (result) is 256-bit but the source is 512-bit wide.
// 128-bit was made Legal under AVX1.
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
MVT::v8f32, MVT::v4f64 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::MLOAD, VT, Legal);
setOperationAction(ISD::MSTORE, VT, Legal);
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
}
// Need to custom split v32i16/v64i8 bitcasts.
if (!Subtarget.hasBWI()) {
setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);
}
if (Subtarget.hasVBMI2()) {
for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
setOperationAction(ISD::FSHL, VT, Custom);
setOperationAction(ISD::FSHR, VT, Custom);
}
}
}// has AVX-512
// This block controls legalization for operations that don't have
// pre-AVX512 equivalents. Without VLX we use 512-bit operations for
// narrower widths.
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
// These operations are handled on non-VLX by artificially widening in
// isel patterns.
// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::SMIN, VT, Legal);
setOperationAction(ISD::UMIN, VT, Legal);
setOperationAction(ISD::ABS, VT, Legal);
}
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::ROTL, VT, Custom);
setOperationAction(ISD::ROTR, VT, Custom);
}
// Custom legalize 2x32 to get a little better code.
setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
setOperationAction(ISD::MSCATTER, VT, Custom);
if (Subtarget.hasDQI()) {
for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::SINT_TO_FP, VT, Legal);
setOperationAction(ISD::UINT_TO_FP, VT, Legal);
setOperationAction(ISD::FP_TO_SINT, VT, Legal);
setOperationAction(ISD::FP_TO_UINT, VT, Legal);
setOperationAction(ISD::MUL, VT, Legal);
}
}
if (Subtarget.hasCDI()) {
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::CTLZ, VT, Legal);
}
} // Subtarget.hasCDI()
if (Subtarget.hasVPOPCNTDQ()) {
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
setOperationAction(ISD::CTPOP, VT, Legal);
}
}
// This block control legalization of v32i1/v64i1 which are available with
// AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
// useBWIRegs.
if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
setOperationAction(ISD::ADD, VT, Custom);
setOperationAction(ISD::SUB, VT, Custom);
setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Expand);
setOperationAction(ISD::UADDSAT, VT, Custom);
setOperationAction(ISD::SADDSAT, VT, Custom);
setOperationAction(ISD::USUBSAT, VT, Custom);
setOperationAction(ISD::SSUBSAT, VT, Custom);
setOperationAction(ISD::TRUNCATE, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
}
setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
for (auto VT : { MVT::v16i1, MVT::v32i1 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
// Extends from v32i1 masks to 256-bit vectors.
setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
}
// This block controls legalization for v32i16 and v64i8. 512-bits can be
// disabled based on prefer-vector-width and required-vector-width function
// attributes.
if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
// Extends from v64i1 masks to 512-bit vectors.
setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
setOperationAction(ISD::MUL, MVT::v32i16, Legal);
setOperationAction(ISD::MUL, MVT::v64i8, Custom);
setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::ABS, VT, Legal);
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Legal);
setOperationAction(ISD::MSTORE, VT, Legal);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTLZ, VT, Custom);
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::SMIN, VT, Legal);
setOperationAction(ISD::UMIN, VT, Legal);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::UADDSAT, VT, Legal);
setOperationAction(ISD::SADDSAT, VT, Legal);
setOperationAction(ISD::USUBSAT, VT, Legal);
setOperationAction(ISD::SSUBSAT, VT, Legal);
setOperationAction(ISD::SELECT, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
setCondCodeAction(ISD::SETLT, VT, Custom);
setCondCodeAction(ISD::SETLE, VT, Custom);
}
for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
}
if (Subtarget.hasBITALG()) {
for (auto VT : { MVT::v64i8, MVT::v32i16 })
setOperationAction(ISD::CTPOP, VT, Legal);
}
if (Subtarget.hasVBMI2()) {
setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
}
}
if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
}
// These operations are handled on non-VLX by artificially widening in
// isel patterns.
// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
if (Subtarget.hasBITALG()) {
for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
setOperationAction(ISD::CTPOP, VT, Legal);
}
}
if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
if (Subtarget.hasDQI()) {
// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
// v2f32 UINT_TO_FP is already custom under SSE2.
setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
"Unexpected operation action!");
// v2i64 FP_TO_S/UINT(v2f32) custom conversion.
setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
}
if (Subtarget.hasBWI()) {
setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
}
if (Subtarget.hasVBMI2()) {
// TODO: Make these legal even without VLX?
for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::FSHL, VT, Custom);
setOperationAction(ISD::FSHR, VT, Custom);
}
}
}
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
if (!Subtarget.is64Bit()) {
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
}
// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
// handle type legalization for these operations here.
//
// FIXME: We really should do custom legalization for addition and
// subtraction on x86-32 once PR3203 is fixed. We really can't do much better
// than generic legalization for 64-bit multiplication-with-overflow, though.
for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
continue;
// Add/Sub/Mul with overflow operations are custom lowered.
setOperationAction(ISD::SADDO, VT, Custom);
setOperationAction(ISD::UADDO, VT, Custom);
setOperationAction(ISD::SSUBO, VT, Custom);
setOperationAction(ISD::USUBO, VT, Custom);
setOperationAction(ISD::SMULO, VT, Custom);
setOperationAction(ISD::UMULO, VT, Custom);
// Support carry in as value rather than glue.
setOperationAction(ISD::ADDCARRY, VT, Custom);
setOperationAction(ISD::SUBCARRY, VT, Custom);
setOperationAction(ISD::SETCCCARRY, VT, Custom);
}
if (!Subtarget.is64Bit()) {
// These libcalls are not available in 32-bit.
setLibcallName(RTLIB::SHL_I128, nullptr);
setLibcallName(RTLIB::SRL_I128, nullptr);
setLibcallName(RTLIB::SRA_I128, nullptr);
setLibcallName(RTLIB::MUL_I128, nullptr);
}
// Combine sin / cos into _sincos_stret if it is available.
if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
}
if (Subtarget.isTargetWin64()) {
setOperationAction(ISD::SDIV, MVT::i128, Custom);
setOperationAction(ISD::UDIV, MVT::i128, Custom);
setOperationAction(ISD::SREM, MVT::i128, Custom);
setOperationAction(ISD::UREM, MVT::i128, Custom);
setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
}
// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
// is. We should promote the value to 64-bits to solve this.
// This is what the CRT headers do - `fmodf` is an inline header
// function casting to f64 and calling `fmod`.
if (Subtarget.is32Bit() &&
(Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
for (ISD::NodeType Op :
{ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
ISD::FLOG10, ISD::FPOW, ISD::FSIN})
if (isOperationExpand(Op, MVT::f32))
setOperationAction(Op, MVT::f32, Promote);
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
setTargetDAGCombine(ISD::CONCAT_VECTORS);
setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
setTargetDAGCombine(ISD::BITCAST);
setTargetDAGCombine(ISD::VSELECT);
setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRA);
setTargetDAGCombine(ISD::SRL);
setTargetDAGCombine(ISD::OR);
setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::ADD);
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FNEG);
setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::FMINNUM);
setTargetDAGCombine(ISD::FMAXNUM);
setTargetDAGCombine(ISD::SUB);
setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::MLOAD);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::MSTORE);
setTargetDAGCombine(ISD::TRUNCATE);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::SETCC);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::XOR);
setTargetDAGCombine(ISD::MSCATTER);
setTargetDAGCombine(ISD::MGATHER);
computeRegisterProperties(Subtarget.getRegisterInfo());
MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
MaxStoresPerMemsetOptSize = 8;
MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
MaxStoresPerMemcpyOptSize = 4;
MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
MaxStoresPerMemmoveOptSize = 4;
// TODO: These control memcmp expansion in CGP and could be raised higher, but
// that needs to benchmarked and balanced with the potential use of vector
// load/store types (PR33329, PR33914).
MaxLoadsPerMemcmp = 2;
MaxLoadsPerMemcmpOptSize = 2;
// Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
// An out-of-order CPU can speculatively execute past a predictable branch,
// but a conditional move could be stalled by an expensive earlier operation.
PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
EnableExtLdPromotion = true;
setPrefFunctionAlignment(4); // 2^4 bytes.
verifyIntrinsicTables();
}
// This has so far only been implemented for 64-bit MachO.
bool X86TargetLowering::useLoadStackGuardNode() const {
return Subtarget.isTargetMachO() && Subtarget.is64Bit();
}
bool X86TargetLowering::useStackGuardXorFP() const {
// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
return Subtarget.getTargetTriple().isOSMSVCRT();
}
SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
const SDLoc &DL) const {
EVT PtrTy = getPointerTy(DAG.getDataLayout());
unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
return SDValue(Node, 0);
}
TargetLoweringBase::LegalizeTypeAction
X86TargetLowering::getPreferredVectorAction(MVT VT) const {
if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return TypeSplitVector;
if (ExperimentalVectorWideningLegalization &&
VT.getVectorNumElements() != 1 &&
VT.getVectorElementType() != MVT::i1)
return TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
}
MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return MVT::v32i8;
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}
unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return 1;
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
}
EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
LLVMContext& Context,
EVT VT) const {
if (!VT.isVector())
return MVT::i8;
if (Subtarget.hasAVX512()) {
const unsigned NumElts = VT.getVectorNumElements();
// Figure out what this type will be legalized to.
EVT LegalVT = VT;
while (getTypeAction(Context, LegalVT) != TypeLegal)
LegalVT = getTypeToTransformTo(Context, LegalVT);
// If we got a 512-bit vector then we'll definitely have a vXi1 compare.
if (LegalVT.getSimpleVT().is512BitVector())
return EVT::getVectorVT(Context, MVT::i1, NumElts);
if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
// If we legalized to less than a 512-bit vector, then we will use a vXi1
// compare for vXi32/vXi64 for sure. If we have BWI we will also support
// vXi16/vXi8.
MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
return EVT::getVectorVT(Context, MVT::i1, NumElts);
}
}
return VT.changeVectorElementTypeToInteger();
}
/// Helper for getByValTypeAlignment to determine
/// the desired ByVal argument alignment.
static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
if (MaxAlign == 16)
return;
if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
if (VTy->getBitWidth() == 128)
MaxAlign = 16;
} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
unsigned EltAlign = 0;
getMaxByValAlign(ATy->getElementType(), EltAlign);
if (EltAlign > MaxAlign)
MaxAlign = EltAlign;
} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
for (auto *EltTy : STy->elements()) {
unsigned EltAlign = 0;
getMaxByValAlign(EltTy, EltAlign);
if (EltAlign > MaxAlign)
MaxAlign = EltAlign;
if (MaxAlign == 16)
break;
}
}
}
/// Return the desired alignment for ByVal aggregate
/// function arguments in the caller parameter area. For X86, aggregates
/// that contain SSE vectors are placed at 16-byte boundaries while the rest
/// are at 4-byte boundaries.
unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
const DataLayout &DL) const {
if (Subtarget.is64Bit()) {
// Max of 8 and alignment of type.
unsigned TyAlign = DL.getABITypeAlignment(Ty);
if (TyAlign > 8)
return TyAlign;
return 8;
}
unsigned Align = 4;
if (Subtarget.hasSSE1())
getMaxByValAlign(Ty, Align);
return Align;
}
/// Returns the target specific optimal type for load
/// and store operations as a result of memset, memcpy, and memmove
/// lowering. If DstAlign is zero that means it's safe to destination
/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
/// means there isn't a need to check it against alignment requirement,
/// probably because the source does not need to be loaded. If 'IsMemset' is
/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
/// source is constant so it does not need to be loaded.
/// It returns EVT::Other if the type should be determined using generic
/// target-independent logic.
/// For vector ops we check that the overall size isn't larger than our
/// preferred vector width.
EVT X86TargetLowering::getOptimalMemOpType(
uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
bool ZeroMemset, bool MemcpyStrSrc,
const AttributeList &FuncAttributes) const {
if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() ||
((DstAlign == 0 || DstAlign >= 16) &&
(SrcAlign == 0 || SrcAlign >= 16)))) {
// FIXME: Check if unaligned 32-byte accesses are slow.
if (Size >= 32 && Subtarget.hasAVX() &&
(Subtarget.getPreferVectorWidth() >= 256)) {
// Although this isn't a well-supported type for AVX1, we'll let
// legalization and shuffle lowering produce the optimal codegen. If we
// choose an optimal type with a vector element larger than a byte,
// getMemsetStores() may create an intermediate splat (using an integer
// multiply) before we splat as a vector.
return MVT::v32i8;
}
if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
return MVT::v16i8;
// TODO: Can SSE1 handle a byte vector?
// If we have SSE1 registers we should be able to use them.
if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
(Subtarget.getPreferVectorWidth() >= 128))
return MVT::v4f32;
} else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
!Subtarget.is64Bit() && Subtarget.hasSSE2()) {
// Do not use f64 to lower memcpy if source is string constant. It's
// better to use i32 to avoid the loads.
// Also, do not use f64 to lower memset unless this is a memset of zeros.
// The gymnastics of splatting a byte value into an XMM register and then
// only using 8-byte stores (because this is a CPU with slow unaligned
// 16-byte accesses) makes that a loser.
return MVT::f64;
}
}
// This is a compromise. If we reach here, unaligned accesses may be slow on
// this target. However, creating smaller, aligned accesses could be even
// slower and would certainly be a lot more code.
if (Subtarget.is64Bit() && Size >= 8)
return MVT::i64;
return MVT::i32;
}
bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
if (VT == MVT::f32)
return X86ScalarSSEf32;
else if (VT == MVT::f64)
return X86ScalarSSEf64;
return true;
}
bool X86TargetLowering::allowsMisalignedMemoryAccesses(
EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags,
bool *Fast) const {
if (Fast) {
switch (VT.getSizeInBits()) {
default:
// 8-byte and under are always assumed to be fast.
*Fast = true;
break;
case 128:
*Fast = !Subtarget.isUnalignedMem16Slow();
break;
case 256:
*Fast = !Subtarget.isUnalignedMem32Slow();
break;
// TODO: What about AVX-512 (512-bit) accesses?
}
}
// NonTemporal vector memory ops must be aligned.
if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
// NT loads can only be vector aligned, so if its less aligned than the
// minimum vector size (which we can split the vector down to), we might as
// well use a regular unaligned vector load.
// We don't have any NT loads pre-SSE41.
if (!!(Flags & MachineMemOperand::MOLoad))
return (Align < 16 || !Subtarget.hasSSE41());
return false;
}
// Misaligned accesses of any size are always allowed.
return true;
}
/// Return the entry encoding for a jump table in the
/// current function. The returned value is a member of the
/// MachineJumpTableInfo::JTEntryKind enum.
unsigned X86TargetLowering::getJumpTableEncoding() const {
// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
// symbol.
if (isPositionIndependent() && Subtarget.isPICStyleGOT())
return MachineJumpTableInfo::EK_Custom32;
// Otherwise, use the normal jump table encoding heuristics.
return TargetLowering::getJumpTableEncoding();
}
bool X86TargetLowering::useSoftFloat() const {
return Subtarget.useSoftFloat();
}
void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
ArgListTy &Args) const {
// Only relabel X86-32 for C / Stdcall CCs.
if (Subtarget.is64Bit())
return;
if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
return;
unsigned ParamRegs = 0;
if (auto *M = MF->getFunction().getParent())
ParamRegs = M->getNumberRegisterParameters();
// Mark the first N int arguments as having reg
for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
Type *T = Args[Idx].Ty;
if (T->isIntOrPtrTy())
if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
unsigned numRegs = 1;
if (MF->getDataLayout().getTypeAllocSize(T) > 4)
numRegs = 2;
if (ParamRegs < numRegs)
return;
ParamRegs -= numRegs;
Args[Idx].IsInReg = true;
}
}
}
const MCExpr *
X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
const MachineBasicBlock *MBB,
unsigned uid,MCContext &Ctx) const{
assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
// entries.
return MCSymbolRefExpr::create(MBB->getSymbol(),
MCSymbolRefExpr::VK_GOTOFF, Ctx);
}
/// Returns relocation base for the given PIC jumptable.
SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
SelectionDAG &DAG) const {
if (!Subtarget.is64Bit())
// This doesn't have SDLoc associated with it, but is not really the
// same as a Register.
return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
getPointerTy(DAG.getDataLayout()));
return Table;
}
/// This returns the relocation base for the given PIC jumptable,
/// the same as getPICJumpTableRelocBase, but as an MCExpr.
const MCExpr *X86TargetLowering::
getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
MCContext &Ctx) const {
// X86-64 uses RIP relative addressing based on the jump table label.
if (Subtarget.isPICStyleRIPRel())
return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
// Otherwise, the reference is relative to the PIC base.
return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
}
std::pair<const TargetRegisterClass *, uint8_t>
X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
MVT VT) const {
const TargetRegisterClass *RRC = nullptr;
uint8_t Cost = 1;
switch (VT.SimpleTy) {
default:
return TargetLowering::findRepresentativeClass(TRI, VT);
case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
break;
case MVT::x86mmx:
RRC = &X86::VR64RegClass;
break;
case MVT::f32: case MVT::f64:
case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
case MVT::v4f32: case MVT::v2f64:
case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
case MVT::v8f32: case MVT::v4f64:
case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
case MVT::v16f32: case MVT::v8f64:
RRC = &X86::VR128XRegClass;
break;
}
return std::make_pair(RRC, Cost);
}
unsigned X86TargetLowering::getAddressSpace() const {
if (Subtarget.is64Bit())
return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
return 256;
}
static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
}
static Constant* SegmentOffset(IRBuilder<> &IRB,
unsigned Offset, unsigned AddressSpace) {
return ConstantExpr::getIntToPtr(
ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
}
Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
// glibc, bionic, and Fuchsia have a special slot for the stack guard in
// tcbhead_t; use it instead of the usual global variable (see
// sysdeps/{i386,x86_64}/nptl/tls.h)
if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
if (Subtarget.isTargetFuchsia()) {
// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
return SegmentOffset(IRB, 0x10, getAddressSpace());
} else {
// %fs:0x28, unless we're using a Kernel code model, in which case
// it's %gs:0x28. gs:0x14 on i386.
unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
return SegmentOffset(IRB, Offset, getAddressSpace());
}
}
return TargetLowering::getIRStackGuard(IRB);
}
void X86TargetLowering::insertSSPDeclarations(Module &M) const {
// MSVC CRT provides functionalities for stack protection.
if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
// MSVC CRT has a global variable holding security cookie.
M.getOrInsertGlobal("__security_cookie",
Type::getInt8PtrTy(M.getContext()));
// MSVC CRT has a function to validate security cookie.
FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
"__security_check_cookie", Type::getVoidTy(M.getContext()),
Type::getInt8PtrTy(M.getContext()));
if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
F->setCallingConv(CallingConv::X86_FastCall);
F->addAttribute(1, Attribute::AttrKind::InReg);
}
return;
}
// glibc, bionic, and Fuchsia have a special slot for the stack guard.
if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
return;
TargetLowering::insertSSPDeclarations(M);
}
Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
// MSVC CRT has a global variable holding security cookie.
if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
return M.getGlobalVariable("__security_cookie");
}
return TargetLowering::getSDagStackGuard(M);
}
Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
// MSVC CRT has a function to validate security cookie.
if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
return M.getFunction("__security_check_cookie");
}
return TargetLowering::getSSPStackGuardCheck(M);
}
Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
if (Subtarget.getTargetTriple().isOSContiki())
return getDefaultSafeStackPointerLocation(IRB, false);
// Android provides a fixed TLS slot for the SafeStack pointer. See the
// definition of TLS_SLOT_SAFESTACK in
// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
if (Subtarget.isTargetAndroid()) {
// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
// %gs:0x24 on i386
unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
return SegmentOffset(IRB, Offset, getAddressSpace());
}
// Fuchsia is similar.
if (Subtarget.isTargetFuchsia()) {
// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
return SegmentOffset(IRB, 0x18, getAddressSpace());
}
return TargetLowering::getSafeStackPointerLocation(IRB);
}
bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
assert(SrcAS != DestAS && "Expected different address spaces!");
return SrcAS < 256 && DestAS < 256;
}
//===----------------------------------------------------------------------===//
// Return Value Calling Convention Implementation
//===----------------------------------------------------------------------===//
bool X86TargetLowering::CanLowerReturn(
CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
return CCInfo.CheckReturn(Outs, RetCC_X86);
}
const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
return ScratchRegs;
}
/// Lowers masks values (v*i1) to the local register values
/// \returns DAG node after lowering to register type
static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
const SDLoc &Dl, SelectionDAG &DAG) {
EVT ValVT = ValArg.getValueType();
if (ValVT == MVT::v1i1)
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
DAG.getIntPtrConstant(0, Dl));
if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
// Two stage lowering might be required
// bitcast: v8i1 -> i8 / v16i1 -> i16
// anyextend: i8 -> i32 / i16 -> i32
EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
if (ValLoc == MVT::i32)
ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
return ValToCopy;
}
if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
// One stage lowering is required
// bitcast: v32i1 -> i32 / v64i1 -> i64
return DAG.getBitcast(ValLoc, ValArg);
}
return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
}
/// Breaks v64i1 value into two registers and adds the new node to the DAG
static void Passv64i1ArgInRegs(
const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
CCValAssign &NextVA, const X86Subtarget &Subtarget) {
assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
assert(Subtarget.is32Bit() && "Expecting 32 bit target");
assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
assert(VA.isRegLoc() && NextVA.isRegLoc() &&
"The value should reside in two registers");
// Before splitting the value we cast it to i64
Arg = DAG.getBitcast(MVT::i64, Arg);
// Splitting the value into two i32 types
SDValue Lo, Hi;
Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
DAG.getConstant(0, Dl, MVT::i32));
Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
DAG.getConstant(1, Dl, MVT::i32));
// Attach the two i32 types into corresponding registers
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
}
SDValue
X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &dl, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
// In some cases we need to disable registers from the default CSR list.
// For example, when they are used for argument passing.
bool ShouldDisableCalleeSavedRegister =
CallConv == CallingConv::X86_RegCall ||
MF.getFunction().hasFnAttribute("no_caller_saved_registers");
if (CallConv == CallingConv::X86_INTR && !Outs.empty())
report_fatal_error("X86 interrupts may not return any value");
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
CCInfo.AnalyzeReturn(Outs, RetCC_X86);
SDValue Flag;
SmallVector<SDValue, 6> RetOps;
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
// Operand #1 = Bytes To Pop
RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
MVT::i32));
// Copy the result values into the output registers.
for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
++I, ++OutsIndex) {
CCValAssign &VA = RVLocs[I];
assert(VA.isRegLoc() && "Can only return in registers!");
// Add the register to the CalleeSaveDisableRegs list.
if (ShouldDisableCalleeSavedRegister)
MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
SDValue ValToCopy = OutVals[OutsIndex];
EVT ValVT = ValToCopy.getValueType();
// Promote values to the appropriate types.
if (VA.getLocInfo() == CCValAssign::SExt)
ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
else if (VA.getLocInfo() == CCValAssign::ZExt)
ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
else if (VA.getLocInfo() == CCValAssign::AExt) {
if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
else
ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
}
else if (VA.getLocInfo() == CCValAssign::BCvt)
ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
assert(VA.getLocInfo() != CCValAssign::FPExt &&
"Unexpected FP-extend for return value.");
// If this is x86-64, and we disabled SSE, we can't return FP values,
// or SSE or MMX vectors.
if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
(Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
} else if (ValVT == MVT::f64 &&
(Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
// Likewise we can't return F64 values with SSE1 only. gcc does so, but
// llvm-gcc has never done it right and no one has noticed, so this
// should be OK for now.
errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
}
// Returns in ST0/ST1 are handled specially: these are pushed as operands to
// the RET instruction and handled by the FP Stackifier.
if (VA.getLocReg() == X86::FP0 ||
VA.getLocReg() == X86::FP1) {
// If this is a copy from an xmm register to ST(0), use an FPExtend to
// change the value to the FP stack register class.
if (isScalarFPTypeInSSEReg(VA.getValVT()))
ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
RetOps.push_back(ValToCopy);
// Don't emit a copytoreg.
continue;
}
// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
// which is returned in RAX / RDX.
if (Subtarget.is64Bit()) {
if (ValVT == MVT::x86mmx) {
if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
ValToCopy);
// If we don't have SSE2 available, convert to v4f32 so the generated
// register is legal.
if (!Subtarget.hasSSE2())
ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
}
}
}
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
if (VA.needsCustom()) {
assert(VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs");
Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
Subtarget);
assert(2 == RegsToPass.size() &&
"Expecting two registers after Pass64BitArgInRegs");
// Add the second register to the CalleeSaveDisableRegs list.
if (ShouldDisableCalleeSavedRegister)
MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
} else {
RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
}
// Add nodes to the DAG and add the values into the RetOps list
for (auto &Reg : RegsToPass) {
Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
}
}
// Swift calling convention does not require we copy the sret argument
// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
// All x86 ABIs require that for returning structs by value we copy
// the sret argument into %rax/%eax (depending on ABI) for the return.
// We saved the argument into a virtual register in the entry block,
// so now we copy the value out and into %rax/%eax.
//
// Checking Function.hasStructRetAttr() here is insufficient because the IR
// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
// false, then an sret argument may be implicitly inserted in the SelDAG. In
// either case FuncInfo->setSRetReturnReg() will have been called.
if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
// When we have both sret and another return value, we should use the
// original Chain stored in RetOps[0], instead of the current Chain updated
// in the above loop. If we only have sret, RetOps[0] equals to Chain.
// For the case of sret and another return value, we have
// Chain_0 at the function entry
// Chain_1 = getCopyToReg(Chain_0) in the above loop
// If we use Chain_1 in getCopyFromReg, we will have
// Val = getCopyFromReg(Chain_1)
// Chain_2 = getCopyToReg(Chain_1, Val) from below
// getCopyToReg(Chain_0) will be glued together with
// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
// in Unit B, and we will have cyclic dependency between Unit A and Unit B:
// Data dependency from Unit B to Unit A due to usage of Val in
// getCopyToReg(Chain_1, Val)
// Chain dependency from Unit A to Unit B
// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
getPointerTy(MF.getDataLayout()));
unsigned RetValReg
= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
X86::RAX : X86::EAX;
Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
Flag = Chain.getValue(1);
// RAX/EAX now acts like a return value.
RetOps.push_back(
DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
// Add the returned register to the CalleeSaveDisableRegs list.
if (ShouldDisableCalleeSavedRegister)
MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
}
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
const MCPhysReg *I =
TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
if (I) {
for (; *I; ++I) {
if (X86::GR64RegClass.contains(*I))
RetOps.push_back(DAG.getRegister(*I, MVT::i64));
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
}
}
RetOps[0] = Chain; // Update chain.
// Add the flag if we have it.
if (Flag.getNode())
RetOps.push_back(Flag);
X86ISD::NodeType opcode = X86ISD::RET_FLAG;
if (CallConv == CallingConv::X86_INTR)
opcode = X86ISD::IRET;
return DAG.getNode(opcode, dl, MVT::Other, RetOps);
}
bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
return false;
SDValue TCChain = Chain;
SDNode *Copy = *N->use_begin();
if (Copy->getOpcode() == ISD::CopyToReg) {
// If the copy has a glue operand, we conservatively assume it isn't safe to
// perform a tail call.
if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
return false;
TCChain = Copy->getOperand(0);
} else if (Copy->getOpcode() != ISD::FP_EXTEND)
return false;
bool HasRet = false;
for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
UI != UE; ++UI) {
if (UI->getOpcode() != X86ISD::RET_FLAG)
return false;
// If we are returning more than one value, we can definitely
// not make a tail call see PR19530
if (UI->getNumOperands() > 4)
return false;
if (UI->getNumOperands() == 4 &&
UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
return false;
HasRet = true;
}
if (!HasRet)
return false;
Chain = TCChain;
return true;
}
EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
ISD::NodeType ExtendKind) const {
MVT ReturnMVT = MVT::i32;
bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
// The ABI does not require i1, i8 or i16 to be extended.
//
// On Darwin, there is code in the wild relying on Clang's old behaviour of
// always extending i8/i16 return values, so keep doing that for now.
// (PR26665).
ReturnMVT = MVT::i8;
}
EVT MinVT = getRegisterType(Context, ReturnMVT);
return VT.bitsLT(MinVT) ? MinVT : VT;
}
/// Reads two 32 bit registers and creates a 64 bit mask value.
/// \param VA The current 32 bit value that need to be assigned.
/// \param NextVA The next 32 bit value that need to be assigned.
/// \param Root The parent DAG node.
/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
/// glue purposes. In the case the DAG is already using
/// physical register instead of virtual, we should glue
/// our new SDValue to InFlag SDvalue.
/// \return a new SDvalue of size 64bit.
static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
SDValue &Root, SelectionDAG &DAG,
const SDLoc &Dl, const X86Subtarget &Subtarget,
SDValue *InFlag = nullptr) {
assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
assert(Subtarget.is32Bit() && "Expecting 32 bit target");
assert(VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type");
assert(NextVA.getValVT() == VA.getValVT() &&
"The locations should have the same type");
assert(VA.isRegLoc() && NextVA.isRegLoc() &&
"The values should reside in two registers");
SDValue Lo, Hi;
SDValue ArgValueLo, ArgValueHi;
MachineFunction &MF = DAG.getMachineFunction();
const TargetRegisterClass *RC = &X86::GR32RegClass;
// Read a 32 bit value from the registers.
if (nullptr == InFlag) {
// When no physical register is present,
// create an intermediate virtual register.
unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
} else {
// When a physical register is available read the value from it and glue
// the reads together.
ArgValueLo =
DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
*InFlag = ArgValueLo.getValue(2);
ArgValueHi =
DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
*InFlag = ArgValueHi.getValue(2);
}
// Convert the i32 type into v32i1 type.
Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
// Convert the i32 type into v32i1 type.
Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
// Concatenate the two values together.
return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
}
/// The function will lower a register of various sizes (8/16/32/64)
/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
/// \returns a DAG node contains the operand after lowering to mask type.
static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
const EVT &ValLoc, const SDLoc &Dl,
SelectionDAG &DAG) {
SDValue ValReturned = ValArg;
if (ValVT == MVT::v1i1)
return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
if (ValVT == MVT::v64i1) {
// In 32 bit machine, this case is handled by getv64i1Argument
assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
// In 64 bit machine, There is no need to truncate the value only bitcast
} else {
MVT maskLen;
switch (ValVT.getSimpleVT().SimpleTy) {
case MVT::v8i1:
maskLen = MVT::i8;
break;
case MVT::v16i1:
maskLen = MVT::i16;
break;
case MVT::v32i1:
maskLen = MVT::i32;
break;
default:
llvm_unreachable("Expecting a vector of i1 types");
}
ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
}
return DAG.getBitcast(ValVT, ValReturned);
}
/// Lower the result values of a call into the
/// appropriate copies out of appropriate physical registers.
///
SDValue X86TargetLowering::LowerCallResult(
SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
uint32_t *RegMask) const {
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
bool Is64Bit = Subtarget.is64Bit();
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
// Copy all of the result registers out of their specified physreg.
for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
++I, ++InsIndex) {
CCValAssign &VA = RVLocs[I];
EVT CopyVT = VA.getLocVT();
// In some calling conventions we need to remove the used registers
// from the register mask.
if (RegMask) {
for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
SubRegs.isValid(); ++SubRegs)
RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
}
// If this is x86-64, and we disabled SSE, we can't return FP values
if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
}
// If we prefer to use the value in xmm registers, copy it out as f80 and
// use a truncate to move it from fp stack reg to xmm reg.
bool RoundAfterCopy = false;
if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
isScalarFPTypeInSSEReg(VA.getValVT())) {
if (!Subtarget.hasX87())
report_fatal_error("X87 register return with X87 disabled");
CopyVT = MVT::f80;
RoundAfterCopy = (CopyVT != VA.getLocVT());
}
SDValue Val;
if (VA.needsCustom()) {
assert(VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs");
Val =
getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
} else {
Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
.getValue(1);
Val = Chain.getValue(0);
InFlag = Chain.getValue(2);
}
if (RoundAfterCopy)
Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
// This truncation won't change the value.
DAG.getIntPtrConstant(1, dl));
if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
if (VA.getValVT().isVector() &&
((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
} else
Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
}
InVals.push_back(Val);
}
return Chain;
}
//===----------------------------------------------------------------------===//
// C & StdCall & Fast Calling Convention implementation
//===----------------------------------------------------------------------===//
// StdCall calling convention seems to be standard for many Windows' API
// routines and around. It differs from C calling convention just a little:
// callee should clean up the stack, not caller. Symbols should be also
// decorated in some fancy way :) It doesn't support any vector arguments.
// For info on fast calling convention see Fast Calling Convention (tail call)
// implementation LowerX86_32FastCCCallTo.
/// CallIsStructReturn - Determines whether a call uses struct return
/// semantics.
enum StructReturnType {
NotStructReturn,
RegStructReturn,
StackStructReturn
};
static StructReturnType
callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
if (Outs.empty())
return NotStructReturn;
const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
if (!Flags.isSRet())
return NotStructReturn;
if (Flags.isInReg() || IsMCU)
return RegStructReturn;
return StackStructReturn;
}
/// Determines whether a function uses struct return semantics.
static StructReturnType
argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
if (Ins.empty())
return NotStructReturn;
const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
if (!Flags.isSRet())
return NotStructReturn;
if (Flags.isInReg() || IsMCU)
return RegStructReturn;
return StackStructReturn;
}
/// Make a copy of an aggregate at address specified by "Src" to address
/// "Dst" with size and alignment information specified by the specific
/// parameter attribute. The copy will be passed as a byval function parameter.
static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
SDValue Chain, ISD::ArgFlagsTy Flags,
SelectionDAG &DAG, const SDLoc &dl) {
SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
/*isVolatile*/false, /*AlwaysInline=*/true,
/*isTailCall*/false,
MachinePointerInfo(), MachinePointerInfo());
}
/// Return true if the calling convention is one that we can guarantee TCO for.
static bool canGuaranteeTCO(CallingConv::ID CC) {
return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
CC == CallingConv::HHVM);
}
/// Return true if we might ever do TCO for calls with this calling convention.
static bool mayTailCallThisCC(CallingConv::ID CC) {
switch (CC) {
// C calling conventions:
case CallingConv::C:
case CallingConv::Win64:
case CallingConv::X86_64_SysV:
// Callee pop conventions:
case CallingConv::X86_ThisCall:
case CallingConv::X86_StdCall:
case CallingConv::X86_VectorCall:
case CallingConv::X86_FastCall:
// Swift:
case CallingConv::Swift:
return true;
default:
return canGuaranteeTCO(CC);
}
}
/// Return true if the function is being made into a tailcall target by
/// changing its ABI.
static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
}
bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
auto Attr =
CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
if (!CI->isTailCall() || Attr.getValueAsString() == "true")
return false;
ImmutableCallSite CS(CI);
CallingConv::ID CalleeCC = CS.getCallingConv();
if (!mayTailCallThisCC(CalleeCC))
return false;
return true;
}
SDValue
X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
const SmallVectorImpl<ISD::InputArg> &Ins,
const SDLoc &dl, SelectionDAG &DAG,
const CCValAssign &VA,
MachineFrameInfo &MFI, unsigned i) const {
// Create the nodes corresponding to a load from this parameter slot.
ISD::ArgFlagsTy Flags = Ins[i].Flags;
bool AlwaysUseMutable = shouldGuaranteeTCO(
CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
EVT ValVT;
MVT PtrVT = getPointerTy(DAG.getDataLayout());
// If value is passed by pointer we have address passed instead of the value
// itself. No need to extend if the mask value and location share the same
// absolute size.
bool ExtendedInMem =
VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
ValVT = VA.getLocVT();
else
ValVT = VA.getValVT();
// FIXME: For now, all byval parameter objects are marked mutable. This can be
// changed with more analysis.
// In case of tail call optimization mark all arguments mutable. Since they
// could be overwritten by lowering of arguments in case of a tail call.
if (Flags.isByVal()) {
unsigned Bytes = Flags.getByValSize();
if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
// FIXME: For now, all byval parameter objects are marked as aliasing. This
// can be improved with deeper analysis.
int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
/*isAliased=*/true);
return DAG.getFrameIndex(FI, PtrVT);
}
// This is an argument in memory. We might be able to perform copy elision.
// If the argument is passed directly in memory without any extension, then we
// can perform copy elision. Large vector types, for example, may be passed
// indirectly by pointer.
if (Flags.isCopyElisionCandidate() &&
VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) {
EVT ArgVT = Ins[i].ArgVT;
SDValue PartAddr;
if (Ins[i].PartOffset == 0) {
// If this is a one-part value or the first part of a multi-part value,
// create a stack object for the entire argument value type and return a
// load from our portion of it. This assumes that if the first part of an
// argument is in memory, the rest will also be in memory.
int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
/*IsImmutable=*/false);
PartAddr = DAG.getFrameIndex(FI, PtrVT);
return DAG.getLoad(
ValVT, dl, Chain, PartAddr,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
} else {
// This is not the first piece of an argument in memory. See if there is
// already a fixed stack object including this offset. If so, assume it
// was created by the PartOffset == 0 branch above and create a load from
// the appropriate offset into it.
int64_t PartBegin = VA.getLocMemOffset();
int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
int FI = MFI.getObjectIndexBegin();
for (; MFI.isFixedObjectIndex(FI); ++FI) {
int64_t ObjBegin = MFI.getObjectOffset(FI);
int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
break;
}
if (MFI.isFixedObjectIndex(FI)) {
SDValue Addr =
DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
return DAG.getLoad(
ValVT, dl, Chain, Addr,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
Ins[i].PartOffset));
}
}
}
int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
VA.getLocMemOffset(), isImmutable);
// Set SExt or ZExt flag.
if (VA.getLocInfo() == CCValAssign::ZExt) {
MFI.setObjectZExt(FI, true);
} else if (VA.getLocInfo() == CCValAssign::SExt) {
MFI.setObjectSExt(FI, true);
}
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
SDValue Val = DAG.getLoad(
ValVT, dl, Chain, FIN,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
return ExtendedInMem
? (VA.getValVT().isVector()
? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
: Val;
}
// FIXME: Get this from tablegen.
static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
const X86Subtarget &Subtarget) {
assert(Subtarget.is64Bit());
if (Subtarget.isCallingConvWin64(CallConv)) {
static const MCPhysReg GPR64ArgRegsWin64[] = {
X86::RCX, X86::RDX, X86::R8, X86::R9
};
return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
}
static const MCPhysReg GPR64ArgRegs64Bit[] = {
X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
};
return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
}
// FIXME: Get this from tablegen.
static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
CallingConv::ID CallConv,
const X86Subtarget &Subtarget) {
assert(Subtarget.is64Bit());
if (Subtarget.isCallingConvWin64(CallConv)) {
// The XMM registers which might contain var arg parameters are shadowed
// in their paired GPR. So we only need to save the GPR to their home
// slots.
// TODO: __vectorcall will change this.
return None;
}
const Function &F = MF.getFunction();
bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
bool isSoftFloat = Subtarget.useSoftFloat();
assert(!(isSoftFloat && NoImplicitFloatOps) &&
"SSE register cannot be used when SSE is disabled!");
if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
// Kernel mode asks for SSE to be disabled, so there are no XMM argument
// registers.
return None;
static const MCPhysReg XMMArgRegs64Bit[] = {
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
};
return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
}
#ifndef NDEBUG
static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
[](const CCValAssign &A, const CCValAssign &B) -> bool {
return A.getValNo() < B.getValNo();
});
}
#endif
SDValue X86TargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
const Function &F = MF.getFunction();
if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
F.getName() == "main")
FuncInfo->setForceFramePointer(true);
MachineFrameInfo &MFI = MF.getFrameInfo();
bool Is64Bit = Subtarget.is64Bit();
bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
assert(
!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64.
if (IsWin64)
CCInfo.AllocateStack(32, 8);
CCInfo.AnalyzeArguments(Ins, CC_X86);
// In vectorcall calling convention a second pass is required for the HVA
// types.
if (CallingConv::X86_VectorCall == CallConv) {
CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
}
// The next loop assumes that the locations are in the same order of the
// input arguments.
assert(isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering");
SDValue ArgValue;
for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
++I, ++InsIndex) {
assert(InsIndex < Ins.size() && "Invalid Ins index");
CCValAssign &VA = ArgLocs[I];
if (VA.isRegLoc()) {
EVT RegVT = VA.getLocVT();
if (VA.needsCustom()) {
assert(
VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs");
// v64i1 values, in regcall calling convention, that are
// compiled to 32 bit arch, are split up into two registers.
ArgValue =
getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
} else {
const TargetRegisterClass *RC;
if (RegVT == MVT::i8)
RC = &X86::GR8RegClass;
else if (RegVT == MVT::i16)
RC = &X86::GR16RegClass;
else if (RegVT == MVT::i32)
RC = &X86::GR32RegClass;
else if (Is64Bit && RegVT == MVT::i64)
RC = &X86::GR64RegClass;
else if (RegVT == MVT::f32)
RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
else if (RegVT == MVT::f64)
RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
else if (RegVT == MVT::f80)
RC = &X86::RFP80RegClass;
else if (RegVT == MVT::f128)
RC = &X86::VR128RegClass;
else if (RegVT.is512BitVector())
RC = &X86::VR512RegClass;
else if (RegVT.is256BitVector())
RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
else if (RegVT.is128BitVector())
RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
else if (RegVT == MVT::x86mmx)
RC = &X86::VR64RegClass;
else if (RegVT == MVT::v1i1)
RC = &X86::VK1RegClass;
else if (RegVT == MVT::v8i1)
RC = &X86::VK8RegClass;
else if (RegVT == MVT::v16i1)
RC = &X86::VK16RegClass;
else if (RegVT == MVT::v32i1)
RC = &X86::VK32RegClass;
else if (RegVT == MVT::v64i1)
RC = &X86::VK64RegClass;
else
llvm_unreachable("Unknown argument type!");
unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
}
// If this is an 8 or 16-bit value, it is really passed promoted to 32
// bits. Insert an assert[sz]ext to capture this, then truncate to the
// right size.
if (VA.getLocInfo() == CCValAssign::SExt)
ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
DAG.getValueType(VA.getValVT()));
else if (VA.getLocInfo() == CCValAssign::ZExt)
ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
DAG.getValueType(VA.getValVT()));
else if (VA.getLocInfo() == CCValAssign::BCvt)
ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
if (VA.isExtInLoc()) {
// Handle MMX values passed in XMM regs.
if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
else if (VA.getValVT().isVector() &&
VA.getValVT().getScalarType() == MVT::i1 &&
((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
} else
ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
}
} else {
assert(VA.isMemLoc());
ArgValue =
LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
}
// If value is passed via pointer - do a load.
if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
ArgValue =
DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
InVals.push_back(ArgValue);
}
for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
// Swift calling convention does not require we copy the sret argument
// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
if (CallConv == CallingConv::Swift)
continue;
// All x86 ABIs require that for returning structs by value we copy the
// sret argument into %rax/%eax (depending on ABI) for the return. Save
// the argument into a virtual register so that we can access it from the
// return points.
if (Ins[I].Flags.isSRet()) {
unsigned Reg = FuncInfo->getSRetReturnReg();
if (!Reg) {
MVT PtrTy = getPointerTy(DAG.getDataLayout());
Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
FuncInfo->setSRetReturnReg(Reg);
}
SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
break;
}
}
unsigned StackSize = CCInfo.getNextStackOffset();
// Align stack specially for tail calls.
if (shouldGuaranteeTCO(CallConv,
MF.getTarget().Options.GuaranteedTailCallOpt))
StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
// If the function takes variable number of arguments, make a frame index for
// the start of the first vararg value... for expansion of llvm.va_start. We
// can skip this if there are no va_start calls.
if (MFI.hasVAStart() &&
(Is64Bit || (CallConv != CallingConv::X86_FastCall &&
CallConv != CallingConv::X86_ThisCall))) {
FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
}
// Figure out if XMM registers are in use.
assert(!(Subtarget.useSoftFloat() &&
F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
"SSE register cannot be used when SSE is disabled!");
// 64-bit calling conventions support varargs and register parameters, so we
// have to do extra work to spill them in the prologue.
if (Is64Bit && isVarArg && MFI.hasVAStart()) {
// Find the first unallocated argument registers.
ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
"SSE register cannot be used when SSE is disabled!");
// Gather all the live in physical registers.
SmallVector<SDValue, 6> LiveGPRs;
SmallVector<SDValue, 8> LiveXMMRegs;
SDValue ALVal;
for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
LiveGPRs.push_back(
DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
}
if (!ArgXMMs.empty()) {
unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
LiveXMMRegs.push_back(
DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
}
}
if (IsWin64) {
// Get to the caller-allocated home save location. Add 8 to account
// for the return address.
int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
FuncInfo->setRegSaveFrameIndex(
MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
// Fixup to set vararg frame on shadow area (4 x i64).
if (NumIntRegs < 4)
FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
} else {
// For X86-64, if there are vararg parameters that are passed via
// registers, then we must store them to their spots on the stack so
// they may be loaded by dereferencing the result of va_next.
FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
}
// Store the integer parameter registers.
SmallVector<SDValue, 8> MemOps;
SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
getPointerTy(DAG.getDataLayout()));
unsigned Offset = FuncInfo->getVarArgsGPOffset();
for (SDValue Val : LiveGPRs) {
SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
RSFIN, DAG.getIntPtrConstant(Offset, dl));
SDValue Store =
DAG.getStore(Val.getValue(1), dl, Val, FIN,
MachinePointerInfo::getFixedStack(
DAG.getMachineFunction(),
FuncInfo->getRegSaveFrameIndex(), Offset));
MemOps.push_back(Store);
Offset += 8;
}
if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
// Now store the XMM (fp + vector) parameter registers.
SmallVector<SDValue, 12> SaveXMMOps;
SaveXMMOps.push_back(Chain);
SaveXMMOps.push_back(ALVal);
SaveXMMOps.push_back(DAG.getIntPtrConstant(
FuncInfo->getRegSaveFrameIndex(), dl));
SaveXMMOps.push_back(DAG.getIntPtrConstant(
FuncInfo->getVarArgsFPOffset(), dl));
SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
LiveXMMRegs.end());
MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
MVT::Other, SaveXMMOps));
}
if (!MemOps.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
}
if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
// Find the largest legal vector type.
MVT VecVT = MVT::Other;
// FIXME: Only some x86_32 calling conventions support AVX512.
if (Subtarget.hasAVX512() &&
(Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
CallConv == CallingConv::Intel_OCL_BI)))
VecVT = MVT::v16f32;
else if (Subtarget.hasAVX())
VecVT = MVT::v8f32;
else if (Subtarget.hasSSE2())
VecVT = MVT::v4f32;
// We forward some GPRs and some vector types.
SmallVector<MVT, 2> RegParmTypes;
MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
RegParmTypes.push_back(IntVT);
if (VecVT != MVT::Other)
RegParmTypes.push_back(VecVT);
// Compute the set of forwarded registers. The rest are scratch.
SmallVectorImpl<ForwardedRegister> &Forwards =
FuncInfo->getForwardedMustTailRegParms();
CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
// Conservatively forward AL on x86_64, since it might be used for varargs.
if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
}
// Copy all forwards from physical to virtual registers.
for (ForwardedRegister &FR : Forwards) {
// FIXME: Can we use a less constrained schedule?
SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT);
FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT));
Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal);
}
}
// Some CCs need callee pop.
if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
MF.getTarget().Options.GuaranteedTailCallOpt)) {
FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
// X86 interrupts must pop the error code (and the alignment padding) if
// present.
FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
} else {
FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
// If this is an sret function, the return should pop the hidden pointer.
if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
!Subtarget.getTargetTriple().isOSMSVCRT() &&
argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
FuncInfo->setBytesToPopOnReturn(4);
}
if (!Is64Bit) {
// RegSaveFrameIndex is X86-64 only.
FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
if (CallConv == CallingConv::X86_FastCall ||
CallConv == CallingConv::X86_ThisCall)
// fastcc functions can't have varargs.
FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
}
FuncInfo->setArgumentStackSize(StackSize);
if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
if (Personality == EHPersonality::CoreCLR) {
assert(Is64Bit);
// TODO: Add a mechanism to frame lowering that will allow us to indicate
// that we'd prefer this slot be allocated towards the bottom of the frame
// (i.e. near the stack pointer after allocating the frame). Every
// funclet needs a copy of this slot in its (mostly empty) frame, and the
// offset from the bottom of this and each funclet's frame must be the
// same, so the size of funclets' (mostly empty) frames is dictated by
// how far this slot is from the bottom (since they allocate just enough
// space to accommodate holding this slot at the correct offset).
int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
EHInfo->PSPSymFrameIdx = PSPSymFI;
}
}
if (CallConv == CallingConv::X86_RegCall ||
F.hasFnAttribute("no_caller_saved_registers")) {
MachineRegisterInfo &MRI = MF.getRegInfo();
for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
MRI.disableCalleeSavedRegister(Pair.first);
}
return Chain;
}
SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
SDValue Arg, const SDLoc &dl,
SelectionDAG &DAG,
const CCValAssign &VA,
ISD::ArgFlagsTy Flags) const {
unsigned LocMemOffset = VA.getLocMemOffset();
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
StackPtr, PtrOff);
if (Flags.isByVal())
return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
return DAG.getStore(
Chain, dl, Arg, PtrOff,
MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
}
/// Emit a load of return address if tail call
/// optimization is performed and it is required.
SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
bool Is64Bit, int FPDiff, const SDLoc &dl) const {
// Adjust the Return address stack slot.
EVT VT = getPointerTy(DAG.getDataLayout());
OutRetAddr = getReturnAddressFrameIndex(DAG);
// Load the "old" Return address.
OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
return SDValue(OutRetAddr.getNode(), 1);
}
/// Emit a store of the return address if tail call
/// optimization is performed and it is required (FPDiff!=0).
static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
SDValue Chain, SDValue RetAddrFrIdx,
EVT PtrVT, unsigned SlotSize,
int FPDiff, const SDLoc &dl) {
// Store the return address to the appropriate stack slot.
if (!FPDiff) return Chain;
// Calculate the new stack slot for the return address.
int NewReturnAddrFI =
MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
false);
SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
MachinePointerInfo::getFixedStack(
DAG.getMachineFunction(), NewReturnAddrFI));
return Chain;
}
/// Returns a vector_shuffle mask for an movs{s|d}, movd
/// operation of specified width.
static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
SDValue V2) {
unsigned NumElems = VT.getVectorNumElements();
SmallVector<int, 8> Mask;
Mask.push_back(NumElems);
for (unsigned i = 1; i != NumElems; ++i)
Mask.push_back(i);
return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
}
SDValue
X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
SelectionDAG &DAG = CLI.DAG;
SDLoc &dl = CLI.DL;
SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
CallingConv::ID CallConv = CLI.CallConv;
bool &isTailCall = CLI.IsTailCall;
bool isVarArg = CLI.IsVarArg;
MachineFunction &MF = DAG.getMachineFunction();
bool Is64Bit = Subtarget.is64Bit();
bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
bool IsSibcall = false;
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
(Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());
bool HasNoCfCheck =
(CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
const Module *M = MF.getMMI().getModule();
Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
MachineFunction::CallSiteInfo CSInfo;
if (CallConv == CallingConv::X86_INTR)
report_fatal_error("X86 interrupts may not be called directly");
if (Attr.getValueAsString() == "true")
isTailCall = false;
if (Subtarget.isPICStyleGOT() &&
!MF.getTarget().Options.GuaranteedTailCallOpt) {
// If we are using a GOT, disable tail calls to external symbols with
// default visibility. Tail calling such a symbol requires using a GOT
// relocation, which forces early binding of the symbol. This breaks code
// that require lazy function symbol resolution. Using musttail or
// GuaranteedTailCallOpt will override this.
GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
if (!G || (!G->getGlobal()->hasLocalLinkage() &&
G->getGlobal()->hasDefaultVisibility()))
isTailCall = false;
}
bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
if (IsMustTail) {
// Force this to be a tail call. The verifier rules are enough to ensure
// that we can lower this successfully without moving the return address
// around.
isTailCall = true;
} else if (isTailCall) {
// Check if it's really possible to do a tail call.
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
isVarArg, SR != NotStructReturn,
MF.getFunction().hasStructRetAttr(), CLI.RetTy,
Outs, OutVals, Ins, DAG);
// Sibcalls are automatically detected tailcalls which do not require
// ABI changes.
if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
IsSibcall = true;
if (isTailCall)
++NumTailCalls;
}
assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe");
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64.
if (IsWin64)
CCInfo.AllocateStack(32, 8);
CCInfo.AnalyzeArguments(Outs, CC_X86);
// In vectorcall calling convention a second pass is required for the HVA
// types.
if (CallingConv::X86_VectorCall == CallConv) {
CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
}
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
if (IsSibcall)
// This is a sibcall. The memory operands are available in caller's
// own caller's stack.
NumBytes = 0;
else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
canGuaranteeTCO(CallConv))
NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
int FPDiff = 0;
if (isTailCall && !IsSibcall && !IsMustTail) {
// Lower arguments at fp - stackoffset + fpdiff.
unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
FPDiff = NumBytesCallerPushed - NumBytes;
// Set the delta of movement of the returnaddr stackslot.
// But only set if delta is greater than previous delta.
if (FPDiff < X86Info->getTCReturnAddrDelta())
X86Info->setTCReturnAddrDelta(FPDiff);
}
unsigned NumBytesToPush = NumBytes;
unsigned NumBytesToPop = NumBytes;
// If we have an inalloca argument, all stack space has already been allocated
// for us and be right at the top of the stack. We don't support multiple
// arguments passed in memory when using inalloca.
if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
NumBytesToPush = 0;
if (!ArgLocs.back().isMemLoc())
report_fatal_error("cannot use inalloca attribute on a register "
"parameter");
if (ArgLocs.back().getLocMemOffset() != 0)
report_fatal_error("any parameter with the inalloca attribute must be "
"the only memory argument");
}
if (!IsSibcall)
Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
NumBytes - NumBytesToPush, dl);
SDValue RetAddrFrIdx;
// Load return address for tail calls.
if (isTailCall && FPDiff)
Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
Is64Bit, FPDiff, dl);
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
SmallVector<SDValue, 8> MemOpChains;
SDValue StackPtr;
// The next loop assumes that the locations are in the same order of the
// input arguments.
assert(isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering");
// Walk the register/memloc assignments, inserting copies/loads. In the case
// of tail call optimization arguments are handle later.
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
++I, ++OutIndex) {
assert(OutIndex < Outs.size() && "Invalid Out index");
// Skip inalloca arguments, they have already been written.
ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
if (Flags.isInAlloca())
continue;
CCValAssign &VA = ArgLocs[I];
EVT RegVT = VA.getLocVT();
SDValue Arg = OutVals[OutIndex];
bool isByVal = Flags.isByVal();
// Promote the value if needed.
switch (VA.getLocInfo()) {
default: llvm_unreachable("Unknown loc info!");
case CCValAssign::Full: break;
case CCValAssign::SExt:
Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
break;
case CCValAssign::ZExt:
Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
break;
case CCValAssign::AExt:
if (Arg.getValueType().isVector() &&
Arg.getValueType().getVectorElementType() == MVT::i1)
Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
else if (RegVT.is128BitVector()) {
// Special case: passing MMX values in XMM registers.
Arg = DAG.getBitcast(MVT::i64, Arg);
Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
} else
Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
break;
case CCValAssign::BCvt:
Arg = DAG.getBitcast(RegVT, Arg);
break;
case CCValAssign::Indirect: {
if (isByVal) {
// Memcpy the argument to a temporary stack slot to prevent
// the caller from seeing any modifications the callee may make
// as guaranteed by the `byval` attribute.
int FrameIdx = MF.getFrameInfo().CreateStackObject(
Flags.getByValSize(), std::max(16, (int)Flags.getByValAlign()),
false);
SDValue StackSlot =
DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
Chain =
CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
// From now on treat this as a regular pointer
Arg = StackSlot;
isByVal = false;
} else {
// Store the argument.
SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
Chain = DAG.getStore(
Chain, dl, Arg, SpillSlot,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
Arg = SpillSlot;
}
break;
}
}
if (VA.needsCustom()) {
assert(VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs");
// Split v64i1 value into two registers
Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
Subtarget);
} else if (VA.isRegLoc()) {
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
const TargetOptions &Options = DAG.getTarget().Options;
if (Options.EnableDebugEntryValues)
CSInfo.emplace_back(VA.getLocReg(), I);
if (isVarArg && IsWin64) {
// Win64 ABI requires argument XMM reg to be copied to the corresponding
// shadow reg if callee is a varargs function.
unsigned ShadowReg = 0;
switch (VA.getLocReg()) {
case X86::XMM0: ShadowReg = X86::RCX; break;
case X86::XMM1: ShadowReg = X86::RDX; break;
case X86::XMM2: ShadowReg = X86::R8; break;
case X86::XMM3: ShadowReg = X86::R9; break;
}
if (ShadowReg)
RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
}
} else if (!IsSibcall && (!isTailCall || isByVal)) {
assert(VA.isMemLoc());
if (!StackPtr.getNode())
StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
getPointerTy(DAG.getDataLayout()));
MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
dl, DAG, VA, Flags));
}
}
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
if (Subtarget.isPICStyleGOT()) {
// ELF / PIC requires GOT in the EBX register before function calls via PLT
// GOT pointer.
if (!isTailCall) {
RegsToPass.push_back(std::make_pair(
unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
getPointerTy(DAG.getDataLayout()))));
} else {
// If we are tail calling and generating PIC/GOT style code load the
// address of the callee into ECX. The value in ecx is used as target of
// the tail jump. This is done to circumvent the ebx/callee-saved problem
// for tail calls on PIC/GOT architectures. Normally we would just put the
// address of GOT into ebx and then call target@PLT. But for tail calls
// ebx would be restored (since ebx is callee saved) before jumping to the
// target@PLT.
// Note: The actual moving to ECX is done further down.
GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
if (G && !G->getGlobal()->hasLocalLinkage() &&
G->getGlobal()->hasDefaultVisibility())
Callee = LowerGlobalAddress(Callee, DAG);
else if (isa<ExternalSymbolSDNode>(Callee))
Callee = LowerExternalSymbol(Callee, DAG);
}
}
if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
// From AMD64 ABI document:
// For calls that may call functions that use varargs or stdargs
// (prototype-less calls or calls to functions containing ellipsis (...) in
// the declaration) %al is used as hidden argument to specify the number
// of SSE registers used. The contents of %al do not need to match exactly
// the number of registers, but must be an ubound on the number of SSE
// registers used and is in the range 0 - 8 inclusive.
// Count the number of XMM registers allocated.
static const MCPhysReg XMMArgRegs[] = {
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
};
unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
assert((Subtarget.hasSSE1() || !NumXMMRegs)
&& "SSE registers cannot be used when SSE is disabled");
RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
DAG.getConstant(NumXMMRegs, dl,
MVT::i8)));
}
if (isVarArg && IsMustTail) {
const auto &Forwards = X86Info->getForwardedMustTailRegParms();
for (const auto &F : Forwards) {
SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
}
}
// For tail calls lower the arguments to the 'real' stack slots. Sibcalls
// don't need this because the eligibility check rejects calls that require
// shuffling arguments passed in memory.
if (!IsSibcall && isTailCall) {
// Force all the incoming stack arguments to be loaded from the stack
// before any new outgoing arguments are stored to the stack, because the
// outgoing stack slots may alias the incoming argument stack slots, and
// the alias isn't otherwise explicit. This is slightly more conservative
// than necessary, because it means that each store effectively depends
// on every argument instead of just those arguments it would clobber.
SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
SmallVector<SDValue, 8> MemOpChains2;
SDValue FIN;
int FI = 0;
for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
++I, ++OutsIndex) {
CCValAssign &VA = ArgLocs[I];
if (VA.isRegLoc()) {
if (VA.needsCustom()) {
assert((CallConv == CallingConv::X86_RegCall) &&
"Expecting custom case only in regcall calling convention");
// This means that we are in special case where one argument was
// passed through two register locations - Skip the next location
++I;
}
continue;
}
assert(VA.isMemLoc());
SDValue Arg = OutVals[OutsIndex];
ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
// Skip inalloca arguments. They don't require any work.
if (Flags.isInAlloca())
continue;
// Create frame index.
int32_t Offset = VA.getLocMemOffset()+FPDiff;
uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
if (Flags.isByVal()) {
// Copy relative to framepointer.
SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
if (!StackPtr.getNode())
StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
getPointerTy(DAG.getDataLayout()));
Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
StackPtr, Source);
MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
ArgChain,
Flags, DAG, dl));
} else {
// Store relative to framepointer.
MemOpChains2.push_back(DAG.getStore(
ArgChain, dl, Arg, FIN,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
}
}
if (!MemOpChains2.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
// Store the return address to the appropriate stack slot.
Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
getPointerTy(DAG.getDataLayout()),
RegInfo->getSlotSize(), FPDiff, dl);
}
// Build a sequence of copy-to-reg nodes chained together with token chain
// and flag operands which copy the outgoing args into registers.
SDValue InFlag;
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
RegsToPass[i].second, InFlag);
InFlag = Chain.getValue(1);
}
if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
// In the 64-bit large code model, we have to make all calls
// through a register, since the call instruction's 32-bit
// pc-relative offset may not be large enough to hold the whole
// address.
} else if (Callee->getOpcode() == ISD::GlobalAddress ||
Callee->getOpcode() == ISD::ExternalSymbol) {
// Lower direct calls to global addresses and external symbols. Setting
// ForCall to true here has the effect of removing WrapperRIP when possible
// to allow direct calls to be selected without first materializing the
// address into a register.
Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
} else if (Subtarget.isTarget64BitILP32() &&
Callee->getValueType(0) == MVT::i32) {
// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
}
// Returns a chain & a flag for retval copy to use.
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
SmallVector<SDValue, 8> Ops;
if (!IsSibcall && isTailCall) {
Chain = DAG.getCALLSEQ_END(Chain,
DAG.getIntPtrConstant(NumBytesToPop, dl, true),
DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
InFlag = Chain.getValue(1);
}
Ops.push_back(Chain);
Ops.push_back(Callee);
if (isTailCall)
Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
// Add argument registers to the end of the list so that they are known live
// into the call.
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
Ops.push_back(DAG.getRegister(RegsToPass[i].first,
RegsToPass[i].second.getValueType()));
// Add a register mask operand representing the call-preserved registers.
// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
// set X86_INTR calling convention because it has the same CSR mask
// (same preserved registers).
const uint32_t *Mask = RegInfo->getCallPreservedMask(
MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
// If this is an invoke in a 32-bit function using a funclet-based
// personality, assume the function clobbers all registers. If an exception
// is thrown, the runtime will not restore CSRs.
// FIXME: Model this more precisely so that we can register allocate across
// the normal edge and spill and fill across the exceptional edge.
if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
const Function &CallerFn = MF.getFunction();
EHPersonality Pers =
CallerFn.hasPersonalityFn()
? classifyEHPersonality(CallerFn.getPersonalityFn())
: EHPersonality::Unknown;
if (isFuncletEHPersonality(Pers))
Mask = RegInfo->getNoPreservedMask();
}
// Define a new register mask from the existing mask.
uint32_t *RegMask = nullptr;
// In some calling conventions we need to remove the used physical registers
// from the reg mask.
if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
// Allocate a new Reg Mask and copy Mask.
RegMask = MF.allocateRegMask();
unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
// Make sure all sub registers of the argument registers are reset
// in the RegMask.
for (auto const &RegPair : RegsToPass)
for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
SubRegs.isValid(); ++SubRegs)
RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
// Create the RegMask Operand according to our updated mask.
Ops.push_back(DAG.getRegisterMask(RegMask));
} else {
// Create the RegMask Operand according to the static mask.
Ops.push_back(DAG.getRegisterMask(Mask));
}
if (InFlag.getNode())
Ops.push_back(InFlag);
if (isTailCall) {
// We used to do:
//// If this is the first return lowered for this function, add the regs
//// to the liveout set for the function.
// This isn't right, although it's probably harmless on x86; liveouts
// should be computed from returns not tail calls. Consider a void
// function making a tail call to a function returning int.
MF.getFrameInfo().setHasTailCall();
SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
return Ret;
}
if (HasNoCfCheck && IsCFProtectionSupported) {
Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
} else {
Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
}
InFlag = Chain.getValue(1);
DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
+ // Save heapallocsite metadata.
+ if (CLI.CS)
+ if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite"))
+ DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
+
// Create the CALLSEQ_END node.
unsigned NumBytesForCalleeToPop;
if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
DAG.getTarget().Options.GuaranteedTailCallOpt))
NumBytesForCalleeToPop = NumBytes; // Callee pops everything
else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
!Subtarget.getTargetTriple().isOSMSVCRT() &&
SR == StackStructReturn)
// If this is a call to a struct-return function, the callee
// pops the hidden struct pointer, so we have to push it back.
// This is common for Darwin/X86, Linux & Mingw32 targets.
// For MSVC Win32 targets, the caller pops the hidden struct pointer.
NumBytesForCalleeToPop = 4;
else
NumBytesForCalleeToPop = 0; // Callee pops nothing.
if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
// No need to reset the stack after the call if the call doesn't return. To
// make the MI verify, we'll pretend the callee does it for us.
NumBytesForCalleeToPop = NumBytes;
}
// Returns a flag for retval copy to use.
if (!IsSibcall) {
Chain = DAG.getCALLSEQ_END(Chain,
DAG.getIntPtrConstant(NumBytesToPop, dl, true),
DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
true),
InFlag, dl);
InFlag = Chain.getValue(1);
}
// Handle result values, copying them out of physregs into vregs that we
// return.
return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
InVals, RegMask);
}
//===----------------------------------------------------------------------===//
// Fast Calling Convention (tail call) implementation
//===----------------------------------------------------------------------===//
// Like std call, callee cleans arguments, convention except that ECX is
// reserved for storing the tail called function address. Only 2 registers are
// free for argument passing (inreg). Tail call optimization is performed
// provided:
// * tailcallopt is enabled
// * caller/callee are fastcc
// On X86_64 architecture with GOT-style position independent code only local
// (within module) calls are supported at the moment.
// To keep the stack aligned according to platform abi the function
// GetAlignedArgumentStackSize ensures that argument delta is always multiples
// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
// If a tail called function callee has more arguments than the caller the
// caller needs to make sure that there is room to move the RETADDR to. This is
// achieved by reserving an area the size of the argument delta right after the
// original RETADDR, but before the saved framepointer or the spilled registers
// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
// stack layout:
// arg1
// arg2
// RETADDR
// [ new RETADDR
// move area ]
// (possible EBP)
// ESI
// EDI
// local1 ..
/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
/// requirement.
unsigned
X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
SelectionDAG& DAG) const {
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
unsigned StackAlignment = TFI.getStackAlignment();
uint64_t AlignMask = StackAlignment - 1;
int64_t Offset = StackSize;
unsigned SlotSize = RegInfo->getSlotSize();
if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
// Number smaller than 12 so just add the difference.
Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
} else {
// Mask out lower bits, add stackalignment once plus the 12 bytes.
Offset = ((~AlignMask) & Offset) + StackAlignment +
(StackAlignment-SlotSize);
}
return Offset;
}
/// Return true if the given stack call argument is already available in the
/// same position (relatively) of the caller's incoming argument stack.
static
bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
const X86InstrInfo *TII, const CCValAssign &VA) {
unsigned Bytes = Arg.getValueSizeInBits() / 8;
for (;;) {
// Look through nodes that don't alter the bits of the incoming value.
unsigned Op = Arg.getOpcode();
if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
Arg = Arg.getOperand(0);
continue;
}
if (Op == ISD::TRUNCATE) {
const SDValue &TruncInput = Arg.getOperand(0);
if (TruncInput.getOpcode() == ISD::AssertZext &&
cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
Arg.getValueType()) {
Arg = TruncInput.getOperand(0);
continue;
}
}
break;
}
int FI = INT_MAX;
if (Arg.getOpcode() == ISD::CopyFromReg) {
unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
if (!TargetRegisterInfo::isVirtualRegister(VR))
return false;
MachineInstr *Def = MRI->getVRegDef(VR);
if (!Def)
return false;
if (!Flags.isByVal()) {
if (!TII->isLoadFromStackSlot(*Def, FI))
return false;
} else {
unsigned Opcode = Def->getOpcode();
if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
Opcode == X86::LEA64_32r) &&
Def->getOperand(1).isFI()) {
FI = Def->getOperand(1).getIndex();
Bytes = Flags.getByValSize();
} else
return false;
}
} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
if (Flags.isByVal())
// ByVal argument is passed in as a pointer but it's now being
// dereferenced. e.g.
// define @foo(%struct.X* %A) {
// tail call @bar(%struct.X* byval %A)
// }
return false;
SDValue Ptr = Ld->getBasePtr();
FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
if (!FINode)
return false;
FI = FINode->getIndex();
} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
FI = FINode->getIndex();
Bytes = Flags.getByValSize();
} else
return false;
assert(FI != INT_MAX);
if (!MFI.isFixedObjectIndex(FI))
return false;
if (Offset != MFI.getObjectOffset(FI))
return false;
// If this is not byval, check that the argument stack object is immutable.
// inalloca and argument copy elision can create mutable argument stack
// objects. Byval objects can be mutated, but a byval call intends to pass the
// mutated memory.
if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
return false;
if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
// If the argument location is wider than the argument type, check that any
// extension flags match.
if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
Flags.isSExt() != MFI.isObjectSExt(FI)) {
return false;
}
}
return Bytes == MFI.getObjectSize(FI);
}
/// Check whether the call is eligible for tail call optimization. Targets
/// that want to do tail call optimization should implement this function.
bool X86TargetLowering::IsEligibleForTailCallOptimization(
SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
if (!mayTailCallThisCC(CalleeCC))
return false;
// If -tailcallopt is specified, make fastcc functions tail-callable.
MachineFunction &MF = DAG.getMachineFunction();
const Function &CallerF = MF.getFunction();
// If the function return type is x86_fp80 and the callee return type is not,
// then the FP_EXTEND of the call result is not a nop. It's not safe to
// perform a tailcall optimization here.
if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
return false;
CallingConv::ID CallerCC = CallerF.getCallingConv();
bool CCMatch = CallerCC == CalleeCC;
bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
// Win64 functions have extra shadow space for argument homing. Don't do the
// sibcall if the caller and callee have mismatched expectations for this
// space.
if (IsCalleeWin64 != IsCallerWin64)
return false;
if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
if (canGuaranteeTCO(CalleeCC) && CCMatch)
return true;
return false;
}
// Look for obvious safe cases to perform tail call optimization that do not
// require ABI changes. This is what gcc calls sibcall.
// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
// emit a special epilogue.
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
if (RegInfo->needsStackRealignment(MF))
return false;
// Also avoid sibcall optimization if either caller or callee uses struct
// return semantics.
if (isCalleeStructRet || isCallerStructRet)
return false;
// Do not sibcall optimize vararg calls unless all arguments are passed via
// registers.
LLVMContext &C = *DAG.getContext();
if (isVarArg && !Outs.empty()) {
// Optimizing for varargs on Win64 is unlikely to be safe without
// additional testing.
if (IsCalleeWin64 || IsCallerWin64)
return false;
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
if (!ArgLocs[i].isRegLoc())
return false;
}
// If the call result is in ST0 / ST1, it needs to be popped off the x87
// stack. Therefore, if it's not used by the call it is not safe to optimize
// this into a sibcall.
bool Unused = false;
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
if (!Ins[i].Used) {
Unused = true;
break;
}
}
if (Unused) {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
CCValAssign &VA = RVLocs[i];
if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
return false;
}
}
// Check that the call results are passed in the same way.
if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
RetCC_X86, RetCC_X86))
return false;
// The callee has to preserve all registers the caller needs to preserve.
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
if (!CCMatch) {
const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
return false;
}
unsigned StackArgsSize = 0;
// If the callee takes no arguments then go on to check the results of the
// call.
if (!Outs.empty()) {
// Check if stack adjustment is needed. For now, do not do this if any
// argument is passed on the stack.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
// Allocate shadow area for Win64
if (IsCalleeWin64)
CCInfo.AllocateStack(32, 8);
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
StackArgsSize = CCInfo.getNextStackOffset();
if (CCInfo.getNextStackOffset()) {
// Check if the arguments are already laid out in the right way as
// the caller's fixed stack objects.
MachineFrameInfo &MFI = MF.getFrameInfo();
const MachineRegisterInfo *MRI = &MF.getRegInfo();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
SDValue Arg = OutVals[i];
ISD::ArgFlagsTy Flags = Outs[i].Flags;
if (VA.getLocInfo() == CCValAssign::Indirect)
return false;
if (!VA.isRegLoc()) {
if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
MFI, MRI, TII, VA))
return false;
}
}
}
bool PositionIndependent = isPositionIndependent();
// If the tailcall address may be in a register, then make sure it's
// possible to register allocate for it. In 32-bit, the call address can
// only target EAX, EDX, or ECX since the tail call must be scheduled after
// callee-saved registers are restored. These happen to be the same
// registers used to pass 'inreg' arguments so watch out for those.
if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
!isa<ExternalSymbolSDNode>(Callee)) ||
PositionIndependent)) {
unsigned NumInRegs = 0;
// In PIC we need an extra register to formulate the address computation
// for the callee.
unsigned MaxInRegs = PositionIndependent ? 2 : 3;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
if (!VA.isRegLoc())
continue;
unsigned Reg = VA.getLocReg();
switch (Reg) {
default: break;
case X86::EAX: case X86::EDX: case X86::ECX:
if (++NumInRegs == MaxInRegs)
return false;
break;
}
}
}
const MachineRegisterInfo &MRI = MF.getRegInfo();
if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
return false;
}
bool CalleeWillPop =
X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
MF.getTarget().Options.GuaranteedTailCallOpt);
if (unsigned BytesToPop =
MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
// If we have bytes to pop, the callee must pop them.
bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
if (!CalleePopMatches)
return false;
} else if (CalleeWillPop && StackArgsSize > 0) {
// If we don't have bytes to pop, make sure the callee doesn't pop any.
return false;
}
return true;
}
FastISel *
X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo) const {
return X86::createFastISel(funcInfo, libInfo);
}
//===----------------------------------------------------------------------===//
// Other Lowering Hooks
//===----------------------------------------------------------------------===//
static bool MayFoldLoad(SDValue Op) {
return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
}
static bool MayFoldIntoStore(SDValue Op) {
return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
}
static bool MayFoldIntoZeroExtend(SDValue Op) {
if (Op.hasOneUse()) {
unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
return (ISD::ZERO_EXTEND == Opcode);
}
return false;
}
static bool isTargetShuffle(unsigned Opcode) {
switch(Opcode) {
default: return false;
case X86ISD::BLENDI:
case X86ISD::PSHUFB:
case X86ISD::PSHUFD:
case X86ISD::PSHUFHW:
case X86ISD::PSHUFLW:
case X86ISD::SHUFP:
case X86ISD::INSERTPS:
case X86ISD::EXTRQI:
case X86ISD::INSERTQI:
case X86ISD::PALIGNR:
case X86ISD::VSHLDQ:
case X86ISD::VSRLDQ:
case X86ISD::MOVLHPS:
case X86ISD::MOVHLPS:
case X86ISD::MOVSHDUP:
case X86ISD::MOVSLDUP:
case X86ISD::MOVDDUP:
case X86ISD::MOVSS:
case X86ISD::MOVSD:
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
case X86ISD::VBROADCAST:
case X86ISD::VPERMILPI:
case X86ISD::VPERMILPV:
case X86ISD::VPERM2X128:
case X86ISD::SHUF128:
case X86ISD::VPERMIL2:
case X86ISD::VPERMI:
case X86ISD::VPPERM:
case X86ISD::VPERMV:
case X86ISD::VPERMV3:
case X86ISD::VZEXT_MOVL:
return true;
}
}
static bool isTargetShuffleVariableMask(unsigned Opcode) {
switch (Opcode) {
default: return false;
// Target Shuffles.
case X86ISD::PSHUFB:
case X86ISD::VPERMILPV:
case X86ISD::VPERMIL2:
case X86ISD::VPPERM:
case X86ISD::VPERMV:
case X86ISD::VPERMV3:
return true;
// 'Faux' Target Shuffles.
case ISD::OR:
case ISD::AND:
case X86ISD::ANDNP:
return true;
}
}
SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
int ReturnAddrIndex = FuncInfo->getRAIndex();
if (ReturnAddrIndex == 0) {
// Set up a frame object for the return address.
unsigned SlotSize = RegInfo->getSlotSize();
ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
-(int64_t)SlotSize,
false);
FuncInfo->setRAIndex(ReturnAddrIndex);
}
return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
}
bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
bool hasSymbolicDisplacement) {
// Offset should fit into 32 bit immediate field.
if (!isInt<32>(Offset))
return false;
// If we don't have a symbolic displacement - we don't have any extra
// restrictions.
if (!hasSymbolicDisplacement)
return true;
// FIXME: Some tweaks might be needed for medium code model.
if (M != CodeModel::Small && M != CodeModel::Kernel)
return false;
// For small code model we assume that latest object is 16MB before end of 31
// bits boundary. We may also accept pretty large negative constants knowing
// that all objects are in the positive half of address space.
if (M == CodeModel::Small && Offset < 16*1024*1024)
return true;
// For kernel code model we know that all object resist in the negative half
// of 32bits address space. We may not accept negative offsets, since they may
// be just off and we may accept pretty large positive ones.
if (M == CodeModel::Kernel && Offset >= 0)
return true;
return false;
}
/// Determines whether the callee is required to pop its own arguments.
/// Callee pop is necessary to support tail calls.
bool X86::isCalleePop(CallingConv::ID CallingConv,
bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
// If GuaranteeTCO is true, we force some calls to be callee pop so that we
// can guarantee TCO.
if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
return true;
switch (CallingConv) {
default:
return false;
case CallingConv::X86_StdCall:
case CallingConv::X86_FastCall:
case CallingConv::X86_ThisCall:
case CallingConv::X86_VectorCall:
return !is64Bit;
}
}
/// Return true if the condition is an unsigned comparison operation.
static bool isX86CCUnsigned(unsigned X86CC) {
switch (X86CC) {
default:
llvm_unreachable("Invalid integer condition!");
case X86::COND_E:
case X86::COND_NE:
case X86::COND_B:
case X86::COND_A:
case X86::COND_BE:
case X86::COND_AE:
return true;
case X86::COND_G:
case X86::COND_GE:
case X86::COND_L:
case X86::COND_LE:
return false;
}
}
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
switch (SetCCOpcode) {
default: llvm_unreachable("Invalid integer condition!");
case ISD::SETEQ: return X86::COND_E;
case ISD::SETGT: return X86::COND_G;
case ISD::SETGE: return X86::COND_GE;
case ISD::SETLT: return X86::COND_L;
case ISD::SETLE: return X86::COND_LE;
case ISD::SETNE: return X86::COND_NE;
case ISD::SETULT: return X86::COND_B;
case ISD::SETUGT: return X86::COND_A;
case ISD::SETULE: return X86::COND_BE;
case ISD::SETUGE: return X86::COND_AE;
}
}
/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
/// condition code, returning the condition code and the LHS/RHS of the
/// comparison to make.
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
bool isFP, SDValue &LHS, SDValue &RHS,
SelectionDAG &DAG) {
if (!isFP) {
if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
// X > -1 -> X == 0, jump !sign.
RHS = DAG.getConstant(0, DL, RHS.getValueType());
return X86::COND_NS;
}
if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
// X < 0 -> X == 0, jump on sign.
return X86::COND_S;
}
if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
// X < 1 -> X <= 0
RHS = DAG.getConstant(0, DL, RHS.getValueType());
return X86::COND_LE;
}
}
return TranslateIntegerX86CC(SetCCOpcode);
}
// First determine if it is required or is profitable to flip the operands.
// If LHS is a foldable load, but RHS is not, flip the condition.
if (ISD::isNON_EXTLoad(LHS.getNode()) &&
!ISD::isNON_EXTLoad(RHS.getNode())) {
SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
std::swap(LHS, RHS);
}
switch (SetCCOpcode) {
default: break;
case ISD::SETOLT:
case ISD::SETOLE:
case ISD::SETUGT:
case ISD::SETUGE:
std::swap(LHS, RHS);
break;
}
// On a floating point condition, the flags are set as follows:
// ZF PF CF op
// 0 | 0 | 0 | X > Y
// 0 | 0 | 1 | X < Y
// 1 | 0 | 0 | X == Y
// 1 | 1 | 1 | unordered
switch (SetCCOpcode) {
default: llvm_unreachable("Condcode should be pre-legalized away");
case ISD::SETUEQ:
case ISD::SETEQ: return X86::COND_E;
case ISD::SETOLT: // flipped
case ISD::SETOGT:
case ISD::SETGT: return X86::COND_A;
case ISD::SETOLE: // flipped
case ISD::SETOGE:
case ISD::SETGE: return X86::COND_AE;
case ISD::SETUGT: // flipped
case ISD::SETULT:
case ISD::SETLT: return X86::COND_B;
case ISD::SETUGE: // flipped
case ISD::SETULE:
case ISD::SETLE: return X86::COND_BE;
case ISD::SETONE:
case ISD::SETNE: return X86::COND_NE;
case ISD::SETUO: return X86::COND_P;
case ISD::SETO: return X86::COND_NP;
case ISD::SETOEQ:
case ISD::SETUNE: return X86::COND_INVALID;
}
}
/// Is there a floating point cmov for the specific X86 condition code?
/// Current x86 isa includes the following FP cmov instructions:
/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
static bool hasFPCMov(unsigned X86CC) {
switch (X86CC) {
default:
return false;
case X86::COND_B:
case X86::COND_BE:
case X86::COND_E:
case X86::COND_P:
case X86::COND_A:
case X86::COND_AE:
case X86::COND_NE:
case X86::COND_NP:
return true;
}
}
bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,
MachineFunction &MF,
unsigned Intrinsic) const {
const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
if (!IntrData)
return false;
Info.flags = MachineMemOperand::MONone;
Info.offset = 0;
switch (IntrData->Type) {
case TRUNCATE_TO_MEM_VI8:
case TRUNCATE_TO_MEM_VI16:
case TRUNCATE_TO_MEM_VI32: {
Info.opc = ISD::INTRINSIC_VOID;
Info.ptrVal = I.getArgOperand(0);
MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
ScalarVT = MVT::i8;
else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
ScalarVT = MVT::i16;
else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
ScalarVT = MVT::i32;
Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
Info.align = 1;
Info.flags |= MachineMemOperand::MOStore;
break;
}
case GATHER:
case GATHER_AVX2: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.ptrVal = nullptr;
MVT DataVT = MVT::getVT(I.getType());
MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
unsigned NumElts = std::min(DataVT.getVectorNumElements(),
IndexVT.getVectorNumElements());
Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
Info.align = 1;
Info.flags |= MachineMemOperand::MOLoad;
break;
}
case SCATTER: {
Info.opc = ISD::INTRINSIC_VOID;
Info.ptrVal = nullptr;
MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
unsigned NumElts = std::min(DataVT.getVectorNumElements(),
IndexVT.getVectorNumElements());
Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
Info.align = 1;
Info.flags |= MachineMemOperand::MOStore;
break;
}
default:
return false;
}
return true;
}
/// Returns true if the target can instruction select the
/// specified FP immediate natively. If false, the legalizer will
/// materialize the FP immediate as a load from a constant pool.
bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const {
for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
return true;
}
return false;
}
bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
ISD::LoadExtType ExtTy,
EVT NewVT) const {
// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
// relocation target a movq or addq instruction: don't let the load shrink.
SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
// If this is an (1) AVX vector load with (2) multiple uses and (3) all of
// those uses are extracted directly into a store, then the extract + store
// can be store-folded. Therefore, it's probably not worth splitting the load.
EVT VT = Load->getValueType(0);
if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
// Skip uses of the chain value. Result 0 of the node is the load value.
if (UI.getUse().getResNo() != 0)
continue;
// If this use is not an extract + store, it's probably worth splitting.
if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
UI->use_begin()->getOpcode() != ISD::STORE)
return true;
}
// All non-chain uses are extract + store.
return false;
}
return true;
}
/// Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
if (BitSize == 0 || BitSize > 64)
return false;
return true;
}
bool X86TargetLowering::reduceSelectOfFPConstantLoads(bool IsFPSetCC) const {
// If we are using XMM registers in the ABI and the condition of the select is
// a floating-point compare and we have blendv or conditional move, then it is
// cheaper to select instead of doing a cross-register move and creating a
// load that depends on the compare result.
return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
}
bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
// TODO: It might be a win to ease or lift this restriction, but the generic
// folds in DAGCombiner conflict with vector folds for an AVX512 target.
if (VT.isVector() && Subtarget.hasAVX512())
return false;
return true;
}
bool X86TargetLowering::decomposeMulByConstant(EVT VT, SDValue C) const {
// TODO: We handle scalars using custom code, but generic combining could make
// that unnecessary.
APInt MulC;
if (!ISD::isConstantSplatVector(C.getNode(), MulC))
return false;
// If vector multiply is legal, assume that's faster than shl + add/sub.
// TODO: Multiply is a complex op with higher latency and lower througput in
// most implementations, so this check could be loosened based on type
// and/or a CPU attribute.
if (isOperationLegal(ISD::MUL, VT))
return false;
// shl+add, shl+sub, shl+add+neg
return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
(1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
}
bool X86TargetLowering::shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
bool IsSigned) const {
// f80 UINT_TO_FP is more efficient using Strict code if FCMOV is available.
return !IsSigned && FpVT == MVT::f80 && Subtarget.hasCMov();
}
bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
unsigned Index) const {
if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
return false;
// Mask vectors support all subregister combinations and operations that
// extract half of vector.
if (ResVT.getVectorElementType() == MVT::i1)
return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
(Index == ResVT.getVectorNumElements()));
return (Index % ResVT.getVectorNumElements()) == 0;
}
bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
unsigned Opc = VecOp.getOpcode();
// Assume target opcodes can't be scalarized.
// TODO - do we have any exceptions?
if (Opc >= ISD::BUILTIN_OP_END)
return false;
// If the vector op is not supported, try to convert to scalar.
EVT VecVT = VecOp.getValueType();
if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
return true;
// If the vector op is supported, but the scalar op is not, the transform may
// not be worthwhile.
EVT ScalarVT = VecVT.getScalarType();
return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
}
bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT) const {
// TODO: Allow vectors?
if (VT.isVector())
return false;
return VT.isSimple() || !isOperationExpand(Opcode, VT);
}
bool X86TargetLowering::isCheapToSpeculateCttz() const {
// Speculate cttz only if we can directly use TZCNT.
return Subtarget.hasBMI();
}
bool X86TargetLowering::isCheapToSpeculateCtlz() const {
// Speculate ctlz only if we can directly use LZCNT.
return Subtarget.hasLZCNT();
}
bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
const SelectionDAG &DAG,
const MachineMemOperand &MMO) const {
if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
BitcastVT.getVectorElementType() == MVT::i1)
return false;
if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
return false;
// If both types are legal vectors, it's always ok to convert them.
if (LoadVT.isVector() && BitcastVT.isVector() &&
isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
return true;
return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
}
bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
const SelectionDAG &DAG) const {
// Do not merge to float value size (128 bytes) if no implicit
// float attribute is set.
bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
Attribute::NoImplicitFloat);
if (NoFloat) {
unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
return (MemVT.getSizeInBits() <= MaxIntSize);
}
// Make sure we don't merge greater than our preferred vector
// width.
if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
return false;
return true;
}
bool X86TargetLowering::isCtlzFast() const {
return Subtarget.hasFastLZCNT();
}
bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
const Instruction &AndI) const {
return true;
}
bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
EVT VT = Y.getValueType();
if (VT.isVector())
return false;
if (!Subtarget.hasBMI())
return false;
// There are only 32-bit and 64-bit forms for 'andn'.
if (VT != MVT::i32 && VT != MVT::i64)
return false;
return !isa<ConstantSDNode>(Y);
}
bool X86TargetLowering::hasAndNot(SDValue Y) const {
EVT VT = Y.getValueType();
if (!VT.isVector())
return hasAndNotCompare(Y);
// Vector.
if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
return false;
if (VT == MVT::v4i32)
return true;
return Subtarget.hasSSE2();
}
bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
const SDNode *N, CombineLevel Level) const {
assert(((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) ||
(N->getOpcode() == ISD::SRL &&
N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask");
EVT VT = N->getValueType(0);
if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
(Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
// Only fold if the shift values are equal - so it folds to AND.
// TODO - we should fold if either is a non-uniform vector but we don't do
// the fold for non-splats yet.
return N->getOperand(1) == N->getOperand(0).getOperand(1);
}
return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
}
bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
EVT VT = Y.getValueType();
// For vectors, we don't have a preference, but we probably want a mask.
if (VT.isVector())
return false;
// 64-bit shifts on 32-bit targets produce really bad bloated code.
if (VT == MVT::i64 && !Subtarget.is64Bit())
return false;
return true;
}
bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
// Any legal vector type can be splatted more efficiently than
// loading/spilling from memory.
return isTypeLegal(VT);
}
MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
MVT VT = MVT::getIntegerVT(NumBits);
if (isTypeLegal(VT))
return VT;
// PMOVMSKB can handle this.
if (NumBits == 128 && isTypeLegal(MVT::v16i8))
return MVT::v16i8;
// VPMOVMSKB can handle this.
if (NumBits == 256 && isTypeLegal(MVT::v32i8))
return MVT::v32i8;
// TODO: Allow 64-bit type for 32-bit target.
// TODO: 512-bit types should be allowed, but make sure that those
// cases are handled in combineVectorSizedSetCCEquality().
return MVT::INVALID_SIMPLE_VALUE_TYPE;
}
/// Val is the undef sentinel value or equal to the specified value.
static bool isUndefOrEqual(int Val, int CmpVal) {
return ((Val == SM_SentinelUndef) || (Val == CmpVal));
}
/// Val is either the undef or zero sentinel value.
static bool isUndefOrZero(int Val) {
return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
}
/// Return true if every element in Mask, beginning from position Pos and ending
/// in Pos+Size is the undef sentinel value.
static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
if (Mask[i] != SM_SentinelUndef)
return false;
return true;
}
/// Return true if the mask creates a vector whose lower half is undefined.
static bool isUndefLowerHalf(ArrayRef<int> Mask) {
unsigned NumElts = Mask.size();
return isUndefInRange(Mask, 0, NumElts / 2);
}
/// Return true if the mask creates a vector whose upper half is undefined.
static bool isUndefUpperHalf(ArrayRef<int> Mask) {
unsigned NumElts = Mask.size();
return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
}
/// Return true if Val falls within the specified range (L, H].
static bool isInRange(int Val, int Low, int Hi) {
return (Val >= Low && Val < Hi);
}
/// Return true if the value of any element in Mask falls within the specified
/// range (L, H].
static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
for (int M : Mask)
if (isInRange(M, Low, Hi))
return true;
return false;
}
/// Return true if Val is undef or if its value falls within the
/// specified range (L, H].
static bool isUndefOrInRange(int Val, int Low, int Hi) {
return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
}
/// Return true if every element in Mask is undef or if its value
/// falls within the specified range (L, H].
static bool isUndefOrInRange(ArrayRef<int> Mask,
int Low, int Hi) {
for (int M : Mask)
if (!isUndefOrInRange(M, Low, Hi))
return false;
return true;
}
/// Return true if Val is undef, zero or if its value falls within the
/// specified range (L, H].
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
}
/// Return true if every element in Mask is undef, zero or if its value
/// falls within the specified range (L, H].
static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
for (int M : Mask)
if (!isUndefOrZeroOrInRange(M, Low, Hi))
return false;
return true;
}
/// Return true if every element in Mask, beginning
/// from position Pos and ending in Pos + Size, falls within the specified
/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
unsigned Size, int Low, int Step = 1) {
for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
if (!isUndefOrEqual(Mask[i], Low))
return false;
return true;
}
/// Return true if every element in Mask, beginning
/// from position Pos and ending in Pos+Size, falls within the specified
/// sequential range (Low, Low+Size], or is undef or is zero.
static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
unsigned Size, int Low) {
for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
return false;
return true;
}
/// Return true if every element in Mask, beginning
/// from position Pos and ending in Pos+Size is undef or is zero.
static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
unsigned Size) {
for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
if (!isUndefOrZero(Mask[i]))
return false;
return true;
}
/// Helper function to test whether a shuffle mask could be
/// simplified by widening the elements being shuffled.
///
/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
/// leaves it in an unspecified state.
///
/// NOTE: This must handle normal vector shuffle masks and *target* vector
/// shuffle masks. The latter have the special property of a '-2' representing
/// a zero-ed lane of a vector.
static bool canWidenShuffleElements(ArrayRef<int> Mask,
SmallVectorImpl<int> &WidenedMask) {
WidenedMask.assign(Mask.size() / 2, 0);
for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
int M0 = Mask[i];
int M1 = Mask[i + 1];
// If both elements are undef, its trivial.
if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
WidenedMask[i / 2] = SM_SentinelUndef;
continue;
}
// Check for an undef mask and a mask value properly aligned to fit with
// a pair of values. If we find such a case, use the non-undef mask's value.
if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
WidenedMask[i / 2] = M1 / 2;
continue;
}
if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
WidenedMask[i / 2] = M0 / 2;
continue;
}
// When zeroing, we need to spread the zeroing across both lanes to widen.
if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
(M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
WidenedMask[i / 2] = SM_SentinelZero;
continue;
}
return false;
}
// Finally check if the two mask values are adjacent and aligned with
// a pair.
if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
WidenedMask[i / 2] = M0 / 2;
continue;
}
// Otherwise we can't safely widen the elements used in this shuffle.
return false;
}
assert(WidenedMask.size() == Mask.size() / 2 &&
"Incorrect size of mask after widening the elements!");
return true;
}
static bool canWidenShuffleElements(ArrayRef<int> Mask,
const APInt &Zeroable,
SmallVectorImpl<int> &WidenedMask) {
SmallVector<int, 32> TargetMask(Mask.begin(), Mask.end());
for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
if (TargetMask[i] == SM_SentinelUndef)
continue;
if (Zeroable[i])
TargetMask[i] = SM_SentinelZero;
}
return canWidenShuffleElements(TargetMask, WidenedMask);
}
static bool canWidenShuffleElements(ArrayRef<int> Mask) {
SmallVector<int, 32> WidenedMask;
return canWidenShuffleElements(Mask, WidenedMask);
}
/// Returns true if Elt is a constant zero or a floating point constant +0.0.
bool X86::isZeroNode(SDValue Elt) {
return isNullConstant(Elt) || isNullFPConstant(Elt);
}
// Build a vector of constants.
// Use an UNDEF node if MaskElt == -1.
// Split 64-bit constants in the 32-bit mode.
static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
const SDLoc &dl, bool IsMask = false) {
SmallVector<SDValue, 32> Ops;
bool Split = false;
MVT ConstVecVT = VT;
unsigned NumElts = VT.getVectorNumElements();
bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
Split = true;
}
MVT EltVT = ConstVecVT.getVectorElementType();
for (unsigned i = 0; i < NumElts; ++i) {
bool IsUndef = Values[i] < 0 && IsMask;
SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
DAG.getConstant(Values[i], dl, EltVT);
Ops.push_back(OpNode);
if (Split)
Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
DAG.getConstant(0, dl, EltVT));
}
SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
if (Split)
ConstsNode = DAG.getBitcast(VT, ConstsNode);
return ConstsNode;
}
static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
assert(Bits.size() == Undefs.getBitWidth() &&
"Unequal constant and undef arrays");
SmallVector<SDValue, 32> Ops;
bool Split = false;
MVT ConstVecVT = VT;
unsigned NumElts = VT.getVectorNumElements();
bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
Split = true;
}
MVT EltVT = ConstVecVT.getVectorElementType();
for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
if (Undefs[i]) {
Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
continue;
}
const APInt &V = Bits[i];
assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
if (Split) {
Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
} else if (EltVT == MVT::f32) {
APFloat FV(APFloat::IEEEsingle(), V);
Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
} else if (EltVT == MVT::f64) {
APFloat FV(APFloat::IEEEdouble(), V);
Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
} else {
Ops.push_back(DAG.getConstant(V, dl, EltVT));
}
}
SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
return DAG.getBitcast(VT, ConstsNode);
}
/// Returns a vector of specified type with all zero elements.
static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
SelectionDAG &DAG, const SDLoc &dl) {
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
VT.getVectorElementType() == MVT::i1) &&
"Unexpected vector type");
// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
// type. This ensures they get CSE'd. But if the integer type is not
// available, use a floating-point +0.0 instead.
SDValue Vec;
if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
} else if (VT.getVectorElementType() == MVT::i1) {
assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
"Unexpected vector type");
Vec = DAG.getConstant(0, dl, VT);
} else {
unsigned Num32BitElts = VT.getSizeInBits() / 32;
Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
}
return DAG.getBitcast(VT, Vec);
}
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
const SDLoc &dl, unsigned vectorWidth) {
EVT VT = Vec.getValueType();
EVT ElVT = VT.getVectorElementType();
unsigned Factor = VT.getSizeInBits()/vectorWidth;
EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
VT.getVectorNumElements()/Factor);
// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
// This is the index of the first element of the vectorWidth-bit chunk
// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
IdxVal &= ~(ElemsPerChunk - 1);
// If the input is a buildvector just emit a smaller one.
if (Vec.getOpcode() == ISD::BUILD_VECTOR)
return DAG.getBuildVector(ResultVT, dl,
Vec->ops().slice(IdxVal, ElemsPerChunk));
SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
}
/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
/// instructions or a simple subregister reference. Idx is an index in the
/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
/// lowering EXTRACT_VECTOR_ELT operations easier.
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
SelectionDAG &DAG, const SDLoc &dl) {
assert((Vec.getValueType().is256BitVector() ||
Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
return extractSubVector(Vec, IdxVal, DAG, dl, 128);
}
/// Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
SelectionDAG &DAG, const SDLoc &dl) {
assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
return extractSubVector(Vec, IdxVal, DAG, dl, 256);
}
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
SelectionDAG &DAG, const SDLoc &dl,
unsigned vectorWidth) {
assert((vectorWidth == 128 || vectorWidth == 256) &&
"Unsupported vector width");
// Inserting UNDEF is Result
if (Vec.isUndef())
return Result;
EVT VT = Vec.getValueType();
EVT ElVT = VT.getVectorElementType();
EVT ResultVT = Result.getValueType();
// Insert the relevant vectorWidth bits.
unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
// This is the index of the first element of the vectorWidth-bit chunk
// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
IdxVal &= ~(ElemsPerChunk - 1);
SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
}
/// Generate a DAG to put 128-bits into a vector > 128 bits. This
/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
/// simple superregister reference. Idx is an index in the 128 bits
/// we want. It need not be aligned to a 128-bit boundary. That makes
/// lowering INSERT_VECTOR_ELT operations easier.
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
SelectionDAG &DAG, const SDLoc &dl) {
assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
}
/// Widen a vector to a larger size with the same scalar type, with the new
/// elements either zero or undef.
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
const X86Subtarget &Subtarget, SelectionDAG &DAG,
const SDLoc &dl) {
assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&
Vec.getValueType().getScalarType() == VT.getScalarType() &&
"Unsupported vector widening type");
SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
: DAG.getUNDEF(VT);
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
DAG.getIntPtrConstant(0, dl));
}
/// Widen a vector to a larger size with the same scalar type, with the new
/// elements either zero or undef.
static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
const X86Subtarget &Subtarget, SelectionDAG &DAG,
const SDLoc &dl, unsigned WideSizeInBits) {
assert(Vec.getValueSizeInBits() < WideSizeInBits &&
(WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
"Unsupported vector widening type");
unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
MVT SVT = Vec.getSimpleValueType().getScalarType();
MVT VT = MVT::getVectorVT(SVT, WideNumElts);
return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
}
// Helper function to collect subvector ops that are concated together,
// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
// The subvectors in Ops are guaranteed to be the same type.
static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
assert(Ops.empty() && "Expected an empty ops vector");
if (N->getOpcode() == ISD::CONCAT_VECTORS) {
Ops.append(N->op_begin(), N->op_end());
return true;
}
if (N->getOpcode() == ISD::INSERT_SUBVECTOR &&
isa<ConstantSDNode>(N->getOperand(2))) {
SDValue Src = N->getOperand(0);
SDValue Sub = N->getOperand(1);
const APInt &Idx = N->getConstantOperandAPInt(2);
EVT VT = Src.getValueType();
EVT SubVT = Sub.getValueType();
// TODO - Handle more general insert_subvector chains.
if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
Idx == (VT.getVectorNumElements() / 2) &&
Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Src.getOperand(1).getValueType() == SubVT &&
isNullConstant(Src.getOperand(2))) {
Ops.push_back(Src.getOperand(1));
Ops.push_back(Sub);
return true;
}
}
return false;
}
// Helper for splitting operands of an operation to legal target size and
// apply a function on each part.
// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
// The argument Builder is a function that will be applied on each split part:
// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
template <typename F>
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
F Builder, bool CheckBWI = true) {
assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
unsigned NumSubs = 1;
if ((CheckBWI && Subtarget.useBWIRegs()) ||
(!CheckBWI && Subtarget.useAVX512Regs())) {
if (VT.getSizeInBits() > 512) {
NumSubs = VT.getSizeInBits() / 512;
assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
}
} else if (Subtarget.hasAVX2()) {
if (VT.getSizeInBits() > 256) {
NumSubs = VT.getSizeInBits() / 256;
assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
}
} else {
if (VT.getSizeInBits() > 128) {
NumSubs = VT.getSizeInBits() / 128;
assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
}
}
if (NumSubs == 1)
return Builder(DAG, DL, Ops);
SmallVector<SDValue, 4> Subs;
for (unsigned i = 0; i != NumSubs; ++i) {
SmallVector<SDValue, 2> SubOps;
for (SDValue Op : Ops) {
EVT OpVT = Op.getValueType();
unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
}
Subs.push_back(Builder(DAG, DL, SubOps));
}
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
}
/// Insert i1-subvector to i1-vector.
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc dl(Op);
SDValue Vec = Op.getOperand(0);
SDValue SubVec = Op.getOperand(1);
SDValue Idx = Op.getOperand(2);
if (!isa<ConstantSDNode>(Idx))
return SDValue();
// Inserting undef is a nop. We can just return the original vector.
if (SubVec.isUndef())
return Vec;
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
return Op;
MVT OpVT = Op.getSimpleValueType();
unsigned NumElems = OpVT.getVectorNumElements();
SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
// Extend to natively supported kshift.
MVT WideOpVT = OpVT;
if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
// Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
// if necessary.
if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
// May need to promote to a legal type.
Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
getZeroVector(WideOpVT, Subtarget, DAG, dl),
SubVec, Idx);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
}
MVT SubVecVT = SubVec.getSimpleValueType();
unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
assert(IdxVal + SubVecNumElems <= NumElems &&
IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR");
SDValue Undef = DAG.getUNDEF(WideOpVT);
if (IdxVal == 0) {
// Zero lower bits of the Vec
SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
ZeroIdx);
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
// Merge them together, SubVec should be zero extended.
SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
getZeroVector(WideOpVT, Subtarget, DAG, dl),
SubVec, ZeroIdx);
Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
}
SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
Undef, SubVec, ZeroIdx);
if (Vec.isUndef()) {
assert(IdxVal != 0 && "Unexpected index");
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
DAG.getConstant(IdxVal, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
}
if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
assert(IdxVal != 0 && "Unexpected index");
NumElems = WideOpVT.getVectorNumElements();
unsigned ShiftLeft = NumElems - SubVecNumElems;
unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
DAG.getConstant(ShiftLeft, dl, MVT::i8));
if (ShiftRight != 0)
SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
DAG.getConstant(ShiftRight, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
}
// Simple case when we put subvector in the upper part
if (IdxVal + SubVecNumElems == NumElems) {
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
DAG.getConstant(IdxVal, dl, MVT::i8));
if (SubVecNumElems * 2 == NumElems) {
// Special case, use legal zero extending insert_subvector. This allows
// isel to opimitize when bits are known zero.
Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
getZeroVector(WideOpVT, Subtarget, DAG, dl),
Vec, ZeroIdx);
} else {
// Otherwise use explicit shifts to zero the bits.
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
Undef, Vec, ZeroIdx);
NumElems = WideOpVT.getVectorNumElements();
SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
}
Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
}
// Inserting into the middle is more complicated.
NumElems = WideOpVT.getVectorNumElements();
// Widen the vector if needed.
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
// Move the current value of the bit to be replace to the lsbs.
Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
DAG.getConstant(IdxVal, dl, MVT::i8));
// Xor with the new bit.
Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
// Shift to MSB, filling bottom bits with 0.
unsigned ShiftLeft = NumElems - SubVecNumElems;
Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
DAG.getConstant(ShiftLeft, dl, MVT::i8));
// Shift to the final position, filling upper bits with 0.
unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
DAG.getConstant(ShiftRight, dl, MVT::i8));
// Xor with original vector leaving the new value.
Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
// Reduce to original width if needed.
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
}
static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT,
unsigned NumElems, SelectionDAG &DAG,
const SDLoc &dl, unsigned VectorWidth) {
SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, VectorWidth);
return insertSubVector(V, V2, NumElems / 2, DAG, dl, VectorWidth);
}
/// Returns a vector of specified type with all bits set.
/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
/// Then bitcast to their original type, ensuring they get CSE'd.
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
"Expected a 128/256/512-bit vector type");
APInt Ones = APInt::getAllOnesValue(32);
unsigned NumElts = VT.getSizeInBits() / 32;
SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
return DAG.getBitcast(VT, Vec);
}
// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
switch (Opcode) {
case ISD::ANY_EXTEND:
case ISD::ANY_EXTEND_VECTOR_INREG:
return ISD::ANY_EXTEND_VECTOR_INREG;
case ISD::ZERO_EXTEND:
case ISD::ZERO_EXTEND_VECTOR_INREG:
return ISD::ZERO_EXTEND_VECTOR_INREG;
case ISD::SIGN_EXTEND:
case ISD::SIGN_EXTEND_VECTOR_INREG:
return ISD::SIGN_EXTEND_VECTOR_INREG;
}
llvm_unreachable("Unknown opcode");
}
static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
SDValue In, SelectionDAG &DAG) {
EVT InVT = In.getValueType();
assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode");
// For 256-bit vectors, we only need the lower (128-bit) input half.
// For 512-bit vectors, we only need the lower input half or quarter.
if (InVT.getSizeInBits() > 128) {
assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
"Expected VTs to be the same size!");
unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
In = extractSubVector(In, 0, DAG, DL,
std::max(128U, VT.getSizeInBits() / Scale));
InVT = In.getValueType();
}
if (VT.getVectorNumElements() != InVT.getVectorNumElements())
Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
return DAG.getNode(Opcode, DL, VT, In);
}
/// Returns a vector_shuffle node for an unpackl operation.
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
SDValue V1, SDValue V2) {
SmallVector<int, 8> Mask;
createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
}
/// Returns a vector_shuffle node for an unpackh operation.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
SDValue V1, SDValue V2) {
SmallVector<int, 8> Mask;
createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
}
/// Return a vector_shuffle of the specified vector of zero or undef vector.
/// This produces a shuffle where the low element of V2 is swizzled into the
/// zero/undef vector, landing at element Idx.
/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
bool IsZero,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = V2.getSimpleValueType();
SDValue V1 = IsZero
? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
int NumElems = VT.getVectorNumElements();
SmallVector<int, 16> MaskVec(NumElems);
for (int i = 0; i != NumElems; ++i)
// If this is the insertion idx, put the low elt of V2 here.
MaskVec[i] = (i == Idx) ? NumElems : i;
return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
}
static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
if (!Load || !ISD::isNormalLoad(Load))
return nullptr;
SDValue Ptr = Load->getBasePtr();
if (Ptr->getOpcode() == X86ISD::Wrapper ||
Ptr->getOpcode() == X86ISD::WrapperRIP)
Ptr = Ptr->getOperand(0);
auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
return nullptr;
return CNode->getConstVal();
}
static const Constant *getTargetConstantFromNode(SDValue Op) {
Op = peekThroughBitcasts(Op);
return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
}
const Constant *
X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
assert(LD && "Unexpected null LoadSDNode");
return getTargetConstantFromNode(LD);
}
// Extract raw constant bits from constant pools.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
APInt &UndefElts,
SmallVectorImpl<APInt> &EltBits,
bool AllowWholeUndefs = true,
bool AllowPartialUndefs = true) {
assert(EltBits.empty() && "Expected an empty EltBits vector");
Op = peekThroughBitcasts(Op);
EVT VT = Op.getValueType();
unsigned SizeInBits = VT.getSizeInBits();
assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
unsigned NumElts = SizeInBits / EltSizeInBits;
// Bitcast a source array of element bits to the target size.
auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
unsigned NumSrcElts = UndefSrcElts.getBitWidth();
unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
"Constant bit sizes don't match");
// Don't split if we don't allow undef bits.
bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
if (UndefSrcElts.getBoolValue() && !AllowUndefs)
return false;
// If we're already the right size, don't bother bitcasting.
if (NumSrcElts == NumElts) {
UndefElts = UndefSrcElts;
EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
return true;
}
// Extract all the undef/constant element data and pack into single bitsets.
APInt UndefBits(SizeInBits, 0);
APInt MaskBits(SizeInBits, 0);
for (unsigned i = 0; i != NumSrcElts; ++i) {
unsigned BitOffset = i * SrcEltSizeInBits;
if (UndefSrcElts[i])
UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
MaskBits.insertBits(SrcEltBits[i], BitOffset);
}
// Split the undef/constant single bitset data into the target elements.
UndefElts = APInt(NumElts, 0);
EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
for (unsigned i = 0; i != NumElts; ++i) {
unsigned BitOffset = i * EltSizeInBits;
APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
// Only treat an element as UNDEF if all bits are UNDEF.
if (UndefEltBits.isAllOnesValue()) {
if (!AllowWholeUndefs)
return false;
UndefElts.setBit(i);
continue;
}
// If only some bits are UNDEF then treat them as zero (or bail if not
// supported).
if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
return false;
EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
}
return true;
};
// Collect constant bits and insert into mask/undef bit masks.
auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
unsigned UndefBitIndex) {
if (!Cst)
return false;
if (isa<UndefValue>(Cst)) {
Undefs.setBit(UndefBitIndex);
return true;
}
if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
Mask = CInt->getValue();
return true;
}
if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
Mask = CFP->getValueAPF().bitcastToAPInt();
return true;
}
return false;
};
// Handle UNDEFs.
if (Op.isUndef()) {
APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
return CastBitData(UndefSrcElts, SrcEltBits);
}
// Extract scalar constant bits.
if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
APInt UndefSrcElts = APInt::getNullValue(1);
SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
return CastBitData(UndefSrcElts, SrcEltBits);
}
if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
APInt UndefSrcElts = APInt::getNullValue(1);
APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
SmallVector<APInt, 64> SrcEltBits(1, RawBits);
return CastBitData(UndefSrcElts, SrcEltBits);
}
// Extract constant bits from build vector.
if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
APInt UndefSrcElts(NumSrcElts, 0);
SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
const SDValue &Src = Op.getOperand(i);
if (Src.isUndef()) {
UndefSrcElts.setBit(i);
continue;
}
auto *Cst = cast<ConstantSDNode>(Src);
SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
}
return CastBitData(UndefSrcElts, SrcEltBits);
}
if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
APInt UndefSrcElts(NumSrcElts, 0);
SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
const SDValue &Src = Op.getOperand(i);
if (Src.isUndef()) {
UndefSrcElts.setBit(i);
continue;
}
auto *Cst = cast<ConstantFPSDNode>(Src);
APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
}
return CastBitData(UndefSrcElts, SrcEltBits);
}
// Extract constant bits from constant pool vector.
if (auto *Cst = getTargetConstantFromNode(Op)) {
Type *CstTy = Cst->getType();
unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
return false;
unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
APInt UndefSrcElts(NumSrcElts, 0);
SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
for (unsigned i = 0; i != NumSrcElts; ++i)
if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
UndefSrcElts, i))
return false;
return CastBitData(UndefSrcElts, SrcEltBits);
}
// Extract constant bits from a broadcasted constant pool scalar.
if (Op.getOpcode() == X86ISD::VBROADCAST &&
EltSizeInBits <= VT.getScalarSizeInBits()) {
if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
APInt UndefSrcElts(NumSrcElts, 0);
SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
if (UndefSrcElts[0])
UndefSrcElts.setBits(0, NumSrcElts);
SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
return CastBitData(UndefSrcElts, SrcEltBits);
}
}
}
// Extract constant bits from a subvector broadcast.
if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {
SmallVector<APInt, 16> SubEltBits;
if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
UndefElts, SubEltBits, AllowWholeUndefs,
AllowPartialUndefs)) {
UndefElts = APInt::getSplat(NumElts, UndefElts);
while (EltBits.size() < NumElts)
EltBits.append(SubEltBits.begin(), SubEltBits.end());
return true;
}
}
// Extract a rematerialized scalar constant insertion.
if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
APInt UndefSrcElts(NumSrcElts, 0);
SmallVector<APInt, 64> SrcEltBits;
auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
return CastBitData(UndefSrcElts, SrcEltBits);
}
// Insert constant bits from a base and sub vector sources.
if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
isa<ConstantSDNode>(Op.getOperand(2))) {
// TODO - support insert_subvector through bitcasts.
if (EltSizeInBits != VT.getScalarSizeInBits())
return false;
APInt UndefSubElts;
SmallVector<APInt, 32> EltSubBits;
if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
UndefSubElts, EltSubBits,
AllowWholeUndefs, AllowPartialUndefs) &&
getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
UndefElts, EltBits, AllowWholeUndefs,
AllowPartialUndefs)) {
unsigned BaseIdx = Op.getConstantOperandVal(2);
UndefElts.insertBits(UndefSubElts, BaseIdx);
for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
EltBits[BaseIdx + i] = EltSubBits[i];
return true;
}
}
// Extract constant bits from a subvector's source.
if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
isa<ConstantSDNode>(Op.getOperand(1))) {
// TODO - support extract_subvector through bitcasts.
if (EltSizeInBits != VT.getScalarSizeInBits())
return false;
if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
UndefElts, EltBits, AllowWholeUndefs,
AllowPartialUndefs)) {
EVT SrcVT = Op.getOperand(0).getValueType();
unsigned NumSrcElts = SrcVT.getVectorNumElements();
unsigned NumSubElts = VT.getVectorNumElements();
unsigned BaseIdx = Op.getConstantOperandVal(1);
UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
if ((BaseIdx + NumSubElts) != NumSrcElts)
EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
if (BaseIdx != 0)
EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
return true;
}
}
// Extract constant bits from shuffle node sources.
if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
// TODO - support shuffle through bitcasts.
if (EltSizeInBits != VT.getScalarSizeInBits())
return false;
ArrayRef<int> Mask = SVN->getMask();
if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
llvm::any_of(Mask, [](int M) { return M < 0; }))
return false;
APInt UndefElts0, UndefElts1;
SmallVector<APInt, 32> EltBits0, EltBits1;
if (isAnyInRange(Mask, 0, NumElts) &&
!getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
UndefElts0, EltBits0, AllowWholeUndefs,
AllowPartialUndefs))
return false;
if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
!getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
UndefElts1, EltBits1, AllowWholeUndefs,
AllowPartialUndefs))
return false;
UndefElts = APInt::getNullValue(NumElts);
for (int i = 0; i != (int)NumElts; ++i) {
int M = Mask[i];
if (M < 0) {
UndefElts.setBit(i);
EltBits.push_back(APInt::getNullValue(EltSizeInBits));
} else if (M < (int)NumElts) {
if (UndefElts0[M])
UndefElts.setBit(i);
EltBits.push_back(EltBits0[M]);
} else {
if (UndefElts1[M - NumElts])
UndefElts.setBit(i);
EltBits.push_back(EltBits1[M - NumElts]);
}
}
return true;
}
return false;
}
static bool isConstantSplat(SDValue Op, APInt &SplatVal) {
APInt UndefElts;
SmallVector<APInt, 16> EltBits;
if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
UndefElts, EltBits, true, false)) {
int SplatIndex = -1;
for (int i = 0, e = EltBits.size(); i != e; ++i) {
if (UndefElts[i])
continue;
if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
SplatIndex = -1;
break;
}
SplatIndex = i;
}
if (0 <= SplatIndex) {
SplatVal = EltBits[SplatIndex];
return true;
}
}
return false;
}
static bool getTargetShuffleMaskIndices(SDValue MaskNode,
unsigned MaskEltSizeInBits,
SmallVectorImpl<uint64_t> &RawMask,
APInt &UndefElts) {
// Extract the raw target constant bits.
SmallVector<APInt, 64> EltBits;
if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
EltBits, /* AllowWholeUndefs */ true,
/* AllowPartialUndefs */ false))
return false;
// Insert the extracted elements into the mask.
for (APInt Elt : EltBits)
RawMask.push_back(Elt.getZExtValue());
return true;
}
/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
/// Note: This ignores saturation, so inputs must be checked first.
static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
bool Unary) {
assert(Mask.empty() && "Expected an empty shuffle mask vector");
unsigned NumElts = VT.getVectorNumElements();
unsigned NumLanes = VT.getSizeInBits() / 128;
unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
unsigned Offset = Unary ? 0 : NumElts;
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
Mask.push_back(Elt + (Lane * NumEltsPerLane));
for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
}
}
// Split the demanded elts of a PACKSS/PACKUS node between its operands.
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
APInt &DemandedLHS, APInt &DemandedRHS) {
int NumLanes = VT.getSizeInBits() / 128;
int NumElts = DemandedElts.getBitWidth();
int NumInnerElts = NumElts / 2;
int NumEltsPerLane = NumElts / NumLanes;
int NumInnerEltsPerLane = NumInnerElts / NumLanes;
DemandedLHS = APInt::getNullValue(NumInnerElts);
DemandedRHS = APInt::getNullValue(NumInnerElts);
// Map DemandedElts to the packed operands.
for (int Lane = 0; Lane != NumLanes; ++Lane) {
for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
int OuterIdx = (Lane * NumEltsPerLane) + Elt;
int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
if (DemandedElts[OuterIdx])
DemandedLHS.setBit(InnerIdx);
if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
DemandedRHS.setBit(InnerIdx);
}
}
}
// Split the demanded elts of a HADD/HSUB node between its operands.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
APInt &DemandedLHS, APInt &DemandedRHS) {
int NumLanes = VT.getSizeInBits() / 128;
int NumElts = DemandedElts.getBitWidth();
int NumEltsPerLane = NumElts / NumLanes;
int HalfEltsPerLane = NumEltsPerLane / 2;
DemandedLHS = APInt::getNullValue(NumElts);
DemandedRHS = APInt::getNullValue(NumElts);
// Map DemandedElts to the horizontal operands.
for (int Idx = 0; Idx != NumElts; ++Idx) {
if (!DemandedElts[Idx])
continue;
int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
int LocalIdx = Idx % NumEltsPerLane;
if (LocalIdx < HalfEltsPerLane) {
DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
} else {
LocalIdx -= HalfEltsPerLane;
DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
}
}
}
/// Calculates the shuffle mask corresponding to the target-specific opcode.
/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
/// operands in \p Ops, and returns true.
/// Sets \p IsUnary to true if only one source is used. Note that this will set
/// IsUnary for shuffles which use a single input multiple times, and in those
/// cases it will adjust the mask to only have indices within that single input.
/// It is an error to call this with non-empty Mask/Ops vectors.
static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
SmallVectorImpl<SDValue> &Ops,
SmallVectorImpl<int> &Mask, bool &IsUnary) {
unsigned NumElems = VT.getVectorNumElements();
unsigned MaskEltSize = VT.getScalarSizeInBits();
SmallVector<uint64_t, 32> RawMask;
APInt RawUndefs;
SDValue ImmN;
assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
IsUnary = false;
bool IsFakeUnary = false;
switch (N->getOpcode()) {
case X86ISD::BLENDI:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::SHUFP:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodeSHUFPMask(NumElems, MaskEltSize,
cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::INSERTPS:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::EXTRQI:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
if (isa<ConstantSDNode>(N->getOperand(1)) &&
isa<ConstantSDNode>(N->getOperand(2))) {
int BitLen = N->getConstantOperandVal(1);
int BitIdx = N->getConstantOperandVal(2);
DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
IsUnary = true;
}
break;
case X86ISD::INSERTQI:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
if (isa<ConstantSDNode>(N->getOperand(2)) &&
isa<ConstantSDNode>(N->getOperand(3))) {
int BitLen = N->getConstantOperandVal(2);
int BitIdx = N->getConstantOperandVal(3);
DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
}
break;
case X86ISD::UNPCKH:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::UNPCKL:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::MOVHLPS:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
DecodeMOVHLPSMask(NumElems, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::MOVLHPS:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
DecodeMOVLHPSMask(NumElems, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::PALIGNR:
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
Ops.push_back(N->getOperand(1));
Ops.push_back(N->getOperand(0));
break;
case X86ISD::VSHLDQ:
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
Mask);
IsUnary = true;
break;
case X86ISD::VSRLDQ:
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
Mask);
IsUnary = true;
break;
case X86ISD::PSHUFD:
case X86ISD::VPERMILPI:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodePSHUFMask(NumElems, MaskEltSize,
cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = true;
break;
case X86ISD::PSHUFHW:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
Mask);
IsUnary = true;
break;
case X86ISD::PSHUFLW:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
Mask);
IsUnary = true;
break;
case X86ISD::VZEXT_MOVL:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
DecodeZeroMoveLowMask(NumElems, Mask);
IsUnary = true;
break;
case X86ISD::VBROADCAST: {
SDValue N0 = N->getOperand(0);
// See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
// add the pre-extracted value to the Ops vector.
if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N0.getOperand(0).getValueType() == VT &&
N0.getConstantOperandVal(1) == 0)
Ops.push_back(N0.getOperand(0));
// We only decode broadcasts of same-sized vectors, unless the broadcast
// came from an extract from the original width. If we found one, we
// pushed it the Ops vector above.
if (N0.getValueType() == VT || !Ops.empty()) {
DecodeVectorBroadcast(NumElems, Mask);
IsUnary = true;
break;
}
return false;
}
case X86ISD::VPERMILPV: {
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
IsUnary = true;
SDValue MaskNode = N->getOperand(1);
if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
RawUndefs)) {
DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
break;
}
return false;
}
case X86ISD::PSHUFB: {
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
IsUnary = true;
SDValue MaskNode = N->getOperand(1);
if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
DecodePSHUFBMask(RawMask, RawUndefs, Mask);
break;
}
return false;
}
case X86ISD::VPERMI:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = true;
break;
case X86ISD::MOVSS:
case X86ISD::MOVSD:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
break;
case X86ISD::VPERM2X128:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::SHUF128:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize,
cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::MOVSLDUP:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
DecodeMOVSLDUPMask(NumElems, Mask);
IsUnary = true;
break;
case X86ISD::MOVSHDUP:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
DecodeMOVSHDUPMask(NumElems, Mask);
IsUnary = true;
break;
case X86ISD::MOVDDUP:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
DecodeMOVDDUPMask(NumElems, Mask);
IsUnary = true;
break;
case X86ISD::VPERMIL2: {
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
SDValue MaskNode = N->getOperand(2);
SDValue CtrlNode = N->getOperand(3);
if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
unsigned CtrlImm = CtrlOp->getZExtValue();
if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
RawUndefs)) {
DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
Mask);
break;
}
}
return false;
}
case X86ISD::VPPERM: {
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
SDValue MaskNode = N->getOperand(2);
if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
DecodeVPPERMMask(RawMask, RawUndefs, Mask);
break;
}
return false;
}
case X86ISD::VPERMV: {
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
IsUnary = true;
// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
Ops.push_back(N->getOperand(1));
SDValue MaskNode = N->getOperand(0);
if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
RawUndefs)) {
DecodeVPERMVMask(RawMask, RawUndefs, Mask);
break;
}
return false;
}
case X86ISD::VPERMV3: {
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
Ops.push_back(N->getOperand(0));
Ops.push_back(N->getOperand(2));
SDValue MaskNode = N->getOperand(1);
if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
RawUndefs)) {
DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
break;
}
return false;
}
default: llvm_unreachable("unknown target shuffle node");
}
// Empty mask indicates the decode failed.
if (Mask.empty())
return false;
// Check if we're getting a shuffle mask with zero'd elements.
if (!AllowSentinelZero)
if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
return false;
// If we have a fake unary shuffle, the shuffle mask is spread across two
// inputs that are actually the same node. Re-map the mask to always point
// into the first input.
if (IsFakeUnary)
for (int &M : Mask)
if (M >= (int)Mask.size())
M -= Mask.size();
// If we didn't already add operands in the opcode-specific code, default to
// adding 1 or 2 operands starting at 0.
if (Ops.empty()) {
Ops.push_back(N->getOperand(0));
if (!IsUnary || IsFakeUnary)
Ops.push_back(N->getOperand(1));
}
return true;
}
/// Check a target shuffle mask's inputs to see if we can set any values to
/// SM_SentinelZero - this is for elements that are known to be zero
/// (not just zeroable) from their inputs.
/// Returns true if the target shuffle mask was decoded.
static bool setTargetShuffleZeroElements(SDValue N,
SmallVectorImpl<int> &Mask,
SmallVectorImpl<SDValue> &Ops) {
bool IsUnary;
if (!isTargetShuffle(N.getOpcode()))
return false;
MVT VT = N.getSimpleValueType();
if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
return false;
SDValue V1 = Ops[0];
SDValue V2 = IsUnary ? V1 : Ops[1];
V1 = peekThroughBitcasts(V1);
V2 = peekThroughBitcasts(V2);
assert((VT.getSizeInBits() % Mask.size()) == 0 &&
"Illegal split of shuffle value type");
unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
// Extract known constant input data.
APInt UndefSrcElts[2];
SmallVector<APInt, 32> SrcEltBits[2];
bool IsSrcConstant[2] = {
getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
SrcEltBits[0], true, false),
getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
SrcEltBits[1], true, false)};
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
int M = Mask[i];
// Already decoded as SM_SentinelZero / SM_SentinelUndef.
if (M < 0)
continue;
// Determine shuffle input and normalize the mask.
unsigned SrcIdx = M / Size;
SDValue V = M < Size ? V1 : V2;
M %= Size;
// We are referencing an UNDEF input.
if (V.isUndef()) {
Mask[i] = SM_SentinelUndef;
continue;
}
// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
// TODO: We currently only set UNDEF for integer types - floats use the same
// registers as vectors and many of the scalar folded loads rely on the
// SCALAR_TO_VECTOR pattern.
if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
(Size % V.getValueType().getVectorNumElements()) == 0) {
int Scale = Size / V.getValueType().getVectorNumElements();
int Idx = M / Scale;
if (Idx != 0 && !VT.isFloatingPoint())
Mask[i] = SM_SentinelUndef;
else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
Mask[i] = SM_SentinelZero;
continue;
}
// Attempt to extract from the source's constant bits.
if (IsSrcConstant[SrcIdx]) {
if (UndefSrcElts[SrcIdx][M])
Mask[i] = SM_SentinelUndef;
else if (SrcEltBits[SrcIdx][M] == 0)
Mask[i] = SM_SentinelZero;
}
}
assert(VT.getVectorNumElements() == Mask.size() &&
"Different mask size from vector size!");
return true;
}
// Forward declaration (for getFauxShuffleMask recursive check).
static bool resolveTargetShuffleInputs(SDValue Op,
SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
SelectionDAG &DAG);
// Attempt to decode ops that could be represented as a shuffle mask.
// The decoded shuffle mask may contain a different number of elements to the
// destination value type.
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
SmallVectorImpl<int> &Mask,
SmallVectorImpl<SDValue> &Ops,
SelectionDAG &DAG) {
Mask.clear();
Ops.clear();
MVT VT = N.getSimpleValueType();
unsigned NumElts = VT.getVectorNumElements();
unsigned NumSizeInBits = VT.getSizeInBits();
unsigned NumBitsPerElt = VT.getScalarSizeInBits();
if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
return false;
assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
unsigned Opcode = N.getOpcode();
switch (Opcode) {
case ISD::VECTOR_SHUFFLE: {
// Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
Mask.append(ShuffleMask.begin(), ShuffleMask.end());
Ops.push_back(N.getOperand(0));
Ops.push_back(N.getOperand(1));
return true;
}
return false;
}
case ISD::AND:
case X86ISD::ANDNP: {
// Attempt to decode as a per-byte mask.
APInt UndefElts;
SmallVector<APInt, 32> EltBits;
SDValue N0 = N.getOperand(0);
SDValue N1 = N.getOperand(1);
bool IsAndN = (X86ISD::ANDNP == Opcode);
uint64_t ZeroMask = IsAndN ? 255 : 0;
if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
return false;
for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
if (UndefElts[i]) {
Mask.push_back(SM_SentinelUndef);
continue;
}
uint64_t ByteBits = EltBits[i].getZExtValue();
if (ByteBits != 0 && ByteBits != 255)
return false;
Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
}
Ops.push_back(IsAndN ? N1 : N0);
return true;
}
case ISD::OR: {
// Inspect each operand at the byte level. We can merge these into a
// blend shuffle mask if for each byte at least one is masked out (zero).
KnownBits Known0 = DAG.computeKnownBits(N.getOperand(0), DemandedElts);
KnownBits Known1 = DAG.computeKnownBits(N.getOperand(1), DemandedElts);
if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
bool IsByteMask = true;
unsigned NumSizeInBytes = NumSizeInBits / 8;
unsigned NumBytesPerElt = NumBitsPerElt / 8;
APInt ZeroMask = APInt::getNullValue(NumBytesPerElt);
APInt SelectMask = APInt::getNullValue(NumBytesPerElt);
for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) {
unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue();
unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue();
if (LHS == 255 && RHS == 0)
SelectMask.setBit(i);
else if (LHS == 255 && RHS == 255)
ZeroMask.setBit(i);
else if (!(LHS == 0 && RHS == 255))
IsByteMask = false;
}
if (IsByteMask) {
for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) {
for (unsigned j = 0; j != NumBytesPerElt; ++j) {
unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0);
int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs));
Mask.push_back(Idx);
}
}
Ops.push_back(N.getOperand(0));
Ops.push_back(N.getOperand(1));
return true;
}
}
// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
// is a valid shuffle index.
SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
return false;
SmallVector<int, 64> SrcMask0, SrcMask1;
SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
if (!resolveTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG) ||
!resolveTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG))
return false;
int MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
SmallVector<int, 64> Mask0, Mask1;
scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
for (int i = 0; i != MaskSize; ++i) {
if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
Mask.push_back(SM_SentinelUndef);
else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
Mask.push_back(SM_SentinelZero);
else if (Mask1[i] == SM_SentinelZero)
Mask.push_back(Mask0[i]);
else if (Mask0[i] == SM_SentinelZero)
Mask.push_back(Mask1[i] + (MaskSize * SrcInputs0.size()));
else
return false;
}
for (SDValue &Op : SrcInputs0)
Ops.push_back(Op);
for (SDValue &Op : SrcInputs1)
Ops.push_back(Op);
return true;
}
case ISD::INSERT_SUBVECTOR: {
SDValue Src = N.getOperand(0);
SDValue Sub = N.getOperand(1);
EVT SubVT = Sub.getValueType();
unsigned NumSubElts = SubVT.getVectorNumElements();
if (!isa<ConstantSDNode>(N.getOperand(2)) ||
!N->isOnlyUserOf(Sub.getNode()))
return false;
uint64_t InsertIdx = N.getConstantOperandVal(2);
// Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
Sub.getOperand(0).getValueType() == VT &&
isa<ConstantSDNode>(Sub.getOperand(1))) {
uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
for (int i = 0; i != (int)NumElts; ++i)
Mask.push_back(i);
for (int i = 0; i != (int)NumSubElts; ++i)
Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
Ops.push_back(Src);
Ops.push_back(Sub.getOperand(0));
return true;
}
// Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
SmallVector<int, 64> SubMask;
SmallVector<SDValue, 2> SubInputs;
if (!resolveTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
SubMask, DAG))
return false;
if (SubMask.size() != NumSubElts) {
assert(((SubMask.size() % NumSubElts) == 0 ||
(NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
if ((NumSubElts % SubMask.size()) == 0) {
int Scale = NumSubElts / SubMask.size();
SmallVector<int,64> ScaledSubMask;
scaleShuffleMask<int>(Scale, SubMask, ScaledSubMask);
SubMask = ScaledSubMask;
} else {
int Scale = SubMask.size() / NumSubElts;
NumSubElts = SubMask.size();
NumElts *= Scale;
InsertIdx *= Scale;
}
}
Ops.push_back(Src);
for (SDValue &SubInput : SubInputs) {
EVT SubSVT = SubInput.getValueType().getScalarType();
EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT,
NumSizeInBits / SubSVT.getSizeInBits());
Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT,
DAG.getUNDEF(AltVT), SubInput,
DAG.getIntPtrConstant(0, SDLoc(N))));
}
for (int i = 0; i != (int)NumElts; ++i)
Mask.push_back(i);
for (int i = 0; i != (int)NumSubElts; ++i) {
int M = SubMask[i];
if (0 <= M) {
int InputIdx = M / NumSubElts;
M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
}
Mask[i + InsertIdx] = M;
}
return true;
}
case ISD::SCALAR_TO_VECTOR: {
// Match against a scalar_to_vector of an extract from a vector,
// for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
SDValue N0 = N.getOperand(0);
SDValue SrcExtract;
if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
N0.getOperand(0).getValueType() == VT) ||
(N0.getOpcode() == X86ISD::PEXTRW &&
N0.getOperand(0).getValueType() == MVT::v8i16) ||
(N0.getOpcode() == X86ISD::PEXTRB &&
N0.getOperand(0).getValueType() == MVT::v16i8)) {
SrcExtract = N0;
}
if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
return false;
SDValue SrcVec = SrcExtract.getOperand(0);
EVT SrcVT = SrcVec.getValueType();
unsigned NumSrcElts = SrcVT.getVectorNumElements();
unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
if (NumSrcElts <= SrcIdx)
return false;
Ops.push_back(SrcVec);
Mask.push_back(SrcIdx);
Mask.append(NumZeros, SM_SentinelZero);
Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
return true;
}
case X86ISD::PINSRB:
case X86ISD::PINSRW: {
SDValue InVec = N.getOperand(0);
SDValue InScl = N.getOperand(1);
SDValue InIndex = N.getOperand(2);
if (!isa<ConstantSDNode>(InIndex) ||
cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
return false;
uint64_t InIdx = N.getConstantOperandVal(2);
// Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
if (X86::isZeroNode(InScl)) {
Ops.push_back(InVec);
for (unsigned i = 0; i != NumElts; ++i)
Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
return true;
}
// Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
// TODO: Expand this to support INSERT_VECTOR_ELT/etc.
unsigned ExOp =
(X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
if (InScl.getOpcode() != ExOp)
return false;
SDValue ExVec = InScl.getOperand(0);
SDValue ExIndex = InScl.getOperand(1);
if (!isa<ConstantSDNode>(ExIndex) ||
cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
return false;
uint64_t ExIdx = InScl.getConstantOperandVal(1);
Ops.push_back(InVec);
Ops.push_back(ExVec);
for (unsigned i = 0; i != NumElts; ++i)
Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
return true;
}
case X86ISD::PACKSS:
case X86ISD::PACKUS: {
SDValue N0 = N.getOperand(0);
SDValue N1 = N.getOperand(1);
assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
"Unexpected input value type");
APInt EltsLHS, EltsRHS;
getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
// If we know input saturation won't happen we can treat this
// as a truncation shuffle.
if (Opcode == X86ISD::PACKSS) {
if ((!N0.isUndef() &&
DAG.ComputeNumSignBits(N0, EltsLHS) <= NumBitsPerElt) ||
(!N1.isUndef() &&
DAG.ComputeNumSignBits(N1, EltsRHS) <= NumBitsPerElt))
return false;
} else {
APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS)) ||
(!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS)))
return false;
}
bool IsUnary = (N0 == N1);
Ops.push_back(N0);
if (!IsUnary)
Ops.push_back(N1);
createPackShuffleMask(VT, Mask, IsUnary);
return true;
}
case X86ISD::VSHLI:
case X86ISD::VSRLI: {
uint64_t ShiftVal = N.getConstantOperandVal(1);
// Out of range bit shifts are guaranteed to be zero.
if (NumBitsPerElt <= ShiftVal) {
Mask.append(NumElts, SM_SentinelZero);
return true;
}
// We can only decode 'whole byte' bit shifts as shuffles.
if ((ShiftVal % 8) != 0)
break;
uint64_t ByteShift = ShiftVal / 8;
unsigned NumBytes = NumSizeInBits / 8;
unsigned NumBytesPerElt = NumBitsPerElt / 8;
Ops.push_back(N.getOperand(0));
// Clear mask to all zeros and insert the shifted byte indices.
Mask.append(NumBytes, SM_SentinelZero);
if (X86ISD::VSHLI == Opcode) {
for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
Mask[i + j] = i + j - ByteShift;
} else {
for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
Mask[i + j - ByteShift] = i + j;
}
return true;
}
case X86ISD::VBROADCAST: {
SDValue Src = N.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
if (!SrcVT.isVector())
return false;
if (NumSizeInBits != SrcVT.getSizeInBits()) {
assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
"Illegal broadcast type");
SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
NumSizeInBits / SrcVT.getScalarSizeInBits());
Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
DAG.getUNDEF(SrcVT), Src,
DAG.getIntPtrConstant(0, SDLoc(N)));
}
Ops.push_back(Src);
Mask.append(NumElts, 0);
return true;
}
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND_VECTOR_INREG:
case ISD::ANY_EXTEND_VECTOR_INREG: {
SDValue Src = N.getOperand(0);
EVT SrcVT = Src.getValueType();
// Extended source must be a simple vector.
if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
(SrcVT.getScalarSizeInBits() % 8) != 0)
return false;
unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits();
bool IsAnyExtend =
(ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend,
Mask);
if (NumSizeInBits != SrcVT.getSizeInBits()) {
assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
"Illegal zero-extension type");
SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(),
NumSizeInBits / NumSrcBitsPerElt);
Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
DAG.getUNDEF(SrcVT), Src,
DAG.getIntPtrConstant(0, SDLoc(N)));
}
Ops.push_back(Src);
return true;
}
}
return false;
}
/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask) {
int MaskWidth = Mask.size();
SmallVector<SDValue, 16> UsedInputs;
for (int i = 0, e = Inputs.size(); i < e; ++i) {
int lo = UsedInputs.size() * MaskWidth;
int hi = lo + MaskWidth;
// Strip UNDEF input usage.
if (Inputs[i].isUndef())
for (int &M : Mask)
if ((lo <= M) && (M < hi))
M = SM_SentinelUndef;
// Check for unused inputs.
if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
for (int &M : Mask)
if (lo <= M)
M -= MaskWidth;
continue;
}
// Check for repeated inputs.
bool IsRepeat = false;
for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
if (UsedInputs[j] != Inputs[i])
continue;
for (int &M : Mask)
if (lo <= M)
M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
IsRepeat = true;
break;
}
if (IsRepeat)
continue;
UsedInputs.push_back(Inputs[i]);
}
Inputs = UsedInputs;
}
/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
/// remaining input indices in case we now have a unary shuffle and adjust the
/// inputs accordingly.
/// Returns true if the target shuffle mask was decoded.
static bool resolveTargetShuffleInputs(SDValue Op,
SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
SelectionDAG &DAG) {
unsigned NumElts = Op.getValueType().getVectorNumElements();
APInt DemandedElts = APInt::getAllOnesValue(NumElts);
if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
if (!getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG))
return false;
resolveTargetShuffleInputsAndMask(Inputs, Mask);
return true;
}
/// Returns the scalar element that will make up the ith
/// element of the result of the vector shuffle.
static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
unsigned Depth) {
if (Depth == 6)
return SDValue(); // Limit search depth.
SDValue V = SDValue(N, 0);
EVT VT = V.getValueType();
unsigned Opcode = V.getOpcode();
// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
int Elt = SV->getMaskElt(Index);
if (Elt < 0)
return DAG.getUNDEF(VT.getVectorElementType());
unsigned NumElems = VT.getVectorNumElements();
SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
: SV->getOperand(1);
return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
}
// Recurse into target specific vector shuffles to find scalars.
if (isTargetShuffle(Opcode)) {
MVT ShufVT = V.getSimpleValueType();
MVT ShufSVT = ShufVT.getVectorElementType();
int NumElems = (int)ShufVT.getVectorNumElements();
SmallVector<int, 16> ShuffleMask;
SmallVector<SDValue, 16> ShuffleOps;
bool IsUnary;
if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
return SDValue();
int Elt = ShuffleMask[Index];
if (Elt == SM_SentinelZero)
return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
: DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
if (Elt == SM_SentinelUndef)
return DAG.getUNDEF(ShufSVT);
assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
Depth+1);
}
// Recurse into insert_subvector base/sub vector to find scalars.
if (Opcode == ISD::INSERT_SUBVECTOR &&
isa<ConstantSDNode>(N->getOperand(2))) {
SDValue Vec = N->getOperand(0);
SDValue Sub = N->getOperand(1);
EVT SubVT = Sub.getValueType();
unsigned NumSubElts = SubVT.getVectorNumElements();
uint64_t SubIdx = N->getConstantOperandVal(2);
if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1);
return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1);
}
// Recurse into extract_subvector src vector to find scalars.
if (Opcode == ISD::EXTRACT_SUBVECTOR &&
isa<ConstantSDNode>(N->getOperand(1))) {
SDValue Src = N->getOperand(0);
uint64_t SrcIdx = N->getConstantOperandVal(1);
return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1);
}
// Actual nodes that may contain scalar elements
if (Opcode == ISD::BITCAST) {
V = V.getOperand(0);
EVT SrcVT = V.getValueType();
unsigned NumElems = VT.getVectorNumElements();
if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
return SDValue();
}
if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
return (Index == 0) ? V.getOperand(0)
: DAG.getUNDEF(VT.getVectorElementType());
if (V.getOpcode() == ISD::BUILD_VECTOR)
return V.getOperand(Index);
return SDValue();
}
// Use PINSRB/PINSRW/PINSRD to create a build vector.
static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
unsigned NumElts = VT.getVectorNumElements();
assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
"Illegal vector insertion");
SDLoc dl(Op);
SDValue V;
bool First = true;
for (unsigned i = 0; i < NumElts; ++i) {
bool IsNonZero = (NonZeros & (1 << i)) != 0;
if (!IsNonZero)
continue;
// If the build vector contains zeros or our first insertion is not the
// first index then insert into zero vector to break any register
// dependency else use SCALAR_TO_VECTOR.
if (First) {
First = false;
if (NumZero || 0 != i)
V = getZeroVector(VT, Subtarget, DAG, dl);
else {
assert(0 == i && "Expected insertion into zero-index");
V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
V = DAG.getBitcast(VT, V);
continue;
}
}
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
DAG.getIntPtrConstant(i, dl));
}
return V;
}
/// Custom lower build_vector of v16i8.
static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (NumNonZero > 8 && !Subtarget.hasSSE41())
return SDValue();
// SSE4.1 - use PINSRB to insert each byte directly.
if (Subtarget.hasSSE41())
return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
Subtarget);
SDLoc dl(Op);
SDValue V;
// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
for (unsigned i = 0; i < 16; i += 2) {
bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0;
if (!ThisIsNonZero && !NextIsNonZero)
continue;
// FIXME: Investigate combining the first 4 bytes as a i32 instead.
SDValue Elt;
if (ThisIsNonZero) {
if (NumZero || NextIsNonZero)
Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
else
Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
}
if (NextIsNonZero) {
SDValue NextElt = Op.getOperand(i + 1);
if (i == 0 && NumZero)
NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
else
NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
DAG.getConstant(8, dl, MVT::i8));
if (ThisIsNonZero)
Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
else
Elt = NextElt;
}
// If our first insertion is not the first index then insert into zero
// vector to break any register dependency else use SCALAR_TO_VECTOR.
if (!V) {
if (i != 0)
V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
else {
V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
V = DAG.getBitcast(MVT::v8i16, V);
continue;
}
}
Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
DAG.getIntPtrConstant(i / 2, dl));
}
return DAG.getBitcast(MVT::v16i8, V);
}
/// Custom lower build_vector of v8i16.
static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (NumNonZero > 4 && !Subtarget.hasSSE41())
return SDValue();
// Use PINSRW to insert each byte directly.
return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
Subtarget);
}
/// Custom lower build_vector of v4i32 or v4f32.
static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// If this is a splat of a pair of elements, use MOVDDUP (unless the target
// has XOP; in that case defer lowering to potentially use VPERMIL2PS).
// Because we're creating a less complicated build vector here, we may enable
// further folding of the MOVDDUP via shuffle transforms.
if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
Op.getOperand(0) == Op.getOperand(2) &&
Op.getOperand(1) == Op.getOperand(3) &&
Op.getOperand(0) != Op.getOperand(1)) {
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
// Create a new build vector with the first 2 elements followed by undef
// padding, bitcast to v2f64, duplicate, and bitcast back.
SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
return DAG.getBitcast(VT, Dup);
}
// Find all zeroable elements.
std::bitset<4> Zeroable, Undefs;
for (int i = 0; i < 4; ++i) {
SDValue Elt = Op.getOperand(i);
Undefs[i] = Elt.isUndef();
Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
}
assert(Zeroable.size() - Zeroable.count() > 1 &&
"We expect at least two non-zero elements!");
// We only know how to deal with build_vector nodes where elements are either
// zeroable or extract_vector_elt with constant index.
SDValue FirstNonZero;
unsigned FirstNonZeroIdx;
for (unsigned i = 0; i < 4; ++i) {
if (Zeroable[i])
continue;
SDValue Elt = Op.getOperand(i);
if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isa<ConstantSDNode>(Elt.getOperand(1)))
return SDValue();
// Make sure that this node is extracting from a 128-bit vector.
MVT VT = Elt.getOperand(0).getSimpleValueType();
if (!VT.is128BitVector())
return SDValue();
if (!FirstNonZero.getNode()) {
FirstNonZero = Elt;
FirstNonZeroIdx = i;
}
}
assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
SDValue V1 = FirstNonZero.getOperand(0);
MVT VT = V1.getSimpleValueType();
// See if this build_vector can be lowered as a blend with zero.
SDValue Elt;
unsigned EltMaskIdx, EltIdx;
int Mask[4];
for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
if (Zeroable[EltIdx]) {
// The zero vector will be on the right hand side.
Mask[EltIdx] = EltIdx+4;
continue;
}
Elt = Op->getOperand(EltIdx);
// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
EltMaskIdx = Elt.getConstantOperandVal(1);
if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
break;
Mask[EltIdx] = EltIdx;
}
if (EltIdx == 4) {
// Let the shuffle legalizer deal with blend operations.
SDValue VZeroOrUndef = (Zeroable == Undefs)
? DAG.getUNDEF(VT)
: getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
if (V1.getSimpleValueType() != VT)
V1 = DAG.getBitcast(VT, V1);
return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
}
// See if we can lower this build_vector to a INSERTPS.
if (!Subtarget.hasSSE41())
return SDValue();
SDValue V2 = Elt.getOperand(0);
if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
V1 = SDValue();
bool CanFold = true;
for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
if (Zeroable[i])
continue;
SDValue Current = Op->getOperand(i);
SDValue SrcVector = Current->getOperand(0);
if (!V1.getNode())
V1 = SrcVector;
CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
}
if (!CanFold)
return SDValue();
assert(V1.getNode() && "Expected at least two non-zero elements!");
if (V1.getSimpleValueType() != MVT::v4f32)
V1 = DAG.getBitcast(MVT::v4f32, V1);
if (V2.getSimpleValueType() != MVT::v4f32)
V2 = DAG.getBitcast(MVT::v4f32, V2);
// Ok, we can emit an INSERTPS instruction.
unsigned ZMask = Zeroable.to_ulong();
unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
SDLoc DL(Op);
SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
DAG.getIntPtrConstant(InsertPSMask, DL));
return DAG.getBitcast(VT, Result);
}
/// Return a vector logical shift node.
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
SelectionDAG &DAG, const TargetLowering &TLI,
const SDLoc &dl) {
assert(VT.is128BitVector() && "Unknown type for VShift");
MVT ShVT = MVT::v16i8;
unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
SrcOp = DAG.getBitcast(ShVT, SrcOp);
assert(NumBits % 8 == 0 && "Only support byte sized shifts");
SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, MVT::i8);
return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
}
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
SelectionDAG &DAG) {
// Check if the scalar load can be widened into a vector load. And if
// the address is "base + cst" see if the cst can be "absorbed" into
// the shuffle mask.
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
SDValue Ptr = LD->getBasePtr();
if (!ISD::isNormalLoad(LD) || LD->isVolatile())
return SDValue();
EVT PVT = LD->getValueType(0);
if (PVT != MVT::i32 && PVT != MVT::f32)
return SDValue();
int FI = -1;
int64_t Offset = 0;
if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
FI = FINode->getIndex();
Offset = 0;
} else if (DAG.isBaseWithConstantOffset(Ptr) &&
isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
Offset = Ptr.getConstantOperandVal(1);
Ptr = Ptr.getOperand(0);
} else {
return SDValue();
}
// FIXME: 256-bit vector instructions don't require a strict alignment,
// improve this code to support it better.
unsigned RequiredAlign = VT.getSizeInBits()/8;
SDValue Chain = LD->getChain();
// Make sure the stack object alignment is at least 16 or 32.
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
if (MFI.isFixedObjectIndex(FI)) {
// Can't change the alignment. FIXME: It's possible to compute
// the exact stack offset and reference FI + adjust offset instead.
// If someone *really* cares about this. That's the way to implement it.
return SDValue();
} else {
MFI.setObjectAlignment(FI, RequiredAlign);
}
}
// (Offset % 16 or 32) must be multiple of 4. Then address is then
// Ptr + (Offset & ~15).
if (Offset < 0)
return SDValue();
if ((Offset % RequiredAlign) & 3)
return SDValue();
int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
if (StartOffset) {
SDLoc DL(Ptr);
Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
}
int EltNo = (Offset - StartOffset) >> 2;
unsigned NumElems = VT.getVectorNumElements();
EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
LD->getPointerInfo().getWithOffset(StartOffset));
SmallVector<int, 8> Mask(NumElems, EltNo);
return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
}
return SDValue();
}
/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
/// elements can be replaced by a single large load which has the same value as
/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
///
/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
const SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
bool isAfterLegalize) {
unsigned NumElems = Elts.size();
int LastLoadedElt = -1;
APInt LoadMask = APInt::getNullValue(NumElems);
APInt ZeroMask = APInt::getNullValue(NumElems);
APInt UndefMask = APInt::getNullValue(NumElems);
SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
// For each element in the initializer, see if we've found a load, zero or an
// undef.
for (unsigned i = 0; i < NumElems; ++i) {
SDValue Elt = peekThroughBitcasts(Elts[i]);
if (!Elt.getNode())
return SDValue();
if (Elt.isUndef()) {
UndefMask.setBit(i);
continue;
}
if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
ZeroMask.setBit(i);
continue;
}
// Each loaded element must be the correct fractional portion of the
// requested vector load.
if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
return SDValue();
if (!ISD::isNON_EXTLoad(Elt.getNode()))
return SDValue();
Loads[i] = cast<LoadSDNode>(Elt);
LoadMask.setBit(i);
LastLoadedElt = i;
}
assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +
LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks");
// Handle Special Cases - all undef or undef/zero.
if (UndefMask.countPopulation() == NumElems)
return DAG.getUNDEF(VT);
// FIXME: Should we return this as a BUILD_VECTOR instead?
if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
return VT.isInteger() ? DAG.getConstant(0, DL, VT)
: DAG.getConstantFP(0.0, DL, VT);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
int FirstLoadedElt = LoadMask.countTrailingZeros();
SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
EVT EltBaseVT = EltBase.getValueType();
assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
"Register/Memory size mismatch");
LoadSDNode *LDBase = Loads[FirstLoadedElt];
assert(LDBase && "Did not find base load for merging consecutive loads");
unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
unsigned BaseSizeInBytes = BaseSizeInBits / 8;
int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;
assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
// Consecutive loads can contain UNDEFS but not ZERO elements.
// Consecutive loads with UNDEFs and ZEROs elements require a
// an additional shuffle stage to clear the ZERO elements.
bool IsConsecutiveLoad = true;
bool IsConsecutiveLoadWithZeros = true;
for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
if (LoadMask[i]) {
if (!DAG.areNonVolatileConsecutiveLoads(Loads[i], LDBase, BaseSizeInBytes,
i - FirstLoadedElt)) {
IsConsecutiveLoad = false;
IsConsecutiveLoadWithZeros = false;
break;
}
} else if (ZeroMask[i]) {
IsConsecutiveLoad = false;
}
}
auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
auto MMOFlags = LDBase->getMemOperand()->getFlags();
assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
"Cannot merge volatile loads.");
SDValue NewLd =
DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
for (auto *LD : Loads)
if (LD)
DAG.makeEquivalentMemoryOrdering(LD, NewLd);
return NewLd;
};
// Check if the base load is entirely dereferenceable.
bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
// LOAD - all consecutive load/undefs (must start/end with a load or be
// entirely dereferenceable). If we have found an entire vector of loads and
// undefs, then return a large load of the entire vector width starting at the
// base pointer. If the vector contains zeros, then attempt to shuffle those
// elements.
if (FirstLoadedElt == 0 &&
(LastLoadedElt == (int)(NumElems - 1) || IsDereferenceable) &&
(IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
return SDValue();
// Don't create 256-bit non-temporal aligned loads without AVX2 as these
// will lower to regular temporal loads and use the cache.
if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
VT.is256BitVector() && !Subtarget.hasInt256())
return SDValue();
if (NumElems == 1)
return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
if (!ZeroMask)
return CreateLoad(VT, LDBase);
// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
// vector and a zero vector to clear out the zero elements.
if (!isAfterLegalize && VT.isVector()) {
SmallVector<int, 4> ClearMask(NumElems, -1);
for (unsigned i = 0; i < NumElems; ++i) {
if (ZeroMask[i])
ClearMask[i] = i + NumElems;
else if (LoadMask[i])
ClearMask[i] = i;
}
SDValue V = CreateLoad(VT, LDBase);
SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
: DAG.getConstantFP(0.0, DL, VT);
return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
}
}
// If the upper half of a ymm/zmm load is undef then just load the lower half.
if (VT.is256BitVector() || VT.is512BitVector()) {
unsigned HalfNumElems = NumElems / 2;
if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
EVT HalfVT =
EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
SDValue HalfLD =
EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
DAG, Subtarget, isAfterLegalize);
if (HalfLD)
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
HalfLD, DAG.getIntPtrConstant(0, DL));
}
}
// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
(LoadSizeInBits == 32 || LoadSizeInBits == 64) &&
((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
: MVT::getIntegerVT(LoadSizeInBits);
MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
if (TLI.isTypeLegal(VecVT)) {
SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
SDValue ResNode =
DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
LDBase->getPointerInfo(),
LDBase->getAlignment(),
MachineMemOperand::MOLoad);
for (auto *LD : Loads)
if (LD)
DAG.makeEquivalentMemoryOrdering(LD, ResNode);
return DAG.getBitcast(VT, ResNode);
}
}
// BROADCAST - match the smallest possible repetition pattern, load that
// scalar/subvector element and then broadcast to the entire vector.
if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
unsigned RepeatSize = SubElems * BaseSizeInBits;
unsigned ScalarSize = std::min(RepeatSize, 64u);
if (!Subtarget.hasAVX2() && ScalarSize < 32)
continue;
bool Match = true;
SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
for (unsigned i = 0; i != NumElems && Match; ++i) {
if (!LoadMask[i])
continue;
SDValue Elt = peekThroughBitcasts(Elts[i]);
if (RepeatedLoads[i % SubElems].isUndef())
RepeatedLoads[i % SubElems] = Elt;
else
Match &= (RepeatedLoads[i % SubElems] == Elt);
}
// We must have loads at both ends of the repetition.
Match &= !RepeatedLoads.front().isUndef();
Match &= !RepeatedLoads.back().isUndef();
if (!Match)
continue;
EVT RepeatVT =
VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
: EVT::getFloatingPointVT(ScalarSize);
if (RepeatSize > ScalarSize)
RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
RepeatSize / ScalarSize);
EVT BroadcastVT =
EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
VT.getSizeInBits() / ScalarSize);
if (TLI.isTypeLegal(BroadcastVT)) {
if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST
: X86ISD::VBROADCAST;
SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad);
return DAG.getBitcast(VT, Broadcast);
}
}
}
}
return SDValue();
}
// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
// are consecutive, non-overlapping, and in the right order.
static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget,
bool isAfterLegalize) {
SmallVector<SDValue, 64> Elts;
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
Elts.push_back(Elt);
continue;
}
return SDValue();
}
assert(Elts.size() == VT.getVectorNumElements());
return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
isAfterLegalize);
}
static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
unsigned SplatBitSize, LLVMContext &C) {
unsigned ScalarSize = VT.getScalarSizeInBits();
unsigned NumElm = SplatBitSize / ScalarSize;
SmallVector<Constant *, 32> ConstantVec;
for (unsigned i = 0; i < NumElm; i++) {
APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
Constant *Const;
if (VT.isFloatingPoint()) {
if (ScalarSize == 32) {
Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
} else {
assert(ScalarSize == 64 && "Unsupported floating point scalar size");
Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
}
} else
Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
ConstantVec.push_back(Const);
}
return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
}
static bool isFoldableUseOfShuffle(SDNode *N) {
for (auto *U : N->uses()) {
unsigned Opc = U->getOpcode();
// VPERMV/VPERMV3 shuffles can never fold their index operands.
if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
return false;
if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
return false;
if (isTargetShuffle(Opc))
return true;
if (Opc == ISD::BITCAST) // Ignore bitcasts
return isFoldableUseOfShuffle(U);
if (N->hasOneUse())
return true;
}
return false;
}
// Check if the current node of build vector is a zero extended vector.
// // If so, return the value extended.
// // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
// // NumElt - return the number of zero extended identical values.
// // EltType - return the type of the value include the zero extend.
static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
unsigned &NumElt, MVT &EltType) {
SDValue ExtValue = Op->getOperand(0);
unsigned NumElts = Op->getNumOperands();
unsigned Delta = NumElts;
for (unsigned i = 1; i < NumElts; i++) {
if (Op->getOperand(i) == ExtValue) {
Delta = i;
break;
}
if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
return SDValue();
}
if (!isPowerOf2_32(Delta) || Delta == 1)
return SDValue();
for (unsigned i = Delta; i < NumElts; i++) {
if (i % Delta == 0) {
if (Op->getOperand(i) != ExtValue)
return SDValue();
} else if (!(isNullConstant(Op->getOperand(i)) ||
Op->getOperand(i).isUndef()))
return SDValue();
}
unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
unsigned ExtVTSize = EltSize * Delta;
EltType = MVT::getIntegerVT(ExtVTSize);
NumElt = NumElts / Delta;
return ExtValue;
}
/// Attempt to use the vbroadcast instruction to generate a splat value
/// from a splat BUILD_VECTOR which uses:
/// a. A single scalar load, or a constant.
/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
///
/// The VBROADCAST node is returned when a pattern is found,
/// or SDValue() otherwise.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// VBROADCAST requires AVX.
// TODO: Splats could be generated for non-AVX CPUs using SSE
// instructions, but there's less potential gain for only 128-bit vectors.
if (!Subtarget.hasAVX())
return SDValue();
MVT VT = BVOp->getSimpleValueType(0);
SDLoc dl(BVOp);
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
"Unsupported vector type for broadcast.");
BitVector UndefElements;
SDValue Ld = BVOp->getSplatValue(&UndefElements);
// Attempt to use VBROADCASTM
// From this paterrn:
// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
// b. t1 = (build_vector t0 t0)
//
// Create (VBROADCASTM v2i1 X)
if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
MVT EltType = VT.getScalarType();
unsigned NumElts = VT.getVectorNumElements();
SDValue BOperand;
SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
(Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
if (ZeroExtended)
BOperand = ZeroExtended.getOperand(0);
else
BOperand = Ld.getOperand(0).getOperand(0);
MVT MaskVT = BOperand.getSimpleValueType();
if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
(EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
SDValue Brdcst =
DAG.getNode(X86ISD::VBROADCASTM, dl,
MVT::getVectorVT(EltType, NumElts), BOperand);
return DAG.getBitcast(VT, Brdcst);
}
}
}
unsigned NumElts = VT.getVectorNumElements();
unsigned NumUndefElts = UndefElements.count();
if (!Ld || (NumElts - NumUndefElts) <= 1) {
APInt SplatValue, Undef;
unsigned SplatBitSize;
bool HasUndef;
// Check if this is a repeated constant pattern suitable for broadcasting.
if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
SplatBitSize > VT.getScalarSizeInBits() &&
SplatBitSize < VT.getSizeInBits()) {
// Avoid replacing with broadcast when it's a use of a shuffle
// instruction to preserve the present custom lowering of shuffles.
if (isFoldableUseOfShuffle(BVOp))
return SDValue();
// replace BUILD_VECTOR with broadcast of the repeated constants.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
LLVMContext *Ctx = DAG.getContext();
MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
if (Subtarget.hasAVX()) {
if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
!(SplatBitSize == 64 && Subtarget.is32Bit())) {
// Splatted value can fit in one INTEGER constant in constant pool.
// Load the constant and broadcast it.
MVT CVT = MVT::getIntegerVT(SplatBitSize);
Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
SDValue CP = DAG.getConstantPool(C, PVT);
unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
Ld = DAG.getLoad(
CVT, dl, DAG.getEntryNode(), CP,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
Alignment);
SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
MVT::getVectorVT(CVT, Repeat), Ld);
return DAG.getBitcast(VT, Brdcst);
} else if (SplatBitSize == 32 || SplatBitSize == 64) {
// Splatted value can fit in one FLOAT constant in constant pool.
// Load the constant and broadcast it.
// AVX have support for 32 and 64 bit broadcast for floats only.
// No 64bit integer in 32bit subtarget.
MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
// Lower the splat via APFloat directly, to avoid any conversion.
Constant *C =
SplatBitSize == 32
? ConstantFP::get(*Ctx,
APFloat(APFloat::IEEEsingle(), SplatValue))
: ConstantFP::get(*Ctx,
APFloat(APFloat::IEEEdouble(), SplatValue));
SDValue CP = DAG.getConstantPool(C, PVT);
unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
Ld = DAG.getLoad(
CVT, dl, DAG.getEntryNode(), CP,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
Alignment);
SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
MVT::getVectorVT(CVT, Repeat), Ld);
return DAG.getBitcast(VT, Brdcst);
} else if (SplatBitSize > 64) {
// Load the vector of constants and broadcast it.
MVT CVT = VT.getScalarType();
Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
*Ctx);
SDValue VCP = DAG.getConstantPool(VecC, PVT);
unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
Ld = DAG.getLoad(
MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
Alignment);
SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
return DAG.getBitcast(VT, Brdcst);
}
}
}
// If we are moving a scalar into a vector (Ld must be set and all elements
// but 1 are undef) and that operation is not obviously supported by
// vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
// That's better than general shuffling and may eliminate a load to GPR and
// move from scalar to vector register.
if (!Ld || NumElts - NumUndefElts != 1)
return SDValue();
unsigned ScalarSize = Ld.getValueSizeInBits();
if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
return SDValue();
}
bool ConstSplatVal =
(Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
// Make sure that all of the users of a non-constant load are from the
// BUILD_VECTOR node.
if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
return SDValue();
unsigned ScalarSize = Ld.getValueSizeInBits();
bool IsGE256 = (VT.getSizeInBits() >= 256);
// When optimizing for size, generate up to 5 extra bytes for a broadcast
// instruction to save 8 or more bytes of constant pool data.
// TODO: If multiple splats are generated to load the same constant,
// it may be detrimental to overall size. There needs to be a way to detect
// that condition to know if this is truly a size win.
bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
// Handle broadcasting a single constant scalar from the constant pool
// into a vector.
// On Sandybridge (no AVX2), it is still better to load a constant vector
// from the constant pool and not to broadcast it from a scalar.
// But override that restriction when optimizing for size.
// TODO: Check if splatting is recommended for other AVX-capable CPUs.
if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
EVT CVT = Ld.getValueType();
assert(!CVT.isVector() && "Must not broadcast a vector type");
// Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
// For size optimization, also splat v2f64 and v2i64, and for size opt
// with AVX2, also splat i8 and i16.
// With pattern matching, the VBROADCAST node may become a VMOVDDUP.
if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
(OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
const Constant *C = nullptr;
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
C = CI->getConstantIntValue();
else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
C = CF->getConstantFPValue();
assert(C && "Invalid constant type");
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue CP =
DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
Ld = DAG.getLoad(
CVT, dl, DAG.getEntryNode(), CP,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
Alignment);
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
}
}
bool IsLoad = ISD::isNormalLoad(Ld.getNode());
// Handle AVX2 in-register broadcasts.
if (!IsLoad && Subtarget.hasInt256() &&
(ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
// The scalar source must be a normal load.
if (!IsLoad)
return SDValue();
if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
(Subtarget.hasVLX() && ScalarSize == 64))
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
// The integer check is needed for the 64-bit into 128-bit so it doesn't match
// double since there is no vbroadcastsd xmm
if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
}
// Unsupported broadcast.
return SDValue();
}
/// For an EXTRACT_VECTOR_ELT with a constant index return the real
/// underlying vector and index.
///
/// Modifies \p ExtractedFromVec to the real vector and returns the real
/// index.
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
SDValue ExtIdx) {
int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
return Idx;
// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
// lowered this:
// (extract_vector_elt (v8f32 %1), Constant<6>)
// to:
// (extract_vector_elt (vector_shuffle<2,u,u,u>
// (extract_subvector (v8f32 %0), Constant<4>),
// undef)
// Constant<0>)
// In this case the vector is the extract_subvector expression and the index
// is 2, as specified by the shuffle.
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
SDValue ShuffleVec = SVOp->getOperand(0);
MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
assert(ShuffleVecVT.getVectorElementType() ==
ExtractedFromVec.getSimpleValueType().getVectorElementType());
int ShuffleIdx = SVOp->getMaskElt(Idx);
if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
ExtractedFromVec = ShuffleVec;
return ShuffleIdx;
}
return Idx;
}
static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
// Skip if insert_vec_elt is not supported.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
return SDValue();
SDLoc DL(Op);
unsigned NumElems = Op.getNumOperands();
SDValue VecIn1;
SDValue VecIn2;
SmallVector<unsigned, 4> InsertIndices;
SmallVector<int, 8> Mask(NumElems, -1);
for (unsigned i = 0; i != NumElems; ++i) {
unsigned Opc = Op.getOperand(i).getOpcode();
if (Opc == ISD::UNDEF)
continue;
if (Opc != ISD::EXTRACT_VECTOR_ELT) {
// Quit if more than 1 elements need inserting.
if (InsertIndices.size() > 1)
return SDValue();
InsertIndices.push_back(i);
continue;
}
SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
SDValue ExtIdx = Op.getOperand(i).getOperand(1);
// Quit if non-constant index.
if (!isa<ConstantSDNode>(ExtIdx))
return SDValue();
int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
// Quit if extracted from vector of different type.
if (ExtractedFromVec.getValueType() != VT)
return SDValue();
if (!VecIn1.getNode())
VecIn1 = ExtractedFromVec;
else if (VecIn1 != ExtractedFromVec) {
if (!VecIn2.getNode())
VecIn2 = ExtractedFromVec;
else if (VecIn2 != ExtractedFromVec)
// Quit if more than 2 vectors to shuffle
return SDValue();
}
if (ExtractedFromVec == VecIn1)
Mask[i] = Idx;
else if (ExtractedFromVec == VecIn2)
Mask[i] = Idx + NumElems;
}
if (!VecIn1.getNode())
return SDValue();
VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
for (unsigned Idx : InsertIndices)
NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
DAG.getIntPtrConstant(Idx, DL));
return NV;
}
static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
Op.getScalarValueSizeInBits() == 1 &&
"Can not convert non-constant vector");
uint64_t Immediate = 0;
for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
SDValue In = Op.getOperand(idx);
if (!In.isUndef())
Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
}
SDLoc dl(Op);
MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
return DAG.getConstant(Immediate, dl, VT);
}
// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
assert((VT.getVectorElementType() == MVT::i1) &&
"Unexpected type in LowerBUILD_VECTORvXi1!");
SDLoc dl(Op);
if (ISD::isBuildVectorAllZeros(Op.getNode()))
return Op;
if (ISD::isBuildVectorAllOnes(Op.getNode()))
return Op;
if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
// Split the pieces.
SDValue Lower =
DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
SDValue Upper =
DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
// We have to manually lower both halves so getNode doesn't try to
// reassemble the build_vector.
Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
}
SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
if (Imm.getValueSizeInBits() == VT.getSizeInBits())
return DAG.getBitcast(VT, Imm);
SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
DAG.getIntPtrConstant(0, dl));
}
// Vector has one or more non-const elements
uint64_t Immediate = 0;
SmallVector<unsigned, 16> NonConstIdx;
bool IsSplat = true;
bool HasConstElts = false;
int SplatIdx = -1;
for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
SDValue In = Op.getOperand(idx);
if (In.isUndef())
continue;
if (!isa<ConstantSDNode>(In))
NonConstIdx.push_back(idx);
else {
Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
HasConstElts = true;
}
if (SplatIdx < 0)
SplatIdx = idx;
else if (In != Op.getOperand(SplatIdx))
IsSplat = false;
}
// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
if (IsSplat)
return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
DAG.getConstant(1, dl, VT),
DAG.getConstant(0, dl, VT));
// insert elements one by one
SDValue DstVec;
SDValue Imm;
if (Immediate) {
MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
Imm = DAG.getConstant(Immediate, dl, ImmVT);
}
else if (HasConstElts)
Imm = DAG.getConstant(0, dl, VT);
else
Imm = DAG.getUNDEF(VT);
if (Imm.getValueSizeInBits() == VT.getSizeInBits())
DstVec = DAG.getBitcast(VT, Imm);
else {
SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
DAG.getIntPtrConstant(0, dl));
}
for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
unsigned InsertIdx = NonConstIdx[i];
DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
Op.getOperand(InsertIdx),
DAG.getIntPtrConstant(InsertIdx, dl));
}
return DstVec;
}
/// This is a helper function of LowerToHorizontalOp().
/// This function checks that the build_vector \p N in input implements a
/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
/// may not match the layout of an x86 256-bit horizontal instruction.
/// In other words, if this returns true, then some extraction/insertion will
/// be required to produce a valid horizontal instruction.
///
/// Parameter \p Opcode defines the kind of horizontal operation to match.
/// For example, if \p Opcode is equal to ISD::ADD, then this function
/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
/// is equal to ISD::SUB, then this function checks if this is a horizontal
/// arithmetic sub.
///
/// This function only analyzes elements of \p N whose indices are
/// in range [BaseIdx, LastIdx).
///
/// TODO: This function was originally used to match both real and fake partial
/// horizontal operations, but the index-matching logic is incorrect for that.
/// See the corrected implementation in isHopBuildVector(). Can we reduce this
/// code because it is only used for partial h-op matching now?
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
SelectionDAG &DAG,
unsigned BaseIdx, unsigned LastIdx,
SDValue &V0, SDValue &V1) {
EVT VT = N->getValueType(0);
assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
"Invalid Vector in input!");
bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
bool CanFold = true;
unsigned ExpectedVExtractIdx = BaseIdx;
unsigned NumElts = LastIdx - BaseIdx;
V0 = DAG.getUNDEF(VT);
V1 = DAG.getUNDEF(VT);
// Check if N implements a horizontal binop.
for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
SDValue Op = N->getOperand(i + BaseIdx);
// Skip UNDEFs.
if (Op->isUndef()) {
// Update the expected vector extract index.
if (i * 2 == NumElts)
ExpectedVExtractIdx = BaseIdx;
ExpectedVExtractIdx += 2;
continue;
}
CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
if (!CanFold)
break;
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
// Try to match the following pattern:
// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
Op0.getOperand(0) == Op1.getOperand(0) &&
isa<ConstantSDNode>(Op0.getOperand(1)) &&
isa<ConstantSDNode>(Op1.getOperand(1)));
if (!CanFold)
break;
unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
if (i * 2 < NumElts) {
if (V0.isUndef()) {
V0 = Op0.getOperand(0);
if (V0.getValueType() != VT)
return false;
}
} else {
if (V1.isUndef()) {
V1 = Op0.getOperand(0);
if (V1.getValueType() != VT)
return false;
}
if (i * 2 == NumElts)
ExpectedVExtractIdx = BaseIdx;
}
SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
if (I0 == ExpectedVExtractIdx)
CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
else if (IsCommutable && I1 == ExpectedVExtractIdx) {
// Try to match the following dag sequence:
// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
} else
CanFold = false;
ExpectedVExtractIdx += 2;
}
return CanFold;
}
/// Emit a sequence of two 128-bit horizontal add/sub followed by
/// a concat_vector.
///
/// This is a helper function of LowerToHorizontalOp().
/// This function expects two 256-bit vectors called V0 and V1.
/// At first, each vector is split into two separate 128-bit vectors.
/// Then, the resulting 128-bit vectors are used to implement two
/// horizontal binary operations.
///
/// The kind of horizontal binary operation is defined by \p X86Opcode.
///
/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
/// the two new horizontal binop.
/// When Mode is set, the first horizontal binop dag node would take as input
/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
/// horizontal binop dag node would take as input the lower 128-bit of V1
/// and the upper 128-bit of V1.
/// Example:
/// HADD V0_LO, V0_HI
/// HADD V1_LO, V1_HI
///
/// Otherwise, the first horizontal binop dag node takes as input the lower
/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
/// Example:
/// HADD V0_LO, V1_LO
/// HADD V0_HI, V1_HI
///
/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
/// the upper 128-bits of the result.
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
const SDLoc &DL, SelectionDAG &DAG,
unsigned X86Opcode, bool Mode,
bool isUndefLO, bool isUndefHI) {
MVT VT = V0.getSimpleValueType();
assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
"Invalid nodes in input!");
unsigned NumElts = VT.getVectorNumElements();
SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
MVT NewVT = V0_LO.getSimpleValueType();
SDValue LO = DAG.getUNDEF(NewVT);
SDValue HI = DAG.getUNDEF(NewVT);
if (Mode) {
// Don't emit a horizontal binop if the result is expected to be UNDEF.
if (!isUndefLO && !V0->isUndef())
LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
if (!isUndefHI && !V1->isUndef())
HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
} else {
// Don't emit a horizontal binop if the result is expected to be UNDEF.
if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
}
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
}
/// Returns true iff \p BV builds a vector with the result equivalent to
/// the result of ADDSUB/SUBADD operation.
/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
/// \p Opnd0 and \p Opnd1.
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
const X86Subtarget &Subtarget, SelectionDAG &DAG,
SDValue &Opnd0, SDValue &Opnd1,
unsigned &NumExtracts,
bool &IsSubAdd) {
MVT VT = BV->getSimpleValueType(0);
if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
return false;
unsigned NumElts = VT.getVectorNumElements();
SDValue InVec0 = DAG.getUNDEF(VT);
SDValue InVec1 = DAG.getUNDEF(VT);
NumExtracts = 0;
// Odd-numbered elements in the input build vector are obtained from
// adding/subtracting two integer/float elements.
// Even-numbered elements in the input build vector are obtained from
// subtracting/adding two integer/float elements.
unsigned Opc[2] = {0, 0};
for (unsigned i = 0, e = NumElts; i != e; ++i) {
SDValue Op = BV->getOperand(i);
// Skip 'undef' values.
unsigned Opcode = Op.getOpcode();
if (Opcode == ISD::UNDEF)
continue;
// Early exit if we found an unexpected opcode.
if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
return false;
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
// Try to match the following pattern:
// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
// Early exit if we cannot match that sequence.
if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isa<ConstantSDNode>(Op0.getOperand(1)) ||
!isa<ConstantSDNode>(Op1.getOperand(1)) ||
Op0.getOperand(1) != Op1.getOperand(1))
return false;
unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
if (I0 != i)
return false;
// We found a valid add/sub node, make sure its the same opcode as previous
// elements for this parity.
if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
return false;
Opc[i % 2] = Opcode;
// Update InVec0 and InVec1.
if (InVec0.isUndef()) {
InVec0 = Op0.getOperand(0);
if (InVec0.getSimpleValueType() != VT)
return false;
}
if (InVec1.isUndef()) {
InVec1 = Op1.getOperand(0);
if (InVec1.getSimpleValueType() != VT)
return false;
}
// Make sure that operands in input to each add/sub node always
// come from a same pair of vectors.
if (InVec0 != Op0.getOperand(0)) {
if (Opcode == ISD::FSUB)
return false;
// FADD is commutable. Try to commute the operands
// and then test again.
std::swap(Op0, Op1);
if (InVec0 != Op0.getOperand(0))
return false;
}
if (InVec1 != Op1.getOperand(0))
return false;
// Increment the number of extractions done.
++NumExtracts;
}
// Ensure we have found an opcode for both parities and that they are
// different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
// inputs are undef.
if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
InVec0.isUndef() || InVec1.isUndef())
return false;
IsSubAdd = Opc[0] == ISD::FADD;
Opnd0 = InVec0;
Opnd1 = InVec1;
return true;
}
/// Returns true if is possible to fold MUL and an idiom that has already been
/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
///
/// Prior to calling this function it should be known that there is some
/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
/// before replacement of such SDNode with ADDSUB operation. Thus the number
/// of \p Opnd0 uses is expected to be equal to 2.
/// For example, this function may be called for the following IR:
/// %AB = fmul fast <2 x double> %A, %B
/// %Sub = fsub fast <2 x double> %AB, %C
/// %Add = fadd fast <2 x double> %AB, %C
/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
/// <2 x i32> <i32 0, i32 3>
/// There is a def for %Addsub here, which potentially can be replaced by
/// X86ISD::ADDSUB operation:
/// %Addsub = X86ISD::ADDSUB %AB, %C
/// and such ADDSUB can further be replaced with FMADDSUB:
/// %Addsub = FMADDSUB %A, %B, %C.
///
/// The main reason why this method is called before the replacement of the
/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
/// FMADDSUB is.
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
SelectionDAG &DAG,
SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
unsigned ExpectedUses) {
if (Opnd0.getOpcode() != ISD::FMUL ||
!Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
return false;
// FIXME: These checks must match the similar ones in
// DAGCombiner::visitFADDForFMACombine. It would be good to have one
// function that would answer if it is Ok to fuse MUL + ADD to FMADD
// or MUL + ADDSUB to FMADDSUB.
const TargetOptions &Options = DAG.getTarget().Options;
bool AllowFusion =
(Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
if (!AllowFusion)
return false;
Opnd2 = Opnd1;
Opnd1 = Opnd0.getOperand(1);
Opnd0 = Opnd0.getOperand(0);
return true;
}
/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
/// X86ISD::FMSUBADD node.
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue Opnd0, Opnd1;
unsigned NumExtracts;
bool IsSubAdd;
if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
IsSubAdd))
return SDValue();
MVT VT = BV->getSimpleValueType(0);
SDLoc DL(BV);
// Try to generate X86ISD::FMADDSUB node here.
SDValue Opnd2;
if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
}
// We only support ADDSUB.
if (IsSubAdd)
return SDValue();
// Do not generate X86ISD::ADDSUB node for 512-bit types even though
// the ADDSUB idiom has been successfully recognized. There are no known
// X86 targets with 512-bit ADDSUB instructions!
// 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
// recognition.
if (VT.is512BitVector())
return SDValue();
return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
}
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
unsigned &HOpcode, SDValue &V0, SDValue &V1) {
// Initialize outputs to known values.
MVT VT = BV->getSimpleValueType(0);
HOpcode = ISD::DELETED_NODE;
V0 = DAG.getUNDEF(VT);
V1 = DAG.getUNDEF(VT);
// x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
// half of the result is calculated independently from the 128-bit halves of
// the inputs, so that makes the index-checking logic below more complicated.
unsigned NumElts = VT.getVectorNumElements();
unsigned GenericOpcode = ISD::DELETED_NODE;
unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
for (unsigned i = 0; i != Num128BitChunks; ++i) {
for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
// Ignore undef elements.
SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
if (Op.isUndef())
continue;
// If there's an opcode mismatch, we're done.
if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
return false;
// Initialize horizontal opcode.
if (HOpcode == ISD::DELETED_NODE) {
GenericOpcode = Op.getOpcode();
switch (GenericOpcode) {
case ISD::ADD: HOpcode = X86ISD::HADD; break;
case ISD::SUB: HOpcode = X86ISD::HSUB; break;
case ISD::FADD: HOpcode = X86ISD::FHADD; break;
case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
default: return false;
}
}
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Op0.getOperand(0) != Op1.getOperand(0) ||
!isa<ConstantSDNode>(Op0.getOperand(1)) ||
!isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
return false;
// The source vector is chosen based on which 64-bit half of the
// destination vector is being calculated.
if (j < NumEltsIn64Bits) {
if (V0.isUndef())
V0 = Op0.getOperand(0);
} else {
if (V1.isUndef())
V1 = Op0.getOperand(0);
}
SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
if (SourceVec != Op0.getOperand(0))
return false;
// op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
unsigned ExpectedIndex = i * NumEltsIn128Bits +
(j % NumEltsIn64Bits) * 2;
if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
continue;
// If this is not a commutative op, this does not match.
if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
return false;
// Addition is commutative, so try swapping the extract indexes.
// op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
continue;
// Extract indexes do not match horizontal requirement.
return false;
}
}
// We matched. Opcode and operands are returned by reference as arguments.
return true;
}
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
SelectionDAG &DAG, unsigned HOpcode,
SDValue V0, SDValue V1) {
// If either input vector is not the same size as the build vector,
// extract/insert the low bits to the correct size.
// This is free (examples: zmm --> xmm, xmm --> ymm).
MVT VT = BV->getSimpleValueType(0);
unsigned Width = VT.getSizeInBits();
if (V0.getValueSizeInBits() > Width)
V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
else if (V0.getValueSizeInBits() < Width)
V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
if (V1.getValueSizeInBits() > Width)
V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
else if (V1.getValueSizeInBits() < Width)
V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
unsigned NumElts = VT.getVectorNumElements();
APInt DemandedElts = APInt::getAllOnesValue(NumElts);
for (unsigned i = 0; i != NumElts; ++i)
if (BV->getOperand(i).isUndef())
DemandedElts.clearBit(i);
// If we don't need the upper xmm, then perform as a xmm hop.
unsigned HalfNumElts = NumElts / 2;
if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), HalfNumElts);
V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
}
return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
}
/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// We need at least 2 non-undef elements to make this worthwhile by default.
unsigned NumNonUndefs =
count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
if (NumNonUndefs < 2)
return SDValue();
// There are 4 sets of horizontal math operations distinguished by type:
// int/FP at 128-bit/256-bit. Each type was introduced with a different
// subtarget feature. Try to match those "native" patterns first.
MVT VT = BV->getSimpleValueType(0);
if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
unsigned HOpcode;
SDValue V0, V1;
if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
}
// Try harder to match 256-bit ops by using extract/concat.
if (!Subtarget.hasAVX() || !VT.is256BitVector())
return SDValue();
// Count the number of UNDEF operands in the build_vector in input.
unsigned NumElts = VT.getVectorNumElements();
unsigned Half = NumElts / 2;
unsigned NumUndefsLO = 0;
unsigned NumUndefsHI = 0;
for (unsigned i = 0, e = Half; i != e; ++i)
if (BV->getOperand(i)->isUndef())
NumUndefsLO++;
for (unsigned i = Half, e = NumElts; i != e; ++i)
if (BV->getOperand(i)->isUndef())
NumUndefsHI++;
SDLoc DL(BV);
SDValue InVec0, InVec1;
if (VT == MVT::v8i32 || VT == MVT::v16i16) {
SDValue InVec2, InVec3;
unsigned X86Opcode;
bool CanFold = true;
if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
InVec3) &&
((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
X86Opcode = X86ISD::HADD;
else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
InVec1) &&
isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
InVec3) &&
((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
X86Opcode = X86ISD::HSUB;
else
CanFold = false;
if (CanFold) {
// Do not try to expand this build_vector into a pair of horizontal
// add/sub if we can emit a pair of scalar add/sub.
if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
return SDValue();
// Convert this build_vector into a pair of horizontal binops followed by
// a concat vector. We must adjust the outputs from the partial horizontal
// matching calls above to account for undefined vector halves.
SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
bool isUndefLO = NumUndefsLO == Half;
bool isUndefHI = NumUndefsHI == Half;
return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
isUndefHI);
}
}
if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
VT == MVT::v16i16) {
unsigned X86Opcode;
if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
X86Opcode = X86ISD::HADD;
else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
InVec1))
X86Opcode = X86ISD::HSUB;
else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
InVec1))
X86Opcode = X86ISD::FHADD;
else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
InVec1))
X86Opcode = X86ISD::FHSUB;
else
return SDValue();
// Don't try to expand this build_vector into a pair of horizontal add/sub
// if we can simply emit a pair of scalar add/sub.
if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
return SDValue();
// Convert this build_vector into two horizontal add/sub followed by
// a concat vector.
bool isUndefLO = NumUndefsLO == Half;
bool isUndefHI = NumUndefsHI == Half;
return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
isUndefLO, isUndefHI);
}
return SDValue();
}
/// If a BUILD_VECTOR's source elements all apply the same bit operation and
/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
/// just apply the bit to the vectors.
/// NOTE: Its not in our interest to start make a general purpose vectorizer
/// from this, but enough scalar bit operations are created from the later
/// legalization + scalarization stages to need basic support.
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
SelectionDAG &DAG) {
SDLoc DL(Op);
MVT VT = Op->getSimpleValueType(0);
unsigned NumElems = VT.getVectorNumElements();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Check that all elements have the same opcode.
// TODO: Should we allow UNDEFS and if so how many?
unsigned Opcode = Op->getOperand(0).getOpcode();
for (unsigned i = 1; i < NumElems; ++i)
if (Opcode != Op->getOperand(i).getOpcode())
return SDValue();
// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
bool IsShift = false;
switch (Opcode) {
default:
return SDValue();
case ISD::SHL:
case ISD::SRL:
case ISD::SRA:
IsShift = true;
break;
case ISD::AND:
case ISD::XOR:
case ISD::OR:
// Don't do this if the buildvector is a splat - we'd replace one
// constant with an entire vector.
if (Op->getSplatValue())
return SDValue();
if (!TLI.isOperationLegalOrPromote(Opcode, VT))
return SDValue();
break;
}
SmallVector<SDValue, 4> LHSElts, RHSElts;
for (SDValue Elt : Op->ops()) {
SDValue LHS = Elt.getOperand(0);
SDValue RHS = Elt.getOperand(1);
// We expect the canonicalized RHS operand to be the constant.
if (!isa<ConstantSDNode>(RHS))
return SDValue();
// Extend shift amounts.
if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
if (!IsShift)
return SDValue();
RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
}
LHSElts.push_back(LHS);
RHSElts.push_back(RHS);
}
// Limit to shifts by uniform immediates.
// TODO: Only accept vXi8/vXi64 special cases?
// TODO: Permit non-uniform XOP/AVX2/MULLO cases?
if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
return SDValue();
SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
return DAG.getNode(Opcode, DL, VT, LHS, RHS);
}
/// Create a vector constant without a load. SSE/AVX provide the bare minimum
/// functionality to do this, so it's all zeros, all ones, or some derivation
/// that is cheap to calculate.
static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
// Vectors containing all zeros can be matched by pxor and xorps.
if (ISD::isBuildVectorAllZeros(Op.getNode())) {
// Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
// and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
return Op;
return getZeroVector(VT, Subtarget, DAG, DL);
}
// Vectors containing all ones can be matched by pcmpeqd on 128-bit width
// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
// vpcmpeqd on 256-bit vectors.
if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
(VT == MVT::v8i32 && Subtarget.hasInt256()))
return Op;
return getOnesVector(VT, DAG, DL);
}
return SDValue();
}
/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
/// from a vector of source values and a vector of extraction indices.
/// The vectors might be manipulated to match the type of the permute op.
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT ShuffleVT = VT;
EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
unsigned NumElts = VT.getVectorNumElements();
unsigned SizeInBits = VT.getSizeInBits();
// Adjust IndicesVec to match VT size.
assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
"Illegal variable permute mask size");
if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
NumElts * VT.getScalarSizeInBits());
IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
// Handle SrcVec that don't match VT type.
if (SrcVec.getValueSizeInBits() != SizeInBits) {
if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
// Handle larger SrcVec by treating it as a larger permute.
unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
Subtarget, DAG, SDLoc(IndicesVec));
return extractSubVector(
createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
DAG, DL, SizeInBits);
} else if (SrcVec.getValueSizeInBits() < SizeInBits) {
// Widen smaller SrcVec to match VT.
SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
} else
return SDValue();
}
auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
EVT SrcVT = Idx.getValueType();
unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
uint64_t IndexScale = 0;
uint64_t IndexOffset = 0;
// If we're scaling a smaller permute op, then we need to repeat the
// indices, scaling and offsetting them as well.
// e.g. v4i32 -> v16i8 (Scale = 4)
// IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
// IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
for (uint64_t i = 0; i != Scale; ++i) {
IndexScale |= Scale << (i * NumDstBits);
IndexOffset |= i << (i * NumDstBits);
}
Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
return Idx;
};
unsigned Opcode = 0;
switch (VT.SimpleTy) {
default:
break;
case MVT::v16i8:
if (Subtarget.hasSSSE3())
Opcode = X86ISD::PSHUFB;
break;
case MVT::v8i16:
if (Subtarget.hasVLX() && Subtarget.hasBWI())
Opcode = X86ISD::VPERMV;
else if (Subtarget.hasSSSE3()) {
Opcode = X86ISD::PSHUFB;
ShuffleVT = MVT::v16i8;
}
break;
case MVT::v4f32:
case MVT::v4i32:
if (Subtarget.hasAVX()) {
Opcode = X86ISD::VPERMILPV;
ShuffleVT = MVT::v4f32;
} else if (Subtarget.hasSSSE3()) {
Opcode = X86ISD::PSHUFB;
ShuffleVT = MVT::v16i8;
}
break;
case MVT::v2f64:
case MVT::v2i64:
if (Subtarget.hasAVX()) {
// VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
Opcode = X86ISD::VPERMILPV;
ShuffleVT = MVT::v2f64;
} else if (Subtarget.hasSSE41()) {
// SSE41 can compare v2i64 - select between indices 0 and 1.
return DAG.getSelectCC(
DL, IndicesVec,
getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
ISD::CondCode::SETEQ);
}
break;
case MVT::v32i8:
if (Subtarget.hasVLX() && Subtarget.hasVBMI())
Opcode = X86ISD::VPERMV;
else if (Subtarget.hasXOP()) {
SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
return DAG.getNode(
ISD::CONCAT_VECTORS, DL, VT,
DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
} else if (Subtarget.hasAVX()) {
SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
// Permute Lo and Hi and then select based on index range.
// This works as SHUFB uses bits[3:0] to permute elements and we don't
// care about the bit[7] as its just an index vector.
SDValue Idx = Ops[2];
EVT VT = Idx.getValueType();
return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
ISD::CondCode::SETGT);
};
SDValue Ops[] = {LoLo, HiHi, IndicesVec};
return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
PSHUFBBuilder);
}
break;
case MVT::v16i16:
if (Subtarget.hasVLX() && Subtarget.hasBWI())
Opcode = X86ISD::VPERMV;
else if (Subtarget.hasAVX()) {
// Scale to v32i8 and perform as v32i8.
IndicesVec = ScaleIndices(IndicesVec, 2);
return DAG.getBitcast(
VT, createVariablePermute(
MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
}
break;
case MVT::v8f32:
case MVT::v8i32:
if (Subtarget.hasAVX2())
Opcode = X86ISD::VPERMV;
else if (Subtarget.hasAVX()) {
SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
{0, 1, 2, 3, 0, 1, 2, 3});
SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
{4, 5, 6, 7, 4, 5, 6, 7});
if (Subtarget.hasXOP())
return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32,
LoLo, HiHi, IndicesVec,
DAG.getConstant(0, DL, MVT::i8)));
// Permute Lo and Hi and then select based on index range.
// This works as VPERMILPS only uses index bits[0:1] to permute elements.
SDValue Res = DAG.getSelectCC(
DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
ISD::CondCode::SETGT);
return DAG.getBitcast(VT, Res);
}
break;
case MVT::v4i64:
case MVT::v4f64:
if (Subtarget.hasAVX512()) {
if (!Subtarget.hasVLX()) {
MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
SDLoc(SrcVec));
IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
DAG, SDLoc(IndicesVec));
SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
DAG, Subtarget);
return extract256BitVector(Res, 0, DAG, DL);
}
Opcode = X86ISD::VPERMV;
} else if (Subtarget.hasAVX()) {
SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
SDValue LoLo =
DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
SDValue HiHi =
DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
// VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
if (Subtarget.hasXOP())
return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64,
LoLo, HiHi, IndicesVec,
DAG.getConstant(0, DL, MVT::i8)));
// Permute Lo and Hi and then select based on index range.
// This works as VPERMILPD only uses index bit[1] to permute elements.
SDValue Res = DAG.getSelectCC(
DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
ISD::CondCode::SETGT);
return DAG.getBitcast(VT, Res);
}
break;
case MVT::v64i8:
if (Subtarget.hasVBMI())
Opcode = X86ISD::VPERMV;
break;
case MVT::v32i16:
if (Subtarget.hasBWI())
Opcode = X86ISD::VPERMV;
break;
case MVT::v16f32:
case MVT::v16i32:
case MVT::v8f64:
case MVT::v8i64:
if (Subtarget.hasAVX512())
Opcode = X86ISD::VPERMV;
break;
}
if (!Opcode)
return SDValue();
assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
"Illegal variable permute shuffle type");
uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
if (Scale > 1)
IndicesVec = ScaleIndices(IndicesVec, Scale);
EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
SDValue Res = Opcode == X86ISD::VPERMV
? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
: DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
return DAG.getBitcast(VT, Res);
}
// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
// reasoned to be a permutation of a vector by indices in a non-constant vector.
// (build_vector (extract_elt V, (extract_elt I, 0)),
// (extract_elt V, (extract_elt I, 1)),
// ...
// ->
// (vpermv I, V)
//
// TODO: Handle undefs
// TODO: Utilize pshufb and zero mask blending to support more efficient
// construction of vectors with constant-0 elements.
static SDValue
LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue SrcVec, IndicesVec;
// Check for a match of the permute source vector and permute index elements.
// This is done by checking that the i-th build_vector operand is of the form:
// (extract_elt SrcVec, (extract_elt IndicesVec, i)).
for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
SDValue Op = V.getOperand(Idx);
if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
// If this is the first extract encountered in V, set the source vector,
// otherwise verify the extract is from the previously defined source
// vector.
if (!SrcVec)
SrcVec = Op.getOperand(0);
else if (SrcVec != Op.getOperand(0))
return SDValue();
SDValue ExtractedIndex = Op->getOperand(1);
// Peek through extends.
if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
ExtractedIndex = ExtractedIndex.getOperand(0);
if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
// If this is the first extract from the index vector candidate, set the
// indices vector, otherwise verify the extract is from the previously
// defined indices vector.
if (!IndicesVec)
IndicesVec = ExtractedIndex.getOperand(0);
else if (IndicesVec != ExtractedIndex.getOperand(0))
return SDValue();
auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
if (!PermIdx || PermIdx->getZExtValue() != Idx)
return SDValue();
}
SDLoc DL(V);
MVT VT = V.getSimpleValueType();
return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
}
SDValue
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
unsigned NumElems = Op.getNumOperands();
// Generate vectors for predicate vectors.
if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
return VectorConstant;
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
return AddSub;
if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
return HorizontalOp;
if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
return Broadcast;
if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
return BitOp;
unsigned EVTBits = EltVT.getSizeInBits();
unsigned NumZero = 0;
unsigned NumNonZero = 0;
uint64_t NonZeros = 0;
bool IsAllConstants = true;
SmallSet<SDValue, 8> Values;
unsigned NumConstants = NumElems;
for (unsigned i = 0; i < NumElems; ++i) {
SDValue Elt = Op.getOperand(i);
if (Elt.isUndef())
continue;
Values.insert(Elt);
if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
IsAllConstants = false;
NumConstants--;
}
if (X86::isZeroNode(Elt))
NumZero++;
else {
assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
NonZeros |= ((uint64_t)1 << i);
NumNonZero++;
}
}
// All undef vector. Return an UNDEF. All zero vectors were handled above.
if (NumNonZero == 0)
return DAG.getUNDEF(VT);
// If we are inserting one variable into a vector of non-zero constants, try
// to avoid loading each constant element as a scalar. Load the constants as a
// vector and then insert the variable scalar element. If insertion is not
// supported, fall back to a shuffle to get the scalar blended with the
// constants. Insertion into a zero vector is handled as a special-case
// somewhere below here.
if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
(isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
// Create an all-constant vector. The variable element in the old
// build vector is replaced by undef in the constant vector. Save the
// variable scalar element and its index for use in the insertelement.
LLVMContext &Context = *DAG.getContext();
Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
SDValue VarElt;
SDValue InsIndex;
for (unsigned i = 0; i != NumElems; ++i) {
SDValue Elt = Op.getOperand(i);
if (auto *C = dyn_cast<ConstantSDNode>(Elt))
ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
else if (!Elt.isUndef()) {
assert(!VarElt.getNode() && !InsIndex.getNode() &&
"Expected one variable element in this vector");
VarElt = Elt;
InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
}
}
Constant *CV = ConstantVector::get(ConstVecOps);
SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
// The constants we just created may not be legal (eg, floating point). We
// must lower the vector right here because we can not guarantee that we'll
// legalize it before loading it. This is also why we could not just create
// a new build vector here. If the build vector contains illegal constants,
// it could get split back up into a series of insert elements.
// TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
MachineFunction &MF = DAG.getMachineFunction();
MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
if (InsertC < NumEltsInLow128Bits)
return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
// There's no good way to insert into the high elements of a >128-bit
// vector, so use shuffles to avoid an extract/insert sequence.
assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
SmallVector<int, 8> ShuffleMask;
unsigned NumElts = VT.getVectorNumElements();
for (unsigned i = 0; i != NumElts; ++i)
ShuffleMask.push_back(i == InsertC ? NumElts : i);
SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
}
// Special case for single non-zero, non-undef, element.
if (NumNonZero == 1) {
unsigned Idx = countTrailingZeros(NonZeros);
SDValue Item = Op.getOperand(Idx);
// If we have a constant or non-constant insertion into the low element of
// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
// the rest of the elements. This will be matched as movd/movq/movss/movsd
// depending on what the source datatype is.
if (Idx == 0) {
if (NumZero == 0)
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
(EltVT == MVT::i64 && Subtarget.is64Bit())) {
assert((VT.is128BitVector() || VT.is256BitVector() ||
VT.is512BitVector()) &&
"Expected an SSE value type!");
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
}
// We can't directly insert an i8 or i16 into a vector, so zero extend
// it to i32 first.
if (EltVT == MVT::i16 || EltVT == MVT::i8) {
Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
if (VT.getSizeInBits() >= 256) {
MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
if (Subtarget.hasAVX()) {
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
} else {
// Without AVX, we need to extend to a 128-bit vector and then
// insert into the 256-bit vector.
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
}
} else {
assert(VT.is128BitVector() && "Expected an SSE value type!");
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
}
return DAG.getBitcast(VT, Item);
}
}
// Is it a vector logical left shift?
if (NumElems == 2 && Idx == 1 &&
X86::isZeroNode(Op.getOperand(0)) &&
!X86::isZeroNode(Op.getOperand(1))) {
unsigned NumBits = VT.getSizeInBits();
return getVShift(true, VT,
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
VT, Op.getOperand(1)),
NumBits/2, DAG, *this, dl);
}
if (IsAllConstants) // Otherwise, it's better to do a constpool load.
return SDValue();
// Otherwise, if this is a vector with i32 or f32 elements, and the element
// is a non-constant being inserted into an element other than the low one,
// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
// movd/movss) to move this into the low element, then shuffle it into
// place.
if (EVTBits == 32) {
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
}
}
// Splat is obviously ok. Let legalizer expand it to a shuffle.
if (Values.size() == 1) {
if (EVTBits == 32) {
// Instead of a shuffle like this:
// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
// Check if it's possible to issue this instead.
// shuffle (vload ptr)), undef, <1, 1, 1, 1>
unsigned Idx = countTrailingZeros(NonZeros);
SDValue Item = Op.getOperand(Idx);
if (Op.getNode()->isOnlyUserOf(Item.getNode()))
return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
}
return SDValue();
}
// A vector full of immediates; various special cases are already
// handled, so this is best done with a single constant-pool load.
if (IsAllConstants)
return SDValue();
if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
return V;
// See if we can use a vector load to get all of the elements.
{
SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
if (SDValue LD =
EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
return LD;
}
// If this is a splat of pairs of 32-bit elements, we can use a narrower
// build_vector and broadcast it.
// TODO: We could probably generalize this more.
if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
// Make sure all the even/odd operands match.
for (unsigned i = 2; i != NumElems; ++i)
if (Ops[i % 2] != Op.getOperand(i))
return false;
return true;
};
if (CanSplat(Op, NumElems, Ops)) {
MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
// Create a new build vector and cast to v2i64/v2f64.
SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
DAG.getBuildVector(NarrowVT, dl, Ops));
// Broadcast from v2i64/v2f64 and cast to final VT.
MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
NewBV));
}
}
// For AVX-length vectors, build the individual 128-bit pieces and use
// shuffles to put them in place.
if (VT.getSizeInBits() > 128) {
MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);
// Build both the lower and upper subvector.
SDValue Lower =
DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
SDValue Upper = DAG.getBuildVector(
HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
// Recreate the wider vector with the lower and upper part.
return concatSubVectors(Lower, Upper, VT, NumElems, DAG, dl,
VT.getSizeInBits() / 2);
}
// Let legalizer expand 2-wide build_vectors.
if (EVTBits == 64) {
if (NumNonZero == 1) {
// One half is zero or undef.
unsigned Idx = countTrailingZeros(NonZeros);
SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
Op.getOperand(Idx));
return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
}
return SDValue();
}
// If element VT is < 32 bits, convert it to inserts into a zero vector.
if (EVTBits == 8 && NumElems == 16)
if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
DAG, Subtarget))
return V;
if (EVTBits == 16 && NumElems == 8)
if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
DAG, Subtarget))
return V;
// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
if (EVTBits == 32 && NumElems == 4)
if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
return V;
// If element VT is == 32 bits, turn it into a number of shuffles.
if (NumElems == 4 && NumZero > 0) {
SmallVector<SDValue, 8> Ops(NumElems);
for (unsigned i = 0; i < 4; ++i) {
bool isZero = !(NonZeros & (1ULL << i));
if (isZero)
Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
else
Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
}
for (unsigned i = 0; i < 2; ++i) {
switch ((NonZeros >> (i*2)) & 0x3) {
default: llvm_unreachable("Unexpected NonZero count");
case 0:
Ops[i] = Ops[i*2]; // Must be a zero vector.
break;
case 1:
Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
break;
case 2:
Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
break;
case 3:
Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
break;
}
}
bool Reverse1 = (NonZeros & 0x3) == 2;
bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
int MaskVec[] = {
Reverse1 ? 1 : 0,
Reverse1 ? 0 : 1,
static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
static_cast<int>(Reverse2 ? NumElems : NumElems+1)
};
return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
}
assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
// Check for a build vector from mostly shuffle plus few inserting.
if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
return Sh;
// For SSE 4.1, use insertps to put the high elements into the low element.
if (Subtarget.hasSSE41()) {
SDValue Result;
if (!Op.getOperand(0).isUndef())
Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
else
Result = DAG.getUNDEF(VT);
for (unsigned i = 1; i < NumElems; ++i) {
if (Op.getOperand(i).isUndef()) continue;
Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
}
return Result;
}
// Otherwise, expand into a number of unpckl*, start by extending each of
// our (non-undef) elements to the full vector width with the element in the
// bottom slot of the vector (which generates no code for SSE).
SmallVector<SDValue, 8> Ops(NumElems);
for (unsigned i = 0; i < NumElems; ++i) {
if (!Op.getOperand(i).isUndef())
Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
else
Ops[i] = DAG.getUNDEF(VT);
}
// Next, we iteratively mix elements, e.g. for v4f32:
// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
// Generate scaled UNPCKL shuffle mask.
SmallVector<int, 16> Mask;
for(unsigned i = 0; i != Scale; ++i)
Mask.push_back(i);
for (unsigned i = 0; i != Scale; ++i)
Mask.push_back(NumElems+i);
Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
}
return Ops[0];
}
// 256-bit AVX can use the vinsertf128 instruction
// to create 256-bit vectors from two other 128-bit ones.
// TODO: Detect subvector broadcast here instead of DAG combine?
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc dl(Op);
MVT ResVT = Op.getSimpleValueType();
assert((ResVT.is256BitVector() ||
ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
unsigned NumOperands = Op.getNumOperands();
unsigned NumZero = 0;
unsigned NumNonZero = 0;
unsigned NonZeros = 0;
for (unsigned i = 0; i != NumOperands; ++i) {
SDValue SubVec = Op.getOperand(i);
if (SubVec.isUndef())
continue;
if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
++NumZero;
else {
assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
NonZeros |= 1 << i;
++NumNonZero;
}
}
// If we have more than 2 non-zeros, build each half separately.
if (NumNonZero > 2) {
MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
ResVT.getVectorNumElements()/2);
ArrayRef<SDUse> Ops = Op->ops();
SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
Ops.slice(0, NumOperands/2));
SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
Ops.slice(NumOperands/2));
return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
}
// Otherwise, build it up through insert_subvectors.
SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
: DAG.getUNDEF(ResVT);
MVT SubVT = Op.getOperand(0).getSimpleValueType();
unsigned NumSubElems = SubVT.getVectorNumElements();
for (unsigned i = 0; i != NumOperands; ++i) {
if ((NonZeros & (1 << i)) == 0)
continue;
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
Op.getOperand(i),
DAG.getIntPtrConstant(i * NumSubElems, dl));
}
return Vec;
}
// Returns true if the given node is a type promotion (by concatenating i1
// zeros) of the result of a node that already zeros all upper bits of
// k-register.
// TODO: Merge this with LowerAVXCONCAT_VECTORS?
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
const X86Subtarget &Subtarget,
SelectionDAG & DAG) {
SDLoc dl(Op);
MVT ResVT = Op.getSimpleValueType();
unsigned NumOperands = Op.getNumOperands();
assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
"Unexpected number of operands in CONCAT_VECTORS");
unsigned NumZero = 0;
unsigned NumNonZero = 0;
uint64_t NonZeros = 0;
for (unsigned i = 0; i != NumOperands; ++i) {
SDValue SubVec = Op.getOperand(i);
if (SubVec.isUndef())
continue;
if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
++NumZero;
else {
assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
NonZeros |= (uint64_t)1 << i;
++NumNonZero;
}
}
// If there are zero or one non-zeros we can handle this very simply.
if (NumNonZero <= 1) {
SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
: DAG.getUNDEF(ResVT);
if (!NumNonZero)
return Vec;
unsigned Idx = countTrailingZeros(NonZeros);
SDValue SubVec = Op.getOperand(Idx);
unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
}
if (NumOperands > 2) {
MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
ResVT.getVectorNumElements()/2);
ArrayRef<SDUse> Ops = Op->ops();
SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
Ops.slice(0, NumOperands/2));
SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
Ops.slice(NumOperands/2));
return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
}
assert(NumNonZero == 2 && "Simple cases not handled?");
if (ResVT.getVectorNumElements() >= 16)
return Op; // The operation is legal with KUNPCK
SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
DAG.getUNDEF(ResVT), Op.getOperand(0),
DAG.getIntPtrConstant(0, dl));
unsigned NumElems = ResVT.getVectorNumElements();
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
DAG.getIntPtrConstant(NumElems/2, dl));
}
static SDValue LowerCONCAT_VECTORS(SDValue Op,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
if (VT.getVectorElementType() == MVT::i1)
return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4)));
// AVX can use the vinsertf128 instruction to create 256-bit vectors
// from two other 128-bit ones.
// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
}
//===----------------------------------------------------------------------===//
// Vector shuffle lowering
//
// This is an experimental code path for lowering vector shuffles on x86. It is
// designed to handle arbitrary vector shuffles and blends, gracefully
// degrading performance as necessary. It works hard to recognize idiomatic
// shuffles and lower them to optimal instruction patterns without leaving
// a framework that allows reasonably efficient handling of all vector shuffle
// patterns.
//===----------------------------------------------------------------------===//
/// Tiny helper function to identify a no-op mask.
///
/// This is a somewhat boring predicate function. It checks whether the mask
/// array input, which is assumed to be a single-input shuffle mask of the kind
/// used by the X86 shuffle instructions (not a fully general
/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
/// in-place shuffle are 'no-op's.
static bool isNoopShuffleMask(ArrayRef<int> Mask) {
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
assert(Mask[i] >= -1 && "Out of bound mask element!");
if (Mask[i] >= 0 && Mask[i] != i)
return false;
}
return true;
}
/// Test whether there are elements crossing 128-bit lanes in this
/// shuffle mask.
///
/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
/// and we routinely test for these.
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
int LaneSize = 128 / VT.getScalarSizeInBits();
int Size = Mask.size();
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
return true;
return false;
}
/// Test whether a shuffle mask is equivalent within each sub-lane.
///
/// This checks a shuffle mask to see if it is performing the same
/// lane-relative shuffle in each sub-lane. This trivially implies
/// that it is also not lane-crossing. It may however involve a blend from the
/// same lane of a second vector.
///
/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
/// non-trivial to compute in the face of undef lanes. The representation is
/// suitable for use with existing 128-bit shuffles as entries from the second
/// vector have been remapped to [LaneSize, 2*LaneSize).
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
ArrayRef<int> Mask,
SmallVectorImpl<int> &RepeatedMask) {
auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
RepeatedMask.assign(LaneSize, -1);
int Size = Mask.size();
for (int i = 0; i < Size; ++i) {
assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
if (Mask[i] < 0)
continue;
if ((Mask[i] % Size) / LaneSize != i / LaneSize)
// This entry crosses lanes, so there is no way to model this shuffle.
return false;
// Ok, handle the in-lane shuffles by detecting if and when they repeat.
// Adjust second vector indices to start at LaneSize instead of Size.
int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
: Mask[i] % LaneSize + LaneSize;
if (RepeatedMask[i % LaneSize] < 0)
// This is the first non-undef entry in this slot of a 128-bit lane.
RepeatedMask[i % LaneSize] = LocalM;
else if (RepeatedMask[i % LaneSize] != LocalM)
// Found a mismatch with the repeated mask.
return false;
}
return true;
}
/// Test whether a shuffle mask is equivalent within each 128-bit lane.
static bool
is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
SmallVectorImpl<int> &RepeatedMask) {
return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
}
static bool
is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
SmallVector<int, 32> RepeatedMask;
return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
}
/// Test whether a shuffle mask is equivalent within each 256-bit lane.
static bool
is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
SmallVectorImpl<int> &RepeatedMask) {
return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
}
/// Test whether a target shuffle mask is equivalent within each sub-lane.
/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
ArrayRef<int> Mask,
SmallVectorImpl<int> &RepeatedMask) {
int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
RepeatedMask.assign(LaneSize, SM_SentinelUndef);
int Size = Mask.size();
for (int i = 0; i < Size; ++i) {
assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
if (Mask[i] == SM_SentinelUndef)
continue;
if (Mask[i] == SM_SentinelZero) {
if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
return false;
RepeatedMask[i % LaneSize] = SM_SentinelZero;
continue;
}
if ((Mask[i] % Size) / LaneSize != i / LaneSize)
// This entry crosses lanes, so there is no way to model this shuffle.
return false;
// Ok, handle the in-lane shuffles by detecting if and when they repeat.
// Adjust second vector indices to start at LaneSize instead of Size.
int LocalM =
Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
// This is the first non-undef entry in this slot of a 128-bit lane.
RepeatedMask[i % LaneSize] = LocalM;
else if (RepeatedMask[i % LaneSize] != LocalM)
// Found a mismatch with the repeated mask.
return false;
}
return true;
}
/// Checks whether a shuffle mask is equivalent to an explicit list of
/// arguments.
///
/// This is a fast way to test a shuffle mask against a fixed pattern:
///
/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
///
/// It returns true if the mask is exactly as wide as the argument list, and
/// each element of the mask is either -1 (signifying undef) or the value given
/// in the argument.
static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
ArrayRef<int> ExpectedMask) {
if (Mask.size() != ExpectedMask.size())
return false;
int Size = Mask.size();
// If the values are build vectors, we can look through them to find
// equivalent inputs that make the shuffles equivalent.
auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
for (int i = 0; i < Size; ++i) {
assert(Mask[i] >= -1 && "Out of bound mask element!");
if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
if (!MaskBV || !ExpectedBV ||
MaskBV->getOperand(Mask[i] % Size) !=
ExpectedBV->getOperand(ExpectedMask[i] % Size))
return false;
}
}
return true;
}
/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
///
/// The masks must be exactly the same width.
///
/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
/// value in ExpectedMask is always accepted. Otherwise the indices must match.
///
/// SM_SentinelZero is accepted as a valid negative index but must match in both.
static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
ArrayRef<int> ExpectedMask) {
int Size = Mask.size();
if (Size != (int)ExpectedMask.size())
return false;
assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
"Illegal target shuffle mask");
for (int i = 0; i < Size; ++i)
if (Mask[i] == SM_SentinelUndef)
continue;
else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
return false;
else if (Mask[i] != ExpectedMask[i])
return false;
return true;
}
// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
// mask.
static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
const APInt &Zeroable) {
int NumElts = Mask.size();
assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
for (int i = 0; i != NumElts; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef)
continue;
assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
}
return TargetMask;
}
// Attempt to create a shuffle mask from a VSELECT condition mask.
static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
SDValue Cond) {
if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
return false;
unsigned Size = Cond.getValueType().getVectorNumElements();
Mask.resize(Size, SM_SentinelUndef);
for (int i = 0; i != (int)Size; ++i) {
SDValue CondElt = Cond.getOperand(i);
Mask[i] = i;
// Arbitrarily choose from the 2nd operand if the select condition element
// is undef.
// TODO: Can we do better by matching patterns such as even/odd?
if (CondElt.isUndef() || isNullConstant(CondElt))
Mask[i] += Size;
}
return true;
}
// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
// instructions.
static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
if (VT != MVT::v8i32 && VT != MVT::v8f32)
return false;
SmallVector<int, 8> Unpcklwd;
createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
/* Unary = */ false);
SmallVector<int, 8> Unpckhwd;
createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
/* Unary = */ false);
bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
isTargetShuffleEquivalent(Mask, Unpckhwd));
return IsUnpackwdMask;
}
static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
// Create 128-bit vector type based on mask size.
MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
MVT VT = MVT::getVectorVT(EltVT, Mask.size());
// We can't assume a canonical shuffle mask, so try the commuted version too.
SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
ShuffleVectorSDNode::commuteMask(CommutedMask);
// Match any of unary/binary or low/high.
for (unsigned i = 0; i != 4; ++i) {
SmallVector<int, 16> UnpackMask;
createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
if (isTargetShuffleEquivalent(Mask, UnpackMask) ||
isTargetShuffleEquivalent(CommutedMask, UnpackMask))
return true;
}
return false;
}
/// Return true if a shuffle mask chooses elements identically in its top and
/// bottom halves. For example, any splat mask has the same top and bottom
/// halves. If an element is undefined in only one half of the mask, the halves
/// are not considered identical.
static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
unsigned HalfSize = Mask.size() / 2;
for (unsigned i = 0; i != HalfSize; ++i) {
if (Mask[i] != Mask[i + HalfSize])
return false;
}
return true;
}
/// Get a 4-lane 8-bit shuffle immediate for a mask.
///
/// This helper function produces an 8-bit shuffle immediate corresponding to
/// the ubiquitous shuffle encoding scheme used in x86 instructions for
/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
/// example.
///
/// NB: We rely heavily on "undef" masks preserving the input lane.
static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
unsigned Imm = 0;
Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
return Imm;
}
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
SelectionDAG &DAG) {
return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
}
/// Compute whether each element of a shuffle is zeroable.
///
/// A "zeroable" vector shuffle element is one which can be lowered to zero.
/// Either it is an undef element in the shuffle mask, the element of the input
/// referenced is undef, or the element of the input referenced is known to be
/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
/// as many lanes with this technique as possible to simplify the remaining
/// shuffle.
static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
SDValue V1, SDValue V2) {
APInt Zeroable(Mask.size(), 0);
V1 = peekThroughBitcasts(V1);
V2 = peekThroughBitcasts(V2);
bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
int VectorSizeInBits = V1.getValueSizeInBits();
int ScalarSizeInBits = VectorSizeInBits / Mask.size();
assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
int M = Mask[i];
// Handle the easy cases.
if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
Zeroable.setBit(i);
continue;
}
// Determine shuffle input and normalize the mask.
SDValue V = M < Size ? V1 : V2;
M %= Size;
// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
if (V.getOpcode() != ISD::BUILD_VECTOR)
continue;
// If the BUILD_VECTOR has fewer elements then the bitcasted portion of
// the (larger) source element must be UNDEF/ZERO.
if ((Size % V.getNumOperands()) == 0) {
int Scale = Size / V->getNumOperands();
SDValue Op = V.getOperand(M / Scale);
if (Op.isUndef() || X86::isZeroNode(Op))
Zeroable.setBit(i);
else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
APInt Val = Cst->getAPIntValue();
Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
Val = Val.getLoBits(ScalarSizeInBits);
if (Val == 0)
Zeroable.setBit(i);
} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
APInt Val = Cst->getValueAPF().bitcastToAPInt();
Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
Val = Val.getLoBits(ScalarSizeInBits);
if (Val == 0)
Zeroable.setBit(i);
}
continue;
}
// If the BUILD_VECTOR has more elements then all the (smaller) source
// elements must be UNDEF or ZERO.
if ((V.getNumOperands() % Size) == 0) {
int Scale = V->getNumOperands() / Size;
bool AllZeroable = true;
for (int j = 0; j < Scale; ++j) {
SDValue Op = V.getOperand((M * Scale) + j);
AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
}
if (AllZeroable)
Zeroable.setBit(i);
continue;
}
}
return Zeroable;
}
// The Shuffle result is as follow:
// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
// Each Zeroable's element correspond to a particular Mask's element.
// As described in computeZeroableShuffleElements function.
//
// The function looks for a sub-mask that the nonzero elements are in
// increasing order. If such sub-mask exist. The function returns true.
static bool isNonZeroElementsInOrder(const APInt &Zeroable,
ArrayRef<int> Mask, const EVT &VectorType,
bool &IsZeroSideLeft) {
int NextElement = -1;
// Check if the Mask's nonzero elements are in increasing order.
for (int i = 0, e = Mask.size(); i < e; i++) {
// Checks if the mask's zeros elements are built from only zeros.
assert(Mask[i] >= -1 && "Out of bound mask element!");
if (Mask[i] < 0)
return false;
if (Zeroable[i])
continue;
// Find the lowest non zero element
if (NextElement < 0) {
NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
IsZeroSideLeft = NextElement != 0;
}
// Exit if the mask's non zero elements are not in increasing order.
if (NextElement != Mask[i])
return false;
NextElement++;
}
return true;
}
/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
int Size = Mask.size();
int LaneSize = 128 / VT.getScalarSizeInBits();
const int NumBytes = VT.getSizeInBits() / 8;
const int NumEltBytes = VT.getScalarSizeInBits() / 8;
assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
(Subtarget.hasAVX2() && VT.is256BitVector()) ||
(Subtarget.hasBWI() && VT.is512BitVector()));
SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
// Sign bit set in i8 mask means zero element.
SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
SDValue V;
for (int i = 0; i < NumBytes; ++i) {
int M = Mask[i / NumEltBytes];
if (M < 0) {
PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
continue;
}
if (Zeroable[i / NumEltBytes]) {
PSHUFBMask[i] = ZeroMask;
continue;
}
// We can only use a single input of V1 or V2.
SDValue SrcV = (M >= Size ? V2 : V1);
if (V && V != SrcV)
return SDValue();
V = SrcV;
M %= Size;
// PSHUFB can't cross lanes, ensure this doesn't happen.
if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
return SDValue();
M = M % LaneSize;
M = M * NumEltBytes + (i % NumEltBytes);
PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
}
assert(V && "Failed to find a source input");
MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
}
static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
const X86Subtarget &Subtarget, SelectionDAG &DAG,
const SDLoc &dl);
// X86 has dedicated shuffle that can be lowered to VEXPAND
static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
const APInt &Zeroable,
ArrayRef<int> Mask, SDValue &V1,
SDValue &V2, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
bool IsLeftZeroSide = true;
if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
IsLeftZeroSide))
return SDValue();
unsigned VEXPANDMask = (~Zeroable).getZExtValue();
MVT IntegerType =
MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
unsigned NumElts = VT.getVectorNumElements();
assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
"Unexpected number of vector elements");
SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
Subtarget, DAG, DL);
SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
}
static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
unsigned &UnpackOpcode, bool IsUnary,
ArrayRef<int> TargetMask,
const SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
int NumElts = VT.getVectorNumElements();
bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
for (int i = 0; i != NumElts; i += 2) {
int M1 = TargetMask[i + 0];
int M2 = TargetMask[i + 1];
Undef1 &= (SM_SentinelUndef == M1);
Undef2 &= (SM_SentinelUndef == M2);
Zero1 &= isUndefOrZero(M1);
Zero2 &= isUndefOrZero(M2);
}
assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
"Zeroable shuffle detected");
// Attempt to match the target mask against the unpack lo/hi mask patterns.
SmallVector<int, 64> Unpckl, Unpckh;
createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
UnpackOpcode = X86ISD::UNPCKL;
V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
return true;
}
createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
UnpackOpcode = X86ISD::UNPCKH;
V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
return true;
}
// If an unary shuffle, attempt to match as an unpack lo/hi with zero.
if (IsUnary && (Zero1 || Zero2)) {
// Don't bother if we can blend instead.
if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
return false;
bool MatchLo = true, MatchHi = true;
for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
int M = TargetMask[i];
// Ignore if the input is known to be zero or the index is undef.
if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
(M == SM_SentinelUndef))
continue;
MatchLo &= (M == Unpckl[i]);
MatchHi &= (M == Unpckh[i]);
}
if (MatchLo || MatchHi) {
UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
return true;
}
}
// If a binary shuffle, commute and try again.
if (!IsUnary) {
ShuffleVectorSDNode::commuteMask(Unpckl);
if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
UnpackOpcode = X86ISD::UNPCKL;
std::swap(V1, V2);
return true;
}
ShuffleVectorSDNode::commuteMask(Unpckh);
if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
UnpackOpcode = X86ISD::UNPCKH;
std::swap(V1, V2);
return true;
}
}
return false;
}
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1, SDValue V2,
SelectionDAG &DAG) {
SmallVector<int, 8> Unpckl;
createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
SmallVector<int, 8> Unpckh;
createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
// Commute and try again.
ShuffleVectorSDNode::commuteMask(Unpckl);
if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
ShuffleVectorSDNode::commuteMask(Unpckh);
if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
return SDValue();
}
static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
int Delta) {
int Size = (int)Mask.size();
int Split = Size / Delta;
int TruncatedVectorStart = SwappedOps ? Size : 0;
// Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
return false;
// The rest of the mask should not refer to the truncated vector's elements.
if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
TruncatedVectorStart + Size))
return false;
return true;
}
// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
//
// An example is the following:
//
// t0: ch = EntryToken
// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
// t25: v4i32 = truncate t2
// t41: v8i16 = bitcast t25
// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
// t18: v2i64 = bitcast t51
//
// Without avx512vl, this is lowered to:
//
// vpmovqd %zmm0, %ymm0
// vpshufb {{.*#+}} xmm0 =
// xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
//
// But when avx512vl is available, one can just use a single vpmovdw
// instruction.
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (VT != MVT::v16i8 && VT != MVT::v8i16)
return SDValue();
if (Mask.size() != VT.getVectorNumElements())
return SDValue();
bool SwappedOps = false;
if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
if (!ISD::isBuildVectorAllZeros(V1.getNode()))
return SDValue();
std::swap(V1, V2);
SwappedOps = true;
}
// Look for:
//
// bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
// bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
//
// and similar ones.
if (V1.getOpcode() != ISD::BITCAST)
return SDValue();
if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
return SDValue();
SDValue Src = V1.getOperand(0).getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
// The vptrunc** instructions truncating 128 bit and 256 bit vectors
// are only available with avx512vl.
if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
return SDValue();
// Down Convert Word to Byte is only available with avx512bw. The case with
// 256-bit output doesn't contain a shuffle and is therefore not handled here.
if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
!Subtarget.hasBWI())
return SDValue();
// The first half/quarter of the mask should refer to every second/fourth
// element of the vector truncated and bitcasted.
if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) &&
!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4))
return SDValue();
return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
}
// X86 has dedicated pack instructions that can handle specific truncation
// operations: PACKSS and PACKUS.
static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
SDValue &V2, unsigned &PackOpcode,
ArrayRef<int> TargetMask,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
unsigned NumElts = VT.getVectorNumElements();
unsigned BitSize = VT.getScalarSizeInBits();
MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
auto MatchPACK = [&](SDValue N1, SDValue N2) {
SDValue VV1 = DAG.getBitcast(PackVT, N1);
SDValue VV2 = DAG.getBitcast(PackVT, N2);
if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
(N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
V1 = VV1;
V2 = VV2;
SrcVT = PackVT;
PackOpcode = X86ISD::PACKUS;
return true;
}
}
if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
(N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
V1 = VV1;
V2 = VV2;
SrcVT = PackVT;
PackOpcode = X86ISD::PACKSS;
return true;
}
return false;
};
// Try binary shuffle.
SmallVector<int, 32> BinaryMask;
createPackShuffleMask(VT, BinaryMask, false);
if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
if (MatchPACK(V1, V2))
return true;
// Try unary shuffle.
SmallVector<int, 32> UnaryMask;
createPackShuffleMask(VT, UnaryMask, true);
if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
if (MatchPACK(V1, V1))
return true;
return false;
}
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
SDValue V1, SDValue V2, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT PackVT;
unsigned PackOpcode;
if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
Subtarget))
return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
DAG.getBitcast(PackVT, V2));
return SDValue();
}
/// Try to emit a bitmask instruction for a shuffle.
///
/// This handles cases where we can model a blend exactly as a bitmask due to
/// one of the inputs being zeroable.
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT MaskVT = VT;
MVT EltVT = VT.getVectorElementType();
SDValue Zero, AllOnes;
// Use f64 if i64 isn't legal.
if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
EltVT = MVT::f64;
MaskVT = MVT::getVectorVT(EltVT, Mask.size());
}
MVT LogicVT = VT;
if (EltVT == MVT::f32 || EltVT == MVT::f64) {
Zero = DAG.getConstantFP(0.0, DL, EltVT);
AllOnes = DAG.getConstantFP(
APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT);
LogicVT =
MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
} else {
Zero = DAG.getConstant(0, DL, EltVT);
AllOnes = DAG.getAllOnesConstant(DL, EltVT);
}
SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
SDValue V;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
if (Zeroable[i])
continue;
if (Mask[i] % Size != i)
return SDValue(); // Not a blend.
if (!V)
V = Mask[i] < Size ? V1 : V2;
else if (V != (Mask[i] < Size ? V1 : V2))
return SDValue(); // Can only let one input through the mask.
VMaskOps[i] = AllOnes;
}
if (!V)
return SDValue(); // No non-zeroable elements!
SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
VMask = DAG.getBitcast(LogicVT, VMask);
V = DAG.getBitcast(LogicVT, V);
SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
return DAG.getBitcast(VT, And);
}
/// Try to emit a blend instruction for a shuffle using bit math.
///
/// This is used as a fallback approach when first class blend instructions are
/// unavailable. Currently it is only suitable for integer vectors, but could
/// be generalized for floating point vectors if desirable.
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG) {
assert(VT.isInteger() && "Only supports integer vector types!");
MVT EltVT = VT.getVectorElementType();
SDValue Zero = DAG.getConstant(0, DL, EltVT);
SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
SmallVector<SDValue, 16> MaskOps;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
return SDValue(); // Shuffled input!
MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
}
SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
return DAG.getNode(ISD::OR, DL, VT, V1, V2);
}
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
SDValue PreservedSrc,
const X86Subtarget &Subtarget,
SelectionDAG &DAG);
static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
MutableArrayRef<int> TargetMask,
bool &ForceV1Zero, bool &ForceV2Zero,
uint64_t &BlendMask) {
bool V1IsZeroOrUndef =
V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
bool V2IsZeroOrUndef =
V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
BlendMask = 0;
ForceV1Zero = false, ForceV2Zero = false;
assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
// Attempt to generate the binary blend mask. If an input is zero then
// we can use any lane.
// TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
int M = TargetMask[i];
if (M == SM_SentinelUndef)
continue;
if (M == i)
continue;
if (M == i + Size) {
BlendMask |= 1ull << i;
continue;
}
if (M == SM_SentinelZero) {
if (V1IsZeroOrUndef) {
ForceV1Zero = true;
TargetMask[i] = i;
continue;
}
if (V2IsZeroOrUndef) {
ForceV2Zero = true;
BlendMask |= 1ull << i;
TargetMask[i] = i + Size;
continue;
}
}
return false;
}
return true;
}
static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
int Scale) {
uint64_t ScaledMask = 0;
for (int i = 0; i != Size; ++i)
if (BlendMask & (1ull << i))
ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
return ScaledMask;
}
/// Try to emit a blend instruction for a shuffle.
///
/// This doesn't do any checks for the availability of instructions for blending
/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
/// be matched in the backend with the type given. What it does check for is
/// that the shuffle mask is a blend, or convertible into a blend with zero.
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Original,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
uint64_t BlendMask = 0;
bool ForceV1Zero = false, ForceV2Zero = false;
if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
BlendMask))
return SDValue();
// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
if (ForceV1Zero)
V1 = getZeroVector(VT, Subtarget, DAG, DL);
if (ForceV2Zero)
V2 = getZeroVector(VT, Subtarget, DAG, DL);
switch (VT.SimpleTy) {
case MVT::v4i64:
case MVT::v8i32:
assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
LLVM_FALLTHROUGH;
case MVT::v4f64:
case MVT::v8f32:
assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
LLVM_FALLTHROUGH;
case MVT::v2f64:
case MVT::v2i64:
case MVT::v4f32:
case MVT::v4i32:
case MVT::v8i16:
assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
DAG.getConstant(BlendMask, DL, MVT::i8));
case MVT::v16i16: {
assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
SmallVector<int, 8> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
// We can lower these with PBLENDW which is mirrored across 128-bit lanes.
assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
BlendMask = 0;
for (int i = 0; i < 8; ++i)
if (RepeatedMask[i] >= 8)
BlendMask |= 1ull << i;
return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
DAG.getConstant(BlendMask, DL, MVT::i8));
}
// Use PBLENDW for lower/upper lanes and then blend lanes.
// TODO - we should allow 2 PBLENDW here and leave shuffle combine to
// merge to VSELECT where useful.
uint64_t LoMask = BlendMask & 0xFF;
uint64_t HiMask = (BlendMask >> 8) & 0xFF;
if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
DAG.getConstant(LoMask, DL, MVT::i8));
SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
DAG.getConstant(HiMask, DL, MVT::i8));
return DAG.getVectorShuffle(
MVT::v16i16, DL, Lo, Hi,
{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
}
LLVM_FALLTHROUGH;
}
case MVT::v32i8:
assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
LLVM_FALLTHROUGH;
case MVT::v16i8: {
assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
Subtarget, DAG))
return Masked;
if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
MVT IntegerType =
MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
}
// Scale the blend by the number of bytes per element.
int Scale = VT.getScalarSizeInBits() / 8;
// This form of blend is always done on bytes. Compute the byte vector
// type.
MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
// x86 allows load folding with blendvb from the 2nd source operand. But
// we are still using LLVM select here (see comment below), so that's V1.
// If V2 can be load-folded and V1 cannot be load-folded, then commute to
// allow that load-folding possibility.
if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
ShuffleVectorSDNode::commuteMask(Mask);
std::swap(V1, V2);
}
// Compute the VSELECT mask. Note that VSELECT is really confusing in the
// mix of LLVM's code generator and the x86 backend. We tell the code
// generator that boolean values in the elements of an x86 vector register
// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
// mapping a select to operand #1, and 'false' mapping to operand #2. The
// reality in x86 is that vector masks (pre-AVX-512) use only the high bit
// of the element (the remaining are ignored) and 0 in that high bit would
// mean operand #1 while 1 in the high bit would mean operand #2. So while
// the LLVM model for boolean values in vector elements gets the relevant
// bit set, it is set backwards and over constrained relative to x86's
// actual model.
SmallVector<SDValue, 32> VSELECTMask;
for (int i = 0, Size = Mask.size(); i < Size; ++i)
for (int j = 0; j < Scale; ++j)
VSELECTMask.push_back(
Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
MVT::i8));
V1 = DAG.getBitcast(BlendVT, V1);
V2 = DAG.getBitcast(BlendVT, V2);
return DAG.getBitcast(
VT,
DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
V1, V2));
}
case MVT::v16f32:
case MVT::v8f64:
case MVT::v8i64:
case MVT::v16i32:
case MVT::v32i16:
case MVT::v64i8: {
// Attempt to lower to a bitmask if we can. Only if not optimizing for size.
bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
if (!OptForSize) {
if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
Subtarget, DAG))
return Masked;
}
// Otherwise load an immediate into a GPR, cast to k-register, and use a
// masked move.
MVT IntegerType =
MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
}
default:
llvm_unreachable("Not a supported integer vector type!");
}
}
/// Try to lower as a blend of elements from two inputs followed by
/// a single-input permutation.
///
/// This matches the pattern where we can blend elements from two inputs and
/// then reduce the shuffle to a single-input permutation.
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
SDValue V1, SDValue V2,
ArrayRef<int> Mask,
SelectionDAG &DAG,
bool ImmBlends = false) {
// We build up the blend mask while checking whether a blend is a viable way
// to reduce the shuffle.
SmallVector<int, 32> BlendMask(Mask.size(), -1);
SmallVector<int, 32> PermuteMask(Mask.size(), -1);
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
if (Mask[i] < 0)
continue;
assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
if (BlendMask[Mask[i] % Size] < 0)
BlendMask[Mask[i] % Size] = Mask[i];
else if (BlendMask[Mask[i] % Size] != Mask[i])
return SDValue(); // Can't blend in the needed input!
PermuteMask[i] = Mask[i] % Size;
}
// If only immediate blends, then bail if the blend mask can't be widened to
// i16.
unsigned EltSize = VT.getScalarSizeInBits();
if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
return SDValue();
SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
}
/// Try to lower as an unpack of elements from two inputs followed by
/// a single-input permutation.
///
/// This matches the pattern where we can unpack elements from two inputs and
/// then reduce the shuffle to a single-input (wider) permutation.
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
SDValue V1, SDValue V2,
ArrayRef<int> Mask,
SelectionDAG &DAG) {
int NumElts = Mask.size();
int NumLanes = VT.getSizeInBits() / 128;
int NumLaneElts = NumElts / NumLanes;
int NumHalfLaneElts = NumLaneElts / 2;
bool MatchLo = true, MatchHi = true;
SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
// Determine UNPCKL/UNPCKH type and operand order.
for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
int M = Mask[Lane + Elt];
if (M < 0)
continue;
SDValue &Op = Ops[Elt & 1];
if (M < NumElts && (Op.isUndef() || Op == V1))
Op = V1;
else if (NumElts <= M && (Op.isUndef() || Op == V2))
Op = V2;
else
return SDValue();
int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
if (!MatchLo && !MatchHi)
return SDValue();
}
}
assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
// Now check that each pair of elts come from the same unpack pair
// and set the permute mask based on each pair.
// TODO - Investigate cases where we permute individual elements.
SmallVector<int, 32> PermuteMask(NumElts, -1);
for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
int M0 = Mask[Lane + Elt + 0];
int M1 = Mask[Lane + Elt + 1];
if (0 <= M0 && 0 <= M1 &&
(M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
return SDValue();
if (0 <= M0)
PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
if (0 <= M1)
PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
}
}
unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
}
/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
/// permuting the elements of the result in place.
static SDValue lowerShuffleAsByteRotateAndPermute(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
(VT.is256BitVector() && !Subtarget.hasAVX2()) ||
(VT.is512BitVector() && !Subtarget.hasBWI()))
return SDValue();
// We don't currently support lane crossing permutes.
if (is128BitLaneCrossingShuffleMask(VT, Mask))
return SDValue();
int Scale = VT.getScalarSizeInBits() / 8;
int NumLanes = VT.getSizeInBits() / 128;
int NumElts = VT.getVectorNumElements();
int NumEltsPerLane = NumElts / NumLanes;
// Determine range of mask elts.
bool Blend1 = true;
bool Blend2 = true;
std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
int M = Mask[Lane + Elt];
if (M < 0)
continue;
if (M < NumElts) {
Blend1 &= (M == (Lane + Elt));
assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
M = M % NumEltsPerLane;
Range1.first = std::min(Range1.first, M);
Range1.second = std::max(Range1.second, M);
} else {
M -= NumElts;
Blend2 &= (M == (Lane + Elt));
assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
M = M % NumEltsPerLane;
Range2.first = std::min(Range2.first, M);
Range2.second = std::max(Range2.second, M);
}
}
}
// Bail if we don't need both elements.
// TODO - it might be worth doing this for unary shuffles if the permute
// can be widened.
if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
!(0 <= Range2.first && Range2.second < NumEltsPerLane))
return SDValue();
if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
return SDValue();
// Rotate the 2 ops so we can access both ranges, then permute the result.
auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
SDValue Rotate = DAG.getBitcast(
VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
DAG.getBitcast(ByteVT, Lo),
DAG.getConstant(Scale * RotAmt, DL, MVT::i8)));
SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
int M = Mask[Lane + Elt];
if (M < 0)
continue;
if (M < NumElts)
PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
else
PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
}
}
return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
};
// Check if the ranges are small enough to rotate from either direction.
if (Range2.second < Range1.first)
return RotateAndPermute(V1, V2, Range1.first, 0);
if (Range1.second < Range2.first)
return RotateAndPermute(V2, V1, Range2.first, NumElts);
return SDValue();
}
/// Generic routine to decompose a shuffle and blend into independent
/// blends and permutes.
///
/// This matches the extremely common pattern for handling combined
/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
/// operations. It will try to pick the best arrangement of shuffles and
/// blends.
static SDValue lowerShuffleAsDecomposedShuffleBlend(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
// Shuffle the input elements into the desired positions in V1 and V2 and
// blend them together.
SmallVector<int, 32> V1Mask(Mask.size(), -1);
SmallVector<int, 32> V2Mask(Mask.size(), -1);
SmallVector<int, 32> BlendMask(Mask.size(), -1);
for (int i = 0, Size = Mask.size(); i < Size; ++i)
if (Mask[i] >= 0 && Mask[i] < Size) {
V1Mask[i] = Mask[i];
BlendMask[i] = i;
} else if (Mask[i] >= Size) {
V2Mask[i] = Mask[i] - Size;
BlendMask[i] = i + Size;
}
// Try to lower with the simpler initial blend/unpack/rotate strategies unless
// one of the input shuffles would be a no-op. We prefer to shuffle inputs as
// the shuffle may be able to fold with a load or other benefit. However, when
// we'll have to do 2x as many shuffles in order to achieve this, a 2-input
// pre-shuffle first is a better strategy.
if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
// Only prefer immediate blends to unpack/rotate.
if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
DAG, true))
return BlendPerm;
if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
DAG))
return UnpackPerm;
if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
DL, VT, V1, V2, Mask, Subtarget, DAG))
return RotatePerm;
// Unpack/rotate failed - try again with variable blends.
if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
DAG))
return BlendPerm;
}
V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
}
/// Try to lower a vector shuffle as a rotation.
///
/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {
int NumElts = Mask.size();
// We need to detect various ways of spelling a rotation:
// [11, 12, 13, 14, 15, 0, 1, 2]
// [-1, 12, 13, 14, -1, -1, 1, -1]
// [-1, -1, -1, -1, -1, -1, 1, 2]
// [ 3, 4, 5, 6, 7, 8, 9, 10]
// [-1, 4, 5, 6, -1, -1, 9, -1]
// [-1, 4, 5, 6, -1, -1, -1, -1]
int Rotation = 0;
SDValue Lo, Hi;
for (int i = 0; i < NumElts; ++i) {
int M = Mask[i];
assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
"Unexpected mask index.");
if (M < 0)
continue;
// Determine where a rotated vector would have started.
int StartIdx = i - (M % NumElts);
if (StartIdx == 0)
// The identity rotation isn't interesting, stop.
return -1;
// If we found the tail of a vector the rotation must be the missing
// front. If we found the head of a vector, it must be how much of the
// head.
int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
if (Rotation == 0)
Rotation = CandidateRotation;
else if (Rotation != CandidateRotation)
// The rotations don't match, so we can't match this mask.
return -1;
// Compute which value this mask is pointing at.
SDValue MaskV = M < NumElts ? V1 : V2;
// Compute which of the two target values this index should be assigned
// to. This reflects whether the high elements are remaining or the low
// elements are remaining.
SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
// Either set up this value if we've not encountered it before, or check
// that it remains consistent.
if (!TargetV)
TargetV = MaskV;
else if (TargetV != MaskV)
// This may be a rotation, but it pulls from the inputs in some
// unsupported interleaving.
return -1;
}
// Check that we successfully analyzed the mask, and normalize the results.
assert(Rotation != 0 && "Failed to locate a viable rotation!");
assert((Lo || Hi) && "Failed to find a rotated input vector!");
if (!Lo)
Lo = Hi;
else if (!Hi)
Hi = Lo;
V1 = Lo;
V2 = Hi;
return Rotation;
}
/// Try to lower a vector shuffle as a byte rotation.
///
/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
/// try to generically lower a vector shuffle through such an pattern. It
/// does not check for the profitability of lowering either as PALIGNR or
/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
/// This matches shuffle vectors that look like:
///
/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
///
/// Essentially it concatenates V1 and V2, shifts right by some number of
/// elements, and takes the low elements as the result. Note that while this is
/// specified as a *right shift* because x86 is little-endian, it is a *left
/// rotate* of the vector lanes.
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
ArrayRef<int> Mask) {
// Don't accept any shuffles with zero elements.
if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
return -1;
// PALIGNR works on 128-bit lanes.
SmallVector<int, 16> RepeatedMask;
if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
return -1;
int Rotation = matchShuffleAsRotate(V1, V2, RepeatedMask);
if (Rotation <= 0)
return -1;
// PALIGNR rotates bytes, so we need to scale the
// rotation based on how many bytes are in the vector lane.
int NumElts = RepeatedMask.size();
int Scale = 16 / NumElts;
return Rotation * Scale;
}
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
SDValue Lo = V1, Hi = V2;
int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
if (ByteRotation <= 0)
return SDValue();
// Cast the inputs to i8 vector of correct length to match PALIGNR or
// PSLLDQ/PSRLDQ.
MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
Lo = DAG.getBitcast(ByteVT, Lo);
Hi = DAG.getBitcast(ByteVT, Hi);
// SSSE3 targets can use the palignr instruction.
if (Subtarget.hasSSSE3()) {
assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
"512-bit PALIGNR requires BWI instructions");
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
DAG.getConstant(ByteRotation, DL, MVT::i8)));
}
assert(VT.is128BitVector() &&
"Rotate-based lowering only supports 128-bit lowering!");
assert(Mask.size() <= 16 &&
"Can shuffle at most 16 bytes in a 128-bit vector!");
assert(ByteVT == MVT::v16i8 &&
"SSE2 rotate lowering only needed for v16i8!");
// Default SSE2 implementation
int LoByteShift = 16 - ByteRotation;
int HiByteShift = ByteRotation;
SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
DAG.getConstant(LoByteShift, DL, MVT::i8));
SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
DAG.getConstant(HiByteShift, DL, MVT::i8));
return DAG.getBitcast(VT,
DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
}
/// Try to lower a vector shuffle as a dword/qword rotation.
///
/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
/// rotation of the concatenation of two vectors; This routine will
/// try to generically lower a vector shuffle through such an pattern.
///
/// Essentially it concatenates V1 and V2, shifts right by some number of
/// elements, and takes the low elements as the result. Note that while this is
/// specified as a *right shift* because x86 is little-endian, it is a *left
/// rotate* of the vector lanes.
static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
"Only 32-bit and 64-bit elements are supported!");
// 128/256-bit vectors are only supported with VLX.
assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
&& "VLX required for 128/256-bit vectors");
SDValue Lo = V1, Hi = V2;
int Rotation = matchShuffleAsRotate(Lo, Hi, Mask);
if (Rotation <= 0)
return SDValue();
return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
DAG.getConstant(Rotation, DL, MVT::i8));
}
/// Try to lower a vector shuffle as a byte shift sequence.
static SDValue lowerVectorShuffleAsByteShiftMask(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
assert(VT.is128BitVector() && "Only 128-bit vectors supported");
// We need a shuffle that has zeros at one/both ends and a sequential
// shuffle from one source within.
unsigned ZeroLo = Zeroable.countTrailingOnes();
unsigned ZeroHi = Zeroable.countLeadingOnes();
if (!ZeroLo && !ZeroHi)
return SDValue();
unsigned NumElts = Mask.size();
unsigned Len = NumElts - (ZeroLo + ZeroHi);
if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
return SDValue();
unsigned Scale = VT.getScalarSizeInBits() / 8;
ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
if (!isUndefOrInRange(StubMask, 0, NumElts) &&
!isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
return SDValue();
SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
Res = DAG.getBitcast(MVT::v16i8, Res);
// Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
// inner sequential set of elements, possibly offset:
// 01234567 --> zzzzzz01 --> 1zzzzzzz
// 01234567 --> 4567zzzz --> zzzzz456
// 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
if (ZeroLo == 0) {
unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
DAG.getConstant(Scale * Shift, DL, MVT::i8));
Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
DAG.getConstant(Scale * ZeroHi, DL, MVT::i8));
} else if (ZeroHi == 0) {
unsigned Shift = Mask[ZeroLo] % NumElts;
Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
DAG.getConstant(Scale * Shift, DL, MVT::i8));
Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
DAG.getConstant(Scale * ZeroLo, DL, MVT::i8));
} else if (!Subtarget.hasSSSE3()) {
// If we don't have PSHUFB then its worth avoiding an AND constant mask
// by performing 3 byte shifts. Shuffle combining can kick in above that.
// TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
DAG.getConstant(Scale * Shift, DL, MVT::i8));
Shift += Mask[ZeroLo] % NumElts;
Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
DAG.getConstant(Scale * Shift, DL, MVT::i8));
Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
DAG.getConstant(Scale * ZeroLo, DL, MVT::i8));
} else
return SDValue();
return DAG.getBitcast(VT, Res);
}
/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
///
/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
/// matches elements from one of the input vectors shuffled to the left or
/// right with zeroable elements 'shifted in'. It handles both the strictly
/// bit-wise element shifts and the byte shift across an entire 128-bit double
/// quad word lane.
///
/// PSHL : (little-endian) left bit shift.
/// [ zz, 0, zz, 2 ]
/// [ -1, 4, zz, -1 ]
/// PSRL : (little-endian) right bit shift.
/// [ 1, zz, 3, zz]
/// [ -1, -1, 7, zz]
/// PSLLDQ : (little-endian) left byte shift
/// [ zz, 0, 1, 2, 3, 4, 5, 6]
/// [ zz, zz, -1, -1, 2, 3, 4, -1]
/// [ zz, zz, zz, zz, zz, zz, -1, 1]
/// PSRLDQ : (little-endian) right byte shift
/// [ 5, 6, 7, zz, zz, zz, zz, zz]
/// [ -1, 5, 6, 7, zz, zz, zz, zz]
/// [ 1, 2, -1, -1, -1, -1, zz, zz]
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
unsigned ScalarSizeInBits, ArrayRef<int> Mask,
int MaskOffset, const APInt &Zeroable,
const X86Subtarget &Subtarget) {
int Size = Mask.size();
unsigned SizeInBits = Size * ScalarSizeInBits;
auto CheckZeros = [&](int Shift, int Scale, bool Left) {
for (int i = 0; i < Size; i += Scale)
for (int j = 0; j < Shift; ++j)
if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
return false;
return true;
};
auto MatchShift = [&](int Shift, int Scale, bool Left) {
for (int i = 0; i != Size; i += Scale) {
unsigned Pos = Left ? i + Shift : i;
unsigned Low = Left ? i : i + Shift;
unsigned Len = Scale - Shift;
if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
return -1;
}
int ShiftEltBits = ScalarSizeInBits * Scale;
bool ByteShift = ShiftEltBits > 64;
Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
// Normalize the scale for byte shifts to still produce an i64 element
// type.
Scale = ByteShift ? Scale / 2 : Scale;
// We need to round trip through the appropriate type for the shift.
MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
: MVT::getVectorVT(ShiftSVT, Size / Scale);
return (int)ShiftAmt;
};
// SSE/AVX supports logical shifts up to 64-bit integers - so we can just
// keep doubling the size of the integer elements up to that. We can
// then shift the elements of the integer vector by whole multiples of
// their width within the elements of the larger integer vector. Test each
// multiple to see if we can find a match with the moved element indices
// and that the shifted in elements are all zeroable.
unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
for (int Shift = 1; Shift != Scale; ++Shift)
for (bool Left : {true, false})
if (CheckZeros(Shift, Scale, Left)) {
int ShiftAmt = MatchShift(Shift, Scale, Left);
if (0 < ShiftAmt)
return ShiftAmt;
}
// no match
return -1;
}
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
int Size = Mask.size();
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
MVT ShiftVT;
SDValue V = V1;
unsigned Opcode;
// Try to match shuffle against V1 shift.
int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
Mask, 0, Zeroable, Subtarget);
// If V1 failed, try to match shuffle against V2 shift.
if (ShiftAmt < 0) {
ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
Mask, Size, Zeroable, Subtarget);
V = V2;
}
if (ShiftAmt < 0)
return SDValue();
assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
"Illegal integer vector type");
V = DAG.getBitcast(ShiftVT, V);
V = DAG.getNode(Opcode, DL, ShiftVT, V,
DAG.getConstant(ShiftAmt, DL, MVT::i8));
return DAG.getBitcast(VT, V);
}
// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
// Remainder of lower half result is zero and upper half is all undef.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
ArrayRef<int> Mask, uint64_t &BitLen,
uint64_t &BitIdx, const APInt &Zeroable) {
int Size = Mask.size();
int HalfSize = Size / 2;
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
// Upper half must be undefined.
if (!isUndefUpperHalf(Mask))
return false;
// Determine the extraction length from the part of the
// lower half that isn't zeroable.
int Len = HalfSize;
for (; Len > 0; --Len)
if (!Zeroable[Len - 1])
break;
assert(Len > 0 && "Zeroable shuffle mask");
// Attempt to match first Len sequential elements from the lower half.
SDValue Src;
int Idx = -1;
for (int i = 0; i != Len; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef)
continue;
SDValue &V = (M < Size ? V1 : V2);
M = M % Size;
// The extracted elements must start at a valid index and all mask
// elements must be in the lower half.
if (i > M || M >= HalfSize)
return false;
if (Idx < 0 || (Src == V && Idx == (M - i))) {
Src = V;
Idx = M - i;
continue;
}
return false;
}
if (!Src || Idx < 0)
return false;
assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
V1 = Src;
return true;
}
// INSERTQ: Extract lowest Len elements from lower half of second source and
// insert over first source, starting at Idx.
// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
ArrayRef<int> Mask, uint64_t &BitLen,
uint64_t &BitIdx) {
int Size = Mask.size();
int HalfSize = Size / 2;
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
// Upper half must be undefined.
if (!isUndefUpperHalf(Mask))
return false;
for (int Idx = 0; Idx != HalfSize; ++Idx) {
SDValue Base;
// Attempt to match first source from mask before insertion point.
if (isUndefInRange(Mask, 0, Idx)) {
/* EMPTY */
} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
Base = V1;
} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
Base = V2;
} else {
continue;
}
// Extend the extraction length looking to match both the insertion of
// the second source and the remaining elements of the first.
for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
SDValue Insert;
int Len = Hi - Idx;
// Match insertion.
if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
Insert = V1;
} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
Insert = V2;
} else {
continue;
}
// Match the remaining elements of the lower half.
if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
/* EMPTY */
} else if ((!Base || (Base == V1)) &&
isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
Base = V1;
} else if ((!Base || (Base == V2)) &&
isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
Size + Hi)) {
Base = V2;
} else {
continue;
}
BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
V1 = Base;
V2 = Insert;
return true;
}
}
return false;
}
/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable, SelectionDAG &DAG) {
uint64_t BitLen, BitIdx;
if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
DAG.getConstant(BitLen, DL, MVT::i8),
DAG.getConstant(BitIdx, DL, MVT::i8));
if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
V2 ? V2 : DAG.getUNDEF(VT),
DAG.getConstant(BitLen, DL, MVT::i8),
DAG.getConstant(BitIdx, DL, MVT::i8));
return SDValue();
}
/// Lower a vector shuffle as a zero or any extension.
///
/// Given a specific number of elements, element bit width, and extension
/// stride, produce either a zero or any extension based on the available
/// features of the subtarget. The extended elements are consecutive and
/// begin and can start from an offsetted element index in the input; to
/// avoid excess shuffling the offset must either being in the bottom lane
/// or at the start of a higher lane. All extended elements must be from
/// the same lane.
static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(Scale > 1 && "Need a scale to extend.");
int EltBits = VT.getScalarSizeInBits();
int NumElements = VT.getVectorNumElements();
int NumEltsPerLane = 128 / EltBits;
int OffsetLane = Offset / NumEltsPerLane;
assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.");
assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
assert(0 <= Offset && "Extension offset must be positive.");
assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
"Extension offset must be in the first lane or start an upper lane.");
// Check that an index is in same lane as the base offset.
auto SafeOffset = [&](int Idx) {
return OffsetLane == (Idx / NumEltsPerLane);
};
// Shift along an input so that the offset base moves to the first element.
auto ShuffleOffset = [&](SDValue V) {
if (!Offset)
return V;
SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
for (int i = 0; i * Scale < NumElements; ++i) {
int SrcIdx = i + Offset;
ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
}
return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
};
// Found a valid zext mask! Try various lowering strategies based on the
// input type and available ISA extensions.
// TODO: Add AnyExt support.
if (Subtarget.hasSSE41()) {
// Not worth offsetting 128-bit vectors if scale == 2, a pattern using
// PUNPCK will catch this in a later shuffle match.
if (Offset && Scale == 2 && VT.is128BitVector())
return SDValue();
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
NumElements / Scale);
InputV = ShuffleOffset(InputV);
InputV = getExtendInVec(ISD::ZERO_EXTEND, DL, ExtVT, InputV, DAG);
return DAG.getBitcast(VT, InputV);
}
assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
// For any extends we can cheat for larger element sizes and use shuffle
// instructions that can fold with a load and/or copy.
if (AnyExt && EltBits == 32) {
int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
-1};
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
DAG.getBitcast(MVT::v4i32, InputV),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
}
if (AnyExt && EltBits == 16 && Scale > 2) {
int PSHUFDMask[4] = {Offset / 2, -1,
SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
DAG.getBitcast(MVT::v4i32, InputV),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
int PSHUFWMask[4] = {1, -1, -1, -1};
unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
return DAG.getBitcast(
VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
DAG.getBitcast(MVT::v8i16, InputV),
getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
}
// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
// to 64-bits.
if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
assert(VT.is128BitVector() && "Unexpected vector width!");
int LoIdx = Offset * EltBits;
SDValue Lo = DAG.getBitcast(
MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
DAG.getConstant(EltBits, DL, MVT::i8),
DAG.getConstant(LoIdx, DL, MVT::i8)));
if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
return DAG.getBitcast(VT, Lo);
int HiIdx = (Offset + 1) * EltBits;
SDValue Hi = DAG.getBitcast(
MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
DAG.getConstant(EltBits, DL, MVT::i8),
DAG.getConstant(HiIdx, DL, MVT::i8)));
return DAG.getBitcast(VT,
DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
}
// If this would require more than 2 unpack instructions to expand, use
// pshufb when available. We can only use more than 2 unpack instructions
// when zero extending i8 elements which also makes it easier to use pshufb.
if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
assert(NumElements == 16 && "Unexpected byte vector width!");
SDValue PSHUFBMask[16];
for (int i = 0; i < 16; ++i) {
int Idx = Offset + (i / Scale);
PSHUFBMask[i] = DAG.getConstant(
(i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
}
InputV = DAG.getBitcast(MVT::v16i8, InputV);
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
}
// If we are extending from an offset, ensure we start on a boundary that
// we can unpack from.
int AlignToUnpack = Offset % (NumElements / Scale);
if (AlignToUnpack) {
SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
for (int i = AlignToUnpack; i < NumElements; ++i)
ShMask[i - AlignToUnpack] = i;
InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
Offset -= AlignToUnpack;
}
// Otherwise emit a sequence of unpacks.
do {
unsigned UnpackLoHi = X86ISD::UNPCKL;
if (Offset >= (NumElements / 2)) {
UnpackLoHi = X86ISD::UNPCKH;
Offset -= (NumElements / 2);
}
MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
: getZeroVector(InputVT, Subtarget, DAG, DL);
InputV = DAG.getBitcast(InputVT, InputV);
InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
Scale /= 2;
EltBits *= 2;
NumElements /= 2;
} while (Scale > 1);
return DAG.getBitcast(VT, InputV);
}
/// Try to lower a vector shuffle as a zero extension on any microarch.
///
/// This routine will try to do everything in its power to cleverly lower
/// a shuffle which happens to match the pattern of a zero extend. It doesn't
/// check for the profitability of this lowering, it tries to aggressively
/// match this pattern. It will use all of the micro-architectural details it
/// can to emit an efficient lowering. It handles both blends with all-zero
/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
/// masking out later).
///
/// The reason we have dedicated lowering for zext-style shuffles is that they
/// are both incredibly common and often quite performance sensitive.
static SDValue lowerShuffleAsZeroOrAnyExtend(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
int Bits = VT.getSizeInBits();
int NumLanes = Bits / 128;
int NumElements = VT.getVectorNumElements();
int NumEltsPerLane = NumElements / NumLanes;
assert(VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit");
assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
// Define a helper function to check a particular ext-scale and lower to it if
// valid.
auto Lower = [&](int Scale) -> SDValue {
SDValue InputV;
bool AnyExt = true;
int Offset = 0;
int Matches = 0;
for (int i = 0; i < NumElements; ++i) {
int M = Mask[i];
if (M < 0)
continue; // Valid anywhere but doesn't tell us anything.
if (i % Scale != 0) {
// Each of the extended elements need to be zeroable.
if (!Zeroable[i])
return SDValue();
// We no longer are in the anyext case.
AnyExt = false;
continue;
}
// Each of the base elements needs to be consecutive indices into the
// same input vector.
SDValue V = M < NumElements ? V1 : V2;
M = M % NumElements;
if (!InputV) {
InputV = V;
Offset = M - (i / Scale);
} else if (InputV != V)
return SDValue(); // Flip-flopping inputs.
// Offset must start in the lowest 128-bit lane or at the start of an
// upper lane.
// FIXME: Is it ever worth allowing a negative base offset?
if (!((0 <= Offset && Offset < NumEltsPerLane) ||
(Offset % NumEltsPerLane) == 0))
return SDValue();
// If we are offsetting, all referenced entries must come from the same
// lane.
if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
return SDValue();
if ((M % NumElements) != (Offset + (i / Scale)))
return SDValue(); // Non-consecutive strided elements.
Matches++;
}
// If we fail to find an input, we have a zero-shuffle which should always
// have already been handled.
// FIXME: Maybe handle this here in case during blending we end up with one?
if (!InputV)
return SDValue();
// If we are offsetting, don't extend if we only match a single input, we
// can always do better by using a basic PSHUF or PUNPCK.
if (Offset != 0 && Matches < 2)
return SDValue();
return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
InputV, Mask, Subtarget, DAG);
};
// The widest scale possible for extending is to a 64-bit integer.
assert(Bits % 64 == 0 &&
"The number of bits in a vector must be divisible by 64 on x86!");
int NumExtElements = Bits / 64;
// Each iteration, try extending the elements half as much, but into twice as
// many elements.
for (; NumExtElements < NumElements; NumExtElements *= 2) {
assert(NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size.");
if (SDValue V = Lower(NumElements / NumExtElements))
return V;
}
// General extends failed, but 128-bit vectors may be able to use MOVQ.
if (Bits != 128)
return SDValue();
// Returns one of the source operands if the shuffle can be reduced to a
// MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
auto CanZExtLowHalf = [&]() {
for (int i = NumElements / 2; i != NumElements; ++i)
if (!Zeroable[i])
return SDValue();
if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
return V1;
if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
return V2;
return SDValue();
};
if (SDValue V = CanZExtLowHalf()) {
V = DAG.getBitcast(MVT::v2i64, V);
V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
return DAG.getBitcast(VT, V);
}
// No viable ext lowering found.
return SDValue();
}
/// Try to get a scalar value for a specific element of a vector.
///
/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
SelectionDAG &DAG) {
MVT VT = V.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
V = peekThroughBitcasts(V);
// If the bitcasts shift the element size, we can't extract an equivalent
// element from it.
MVT NewVT = V.getSimpleValueType();
if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
return SDValue();
if (V.getOpcode() == ISD::BUILD_VECTOR ||
(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
// Ensure the scalar operand is the same size as the destination.
// FIXME: Add support for scalar truncation where possible.
SDValue S = V.getOperand(Idx);
if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
return DAG.getBitcast(EltVT, S);
}
return SDValue();
}
/// Helper to test for a load that can be folded with x86 shuffles.
///
/// This is particularly important because the set of instructions varies
/// significantly based on whether the operand is a load or not.
static bool isShuffleFoldableLoad(SDValue V) {
V = peekThroughBitcasts(V);
return ISD::isNON_EXTLoad(V.getNode());
}
/// Try to lower insertion of a single element into a zero vector.
///
/// This is a common pattern that we have especially efficient patterns to lower
/// across all subtarget feature sets.
static SDValue lowerShuffleAsElementInsertion(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT ExtVT = VT;
MVT EltVT = VT.getVectorElementType();
int V2Index =
find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
Mask.begin();
bool IsV1Zeroable = true;
for (int i = 0, Size = Mask.size(); i < Size; ++i)
if (i != V2Index && !Zeroable[i]) {
IsV1Zeroable = false;
break;
}
// Check for a single input from a SCALAR_TO_VECTOR node.
// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
// all the smarts here sunk into that routine. However, the current
// lowering of BUILD_VECTOR makes that nearly impossible until the old
// vector shuffle lowering is dead.
SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
DAG);
if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
// We need to zext the scalar if it is smaller than an i32.
V2S = DAG.getBitcast(EltVT, V2S);
if (EltVT == MVT::i8 || EltVT == MVT::i16) {
// Using zext to expand a narrow element won't work for non-zero
// insertions.
if (!IsV1Zeroable)
return SDValue();
// Zero-extend directly to i32.
ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
}
V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
} else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
EltVT == MVT::i16) {
// Either not inserting from the low element of the input or the input
// element size is too small to use VZEXT_MOVL to clear the high bits.
return SDValue();
}
if (!IsV1Zeroable) {
// If V1 can't be treated as a zero vector we have fewer options to lower
// this. We can't support integer vectors or non-zero targets cheaply, and
// the V1 elements can't be permuted in any way.
assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
if (!VT.isFloatingPoint() || V2Index != 0)
return SDValue();
SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
V1Mask[V2Index] = -1;
if (!isNoopShuffleMask(V1Mask))
return SDValue();
if (!VT.is128BitVector())
return SDValue();
// Otherwise, use MOVSD or MOVSS.
assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
"Only two types of floating point element types to handle!");
return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
ExtVT, V1, V2);
}
// This lowering only works for the low element with floating point vectors.
if (VT.isFloatingPoint() && V2Index != 0)
return SDValue();
V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
if (ExtVT != VT)
V2 = DAG.getBitcast(VT, V2);
if (V2Index != 0) {
// If we have 4 or fewer lanes we can cheaply shuffle the element into
// the desired position. Otherwise it is more efficient to do a vector
// shift left. We know that we can do a vector shift left because all
// the inputs are zero.
if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
V2Shuffle[V2Index] = 0;
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
} else {
V2 = DAG.getBitcast(MVT::v16i8, V2);
V2 = DAG.getNode(
X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
V2 = DAG.getBitcast(VT, V2);
}
}
return V2;
}
/// Try to lower broadcast of a single - truncated - integer element,
/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
///
/// This assumes we have AVX2.
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
int BroadcastIdx,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Subtarget.hasAVX2() &&
"We can only lower integer broadcasts with AVX2!");
EVT EltVT = VT.getVectorElementType();
EVT V0VT = V0.getValueType();
assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
EVT V0EltVT = V0VT.getVectorElementType();
if (!V0EltVT.isInteger())
return SDValue();
const unsigned EltSize = EltVT.getSizeInBits();
const unsigned V0EltSize = V0EltVT.getSizeInBits();
// This is only a truncation if the original element type is larger.
if (V0EltSize <= EltSize)
return SDValue();
assert(((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!");
const unsigned V0Opc = V0.getOpcode();
const unsigned Scale = V0EltSize / EltSize;
const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
V0Opc != ISD::BUILD_VECTOR)
return SDValue();
SDValue Scalar = V0.getOperand(V0BroadcastIdx);
// If we're extracting non-least-significant bits, shift so we can truncate.
// Hopefully, we can fold away the trunc/srl/load into the broadcast.
// Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
if (const int OffsetIdx = BroadcastIdx % Scale)
Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
}
/// Test whether this can be lowered with a single SHUFPS instruction.
///
/// This is used to disable more specialized lowerings when the shufps lowering
/// will happen to be efficient.
static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
// This routine only handles 128-bit shufps.
assert(Mask.size() == 4 && "Unsupported mask size!");
assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
// To lower with a single SHUFPS we need to have the low half and high half
// each requiring a single input.
if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
return false;
if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
return false;
return true;
}
/// If we are extracting two 128-bit halves of a vector and shuffling the
/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
/// multi-shuffle lowering.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
SDValue N1, ArrayRef<int> Mask,
SelectionDAG &DAG) {
EVT VT = N0.getValueType();
assert((VT.is128BitVector() &&
(VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
"VPERM* family of shuffles requires 32-bit or 64-bit elements");
// Check that both sources are extracts of the same source vector.
if (!N0.hasOneUse() || !N1.hasOneUse() ||
N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
N0.getOperand(0) != N1.getOperand(0))
return SDValue();
SDValue WideVec = N0.getOperand(0);
EVT WideVT = WideVec.getValueType();
if (!WideVT.is256BitVector() || !isa<ConstantSDNode>(N0.getOperand(1)) ||
!isa<ConstantSDNode>(N1.getOperand(1)))
return SDValue();
// Match extracts of each half of the wide source vector. Commute the shuffle
// if the extract of the low half is N1.
unsigned NumElts = VT.getVectorNumElements();
SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
ShuffleVectorSDNode::commuteMask(NewMask);
else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
return SDValue();
// Final bailout: if the mask is simple, we are better off using an extract
// and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
// because that avoids a constant load from memory.
if (NumElts == 4 &&
(isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
return SDValue();
// Extend the shuffle mask with undef elements.
NewMask.append(NumElts, -1);
// shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
NewMask);
// This is free: ymm -> xmm.
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
DAG.getIntPtrConstant(0, DL));
}
/// Try to lower broadcast of a single element.
///
/// For convenience, this code also bundles all of the subtarget feature set
/// filtering. While a little annoying to re-dispatch on type here, there isn't
/// a convenient way to factor it out.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
(Subtarget.hasAVX() && VT.isFloatingPoint()) ||
(Subtarget.hasAVX2() && VT.isInteger())))
return SDValue();
// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
// we can only broadcast from a register with AVX2.
unsigned NumElts = Mask.size();
unsigned NumEltBits = VT.getScalarSizeInBits();
unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
? X86ISD::MOVDDUP
: X86ISD::VBROADCAST;
bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
// Check that the mask is a broadcast.
int BroadcastIdx = -1;
for (int i = 0; i != (int)NumElts; ++i) {
SmallVector<int, 8> BroadcastMask(NumElts, i);
if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
BroadcastIdx = i;
break;
}
}
if (BroadcastIdx < 0)
return SDValue();
assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast "
"comes from V1.");
// Go up the chain of (vector) values to find a scalar load that we can
// combine with the broadcast.
int BitOffset = BroadcastIdx * NumEltBits;
SDValue V = V1;
for (;;) {
switch (V.getOpcode()) {
case ISD::BITCAST: {
V = V.getOperand(0);
continue;
}
case ISD::CONCAT_VECTORS: {
int OpBitWidth = V.getOperand(0).getValueSizeInBits();
int OpIdx = BitOffset / OpBitWidth;
V = V.getOperand(OpIdx);
BitOffset %= OpBitWidth;
continue;
}
case ISD::INSERT_SUBVECTOR: {
SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
if (!ConstantIdx)
break;
int EltBitWidth = VOuter.getScalarValueSizeInBits();
int Idx = (int)ConstantIdx->getZExtValue();
int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
int BeginOffset = Idx * EltBitWidth;
int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
BitOffset -= BeginOffset;
V = VInner;
} else {
V = VOuter;
}
continue;
}
}
break;
}
assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
BroadcastIdx = BitOffset / NumEltBits;
// Do we need to bitcast the source to retrieve the original broadcast index?
bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
// Check if this is a broadcast of a scalar. We special case lowering
// for scalars so that we can more effectively fold with loads.
// If the original value has a larger element type than the shuffle, the
// broadcast element is in essence truncated. Make that explicit to ease
// folding.
if (BitCastSrc && VT.isInteger())
if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
DL, VT, V, BroadcastIdx, Subtarget, DAG))
return TruncBroadcast;
MVT BroadcastVT = VT;
// Also check the simpler case, where we can directly reuse the scalar.
if (!BitCastSrc &&
((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
V = V.getOperand(BroadcastIdx);
// If we can't broadcast from a register, check that the input is a load.
if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
return SDValue();
} else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) {
// 32-bit targets need to load i64 as a f64 and then bitcast the result.
if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
? X86ISD::MOVDDUP
: Opcode;
}
// If we are broadcasting a load that is only used by the shuffle
// then we can reduce the vector load to the broadcasted scalar load.
LoadSDNode *Ld = cast<LoadSDNode>(V);
SDValue BaseAddr = Ld->getOperand(1);
EVT SVT = BroadcastVT.getScalarType();
unsigned Offset = BroadcastIdx * SVT.getStoreSize();
assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
DAG.getMachineFunction().getMachineMemOperand(
Ld->getMemOperand(), Offset, SVT.getStoreSize()));
DAG.makeEquivalentMemoryOrdering(Ld, V);
} else if (!BroadcastFromReg) {
// We can't broadcast from a vector register.
return SDValue();
} else if (BitOffset != 0) {
// We can only broadcast from the zero-element of a vector register,
// but it can be advantageous to broadcast from the zero-element of a
// subvector.
if (!VT.is256BitVector() && !VT.is512BitVector())
return SDValue();
// VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
if (VT == MVT::v4f64 || VT == MVT::v4i64)
return SDValue();
// Only broadcast the zero-element of a 128-bit subvector.
if ((BitOffset % 128) != 0)
return SDValue();
assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
"Unexpected bit-offset");
assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
"Unexpected vector size");
unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
V = extract128BitVector(V, ExtractIdx, DAG, DL);
}
if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
DAG.getBitcast(MVT::f64, V));
// Bitcast back to the same scalar type as BroadcastVT.
if (V.getValueType().getScalarType() != BroadcastVT.getScalarType()) {
assert(NumEltBits == BroadcastVT.getScalarSizeInBits() &&
"Unexpected vector element size");
MVT ExtVT;
if (V.getValueType().isVector()) {
unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
ExtVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
} else {
ExtVT = BroadcastVT.getScalarType();
}
V = DAG.getBitcast(ExtVT, V);
}
// 32-bit targets need to load i64 as a f64 and then bitcast the result.
if (!Subtarget.is64Bit() && V.getValueType() == MVT::i64) {
V = DAG.getBitcast(MVT::f64, V);
unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
}
// We only support broadcasting from 128-bit vectors to minimize the
// number of patterns we need to deal with in isel. So extract down to
// 128-bits, removing as many bitcasts as possible.
if (V.getValueSizeInBits() > 128) {
MVT ExtVT = V.getSimpleValueType().getScalarType();
ExtVT = MVT::getVectorVT(ExtVT, 128 / ExtVT.getScalarSizeInBits());
V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
V = DAG.getBitcast(ExtVT, V);
}
return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
}
// Check for whether we can use INSERTPS to perform the shuffle. We only use
// INSERTPS when the V1 elements are already in the correct locations
// because otherwise we can just always use two SHUFPS instructions which
// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
// perform INSERTPS if a single V1 element is out of place and all V2
// elements are zeroable.
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
unsigned &InsertPSMask,
const APInt &Zeroable,
ArrayRef<int> Mask, SelectionDAG &DAG) {
assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
// Attempt to match INSERTPS with one element from VA or VB being
// inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
// are updated.
auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
ArrayRef<int> CandidateMask) {
unsigned ZMask = 0;
int VADstIndex = -1;
int VBDstIndex = -1;
bool VAUsedInPlace = false;
for (int i = 0; i < 4; ++i) {
// Synthesize a zero mask from the zeroable elements (includes undefs).
if (Zeroable[i]) {
ZMask |= 1 << i;
continue;
}
// Flag if we use any VA inputs in place.
if (i == CandidateMask[i]) {
VAUsedInPlace = true;
continue;
}
// We can only insert a single non-zeroable element.
if (VADstIndex >= 0 || VBDstIndex >= 0)
return false;
if (CandidateMask[i] < 4) {
// VA input out of place for insertion.
VADstIndex = i;
} else {
// VB input for insertion.
VBDstIndex = i;
}
}
// Don't bother if we have no (non-zeroable) element for insertion.
if (VADstIndex < 0 && VBDstIndex < 0)
return false;
// Determine element insertion src/dst indices. The src index is from the
// start of the inserted vector, not the start of the concatenated vector.
unsigned VBSrcIndex = 0;
if (VADstIndex >= 0) {
// If we have a VA input out of place, we use VA as the V2 element
// insertion and don't use the original V2 at all.
VBSrcIndex = CandidateMask[VADstIndex];
VBDstIndex = VADstIndex;
VB = VA;
} else {
VBSrcIndex = CandidateMask[VBDstIndex] - 4;
}
// If no V1 inputs are used in place, then the result is created only from
// the zero mask and the V2 insertion - so remove V1 dependency.
if (!VAUsedInPlace)
VA = DAG.getUNDEF(MVT::v4f32);
// Update V1, V2 and InsertPSMask accordingly.
V1 = VA;
V2 = VB;
// Insert the V2 element into the desired position.
InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
return true;
};
if (matchAsInsertPS(V1, V2, Mask))
return true;
// Commute and try again.
SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
ShuffleVectorSDNode::commuteMask(CommutedMask);
if (matchAsInsertPS(V2, V1, CommutedMask))
return true;
return false;
}
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
ArrayRef<int> Mask, const APInt &Zeroable,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
// Attempt to match the insertps pattern.
unsigned InsertPSMask;
if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
return SDValue();
// Insert the V2 element into the desired position.
return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
DAG.getConstant(InsertPSMask, DL, MVT::i8));
}
/// Try to lower a shuffle as a permute of the inputs followed by an
/// UNPCK instruction.
///
/// This specifically targets cases where we end up with alternating between
/// the two inputs, and so can permute them into something that feeds a single
/// UNPCK instruction. Note that this routine only targets integer vectors
/// because for floating point vectors we have a generalized SHUFPS lowering
/// strategy that handles everything that doesn't *exactly* match an unpack,
/// making this clever lowering unnecessary.
static SDValue lowerShuffleAsPermuteAndUnpack(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(!VT.isFloatingPoint() &&
"This routine only supports integer vectors.");
assert(VT.is128BitVector() &&
"This routine only works on 128-bit vectors.");
assert(!V2.isUndef() &&
"This routine should only be used when blending two inputs.");
assert(Mask.size() >= 2 && "Single element masks are invalid.");
int Size = Mask.size();
int NumLoInputs =
count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
int NumHiInputs =
count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
bool UnpackLo = NumLoInputs >= NumHiInputs;
auto TryUnpack = [&](int ScalarSize, int Scale) {
SmallVector<int, 16> V1Mask((unsigned)Size, -1);
SmallVector<int, 16> V2Mask((unsigned)Size, -1);
for (int i = 0; i < Size; ++i) {
if (Mask[i] < 0)
continue;
// Each element of the unpack contains Scale elements from this mask.
int UnpackIdx = i / Scale;
// We only handle the case where V1 feeds the first slots of the unpack.
// We rely on canonicalization to ensure this is the case.
if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
return SDValue();
// Setup the mask for this input. The indexing is tricky as we have to
// handle the unpack stride.
SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
Mask[i] % Size;
}
// If we will have to shuffle both inputs to use the unpack, check whether
// we can just unpack first and shuffle the result. If so, skip this unpack.
if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
!isNoopShuffleMask(V2Mask))
return SDValue();
// Shuffle the inputs into place.
V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
// Cast the inputs to the type we will use to unpack them.
MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
V1 = DAG.getBitcast(UnpackVT, V1);
V2 = DAG.getBitcast(UnpackVT, V2);
// Unpack the inputs and cast the result back to the desired type.
return DAG.getBitcast(
VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
UnpackVT, V1, V2));
};
// We try each unpack from the largest to the smallest to try and find one
// that fits this mask.
int OrigScalarSize = VT.getScalarSizeInBits();
for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
return Unpack;
// If we're shuffling with a zero vector then we're better off not doing
// VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
ISD::isBuildVectorAllZeros(V2.getNode()))
return SDValue();
// If none of the unpack-rooted lowerings worked (or were profitable) try an
// initial unpack.
if (NumLoInputs == 0 || NumHiInputs == 0) {
assert((NumLoInputs > 0 || NumHiInputs > 0) &&
"We have to have *some* inputs!");
int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
// FIXME: We could consider the total complexity of the permute of each
// possible unpacking. Or at the least we should consider how many
// half-crossings are created.
// FIXME: We could consider commuting the unpacks.
SmallVector<int, 32> PermMask((unsigned)Size, -1);
for (int i = 0; i < Size; ++i) {
if (Mask[i] < 0)
continue;
assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
PermMask[i] =
2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
}
return DAG.getVectorShuffle(
VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
DL, VT, V1, V2),
DAG.getUNDEF(VT), PermMask);
}
return SDValue();
}
/// Handle lowering of 2-lane 64-bit floating point shuffles.
///
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
/// support for floating point shuffles but not integer shuffles. These
/// instructions will incur a domain crossing penalty on some chips though so
/// it is better to avoid lowering through this for integer vectors where
/// possible.
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
if (V2.isUndef()) {
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
Mask, Subtarget, DAG))
return Broadcast;
// Straight shuffle of a single input vector. Simulate this by using the
// single input as both of the "inputs" to this instruction..
unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
if (Subtarget.hasAVX()) {
// If we have AVX, we can use VPERMILPS which will allow folding a load
// into the shuffle.
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
DAG.getConstant(SHUFPDMask, DL, MVT::i8));
}
return DAG.getNode(
X86ISD::SHUFP, DL, MVT::v2f64,
Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
DAG.getConstant(SHUFPDMask, DL, MVT::i8));
}
assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
assert(Mask[0] < 2 && "We sort V1 to be the first input.");
assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
if (Subtarget.hasAVX2())
if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
return Extract;
// When loading a scalar and then shuffling it into a vector we can often do
// the insertion cheaply.
if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Try inverting the insertion since for v2 masks it is easy to do and we
// can't reliably sort the mask one way or the other.
int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
return Insertion;
// Try to use one of the special instruction patterns to handle two common
// blend patterns if a zero-blend above didn't work.
if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
isShuffleEquivalent(V1, V2, Mask, {1, 3}))
if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
// We can either use a special instruction to load over the low double or
// to move just the low double.
return DAG.getNode(
X86ISD::MOVSD, DL, MVT::v2f64, V2,
DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
if (Subtarget.hasSSE41())
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
return V;
unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
DAG.getConstant(SHUFPDMask, DL, MVT::i8));
}
/// Handle lowering of 2-lane 64-bit integer shuffles.
///
/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
/// the integer unit to minimize domain crossing penalties. However, for blends
/// it falls back to the floating point shuffle operation with appropriate bit
/// casting.
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
if (V2.isUndef()) {
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
Mask, Subtarget, DAG))
return Broadcast;
// Straight shuffle of a single input vector. For everything from SSE2
// onward this has a single fast instruction with no scary immediates.
// We have to map the mask as it is actually a v4i32 shuffle instruction.
V1 = DAG.getBitcast(MVT::v4i32, V1);
int WidenedMask[4] = {
std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
return DAG.getBitcast(
MVT::v2i64,
DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
}
assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
assert(Mask[0] < 2 && "We sort V1 to be the first input.");
assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
if (Subtarget.hasAVX2())
if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
return Extract;
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// When loading a scalar and then shuffling it into a vector we can often do
// the insertion cheaply.
if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Try inverting the insertion since for v2 masks it is easy to do and we
// can't reliably sort the mask one way or the other.
int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
return Insertion;
// We have different paths for blend lowering, but they all must use the
// *exact* same predicate.
bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
return V;
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget.hasSSSE3()) {
if (Subtarget.hasVLX())
if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v2i64, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
}
// If we have direct support for blends, we should lower by decomposing into
// a permute. That will be faster than the domain cross.
if (IsBlendSupported)
return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask,
Subtarget, DAG);
// We implement this with SHUFPD which is pretty lame because it will likely
// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
// However, all the alternatives are still more cycles and newer chips don't
// have this problem. It would be really nice if x86 had better shuffles here.
V1 = DAG.getBitcast(MVT::v2f64, V1);
V2 = DAG.getBitcast(MVT::v2f64, V2);
return DAG.getBitcast(MVT::v2i64,
DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
}
/// Lower a vector shuffle using the SHUFPS instruction.
///
/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
/// It makes no assumptions about whether this is the *best* lowering, it simply
/// uses it.
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
SDValue LowV = V1, HighV = V2;
int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
if (NumV2Elements == 1) {
int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
// Compute the index adjacent to V2Index and in the same half by toggling
// the low bit.
int V2AdjIndex = V2Index ^ 1;
if (Mask[V2AdjIndex] < 0) {
// Handles all the cases where we have a single V2 element and an undef.
// This will only ever happen in the high lanes because we commute the
// vector otherwise.
if (V2Index < 2)
std::swap(LowV, HighV);
NewMask[V2Index] -= 4;
} else {
// Handle the case where the V2 element ends up adjacent to a V1 element.
// To make this work, blend them together as the first step.
int V1Index = V2AdjIndex;
int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
// Now proceed to reconstruct the final blend as we have the necessary
// high or low half formed.
if (V2Index < 2) {
LowV = V2;
HighV = V1;
} else {
HighV = V2;
}
NewMask[V1Index] = 2; // We put the V1 element in V2[2].
NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
}
} else if (NumV2Elements == 2) {
if (Mask[0] < 4 && Mask[1] < 4) {
// Handle the easy case where we have V1 in the low lanes and V2 in the
// high lanes.
NewMask[2] -= 4;
NewMask[3] -= 4;
} else if (Mask[2] < 4 && Mask[3] < 4) {
// We also handle the reversed case because this utility may get called
// when we detect a SHUFPS pattern but can't easily commute the shuffle to
// arrange things in the right direction.
NewMask[0] -= 4;
NewMask[1] -= 4;
HighV = V1;
LowV = V2;
} else {
// We have a mixture of V1 and V2 in both low and high lanes. Rather than
// trying to place elements directly, just blend them and set up the final
// shuffle to place them.
// The first two blend mask elements are for V1, the second two are for
// V2.
int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
Mask[2] < 4 ? Mask[2] : Mask[3],
(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
// Now we do a normal shuffle of V1 by giving V1 as both operands to
// a blend.
LowV = HighV = V1;
NewMask[0] = Mask[0] < 4 ? 0 : 2;
NewMask[1] = Mask[0] < 4 ? 2 : 0;
NewMask[2] = Mask[2] < 4 ? 1 : 3;
NewMask[3] = Mask[2] < 4 ? 3 : 1;
}
}
return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
}
/// Lower 4-lane 32-bit floating point shuffles.
///
/// Uses instructions exclusively from the floating point unit to minimize
/// domain crossing penalties, as these are sufficient to implement all v4f32
/// shuffles.
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
Mask, Subtarget, DAG))
return Broadcast;
// Use even/odd duplicate instructions for masks that match their pattern.
if (Subtarget.hasSSE3()) {
if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
}
if (Subtarget.hasAVX()) {
// If we have AVX, we can use VPERMILPS which will allow folding a load
// into the shuffle.
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
}
// Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
// in SSE1 because otherwise they are widened to v2f64 and never get here.
if (!Subtarget.hasSSE2()) {
if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
}
// Otherwise, use a straight shuffle of a single input vector. We pass the
// input vector to both operands to simulate this with a SHUFPS.
return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
}
if (Subtarget.hasAVX2())
if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
return Extract;
// There are special ways we can lower some single-element blends. However, we
// have custom ways we can lower more complex single-element blends below that
// we defer to if both this and BLENDPS fail to match, so restrict this to
// when the V2 input is targeting element 0 of the mask -- that is the fast
// case here.
if (NumV2Elements == 1 && Mask[0] >= 4)
if (SDValue V = lowerShuffleAsElementInsertion(
DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
if (Subtarget.hasSSE41()) {
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Use INSERTPS if we can complete the shuffle efficiently.
if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
return V;
if (!isSingleSHUFPSMask(Mask))
if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
V2, Mask, DAG))
return BlendPerm;
}
// Use low/high mov instructions. These are only valid in SSE1 because
// otherwise they are widened to v2f64 and never get here.
if (!Subtarget.hasSSE2()) {
if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
}
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
return V;
// Otherwise fall back to a SHUFPS lowering strategy.
return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
}
/// Lower 4-lane i32 vector shuffles.
///
/// We try to handle these with integer-domain shuffles where we can, but for
/// blends we use the floating point domain blend instructions.
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return ZExt;
int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
Mask, Subtarget, DAG))
return Broadcast;
// Straight shuffle of a single input vector. For everything from SSE2
// onward this has a single fast instruction with no scary immediates.
// We coerce the shuffle pattern to be compatible with UNPCK instructions
// but we aren't actually going to use the UNPCK instruction because doing
// so prevents folding a load into this instruction or making a copy.
const int UnpackLoMask[] = {0, 0, 1, 1};
const int UnpackHiMask[] = {2, 2, 3, 3};
if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
Mask = UnpackLoMask;
else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
Mask = UnpackHiMask;
return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
}
if (Subtarget.hasAVX2())
if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
return Extract;
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
if (SDValue V = lowerShuffleAsElementInsertion(
DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
// We have different paths for blend lowering, but they all must use the
// *exact* same predicate.
bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
return V;
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget.hasSSSE3()) {
if (Subtarget.hasVLX())
if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i32, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
}
// Assume that a single SHUFPS is faster than an alternative sequence of
// multiple instructions (even if the CPU has a domain penalty).
// If some CPU is harmed by the domain switch, we can fix it in a later pass.
if (!isSingleSHUFPSMask(Mask)) {
// If we have direct support for blends, we should lower by decomposing into
// a permute. That will be faster than the domain cross.
if (IsBlendSupported)
return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask,
Subtarget, DAG);
// Try to lower by permuting the inputs into an unpack instruction.
if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
Mask, Subtarget, DAG))
return Unpack;
}
// We implement this with SHUFPS because it can blend from two vectors.
// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
// up the inputs, bypassing domain shift penalties that we would incur if we
// directly used PSHUFD on Nehalem and older. For newer chips, this isn't
// relevant.
SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
return DAG.getBitcast(MVT::v4i32, ShufPS);
}
/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
/// shuffle lowering, and the most complex part.
///
/// The lowering strategy is to try to form pairs of input lanes which are
/// targeted at the same half of the final vector, and then use a dword shuffle
/// to place them onto the right half, and finally unpack the paired lanes into
/// their final position.
///
/// The exact breakdown of how to form these dword pairs and align them on the
/// correct sides is really tricky. See the comments within the function for
/// more of the details.
///
/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
/// vector, form the analogous 128-bit 8-element Mask.
static SDValue lowerV8I16GeneralSingleInputShuffle(
const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
MutableArrayRef<int> LoMask = Mask.slice(0, 4);
MutableArrayRef<int> HiMask = Mask.slice(4, 4);
// Attempt to directly match PSHUFLW or PSHUFHW.
if (isUndefOrInRange(LoMask, 0, 4) &&
isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
}
if (isUndefOrInRange(HiMask, 4, 8) &&
isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
for (int i = 0; i != 4; ++i)
HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
}
SmallVector<int, 4> LoInputs;
copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
array_pod_sort(LoInputs.begin(), LoInputs.end());
LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
SmallVector<int, 4> HiInputs;
copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
array_pod_sort(HiInputs.begin(), HiInputs.end());
HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
int NumHToL = LoInputs.size() - NumLToL;
int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
int NumHToH = HiInputs.size() - NumLToH;
MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
// If we are shuffling values from one half - check how many different DWORD
// pairs we need to create. If only 1 or 2 then we can perform this as a
// PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
V = DAG.getNode(ShufWOp, DL, VT, V,
getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
V = DAG.getBitcast(PSHUFDVT, V);
V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
return DAG.getBitcast(VT, V);
};
if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
int PSHUFDMask[4] = { -1, -1, -1, -1 };
SmallVector<std::pair<int, int>, 4> DWordPairs;
int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
// Collect the different DWORD pairs.
for (int DWord = 0; DWord != 4; ++DWord) {
int M0 = Mask[2 * DWord + 0];
int M1 = Mask[2 * DWord + 1];
M0 = (M0 >= 0 ? M0 % 4 : M0);
M1 = (M1 >= 0 ? M1 % 4 : M1);
if (M0 < 0 && M1 < 0)
continue;
bool Match = false;
for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
auto &DWordPair = DWordPairs[j];
if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
(M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
PSHUFDMask[DWord] = DOffset + j;
Match = true;
break;
}
}
if (!Match) {
PSHUFDMask[DWord] = DOffset + DWordPairs.size();
DWordPairs.push_back(std::make_pair(M0, M1));
}
}
if (DWordPairs.size() <= 2) {
DWordPairs.resize(2, std::make_pair(-1, -1));
int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
DWordPairs[1].first, DWordPairs[1].second};
if ((NumHToL + NumHToH) == 0)
return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
if ((NumLToL + NumLToH) == 0)
return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
}
}
// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
// such inputs we can swap two of the dwords across the half mark and end up
// with <=2 inputs to each half in each half. Once there, we can fall through
// to the generic code below. For example:
//
// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
//
// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
// and an existing 2-into-2 on the other half. In this case we may have to
// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
// Fortunately, we don't have to handle anything but a 2-into-2 pattern
// because any other situation (including a 3-into-1 or 1-into-3 in the other
// half than the one we target for fixing) will be fixed when we re-enter this
// path. We will also combine away any sequence of PSHUFD instructions that
// result into a single instruction. Here is an example of the tricky case:
//
// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
//
// This now has a 1-into-3 in the high half! Instead, we do two shuffles:
//
// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
//
// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
//
// The result is fine to be handled by the generic logic.
auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
int AOffset, int BOffset) {
assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
"Must call this with A having 3 or 1 inputs from the A half.");
assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
"Must call this with B having 1 or 3 inputs from the B half.");
assert(AToAInputs.size() + BToAInputs.size() == 4 &&
"Must call this with either 3:1 or 1:3 inputs (summing to 4).");
bool ThreeAInputs = AToAInputs.size() == 3;
// Compute the index of dword with only one word among the three inputs in
// a half by taking the sum of the half with three inputs and subtracting
// the sum of the actual three inputs. The difference is the remaining
// slot.
int ADWord = 0, BDWord = 0;
int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
int TripleNonInputIdx =
TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
TripleDWord = TripleNonInputIdx / 2;
// We use xor with one to compute the adjacent DWord to whichever one the
// OneInput is in.
OneInputDWord = (OneInput / 2) ^ 1;
// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
// and BToA inputs. If there is also such a problem with the BToB and AToB
// inputs, we don't try to fix it necessarily -- we'll recurse and see it in
// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
// is essential that we don't *create* a 3<-1 as then we might oscillate.
if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
// Compute how many inputs will be flipped by swapping these DWords. We
// need
// to balance this to ensure we don't form a 3-1 shuffle in the other
// half.
int NumFlippedAToBInputs =
std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
int NumFlippedBToBInputs =
std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
if ((NumFlippedAToBInputs == 1 &&
(NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
(NumFlippedBToBInputs == 1 &&
(NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
// We choose whether to fix the A half or B half based on whether that
// half has zero flipped inputs. At zero, we may not be able to fix it
// with that half. We also bias towards fixing the B half because that
// will more commonly be the high half, and we have to bias one way.
auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
ArrayRef<int> Inputs) {
int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
// Determine whether the free index is in the flipped dword or the
// unflipped dword based on where the pinned index is. We use this bit
// in an xor to conditionally select the adjacent dword.
int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
if (IsFixIdxInput == IsFixFreeIdxInput)
FixFreeIdx += 1;
IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
assert(IsFixIdxInput != IsFixFreeIdxInput &&
"We need to be changing the number of flipped inputs!");
int PSHUFHalfMask[] = {0, 1, 2, 3};
std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
V = DAG.getNode(
FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
for (int &M : Mask)
if (M >= 0 && M == FixIdx)
M = FixFreeIdx;
else if (M >= 0 && M == FixFreeIdx)
M = FixIdx;
};
if (NumFlippedBToBInputs != 0) {
int BPinnedIdx =
BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
} else {
assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
}
}
}
int PSHUFDMask[] = {0, 1, 2, 3};
PSHUFDMask[ADWord] = BDWord;
PSHUFDMask[BDWord] = ADWord;
V = DAG.getBitcast(
VT,
DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
// Adjust the mask to match the new locations of A and B.
for (int &M : Mask)
if (M >= 0 && M/2 == ADWord)
M = 2 * BDWord + M % 2;
else if (M >= 0 && M/2 == BDWord)
M = 2 * ADWord + M % 2;
// Recurse back into this routine to re-compute state now that this isn't
// a 3 and 1 problem.
return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
};
if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
// At this point there are at most two inputs to the low and high halves from
// each half. That means the inputs can always be grouped into dwords and
// those dwords can then be moved to the correct half with a dword shuffle.
// We use at most one low and one high word shuffle to collect these paired
// inputs into dwords, and finally a dword shuffle to place them.
int PSHUFLMask[4] = {-1, -1, -1, -1};
int PSHUFHMask[4] = {-1, -1, -1, -1};
int PSHUFDMask[4] = {-1, -1, -1, -1};
// First fix the masks for all the inputs that are staying in their
// original halves. This will then dictate the targets of the cross-half
// shuffles.
auto fixInPlaceInputs =
[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
MutableArrayRef<int> SourceHalfMask,
MutableArrayRef<int> HalfMask, int HalfOffset) {
if (InPlaceInputs.empty())
return;
if (InPlaceInputs.size() == 1) {
SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
InPlaceInputs[0] - HalfOffset;
PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
return;
}
if (IncomingInputs.empty()) {
// Just fix all of the in place inputs.
for (int Input : InPlaceInputs) {
SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
PSHUFDMask[Input / 2] = Input / 2;
}
return;
}
assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
InPlaceInputs[0] - HalfOffset;
// Put the second input next to the first so that they are packed into
// a dword. We find the adjacent index by toggling the low bit.
int AdjIndex = InPlaceInputs[0] ^ 1;
SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
};
fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
// Now gather the cross-half inputs and place them into a free dword of
// their target half.
// FIXME: This operation could almost certainly be simplified dramatically to
// look more like the 3-1 fixing operation.
auto moveInputsToRightHalf = [&PSHUFDMask](
MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
int DestOffset) {
auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
};
auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
int Word) {
int LowWord = Word & ~1;
int HighWord = Word | 1;
return isWordClobbered(SourceHalfMask, LowWord) ||
isWordClobbered(SourceHalfMask, HighWord);
};
if (IncomingInputs.empty())
return;
if (ExistingInputs.empty()) {
// Map any dwords with inputs from them into the right half.
for (int Input : IncomingInputs) {
// If the source half mask maps over the inputs, turn those into
// swaps and use the swapped lane.
if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
Input - SourceOffset;
// We have to swap the uses in our half mask in one sweep.
for (int &M : HalfMask)
if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
M = Input;
else if (M == Input)
M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
} else {
assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
Input - SourceOffset &&
"Previous placement doesn't match!");
}
// Note that this correctly re-maps both when we do a swap and when
// we observe the other side of the swap above. We rely on that to
// avoid swapping the members of the input list directly.
Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
}
// Map the input's dword into the correct half.
if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
else
assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
Input / 2 &&
"Previous placement doesn't match!");
}
// And just directly shift any other-half mask elements to be same-half
// as we will have mirrored the dword containing the element into the
// same position within that half.
for (int &M : HalfMask)
if (M >= SourceOffset && M < SourceOffset + 4) {
M = M - SourceOffset + DestOffset;
assert(M >= 0 && "This should never wrap below zero!");
}
return;
}
// Ensure we have the input in a viable dword of its current half. This
// is particularly tricky because the original position may be clobbered
// by inputs being moved and *staying* in that half.
if (IncomingInputs.size() == 1) {
if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
SourceOffset;
SourceHalfMask[InputFixed - SourceOffset] =
IncomingInputs[0] - SourceOffset;
std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
InputFixed);
IncomingInputs[0] = InputFixed;
}
} else if (IncomingInputs.size() == 2) {
if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
// We have two non-adjacent or clobbered inputs we need to extract from
// the source half. To do this, we need to map them into some adjacent
// dword slot in the source mask.
int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
IncomingInputs[1] - SourceOffset};
// If there is a free slot in the source half mask adjacent to one of
// the inputs, place the other input in it. We use (Index XOR 1) to
// compute an adjacent index.
if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
InputsFixed[1] = InputsFixed[0] ^ 1;
} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
InputsFixed[0] = InputsFixed[1] ^ 1;
} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
// The two inputs are in the same DWord but it is clobbered and the
// adjacent DWord isn't used at all. Move both inputs to the free
// slot.
SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
} else {
// The only way we hit this point is if there is no clobbering
// (because there are no off-half inputs to this half) and there is no
// free slot adjacent to one of the inputs. In this case, we have to
// swap an input with a non-input.
for (int i = 0; i < 4; ++i)
assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
"We can't handle any clobbers here!");
assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
"Cannot have adjacent inputs here!");
SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
// We also have to update the final source mask in this case because
// it may need to undo the above swap.
for (int &M : FinalSourceHalfMask)
if (M == (InputsFixed[0] ^ 1) + SourceOffset)
M = InputsFixed[1] + SourceOffset;
else if (M == InputsFixed[1] + SourceOffset)
M = (InputsFixed[0] ^ 1) + SourceOffset;
InputsFixed[1] = InputsFixed[0] ^ 1;
}
// Point everything at the fixed inputs.
for (int &M : HalfMask)
if (M == IncomingInputs[0])
M = InputsFixed[0] + SourceOffset;
else if (M == IncomingInputs[1])
M = InputsFixed[1] + SourceOffset;
IncomingInputs[0] = InputsFixed[0] + SourceOffset;
IncomingInputs[1] = InputsFixed[1] + SourceOffset;
}
} else {
llvm_unreachable("Unhandled input size!");
}
// Now hoist the DWord down to the right half.
int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
for (int &M : HalfMask)
for (int Input : IncomingInputs)
if (M == Input)
M = FreeDWord * 2 + Input % 2;
};
moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
/*SourceOffset*/ 4, /*DestOffset*/ 0);
moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
/*SourceOffset*/ 0, /*DestOffset*/ 4);
// Now enact all the shuffles we've computed to move the inputs into their
// target half.
if (!isNoopShuffleMask(PSHUFLMask))
V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
if (!isNoopShuffleMask(PSHUFHMask))
V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
if (!isNoopShuffleMask(PSHUFDMask))
V = DAG.getBitcast(
VT,
DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
// At this point, each half should contain all its inputs, and we can then
// just shuffle them into their final position.
assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
"Failed to lift all the high half inputs to the low mask!");
assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
"Failed to lift all the low half inputs to the high mask!");
// Do a half shuffle for the low mask.
if (!isNoopShuffleMask(LoMask))
V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
// Do a half shuffle with the high mask after shifting its values down.
for (int &M : HiMask)
if (M >= 0)
M -= 4;
if (!isNoopShuffleMask(HiMask))
V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
return V;
}
/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
/// blend if only one input is used.
static SDValue lowerShuffleAsBlendOfPSHUFBs(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
"Lane crossing shuffle masks not supported");
int NumBytes = VT.getSizeInBits() / 8;
int Size = Mask.size();
int Scale = NumBytes / Size;
SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
V1InUse = false;
V2InUse = false;
for (int i = 0; i < NumBytes; ++i) {
int M = Mask[i / Scale];
if (M < 0)
continue;
const int ZeroMask = 0x80;
int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
if (Zeroable[i / Scale])
V1Idx = V2Idx = ZeroMask;
V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
V1InUse |= (ZeroMask != V1Idx);
V2InUse |= (ZeroMask != V2Idx);
}
MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
if (V1InUse)
V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
DAG.getBuildVector(ShufVT, DL, V1Mask));
if (V2InUse)
V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
DAG.getBuildVector(ShufVT, DL, V2Mask));
// If we need shuffled inputs from both, blend the two.
SDValue V;
if (V1InUse && V2InUse)
V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
else
V = V1InUse ? V1 : V2;
// Cast the result back to the correct type.
return DAG.getBitcast(VT, V);
}
/// Generic lowering of 8-lane i16 shuffles.
///
/// This handles both single-input shuffles and combined shuffle/blends with
/// two inputs. The single input shuffles are immediately delegated to
/// a dedicated lowering routine.
///
/// The blends are lowered in one of three fundamental ways. If there are few
/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
/// of the input is significantly cheaper when lowered as an interleaving of
/// the two inputs, try to interleave them. Otherwise, blend the low and high
/// halves of the inputs separately (making them have relatively few inputs)
/// and then concatenate them.
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative.
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return ZExt;
int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
if (NumV2Inputs == 0) {
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
Mask, Subtarget, DAG))
return Broadcast;
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
Subtarget))
return V;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
Subtarget, DAG))
return Rotate;
// Make a copy of the mask so it can be modified.
SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
Subtarget, DAG);
}
assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
"All single-input shuffles should be canonicalized to be V1-input "
"shuffles.");
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// See if we can use SSE4A Extraction / Insertion.
if (Subtarget.hasSSE4A())
if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
Zeroable, DAG))
return V;
// There are special ways we can lower some single-element blends.
if (NumV2Inputs == 1)
if (SDValue V = lowerShuffleAsElementInsertion(
DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
// We have different paths for blend lowering, but they all must use the
// *exact* same predicate.
bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
Subtarget))
return V;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
if (SDValue BitBlend =
lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
return BitBlend;
// Try to use byte shift instructions to mask.
if (SDValue V = lowerVectorShuffleAsByteShiftMask(
DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
// Try to lower by permuting the inputs into an unpack instruction.
if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
Mask, Subtarget, DAG))
return Unpack;
// If we can't directly blend but can use PSHUFB, that will be better as it
// can both shuffle and set up the inefficient blend.
if (!IsBlendSupported && Subtarget.hasSSSE3()) {
bool V1InUse, V2InUse;
return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
Zeroable, DAG, V1InUse, V2InUse);
}
// We can always bit-blend if we have to so the fallback strategy is to
// decompose into single-input permutes and blends.
return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
Mask, Subtarget, DAG);
}
/// Check whether a compaction lowering can be done by dropping even
/// elements and compute how many times even elements must be dropped.
///
/// This handles shuffles which take every Nth element where N is a power of
/// two. Example shuffle masks:
///
/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
///
/// Any of these lanes can of course be undef.
///
/// This routine only supports N <= 3.
/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
/// for larger N.
///
/// \returns N above, or the number of times even elements must be dropped if
/// there is such a number. Otherwise returns zero.
static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
bool IsSingleInput) {
// The modulus for the shuffle vector entries is based on whether this is
// a single input or not.
int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
"We should only be called with masks with a power-of-2 size!");
uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
// and 2^3 simultaneously. This is because we may have ambiguity with
// partially undef inputs.
bool ViableForN[3] = {true, true, true};
for (int i = 0, e = Mask.size(); i < e; ++i) {
// Ignore undef lanes, we'll optimistically collapse them to the pattern we
// want.
if (Mask[i] < 0)
continue;
bool IsAnyViable = false;
for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
if (ViableForN[j]) {
uint64_t N = j + 1;
// The shuffle mask must be equal to (i * 2^N) % M.
if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
IsAnyViable = true;
else
ViableForN[j] = false;
}
// Early exit if we exhaust the possible powers of two.
if (!IsAnyViable)
break;
}
for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
if (ViableForN[j])
return j + 1;
// Return 0 as there is no viable power of two.
return 0;
}
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
if (V2.isUndef())
return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
}
/// Generic lowering of v16i8 shuffles.
///
/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
/// detect any complexity reducing interleaving. If that doesn't help, it uses
/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
/// back together.
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
// Use dedicated pack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
Subtarget))
return V;
// Try to use a zext lowering.
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return ZExt;
// See if we can use SSE4A Extraction / Insertion.
if (Subtarget.hasSSE4A())
if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
Zeroable, DAG))
return V;
int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
// For single-input shuffles, there are some nicer lowering tricks we can use.
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
Mask, Subtarget, DAG))
return Broadcast;
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
return V;
// Check whether we can widen this to an i16 shuffle by duplicating bytes.
// Notably, this handles splat and partial-splat shuffles more efficiently.
// However, it only makes sense if the pre-duplication shuffle simplifies
// things significantly. Currently, this means we need to be able to
// express the pre-duplication shuffle as an i16 shuffle.
//
// FIXME: We should check for other patterns which can be widened into an
// i16 shuffle as well.
auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
for (int i = 0; i < 16; i += 2)
if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
return false;
return true;
};
auto tryToWidenViaDuplication = [&]() -> SDValue {
if (!canWidenViaDuplication(Mask))
return SDValue();
SmallVector<int, 4> LoInputs;
copy_if(Mask, std::back_inserter(LoInputs),
[](int M) { return M >= 0 && M < 8; });
array_pod_sort(LoInputs.begin(), LoInputs.end());
LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
LoInputs.end());
SmallVector<int, 4> HiInputs;
copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
array_pod_sort(HiInputs.begin(), HiInputs.end());
HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
HiInputs.end());
bool TargetLo = LoInputs.size() >= HiInputs.size();
ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
SmallDenseMap<int, int, 8> LaneMap;
for (int I : InPlaceInputs) {
PreDupI16Shuffle[I/2] = I/2;
LaneMap[I] = I;
}
int j = TargetLo ? 0 : 4, je = j + 4;
for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
// Check if j is already a shuffle of this input. This happens when
// there are two adjacent bytes after we move the low one.
if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
// If we haven't yet mapped the input, search for a slot into which
// we can map it.
while (j < je && PreDupI16Shuffle[j] >= 0)
++j;
if (j == je)
// We can't place the inputs into a single half with a simple i16 shuffle, so bail.
return SDValue();
// Map this input with the i16 shuffle.
PreDupI16Shuffle[j] = MovingInputs[i] / 2;
}
// Update the lane map based on the mapping we ended up with.
LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
}
V1 = DAG.getBitcast(
MVT::v16i8,
DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
// Unpack the bytes to form the i16s that will be shuffled into place.
V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
MVT::v16i8, V1, V1);
int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
for (int i = 0; i < 16; ++i)
if (Mask[i] >= 0) {
int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
if (PostDupI16Shuffle[i / 2] < 0)
PostDupI16Shuffle[i / 2] = MappedMask;
else
assert(PostDupI16Shuffle[i / 2] == MappedMask &&
"Conflicting entries in the original shuffle!");
}
return DAG.getBitcast(
MVT::v16i8,
DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
};
if (SDValue V = tryToWidenViaDuplication())
return V;
}
if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
return V;
// Try to use byte shift instructions to mask.
if (SDValue V = lowerVectorShuffleAsByteShiftMask(
DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
// with PSHUFB. It is important to do this before we attempt to generate any
// blends but after all of the single-input lowerings. If the single input
// lowerings can find an instruction sequence that is faster than a PSHUFB, we
// want to preserve that and we can DAG combine any longer sequences into
// a PSHUFB in the end. But once we start blending from multiple inputs,
// the complexity of DAG combining bad patterns back into PSHUFB is too high,
// and there are *very* few patterns that would actually be faster than the
// PSHUFB approach because of its ability to zero lanes.
//
// FIXME: The only exceptions to the above are blends which are exact
// interleavings with direct instructions supporting them. We currently don't
// handle those well here.
if (Subtarget.hasSSSE3()) {
bool V1InUse = false;
bool V2InUse = false;
SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
// If both V1 and V2 are in use and we can use a direct blend or an unpack,
// do so. This avoids using them to handle blends-with-zero which is
// important as a single pshufb is significantly faster for that.
if (V1InUse && V2InUse) {
if (Subtarget.hasSSE41())
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// We can use an unpack to do the blending rather than an or in some
// cases. Even though the or may be (very minorly) more efficient, we
// preference this lowering because there are common cases where part of
// the complexity of the shuffles goes away when we do the final blend as
// an unpack.
// FIXME: It might be worth trying to detect if the unpack-feeding
// shuffles will both be pshufb, in which case we shouldn't bother with
// this.
if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return Unpack;
// If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
if (Subtarget.hasVBMI() && Subtarget.hasVLX())
return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
// Use PALIGNR+Permute if possible - permute might become PSHUFB but the
// PALIGNR will be cheaper than the second PSHUFB+OR.
if (SDValue V = lowerShuffleAsByteRotateAndPermute(
DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return V;
}
return PSHUFB;
}
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
if (SDValue V = lowerShuffleAsElementInsertion(
DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
return Blend;
// Check whether a compaction lowering can be done. This handles shuffles
// which take every Nth element for some even N. See the helper function for
// details.
//
// We special case these as they can be particularly efficiently handled with
// the PACKUSB instruction on x86 and they show up in common patterns of
// rearranging bytes to truncate wide elements.
bool IsSingleInput = V2.isUndef();
if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
// NumEvenDrops is the power of two stride of the elements. Another way of
// thinking about it is that we need to drop the even elements this many
// times to get the original input.
// First we need to zero all the dropped bytes.
assert(NumEvenDrops <= 3 &&
"No support for dropping even elements more than 3 times.");
// We use the mask type to pick which bytes are preserved based on how many
// elements are dropped.
MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
SDValue ByteClearMask = DAG.getBitcast(
MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
if (!IsSingleInput)
V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
// Now pack things back together.
V1 = DAG.getBitcast(MVT::v8i16, V1);
V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
for (int i = 1; i < NumEvenDrops; ++i) {
Result = DAG.getBitcast(MVT::v8i16, Result);
Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
}
return Result;
}
// Handle multi-input cases by blending single-input shuffles.
if (NumV2Elements > 0)
return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask,
Subtarget, DAG);
// The fallback path for single-input shuffles widens this into two v8i16
// vectors with unpacks, shuffles those, and then pulls them back together
// with a pack.
SDValue V = V1;
std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
for (int i = 0; i < 16; ++i)
if (Mask[i] >= 0)
(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
SDValue VLoHalf, VHiHalf;
// Check if any of the odd lanes in the v16i8 are used. If not, we can mask
// them out and avoid using UNPCK{L,H} to extract the elements of V as
// i16s.
if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
// Use a mask to drop the high bytes.
VLoHalf = DAG.getBitcast(MVT::v8i16, V);
VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
DAG.getConstant(0x00FF, DL, MVT::v8i16));
// This will be a single vector shuffle instead of a blend so nuke VHiHalf.
VHiHalf = DAG.getUNDEF(MVT::v8i16);
// Squash the masks to point directly into VLoHalf.
for (int &M : LoBlendMask)
if (M >= 0)
M /= 2;
for (int &M : HiBlendMask)
if (M >= 0)
M /= 2;
} else {
// Otherwise just unpack the low half of V into VLoHalf and the high half into
// VHiHalf so that we can blend them as i16s.
SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
VLoHalf = DAG.getBitcast(
MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
VHiHalf = DAG.getBitcast(
MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
}
SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
}
/// Dispatching routine to lower various 128-bit x86 vector shuffles.
///
/// This routine breaks down the specific type of 128-bit shuffle and
/// dispatches to the lowering routines accordingly.
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
switch (VT.SimpleTy) {
case MVT::v2i64:
return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v2f64:
return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v4i32:
return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v4f32:
return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8i16:
return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16i8:
return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
default:
llvm_unreachable("Unimplemented!");
}
}
/// Generic routine to split vector shuffle into half-sized shuffles.
///
/// This routine just extracts two subvectors, shuffles them independently, and
/// then concatenates them back together. This should work effectively with all
/// AVX vector shuffle types.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG) {
assert(VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!");
assert(V1.getSimpleValueType() == VT && "Bad operand type!");
assert(V2.getSimpleValueType() == VT && "Bad operand type!");
ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
int NumElements = VT.getVectorNumElements();
int SplitNumElements = NumElements / 2;
MVT ScalarVT = VT.getVectorElementType();
MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
// Rather than splitting build-vectors, just build two narrower build
// vectors. This helps shuffling with splats and zeros.
auto SplitVector = [&](SDValue V) {
V = peekThroughBitcasts(V);
MVT OrigVT = V.getSimpleValueType();
int OrigNumElements = OrigVT.getVectorNumElements();
int OrigSplitNumElements = OrigNumElements / 2;
MVT OrigScalarVT = OrigVT.getVectorElementType();
MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
SDValue LoV, HiV;
auto *BV = dyn_cast<BuildVectorSDNode>(V);
if (!BV) {
LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
DAG.getIntPtrConstant(0, DL));
HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
DAG.getIntPtrConstant(OrigSplitNumElements, DL));
} else {
SmallVector<SDValue, 16> LoOps, HiOps;
for (int i = 0; i < OrigSplitNumElements; ++i) {
LoOps.push_back(BV->getOperand(i));
HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
}
LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
}
return std::make_pair(DAG.getBitcast(SplitVT, LoV),
DAG.getBitcast(SplitVT, HiV));
};
SDValue LoV1, HiV1, LoV2, HiV2;
std::tie(LoV1, HiV1) = SplitVector(V1);
std::tie(LoV2, HiV2) = SplitVector(V2);
// Now create two 4-way blends of these half-width vectors.
auto HalfBlend = [&](ArrayRef<int> HalfMask) {
bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
for (int i = 0; i < SplitNumElements; ++i) {
int M = HalfMask[i];
if (M >= NumElements) {
if (M >= NumElements + SplitNumElements)
UseHiV2 = true;
else
UseLoV2 = true;
V2BlendMask[i] = M - NumElements;
BlendMask[i] = SplitNumElements + i;
} else if (M >= 0) {
if (M >= SplitNumElements)
UseHiV1 = true;
else
UseLoV1 = true;
V1BlendMask[i] = M;
BlendMask[i] = i;
}
}
// Because the lowering happens after all combining takes place, we need to
// manually combine these blend masks as much as possible so that we create
// a minimal number of high-level vector shuffle nodes.
// First try just blending the halves of V1 or V2.
if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
return DAG.getUNDEF(SplitVT);
if (!UseLoV2 && !UseHiV2)
return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
if (!UseLoV1 && !UseHiV1)
return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
SDValue V1Blend, V2Blend;
if (UseLoV1 && UseHiV1) {
V1Blend =
DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
} else {
// We only use half of V1 so map the usage down into the final blend mask.
V1Blend = UseLoV1 ? LoV1 : HiV1;
for (int i = 0; i < SplitNumElements; ++i)
if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
}
if (UseLoV2 && UseHiV2) {
V2Blend =
DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
} else {
// We only use half of V2 so map the usage down into the final blend mask.
V2Blend = UseLoV2 ? LoV2 : HiV2;
for (int i = 0; i < SplitNumElements; ++i)
if (BlendMask[i] >= SplitNumElements)
BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
}
return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
};
SDValue Lo = HalfBlend(LoMask);
SDValue Hi = HalfBlend(HiMask);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
}
/// Either split a vector in halves or decompose the shuffles and the
/// blend.
///
/// This is provided as a good fallback for many lowerings of non-single-input
/// shuffles with more than one 128-bit lane. In those cases, we want to select
/// between splitting the shuffle into 128-bit components and stitching those
/// back together vs. extracting the single-input shuffles and blending those
/// results.
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.");
int Size = Mask.size();
// If this can be modeled as a broadcast of two elements followed by a blend,
// prefer that lowering. This is especially important because broadcasts can
// often fold with memory operands.
auto DoBothBroadcast = [&] {
int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
for (int M : Mask)
if (M >= Size) {
if (V2BroadcastIdx < 0)
V2BroadcastIdx = M - Size;
else if (M - Size != V2BroadcastIdx)
return false;
} else if (M >= 0) {
if (V1BroadcastIdx < 0)
V1BroadcastIdx = M;
else if (M != V1BroadcastIdx)
return false;
}
return true;
};
if (DoBothBroadcast())
return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
Subtarget, DAG);
// If the inputs all stem from a single 128-bit lane of each input, then we
// split them rather than blending because the split will decompose to
// unusually few instructions.
int LaneCount = VT.getSizeInBits() / 128;
int LaneSize = Size / LaneCount;
SmallBitVector LaneInputs[2];
LaneInputs[0].resize(LaneCount, false);
LaneInputs[1].resize(LaneCount, false);
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0)
LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
// Otherwise, just fall back to decomposed shuffles and a blend. This requires
// that the decomposed single-input shuffles don't end up here.
return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget,
DAG);
}
/// Lower a vector shuffle crossing multiple 128-bit lanes as
/// a lane permutation followed by a per-lane permutation.
///
/// This is mainly for cases where we can have non-repeating permutes
/// in each lane.
///
/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
/// we should investigate merging them.
static SDValue lowerShuffleAsLanePermuteAndPermute(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG, const X86Subtarget &Subtarget) {
int NumElts = VT.getVectorNumElements();
int NumLanes = VT.getSizeInBits() / 128;
int NumEltsPerLane = NumElts / NumLanes;
SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);
for (int i = 0; i != NumElts; ++i) {
int M = Mask[i];
if (M < 0)
continue;
// Ensure that each lane comes from a single source lane.
int SrcLane = M / NumEltsPerLane;
int DstLane = i / NumEltsPerLane;
if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane))
return SDValue();
SrcLaneMask[DstLane] = SrcLane;
PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
}
// Make sure we set all elements of the lane mask, to avoid undef propagation.
SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
for (int DstLane = 0; DstLane != NumLanes; ++DstLane) {
int SrcLane = SrcLaneMask[DstLane];
if (0 <= SrcLane)
for (int j = 0; j != NumEltsPerLane; ++j) {
LaneMask[(DstLane * NumEltsPerLane) + j] =
(SrcLane * NumEltsPerLane) + j;
}
}
// If we're only shuffling a single lowest lane and the rest are identity
// then don't bother.
// TODO - isShuffleMaskInputInPlace could be extended to something like this.
int NumIdentityLanes = 0;
bool OnlyShuffleLowestLane = true;
for (int i = 0; i != NumLanes; ++i) {
if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane,
i * NumEltsPerLane))
NumIdentityLanes++;
else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes)
OnlyShuffleLowestLane = false;
}
if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
return SDValue();
SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask);
return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
}
/// Lower a vector shuffle crossing multiple 128-bit lanes as
/// a permutation and blend of those lanes.
///
/// This essentially blends the out-of-lane inputs to each lane into the lane
/// from a permuted copy of the vector. This lowering strategy results in four
/// instructions in the worst case for a single-input cross lane shuffle which
/// is lower than any other fully general cross-lane shuffle strategy I'm aware
/// of. Special cases for each particular shuffle pattern should be handled
/// prior to trying this lowering.
static SDValue lowerShuffleAsLanePermuteAndBlend(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG, const X86Subtarget &Subtarget) {
// FIXME: This should probably be generalized for 512-bit vectors as well.
assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
int Size = Mask.size();
int LaneSize = Size / 2;
// If there are only inputs from one 128-bit lane, splitting will in fact be
// less expensive. The flags track whether the given lane contains an element
// that crosses to another lane.
if (!Subtarget.hasAVX2()) {
bool LaneCrossing[2] = {false, false};
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
if (!LaneCrossing[0] || !LaneCrossing[1])
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
} else {
bool LaneUsed[2] = {false, false};
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0)
LaneUsed[(Mask[i] / LaneSize)] = true;
if (!LaneUsed[0] || !LaneUsed[1])
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
}
assert(V2.isUndef() &&
"This last part of this routine only works on single input shuffles");
SmallVector<int, 32> FlippedBlendMask(Size);
for (int i = 0; i < Size; ++i)
FlippedBlendMask[i] =
Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
? Mask[i]
: Mask[i] % LaneSize +
(i / LaneSize) * LaneSize + Size);
// Flip the vector, and blend the results which should now be in-lane.
MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
SDValue Flipped = DAG.getBitcast(PVT, V1);
Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
{ 2, 3, 0, 1 });
Flipped = DAG.getBitcast(VT, Flipped);
return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
}
/// Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
if (Subtarget.hasAVX2() && V2.isUndef())
return SDValue();
SmallVector<int, 4> WidenedMask;
if (!canWidenShuffleElements(Mask, Zeroable, WidenedMask))
return SDValue();
bool IsLowZero = (Zeroable & 0x3) == 0x3;
bool IsHighZero = (Zeroable & 0xc) == 0xc;
// Try to use an insert into a zero vector.
if (WidenedMask[0] == 0 && IsHighZero) {
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
DAG.getIntPtrConstant(0, DL));
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
getZeroVector(VT, Subtarget, DAG, DL), LoV,
DAG.getIntPtrConstant(0, DL));
}
// TODO: If minimizing size and one of the inputs is a zero vector and the
// the zero vector has only one use, we could use a VPERM2X128 to save the
// instruction bytes needed to explicitly generate the zero vector.
// Blends are faster and handle all the non-lane-crossing cases.
if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
Subtarget, DAG))
return Blend;
// If either input operand is a zero vector, use VPERM2X128 because its mask
// allows us to replace the zero input with an implicit zero.
if (!IsLowZero && !IsHighZero) {
// Check for patterns which can be matched with a single insert of a 128-bit
// subvector.
bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
// this will likely become vinsertf128 which can't fold a 256-bit memop.
if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
OnlyUsesV1 ? V1 : V2,
DAG.getIntPtrConstant(0, DL));
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
DAG.getIntPtrConstant(2, DL));
}
}
// Try to use SHUF128 if possible.
if (Subtarget.hasVLX()) {
if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
((WidenedMask[1] % 2) << 1);
return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
DAG.getConstant(PermMask, DL, MVT::i8));
}
}
}
// Otherwise form a 128-bit permutation. After accounting for undefs,
// convert the 64-bit shuffle mask selection values into 128-bit
// selection bits by dividing the indexes by 2 and shifting into positions
// defined by a vperm2*128 instruction's immediate control byte.
// The immediate permute control byte looks like this:
// [1:0] - select 128 bits from sources for low half of destination
// [2] - ignore
// [3] - zero low half of destination
// [5:4] - select 128 bits from sources for high half of destination
// [6] - ignore
// [7] - zero high half of destination
assert((WidenedMask[0] >= 0 || IsLowZero) &&
(WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
unsigned PermMask = 0;
PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
// Check the immediate mask and replace unused sources with undef.
if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
V1 = DAG.getUNDEF(VT);
if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
V2 = DAG.getUNDEF(VT);
return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
DAG.getConstant(PermMask, DL, MVT::i8));
}
/// Lower a vector shuffle by first fixing the 128-bit lanes and then
/// shuffling each lane.
///
/// This attempts to create a repeated lane shuffle where each lane uses one
/// or two of the lanes of the inputs. The lanes of the input vectors are
/// shuffled in one or two independent shuffles to get the lanes into the
/// position needed by the final shuffle.
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(!V2.isUndef() && "This is only useful with multiple inputs.");
if (is128BitLaneRepeatedShuffleMask(VT, Mask))
return SDValue();
int Size = Mask.size();
int NumLanes = VT.getSizeInBits() / 128;
int LaneSize = 128 / VT.getScalarSizeInBits();
SmallVector<int, 16> RepeatMask(LaneSize, -1);
SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
// First pass will try to fill in the RepeatMask from lanes that need two
// sources.
for (int Lane = 0; Lane != NumLanes; ++Lane) {
int Srcs[2] = { -1, -1 };
SmallVector<int, 16> InLaneMask(LaneSize, -1);
for (int i = 0; i != LaneSize; ++i) {
int M = Mask[(Lane * LaneSize) + i];
if (M < 0)
continue;
// Determine which of the possible input lanes (NumLanes from each source)
// this element comes from. Assign that as one of the sources for this
// lane. We can assign up to 2 sources for this lane. If we run out
// sources we can't do anything.
int LaneSrc = M / LaneSize;
int Src;
if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
Src = 0;
else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
Src = 1;
else
return SDValue();
Srcs[Src] = LaneSrc;
InLaneMask[i] = (M % LaneSize) + Src * Size;
}
// If this lane has two sources, see if it fits with the repeat mask so far.
if (Srcs[1] < 0)
continue;
LaneSrcs[Lane][0] = Srcs[0];
LaneSrcs[Lane][1] = Srcs[1];
auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
assert(M1.size() == M2.size() && "Unexpected mask size");
for (int i = 0, e = M1.size(); i != e; ++i)
if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
return false;
return true;
};
auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
for (int i = 0, e = MergedMask.size(); i != e; ++i) {
int M = Mask[i];
if (M < 0)
continue;
assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
"Unexpected mask element");
MergedMask[i] = M;
}
};
if (MatchMasks(InLaneMask, RepeatMask)) {
// Merge this lane mask into the final repeat mask.
MergeMasks(InLaneMask, RepeatMask);
continue;
}
// Didn't find a match. Swap the operands and try again.
std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
ShuffleVectorSDNode::commuteMask(InLaneMask);
if (MatchMasks(InLaneMask, RepeatMask)) {
// Merge this lane mask into the final repeat mask.
MergeMasks(InLaneMask, RepeatMask);
continue;
}
// Couldn't find a match with the operands in either order.
return SDValue();
}
// Now handle any lanes with only one source.
for (int Lane = 0; Lane != NumLanes; ++Lane) {
// If this lane has already been processed, skip it.
if (LaneSrcs[Lane][0] >= 0)
continue;
for (int i = 0; i != LaneSize; ++i) {
int M = Mask[(Lane * LaneSize) + i];
if (M < 0)
continue;
// If RepeatMask isn't defined yet we can define it ourself.
if (RepeatMask[i] < 0)
RepeatMask[i] = M % LaneSize;
if (RepeatMask[i] < Size) {
if (RepeatMask[i] != M % LaneSize)
return SDValue();
LaneSrcs[Lane][0] = M / LaneSize;
} else {
if (RepeatMask[i] != ((M % LaneSize) + Size))
return SDValue();
LaneSrcs[Lane][1] = M / LaneSize;
}
}
if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
return SDValue();
}
SmallVector<int, 16> NewMask(Size, -1);
for (int Lane = 0; Lane != NumLanes; ++Lane) {
int Src = LaneSrcs[Lane][0];
for (int i = 0; i != LaneSize; ++i) {
int M = -1;
if (Src >= 0)
M = Src * LaneSize + i;
NewMask[Lane * LaneSize + i] = M;
}
}
SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
// Ensure we didn't get back the shuffle we started with.
// FIXME: This is a hack to make up for some splat handling code in
// getVectorShuffle.
if (isa<ShuffleVectorSDNode>(NewV1) &&
cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
return SDValue();
for (int Lane = 0; Lane != NumLanes; ++Lane) {
int Src = LaneSrcs[Lane][1];
for (int i = 0; i != LaneSize; ++i) {
int M = -1;
if (Src >= 0)
M = Src * LaneSize + i;
NewMask[Lane * LaneSize + i] = M;
}
}
SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
// Ensure we didn't get back the shuffle we started with.
// FIXME: This is a hack to make up for some splat handling code in
// getVectorShuffle.
if (isa<ShuffleVectorSDNode>(NewV2) &&
cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
return SDValue();
for (int i = 0; i != Size; ++i) {
NewMask[i] = RepeatMask[i % LaneSize];
if (NewMask[i] < 0)
continue;
NewMask[i] += (i / LaneSize) * LaneSize;
}
return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
}
/// If the input shuffle mask results in a vector that is undefined in all upper
/// or lower half elements and that mask accesses only 2 halves of the
/// shuffle's operands, return true. A mask of half the width with mask indexes
/// adjusted to access the extracted halves of the original shuffle operands is
/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
/// lower half of each input operand is accessed.
static bool
getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
int &HalfIdx1, int &HalfIdx2) {
assert((Mask.size() == HalfMask.size() * 2) &&
"Expected input mask to be twice as long as output");
// Exactly one half of the result must be undef to allow narrowing.
bool UndefLower = isUndefLowerHalf(Mask);
bool UndefUpper = isUndefUpperHalf(Mask);
if (UndefLower == UndefUpper)
return false;
unsigned HalfNumElts = HalfMask.size();
unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
HalfIdx1 = -1;
HalfIdx2 = -1;
for (unsigned i = 0; i != HalfNumElts; ++i) {
int M = Mask[i + MaskIndexOffset];
if (M < 0) {
HalfMask[i] = M;
continue;
}
// Determine which of the 4 half vectors this element is from.
// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
int HalfIdx = M / HalfNumElts;
// Determine the element index into its half vector source.
int HalfElt = M % HalfNumElts;
// We can shuffle with up to 2 half vectors, set the new 'half'
// shuffle mask accordingly.
if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
HalfMask[i] = HalfElt;
HalfIdx1 = HalfIdx;
continue;
}
if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
HalfMask[i] = HalfElt + HalfNumElts;
HalfIdx2 = HalfIdx;
continue;
}
// Too many half vectors referenced.
return false;
}
return true;
}
/// Given the output values from getHalfShuffleMask(), create a half width
/// shuffle of extracted vectors followed by an insert back to full width.
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
ArrayRef<int> HalfMask, int HalfIdx1,
int HalfIdx2, bool UndefLower,
SelectionDAG &DAG) {
assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
assert(V1.getValueType().isSimple() && "Expecting only simple types");
MVT VT = V1.getSimpleValueType();
unsigned NumElts = VT.getVectorNumElements();
unsigned HalfNumElts = NumElts / 2;
MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
auto getHalfVector = [&](int HalfIdx) {
if (HalfIdx < 0)
return DAG.getUNDEF(HalfVT);
SDValue V = (HalfIdx < 2 ? V1 : V2);
HalfIdx = (HalfIdx % 2) * HalfNumElts;
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
DAG.getIntPtrConstant(HalfIdx, DL));
};
// ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
SDValue Half1 = getHalfVector(HalfIdx1);
SDValue Half2 = getHalfVector(HalfIdx2);
SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
unsigned Offset = UndefLower ? HalfNumElts : 0;
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
DAG.getIntPtrConstant(Offset, DL));
}
/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
/// This allows for fast cases such as subvector extraction/insertion
/// or shuffling smaller vector types which can lower more efficiently.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert((VT.is256BitVector() || VT.is512BitVector()) &&
"Expected 256-bit or 512-bit vector");
bool UndefLower = isUndefLowerHalf(Mask);
if (!UndefLower && !isUndefUpperHalf(Mask))
return SDValue();
assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
"Completely undef shuffle mask should have been simplified already");
// Upper half is undef and lower half is whole upper subvector.
// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
unsigned NumElts = VT.getVectorNumElements();
unsigned HalfNumElts = NumElts / 2;
MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
if (!UndefLower &&
isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
DAG.getIntPtrConstant(HalfNumElts, DL));
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
DAG.getIntPtrConstant(0, DL));
}
// Lower half is undef and upper half is whole lower subvector.
// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
if (UndefLower &&
isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
DAG.getIntPtrConstant(0, DL));
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
DAG.getIntPtrConstant(HalfNumElts, DL));
}
int HalfIdx1, HalfIdx2;
SmallVector<int, 8> HalfMask(HalfNumElts);
if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
return SDValue();
assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
// Only shuffle the halves of the inputs when useful.
unsigned NumLowerHalves =
(HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
unsigned NumUpperHalves =
(HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
// Determine the larger pattern of undef/halves, then decide if it's worth
// splitting the shuffle based on subtarget capabilities and types.
unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
if (!UndefLower) {
// XXXXuuuu: no insert is needed.
// Always extract lowers when setting lower - these are all free subreg ops.
if (NumUpperHalves == 0)
return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
UndefLower, DAG);
if (NumUpperHalves == 1) {
// AVX2 has efficient 32/64-bit element cross-lane shuffles.
if (Subtarget.hasAVX2()) {
// extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
!is128BitUnpackShuffleMask(HalfMask) &&
(!isSingleSHUFPSMask(HalfMask) ||
Subtarget.hasFastVariableShuffle()))
return SDValue();
// If this is a unary shuffle (assume that the 2nd operand is
// canonicalized to undef), then we can use vpermpd. Otherwise, we
// are better off extracting the upper half of 1 operand and using a
// narrow shuffle.
if (EltWidth == 64 && V2.isUndef())
return SDValue();
}
// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
if (Subtarget.hasAVX512() && VT.is512BitVector())
return SDValue();
// Extract + narrow shuffle is better than the wide alternative.
return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
UndefLower, DAG);
}
// Don't extract both uppers, instead shuffle and then extract.
assert(NumUpperHalves == 2 && "Half vector count went wrong");
return SDValue();
}
// UndefLower - uuuuXXXX: an insert to high half is required if we split this.
if (NumUpperHalves == 0) {
// AVX2 has efficient 64-bit element cross-lane shuffles.
// TODO: Refine to account for unary shuffle, splat, and other masks?
if (Subtarget.hasAVX2() && EltWidth == 64)
return SDValue();
// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
if (Subtarget.hasAVX512() && VT.is512BitVector())
return SDValue();
// Narrow shuffle + insert is better than the wide alternative.
return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
UndefLower, DAG);
}
// NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
return SDValue();
}
/// Test whether the specified input (0 or 1) is in-place blended by the
/// given mask.
///
/// This returns true if the elements from a particular input are already in the
/// slot required by the given mask and require no permutation.
static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
int Size = Mask.size();
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
return false;
return true;
}
/// Handle case where shuffle sources are coming from the same 128-bit lane and
/// every lane can be represented as the same repeating mask - allowing us to
/// shuffle the sources with the repeating shuffle and then permute the result
/// to the destination lanes.
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
int NumElts = VT.getVectorNumElements();
int NumLanes = VT.getSizeInBits() / 128;
int NumLaneElts = NumElts / NumLanes;
// On AVX2 we may be able to just shuffle the lowest elements and then
// broadcast the result.
if (Subtarget.hasAVX2()) {
for (unsigned BroadcastSize : {16, 32, 64}) {
if (BroadcastSize <= VT.getScalarSizeInBits())
continue;
int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
// Attempt to match a repeating pattern every NumBroadcastElts,
// accounting for UNDEFs but only references the lowest 128-bit
// lane of the inputs.
auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
for (int i = 0; i != NumElts; i += NumBroadcastElts)
for (int j = 0; j != NumBroadcastElts; ++j) {
int M = Mask[i + j];
if (M < 0)
continue;
int &R = RepeatMask[j];
if (0 != ((M % NumElts) / NumLaneElts))
return false;
if (0 <= R && R != M)
return false;
R = M;
}
return true;
};
SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
if (!FindRepeatingBroadcastMask(RepeatMask))
continue;
// Shuffle the (lowest) repeated elements in place for broadcast.
SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
// Shuffle the actual broadcast.
SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
for (int i = 0; i != NumElts; i += NumBroadcastElts)
for (int j = 0; j != NumBroadcastElts; ++j)
BroadcastMask[i + j] = j;
return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
BroadcastMask);
}
}
// Bail if the shuffle mask doesn't cross 128-bit lanes.
if (!is128BitLaneCrossingShuffleMask(VT, Mask))
return SDValue();
// Bail if we already have a repeated lane shuffle mask.
SmallVector<int, 8> RepeatedShuffleMask;
if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
return SDValue();
// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
// (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
int NumSubLanes = NumLanes * SubLaneScale;
int NumSubLaneElts = NumLaneElts / SubLaneScale;
// Check that all the sources are coming from the same lane and see if we can
// form a repeating shuffle mask (local to each sub-lane). At the same time,
// determine the source sub-lane for each destination sub-lane.
int TopSrcSubLane = -1;
SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
// Extract the sub-lane mask, check that it all comes from the same lane
// and normalize the mask entries to come from the first lane.
int SrcLane = -1;
SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
if (M < 0)
continue;
int Lane = (M % NumElts) / NumLaneElts;
if ((0 <= SrcLane) && (SrcLane != Lane))
return SDValue();
SrcLane = Lane;
int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
SubLaneMask[Elt] = LocalM;
}
// Whole sub-lane is UNDEF.
if (SrcLane < 0)
continue;
// Attempt to match against the candidate repeated sub-lane masks.
for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
for (int i = 0; i != NumSubLaneElts; ++i) {
if (M1[i] < 0 || M2[i] < 0)
continue;
if (M1[i] != M2[i])
return false;
}
return true;
};
auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
continue;
// Merge the sub-lane mask into the matching repeated sub-lane mask.
for (int i = 0; i != NumSubLaneElts; ++i) {
int M = SubLaneMask[i];
if (M < 0)
continue;
assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
"Unexpected mask element");
RepeatedSubLaneMask[i] = M;
}
// Track the top most source sub-lane - by setting the remaining to UNDEF
// we can greatly simplify shuffle matching.
int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
break;
}
// Bail if we failed to find a matching repeated sub-lane mask.
if (Dst2SrcSubLanes[DstSubLane] < 0)
return SDValue();
}
assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
"Unexpected source lane");
// Create a repeating shuffle mask for the entire vector.
SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
int Lane = SubLane / SubLaneScale;
auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
int M = RepeatedSubLaneMask[Elt];
if (M < 0)
continue;
int Idx = (SubLane * NumSubLaneElts) + Elt;
RepeatedMask[Idx] = M + (Lane * NumLaneElts);
}
}
SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
// Shuffle each source sub-lane to its destination.
SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
for (int i = 0; i != NumElts; i += NumSubLaneElts) {
int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
if (SrcSubLane < 0)
continue;
for (int j = 0; j != NumSubLaneElts; ++j)
SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
}
return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
SubLaneMask);
}
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
unsigned &ShuffleImm, ArrayRef<int> Mask) {
int NumElts = VT.getVectorNumElements();
assert(VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) &&
"Unexpected data type for VSHUFPD");
// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
ShuffleImm = 0;
bool ShufpdMask = true;
bool CommutableMask = true;
for (int i = 0; i < NumElts; ++i) {
if (Mask[i] == SM_SentinelUndef)
continue;
if (Mask[i] < 0)
return false;
int Val = (i & 6) + NumElts * (i & 1);
int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
if (Mask[i] < Val || Mask[i] > Val + 1)
ShufpdMask = false;
if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
CommutableMask = false;
ShuffleImm |= (Mask[i] % 2) << i;
}
if (ShufpdMask)
return true;
if (CommutableMask) {
std::swap(V1, V2);
return true;
}
return false;
}
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
"Unexpected data type for VSHUFPD");
unsigned Immediate = 0;
if (!matchShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
return SDValue();
return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
DAG.getConstant(Immediate, DL, MVT::i8));
}
/// Handle lowering of 4-lane 64-bit floating point shuffles.
///
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
/// isn't available.
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
Subtarget, DAG))
return V;
if (V2.isUndef()) {
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
Mask, Subtarget, DAG))
return Broadcast;
// Use low duplicate instructions for masks that match their pattern.
if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
// Non-half-crossing single input shuffles can be lowered with an
// interleaved permutation.
unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
DAG.getConstant(VPERMILPMask, DL, MVT::i8));
}
// With AVX2 we have direct support for this permutation.
if (Subtarget.hasAVX2())
return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return V;
// Try to permute the lanes and then use a per-lane permute.
if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
Mask, DAG, Subtarget))
return V;
// Otherwise, fall back.
return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG,
Subtarget);
}
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
return V;
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Check if the blend happens to exactly fit that of SHUFPD.
if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
return Op;
// If we have one input in place, then we can permute the other input and
// blend the result.
if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
Subtarget, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return V;
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle. However, if we have AVX2 and either inputs are already in place,
// we will be able to shuffle even across lanes the other input in a single
// instruction so skip this pattern.
if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
isShuffleMaskInputInPlace(1, Mask))))
if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return V;
// If we have VLX support, we can use VEXPAND.
if (Subtarget.hasVLX())
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
DAG, Subtarget))
return V;
// If we have AVX2 then we always want to lower with a blend because an v4 we
// can fully permute the elements.
if (Subtarget.hasAVX2())
return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
Subtarget, DAG);
// Otherwise fall back on generic lowering.
return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
Subtarget, DAG);
}
/// Handle lowering of 4-lane 64-bit integer shuffles.
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v4i64 shuffling..
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
Subtarget, DAG))
return V;
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
Subtarget, DAG))
return Broadcast;
if (V2.isUndef()) {
// When the shuffle is mirrored between the 128-bit lanes of the unit, we
// can use lower latency instructions that will operate on both lanes.
SmallVector<int, 2> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
SmallVector<int, 4> PSHUFDMask;
scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
return DAG.getBitcast(
MVT::v4i64,
DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
DAG.getBitcast(MVT::v8i32, V1),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
}
// AVX2 provides a direct instruction for permuting a single input across
// lanes.
return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
}
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// If we have VLX support, we can use VALIGN or VEXPAND.
if (Subtarget.hasVLX()) {
if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
DAG, Subtarget))
return V;
}
// Try to use PALIGNR.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
return V;
// If we have one input in place, then we can permute the other input and
// blend the result.
if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
Subtarget, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
return V;
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle. However, if we have AVX2 and either inputs are already in place,
// we will be able to shuffle even across lanes the other input in a single
// instruction so skip this pattern.
if (!isShuffleMaskInputInPlace(0, Mask) &&
!isShuffleMaskInputInPlace(1, Mask))
if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
return Result;
// Otherwise fall back on generic blend lowering.
return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
Subtarget, DAG);
}
/// Handle lowering of 8-lane 32-bit floating point shuffles.
///
/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
/// isn't available.
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
Subtarget, DAG))
return Broadcast;
// If the shuffle mask is repeated in each 128-bit lane, we have many more
// options to efficiently lower the shuffle.
SmallVector<int, 4> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
assert(RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!");
// Use even/odd duplicate instructions for masks that match their pattern.
if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
if (V2.isUndef())
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
return V;
// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
// have already handled any direct blends.
return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
}
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
return V;
// If we have a single input shuffle with different shuffle patterns in the
// two 128-bit lanes use the variable mask to VPERMILPS.
if (V2.isUndef()) {
SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
if (Subtarget.hasAVX2())
return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
// Otherwise, fall back.
return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
DAG, Subtarget);
}
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
return Result;
// If we have VLX support, we can use VEXPAND.
if (Subtarget.hasVLX())
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
DAG, Subtarget))
return V;
// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
// since after split we get a more efficient code using vpunpcklwd and
// vpunpckhwd instrs than vblend.
if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
Subtarget, DAG))
return V;
// If we have AVX2 then we always want to lower with a blend because at v8 we
// can fully permute the elements.
if (Subtarget.hasAVX2())
return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask,
Subtarget, DAG);
// Otherwise fall back on generic lowering.
return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
Subtarget, DAG);
}
/// Handle lowering of 8-lane 32-bit integer shuffles.
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v8i32 shuffling..
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return ZExt;
// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
// since after split we get a more efficient code than vblend by using
// vpunpcklwd and vpunpckhwd instrs.
if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
!Subtarget.hasAVX512())
if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask,
Subtarget, DAG))
return V;
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
Subtarget, DAG))
return Broadcast;
// If the shuffle mask is repeated in each 128-bit lane we can use more
// efficient instructions that mirror the shuffles across the two 128-bit
// lanes.
SmallVector<int, 4> RepeatedMask;
bool Is128BitLaneRepeatedShuffle =
is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
if (Is128BitLaneRepeatedShuffle) {
assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
if (V2.isUndef())
return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
return V;
}
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// If we have VLX support, we can use VALIGN or EXPAND.
if (Subtarget.hasVLX()) {
if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
DAG, Subtarget))
return V;
}
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
return V;
// If the shuffle patterns aren't repeated but it is a single input, directly
// generate a cross-lane VPERMD instruction.
if (V2.isUndef()) {
SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
}
// Assume that a single SHUFPS is faster than an alternative sequence of
// multiple instructions (even if the CPU has a domain penalty).
// If some CPU is harmed by the domain switch, we can fix it in a later pass.
if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
CastV1, CastV2, DAG);
return DAG.getBitcast(MVT::v8i32, ShufPS);
}
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
return Result;
// Otherwise fall back on generic blend lowering.
return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask,
Subtarget, DAG);
}
/// Handle lowering of 16-lane 16-bit integer shuffles.
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v16i16 shuffling..
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
Subtarget, DAG))
return Broadcast;
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
Subtarget))
return V;
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
return V;
if (V2.isUndef()) {
// There are no generalized cross-lane shuffle operations available on i16
// element types.
if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
return V;
return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask,
DAG, Subtarget);
}
SmallVector<int, 8> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
// As this is a single-input shuffle, the repeated mask should be
// a strictly valid v8i16 mask that we can pass through to the v8i16
// lowering to handle even the v16 case.
return lowerV8I16GeneralSingleInputShuffle(
DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
}
}
if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
Zeroable, Subtarget, DAG))
return PSHUFB;
// AVX512BWVL can lower to VPERMW.
if (Subtarget.hasBWI() && Subtarget.hasVLX())
return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
return Result;
// Try to permute the lanes and then use a per-lane permute.
if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
return V;
// Otherwise fall back on generic lowering.
return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
Subtarget, DAG);
}
/// Handle lowering of 32-lane 8-bit integer shuffles.
///
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v32i8 shuffling..
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return ZExt;
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
Subtarget, DAG))
return Broadcast;
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
Subtarget))
return V;
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
return V;
// There are no generalized cross-lane shuffle operations available on i8
// element types.
if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
return V;
return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, DAG,
Subtarget);
}
if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
Zeroable, Subtarget, DAG))
return PSHUFB;
// AVX512VBMIVL can lower to VPERMB.
if (Subtarget.hasVBMI() && Subtarget.hasVLX())
return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
return Result;
// Try to permute the lanes and then use a per-lane permute.
if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
return V;
// Otherwise fall back on generic lowering.
return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
Subtarget, DAG);
}
/// High-level routine to lower various 256-bit x86 vector shuffles.
///
/// This routine either breaks down the specific type of a 256-bit x86 vector
/// shuffle or splits it into two 128-bit shuffles and fuses the results back
/// together based on the available instructions.
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
SDValue V1, SDValue V2, const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// If we have a single input to the zero element, insert that into V1 if we
// can do so cheaply.
int NumElts = VT.getVectorNumElements();
int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
if (NumV2Elements == 1 && Mask[0] >= NumElts)
if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Handle special cases where the lower or upper half is UNDEF.
if (SDValue V =
lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
return V;
// There is a really nice hard cut-over between AVX1 and AVX2 that means we
// can check for those subtargets here and avoid much of the subtarget
// querying in the per-vector-type lowering routines. With AVX1 we have
// essentially *zero* ability to manipulate a 256-bit vector with integer
// types. Since we'll use floating point types there eventually, just
// immediately cast everything to a float and operate entirely in that domain.
if (VT.isInteger() && !Subtarget.hasAVX2()) {
int ElementBits = VT.getScalarSizeInBits();
if (ElementBits < 32) {
// No floating point type available, if we can't use the bit operations
// for masking/blending then decompose into 128-bit vectors.
if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
Subtarget, DAG))
return V;
if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
return V;
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
}
MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
VT.getVectorNumElements());
V1 = DAG.getBitcast(FpVT, V1);
V2 = DAG.getBitcast(FpVT, V2);
return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
}
switch (VT.SimpleTy) {
case MVT::v4f64:
return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v4i64:
return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8f32:
return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8i32:
return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16i16:
return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v32i8:
return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
default:
llvm_unreachable("Not a valid 256-bit x86 vector type!");
}
}
/// Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.");
// To handle 256 bit vector requires VLX and most probably
// function lowerV2X128VectorShuffle() is better solution.
assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
// TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
SmallVector<int, 4> WidenedMask;
if (!canWidenShuffleElements(Mask, WidenedMask))
return SDValue();
// Try to use an insert into a zero vector.
if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
(WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
DAG.getIntPtrConstant(0, DL));
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
getZeroVector(VT, Subtarget, DAG, DL), LoV,
DAG.getIntPtrConstant(0, DL));
}
// Check for patterns which can be matched with a single insert of a 256-bit
// subvector.
bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
{0, 1, 2, 3, 0, 1, 2, 3});
if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
{0, 1, 2, 3, 8, 9, 10, 11})) {
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
OnlyUsesV1 ? V1 : V2,
DAG.getIntPtrConstant(0, DL));
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
DAG.getIntPtrConstant(4, DL));
}
assert(WidenedMask.size() == 4);
// See if this is an insertion of the lower 128-bits of V2 into V1.
bool IsInsert = true;
int V2Index = -1;
for (int i = 0; i < 4; ++i) {
assert(WidenedMask[i] >= -1);
if (WidenedMask[i] < 0)
continue;
// Make sure all V1 subvectors are in place.
if (WidenedMask[i] < 4) {
if (WidenedMask[i] != i) {
IsInsert = false;
break;
}
} else {
// Make sure we only have a single V2 index and its the lowest 128-bits.
if (V2Index >= 0 || WidenedMask[i] != 4) {
IsInsert = false;
break;
}
V2Index = i;
}
}
if (IsInsert && V2Index >= 0) {
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
DAG.getIntPtrConstant(0, DL));
return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
}
// Try to lower to vshuf64x2/vshuf32x4.
SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
unsigned PermMask = 0;
// Insure elements came from the same Op.
for (int i = 0; i < 4; ++i) {
assert(WidenedMask[i] >= -1);
if (WidenedMask[i] < 0)
continue;
SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
unsigned OpIndex = i / 2;
if (Ops[OpIndex].isUndef())
Ops[OpIndex] = Op;
else if (Ops[OpIndex] != Op)
return SDValue();
// Convert the 128-bit shuffle mask selection values into 128-bit selection
// bits defined by a vshuf64x2 instruction's immediate control byte.
PermMask |= (WidenedMask[i] % 4) << (i * 2);
}
return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
DAG.getConstant(PermMask, DL, MVT::i8));
}
/// Handle lowering of 8-lane 64-bit floating point shuffles.
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
if (V2.isUndef()) {
// Use low duplicate instructions for masks that match their pattern.
if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
// Non-half-crossing single input shuffles can be lowered with an
// interleaved permutation.
unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
DAG.getConstant(VPERMILPMask, DL, MVT::i8));
}
SmallVector<int, 4> RepeatedMask;
if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
}
if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
V2, Subtarget, DAG))
return Shuf128;
if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
return Unpck;
// Check if the blend happens to exactly fit that of SHUFPD.
if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
return Op;
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
DAG, Subtarget))
return V;
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
}
/// Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
// If the shuffle mask is repeated in each 128-bit lane, we have many more
// options to efficiently lower the shuffle.
SmallVector<int, 4> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
// Use even/odd duplicate instructions for masks that match their pattern.
if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
if (V2.isUndef())
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
return V;
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Otherwise, fall back to a SHUFPS sequence.
return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
}
// If we have a single input shuffle with different shuffle patterns in the
// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
if (V2.isUndef() &&
!is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
}
// If we have AVX512F support, we can use VEXPAND.
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
V1, V2, DAG, Subtarget))
return V;
return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
}
/// Handle lowering of 8-lane 64-bit integer shuffles.
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
if (V2.isUndef()) {
// When the shuffle is mirrored between the 128-bit lanes of the unit, we
// can use lower latency instructions that will operate on all four
// 128-bit lanes.
SmallVector<int, 2> Repeated128Mask;
if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
SmallVector<int, 4> PSHUFDMask;
scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
return DAG.getBitcast(
MVT::v8i64,
DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
DAG.getBitcast(MVT::v16i32, V1),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
}
SmallVector<int, 4> Repeated256Mask;
if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
}
if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
V2, Subtarget, DAG))
return Shuf128;
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// Try to use VALIGN.
if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i64, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
// Try to use PALIGNR.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
return Unpck;
// If we have AVX512F support, we can use VEXPAND.
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
DAG, Subtarget))
return V;
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
}
/// Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
// If the shuffle mask is repeated in each 128-bit lane we can use more
// efficient instructions that mirror the shuffles across the four 128-bit
// lanes.
SmallVector<int, 4> RepeatedMask;
bool Is128BitLaneRepeatedShuffle =
is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
if (Is128BitLaneRepeatedShuffle) {
assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
if (V2.isUndef())
return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
return V;
}
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// Try to use VALIGN.
if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v16i32, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
// Try to use byte rotation instructions.
if (Subtarget.hasBWI())
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
// Assume that a single SHUFPS is faster than using a permv shuffle.
// If some CPU is harmed by the domain switch, we can fix it in a later pass.
if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
CastV1, CastV2, DAG);
return DAG.getBitcast(MVT::v16i32, ShufPS);
}
// If we have AVX512F support, we can use VEXPAND.
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
DAG, Subtarget))
return V;
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
}
/// Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
return V;
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
if (V2.isUndef()) {
SmallVector<int, 8> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
// As this is a single-input shuffle, the repeated mask should be
// a strictly valid v8i16 mask that we can pass through to the v8i16
// lowering to handle even the v32 case.
return lowerV8I16GeneralSingleInputShuffle(
DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
}
}
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
Zeroable, Subtarget, DAG))
return PSHUFB;
return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
}
/// Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
Subtarget))
return V;
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
Zeroable, Subtarget, DAG))
return PSHUFB;
// VBMI can use VPERMV/VPERMV3 byte shuffles.
if (Subtarget.hasVBMI())
return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
return V;
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (!V2.isUndef())
if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
return Result;
// FIXME: Implement direct support for this type!
return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
}
/// High-level routine to lower various 512-bit x86 vector shuffles.
///
/// This routine either breaks down the specific type of a 512-bit x86 vector
/// shuffle or splits it into two 256-bit shuffles and fuses the results back
/// together based on the available instructions.
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Subtarget.hasAVX512() &&
"Cannot lower 512-bit vectors w/ basic ISA!");
// If we have a single input to the zero element, insert that into V1 if we
// can do so cheaply.
int NumElts = Mask.size();
int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
if (NumV2Elements == 1 && Mask[0] >= NumElts)
if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Handle special cases where the lower or upper half is UNDEF.
if (SDValue V =
lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
return V;
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
Subtarget, DAG))
return Broadcast;
// Dispatch to each element type for lowering. If we don't have support for
// specific element type shuffles at 512 bits, immediately split them and
// lower them. Each lowering routine of a given type is allowed to assume that
// the requisite ISA extensions for that element type are available.
switch (VT.SimpleTy) {
case MVT::v8f64:
return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16f32:
return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8i64:
return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16i32:
return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v32i16:
return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v64i8:
return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
default:
llvm_unreachable("Not a valid 512-bit x86 vector type!");
}
}
// Determine if this shuffle can be implemented with a KSHIFT instruction.
// Returns the shift amount if possible or -1 if not. This is a simplified
// version of matchShuffleAsShift.
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
int MaskOffset, const APInt &Zeroable) {
int Size = Mask.size();
auto CheckZeros = [&](int Shift, bool Left) {
for (int j = 0; j < Shift; ++j)
if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
return false;
return true;
};
auto MatchShift = [&](int Shift, bool Left) {
unsigned Pos = Left ? Shift : 0;
unsigned Low = Left ? 0 : Shift;
unsigned Len = Size - Shift;
return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
};
for (int Shift = 1; Shift != Size; ++Shift)
for (bool Left : {true, false})
if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
return Shift;
}
return -1;
}
// Lower vXi1 vector shuffles.
// There is no a dedicated instruction on AVX-512 that shuffles the masks.
// The only way to shuffle bits is to sign-extend the mask vector to SIMD
// vector, shuffle and then truncate it back.
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Subtarget.hasAVX512() &&
"Cannot lower 512-bit vectors w/o basic ISA!");
unsigned NumElts = Mask.size();
// Try to recognize shuffles that are just padding a subvector with zeros.
unsigned SubvecElts = 0;
for (int i = 0; i != (int)NumElts; ++i) {
if (Mask[i] >= 0 && Mask[i] != i)
break;
++SubvecElts;
}
assert(SubvecElts != NumElts && "Identity shuffle?");
// Clip to a power 2.
SubvecElts = PowerOf2Floor(SubvecElts);
// Make sure the number of zeroable bits in the top at least covers the bits
// not covered by the subvector.
if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
V1, DAG.getIntPtrConstant(0, DL));
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
getZeroVector(VT, Subtarget, DAG, DL),
Extract, DAG.getIntPtrConstant(0, DL));
}
// Try to match KSHIFTs.
// TODO: Support narrower than legal shifts by widening and extracting.
if (NumElts >= 16 || (Subtarget.hasDQI() && NumElts == 8)) {
unsigned Offset = 0;
for (SDValue V : { V1, V2 }) {
unsigned Opcode;
int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
if (ShiftAmt >= 0)
return DAG.getNode(Opcode, DL, VT, V,
DAG.getConstant(ShiftAmt, DL, MVT::i8));
Offset += NumElts; // Increment for next iteration.
}
}
MVT ExtVT;
switch (VT.SimpleTy) {
default:
llvm_unreachable("Expected a vector of i1 elements");
case MVT::v2i1:
ExtVT = MVT::v2i64;
break;
case MVT::v4i1:
ExtVT = MVT::v4i32;
break;
case MVT::v8i1:
// Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
// shuffle.
ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
break;
case MVT::v16i1:
// Take 512-bit type, unless we are avoiding 512-bit types and have the
// 256-bit operation available.
ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
break;
case MVT::v32i1:
// Take 512-bit type, unless we are avoiding 512-bit types and have the
// 256-bit operation available.
assert(Subtarget.hasBWI() && "Expected AVX512BW support");
ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
break;
case MVT::v64i1:
ExtVT = MVT::v64i8;
break;
}
V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
// i1 was sign extended we can use X86ISD::CVT2MASK.
int NumElems = VT.getVectorNumElements();
if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
(Subtarget.hasDQI() && (NumElems < 32)))
return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
Shuffle, ISD::SETGT);
return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
}
/// Helper function that returns true if the shuffle mask should be
/// commuted to improve canonicalization.
static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
int NumElements = Mask.size();
int NumV1Elements = 0, NumV2Elements = 0;
for (int M : Mask)
if (M < 0)
continue;
else if (M < NumElements)
++NumV1Elements;
else
++NumV2Elements;
// Commute the shuffle as needed such that more elements come from V1 than
// V2. This allows us to match the shuffle pattern strictly on how many
// elements come from V1 without handling the symmetric cases.
if (NumV2Elements > NumV1Elements)
return true;
assert(NumV1Elements > 0 && "No V1 indices");
if (NumV2Elements == 0)
return false;
// When the number of V1 and V2 elements are the same, try to minimize the
// number of uses of V2 in the low half of the vector. When that is tied,
// ensure that the sum of indices for V1 is equal to or lower than the sum
// indices for V2. When those are equal, try to ensure that the number of odd
// indices for V1 is lower than the number of odd indices for V2.
if (NumV1Elements == NumV2Elements) {
int LowV1Elements = 0, LowV2Elements = 0;
for (int M : Mask.slice(0, NumElements / 2))
if (M >= NumElements)
++LowV2Elements;
else if (M >= 0)
++LowV1Elements;
if (LowV2Elements > LowV1Elements)
return true;
if (LowV2Elements == LowV1Elements) {
int SumV1Indices = 0, SumV2Indices = 0;
for (int i = 0, Size = Mask.size(); i < Size; ++i)
if (Mask[i] >= NumElements)
SumV2Indices += i;
else if (Mask[i] >= 0)
SumV1Indices += i;
if (SumV2Indices < SumV1Indices)
return true;
if (SumV2Indices == SumV1Indices) {
int NumV1OddIndices = 0, NumV2OddIndices = 0;
for (int i = 0, Size = Mask.size(); i < Size; ++i)
if (Mask[i] >= NumElements)
NumV2OddIndices += i % 2;
else if (Mask[i] >= 0)
NumV1OddIndices += i % 2;
if (NumV2OddIndices < NumV1OddIndices)
return true;
}
}
}
return false;
}
/// Top-level lowering for x86 vector shuffles.
///
/// This handles decomposition, canonicalization, and lowering of all x86
/// vector shuffles. Most of the specific lowering strategies are encapsulated
/// above in helper routines. The canonicalization attempts to widen shuffles
/// to involve fewer lanes of wider elements, consolidate symmetric patterns
/// s.t. only one of the two inputs needs to be tested, etc.
static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
MVT VT = Op.getSimpleValueType();
int NumElements = VT.getVectorNumElements();
SDLoc DL(Op);
bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
"Can't lower MMX shuffles");
bool V1IsUndef = V1.isUndef();
bool V2IsUndef = V2.isUndef();
if (V1IsUndef && V2IsUndef)
return DAG.getUNDEF(VT);
// When we create a shuffle node we put the UNDEF node to second operand,
// but in some cases the first operand may be transformed to UNDEF.
// In this case we should just commute the node.
if (V1IsUndef)
return DAG.getCommutedVectorShuffle(*SVOp);
// Check for non-undef masks pointing at an undef vector and make the masks
// undef as well. This makes it easier to match the shuffle based solely on
// the mask.
if (V2IsUndef &&
any_of(Mask, [NumElements](int M) { return M >= NumElements; })) {
SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
for (int &M : NewMask)
if (M >= NumElements)
M = -1;
return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
}
// Check for illegal shuffle mask element index values.
int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
assert(llvm::all_of(Mask,
[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index");
// We actually see shuffles that are entirely re-arrangements of a set of
// zero inputs. This mostly happens while decomposing complex shuffles into
// simple ones. Directly lower these as a buildvector of zeros.
APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
if (Zeroable.isAllOnesValue())
return getZeroVector(VT, Subtarget, DAG, DL);
bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
// Create an alternative mask with info about zeroable elements.
// Here we do not set undef elements as zeroable.
SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
if (V2IsZero) {
assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
for (int i = 0; i != NumElements; ++i)
if (Mask[i] != SM_SentinelUndef && Zeroable[i])
ZeroableMask[i] = SM_SentinelZero;
}
// Try to collapse shuffles into using a vector type with fewer elements but
// wider element types. We cap this to not form integers or floating point
// elements wider than 64 bits, but it might be interesting to form i128
// integers to handle flipping the low and high halves of AVX 256-bit vectors.
SmallVector<int, 16> WidenedMask;
if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
canWidenShuffleElements(ZeroableMask, WidenedMask)) {
// Shuffle mask widening should not interfere with a broadcast opportunity
// by obfuscating the operands with bitcasts.
// TODO: Avoid lowering directly from this top-level function: make this
// a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
Subtarget, DAG))
return Broadcast;
MVT NewEltVT = VT.isFloatingPoint()
? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
int NewNumElts = NumElements / 2;
MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
// Make sure that the new vector type is legal. For example, v2f64 isn't
// legal on SSE1.
if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
if (V2IsZero) {
// Modify the new Mask to take all zeros from the all-zero vector.
// Choose indices that are blend-friendly.
bool UsedZeroVector = false;
assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
"V2's non-undef elements are used?!");
for (int i = 0; i != NewNumElts; ++i)
if (WidenedMask[i] == SM_SentinelZero) {
WidenedMask[i] = i + NewNumElts;
UsedZeroVector = true;
}
// Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
// some elements to be undef.
if (UsedZeroVector)
V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
}
V1 = DAG.getBitcast(NewVT, V1);
V2 = DAG.getBitcast(NewVT, V2);
return DAG.getBitcast(
VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
}
}
// Commute the shuffle if it will improve canonicalization.
if (canonicalizeShuffleMaskWithCommute(Mask))
return DAG.getCommutedVectorShuffle(*SVOp);
if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
return V;
// For each vector width, delegate to a specialized lowering routine.
if (VT.is128BitVector())
return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
if (VT.is256BitVector())
return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
if (VT.is512BitVector())
return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
if (Is1BitVector)
return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
llvm_unreachable("Unimplemented!");
}
/// Try to lower a VSELECT instruction to a vector shuffle.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue Cond = Op.getOperand(0);
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
MVT VT = Op.getSimpleValueType();
// Only non-legal VSELECTs reach this lowering, convert those into generic
// shuffles and re-use the shuffle lowering path for blends.
SmallVector<int, 32> Mask;
if (createShuffleMaskFromVSELECT(Mask, Cond))
return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
return SDValue();
}
SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue Cond = Op.getOperand(0);
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
// A vselect where all conditions and data are constants can be optimized into
// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
return SDValue();
// Try to lower this to a blend-style vector shuffle. This can handle all
// constant condition cases.
if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
return BlendOp;
// If this VSELECT has a vector if i1 as a mask, it will be directly matched
// with patterns on the mask registers on AVX-512.
MVT CondVT = Cond.getSimpleValueType();
unsigned CondEltSize = Cond.getScalarValueSizeInBits();
if (CondEltSize == 1)
return Op;
// Variable blends are only legal from SSE4.1 onward.
if (!Subtarget.hasSSE41())
return SDValue();
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
unsigned EltSize = VT.getScalarSizeInBits();
unsigned NumElts = VT.getVectorNumElements();
// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
// into an i1 condition so that we can use the mask-based 512-bit blend
// instructions.
if (VT.getSizeInBits() == 512) {
// Build a mask by testing the condition against zero.
MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
DAG.getConstant(0, dl, CondVT),
ISD::SETNE);
// Now return a new VSELECT using the mask.
return DAG.getSelect(dl, VT, Mask, LHS, RHS);
}
// SEXT/TRUNC cases where the mask doesn't match the destination size.
if (CondEltSize != EltSize) {
// If we don't have a sign splat, rely on the expansion.
if (CondEltSize != DAG.ComputeNumSignBits(Cond))
return SDValue();
MVT NewCondSVT = MVT::getIntegerVT(EltSize);
MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
}
// Only some types will be legal on some subtargets. If we can emit a legal
// VSELECT-matching blend, return Op, and but if we need to expand, return
// a null value.
switch (VT.SimpleTy) {
default:
// Most of the vector types have blends past SSE4.1.
return Op;
case MVT::v32i8:
// The byte blends for AVX vectors were introduced only in AVX2.
if (Subtarget.hasAVX2())
return Op;
return SDValue();
case MVT::v8i16:
case MVT::v16i16: {
// Bitcast everything to the vXi8 type and use a vXi8 vselect.
MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
Cond = DAG.getBitcast(CastVT, Cond);
LHS = DAG.getBitcast(CastVT, LHS);
RHS = DAG.getBitcast(CastVT, RHS);
SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
return DAG.getBitcast(VT, Select);
}
}
}
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
return SDValue();
if (VT.getSizeInBits() == 8) {
SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
Op.getOperand(0), Op.getOperand(1));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
}
if (VT == MVT::f32) {
// EXTRACTPS outputs to a GPR32 register which will require a movd to copy
// the result back to FR32 register. It's only worth matching if the
// result has a single use which is a store or a bitcast to i32. And in
// the case of a store, it's not worth it if the index is a constant 0,
// because a MOVSSmr can be used instead, which is smaller and faster.
if (!Op.hasOneUse())
return SDValue();
SDNode *User = *Op.getNode()->use_begin();
if ((User->getOpcode() != ISD::STORE ||
isNullConstant(Op.getOperand(1))) &&
(User->getOpcode() != ISD::BITCAST ||
User->getValueType(0) != MVT::i32))
return SDValue();
SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
Op.getOperand(1));
return DAG.getBitcast(MVT::f32, Extract);
}
if (VT == MVT::i32 || VT == MVT::i64) {
// ExtractPS/pextrq works with constant index.
if (isa<ConstantSDNode>(Op.getOperand(1)))
return Op;
}
return SDValue();
}
/// Extract one bit from mask vector, like v16i1 or v8i1.
/// AVX-512 feature.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue Vec = Op.getOperand(0);
SDLoc dl(Vec);
MVT VecVT = Vec.getSimpleValueType();
SDValue Idx = Op.getOperand(1);
MVT EltVT = Op.getSimpleValueType();
assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
"Unexpected vector type in ExtractBitFromMaskVector");
// variable index can't be handled in mask registers,
// extend vector to VR512/128
if (!isa<ConstantSDNode>(Idx)) {
unsigned NumElts = VecVT.getVectorNumElements();
// Extending v8i1/v16i1 to 512-bit get better performance on KNL
// than extending to 128/256bit.
MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
}
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
if (IdxVal == 0) // the operation is legal
return Op;
// Extend to natively supported kshift.
unsigned NumElems = VecVT.getVectorNumElements();
MVT WideVecVT = VecVT;
if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
DAG.getUNDEF(WideVecVT), Vec,
DAG.getIntPtrConstant(0, dl));
}
// Use kshiftr instruction to move to the lower element.
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
DAG.getConstant(IdxVal, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
DAG.getIntPtrConstant(0, dl));
}
SDValue
X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
SDValue Vec = Op.getOperand(0);
MVT VecVT = Vec.getSimpleValueType();
SDValue Idx = Op.getOperand(1);
if (VecVT.getVectorElementType() == MVT::i1)
return ExtractBitFromMaskVector(Op, DAG, Subtarget);
if (!isa<ConstantSDNode>(Idx)) {
// Its more profitable to go through memory (1 cycles throughput)
// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
// IACA tool was used to get performance estimation
// (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
//
// example : extractelement <16 x i8> %a, i32 %i
//
// Block Throughput: 3.00 Cycles
// Throughput Bottleneck: Port5
//
// | Num Of | Ports pressure in cycles | |
// | Uops | 0 - DV | 5 | 6 | 7 | |
// ---------------------------------------------
// | 1 | | 1.0 | | | CP | vmovd xmm1, edi
// | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
// | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
// Total Num Of Uops: 4
//
//
// Block Throughput: 1.00 Cycles
// Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
//
// | | Ports pressure in cycles | |
// |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
// ---------------------------------------------------------
// |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
// |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
// |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
// Total Num Of Uops: 4
return SDValue();
}
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
// If this is a 256-bit vector result, first extract the 128-bit vector and
// then extract the element from the 128-bit vector.
if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
// Get the 128-bit vector.
Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
MVT EltVT = VecVT.getVectorElementType();
unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
// Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
// this can be done with a mask.
IdxVal &= ElemsPerChunk - 1;
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
DAG.getIntPtrConstant(IdxVal, dl));
}
assert(VecVT.is128BitVector() && "Unexpected vector length");
MVT VT = Op.getSimpleValueType();
if (VT.getSizeInBits() == 16) {
// If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
// we're going to zero extend the register or fold the store (SSE41 only).
if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
!(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
DAG.getBitcast(MVT::v4i32, Vec), Idx));
// Transform it so it match pextrw which produces a 32-bit result.
SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
Op.getOperand(0), Op.getOperand(1));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
}
if (Subtarget.hasSSE41())
if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
return Res;
// TODO: We only extract a single element from v16i8, we can probably afford
// to be more aggressive here before using the default approach of spilling to
// stack.
if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
// Extract either the lowest i32 or any i16, and extract the sub-byte.
int DWordIdx = IdxVal / 4;
if (DWordIdx == 0) {
SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
DAG.getBitcast(MVT::v4i32, Vec),
DAG.getIntPtrConstant(DWordIdx, dl));
int ShiftVal = (IdxVal % 4) * 8;
if (ShiftVal != 0)
Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
DAG.getConstant(ShiftVal, dl, MVT::i8));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
}
int WordIdx = IdxVal / 2;
SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
DAG.getBitcast(MVT::v8i16, Vec),
DAG.getIntPtrConstant(WordIdx, dl));
int ShiftVal = (IdxVal % 2) * 8;
if (ShiftVal != 0)
Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
DAG.getConstant(ShiftVal, dl, MVT::i8));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
}
if (VT.getSizeInBits() == 32) {
if (IdxVal == 0)
return Op;
// SHUFPS the element to the lowest double word, then movss.
int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
DAG.getIntPtrConstant(0, dl));
}
if (VT.getSizeInBits() == 64) {
// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
// to match extract_elt for f64.
if (IdxVal == 0)
return Op;
// UNPCKHPD the element to the lowest double word, then movsd.
// Note if the lower 64 bits of the result of the UNPCKHPD is then stored
// to a f64mem, the whole operation is folded into a single MOVHPDmr.
int Mask[2] = { 1, -1 };
Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
DAG.getIntPtrConstant(0, dl));
}
return SDValue();
}
/// Insert one bit to mask vector, like v16i1 or v8i1.
/// AVX-512 feature.
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc dl(Op);
SDValue Vec = Op.getOperand(0);
SDValue Elt = Op.getOperand(1);
SDValue Idx = Op.getOperand(2);
MVT VecVT = Vec.getSimpleValueType();
if (!isa<ConstantSDNode>(Idx)) {
// Non constant index. Extend source and destination,
// insert element and then truncate the result.
unsigned NumElts = VecVT.getVectorNumElements();
MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
}
// Copy into a k-register, extract to v1i1 and insert_subvector.
SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
Op.getOperand(2));
}
SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
unsigned NumElts = VT.getVectorNumElements();
if (EltVT == MVT::i1)
return InsertBitToMaskVector(Op, DAG, Subtarget);
SDLoc dl(Op);
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
SDValue N2 = Op.getOperand(2);
auto *N2C = dyn_cast<ConstantSDNode>(N2);
if (!N2C || N2C->getAPIntValue().uge(NumElts))
return SDValue();
uint64_t IdxVal = N2C->getZExtValue();
bool IsZeroElt = X86::isZeroNode(N1);
bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
// If we are inserting a element, see if we can do this more efficiently with
// a blend shuffle with a rematerializable vector than a costly integer
// insertion.
if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
16 <= EltVT.getSizeInBits()) {
SmallVector<int, 8> BlendMask;
for (unsigned i = 0; i != NumElts; ++i)
BlendMask.push_back(i == IdxVal ? i + NumElts : i);
SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
: getOnesVector(VT, DAG, dl);
return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
}
// If the vector is wider than 128 bits, extract the 128-bit subvector, insert
// into that, and then insert the subvector back into the result.
if (VT.is256BitVector() || VT.is512BitVector()) {
// With a 256-bit vector, we can insert into the zero element efficiently
// using a blend if we have AVX or AVX2 and the right data type.
if (VT.is256BitVector() && IdxVal == 0) {
// TODO: It is worthwhile to cast integer to floating point and back
// and incur a domain crossing penalty if that's what we'll end up
// doing anyway after extracting to a 128-bit vector.
if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
(Subtarget.hasAVX2() && EltVT == MVT::i32)) {
SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
N2 = DAG.getIntPtrConstant(1, dl);
return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
}
}
// Get the desired 128-bit vector chunk.
SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
// Insert the element into the desired chunk.
unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
assert(isPowerOf2_32(NumEltsIn128));
// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
DAG.getIntPtrConstant(IdxIn128, dl));
// Insert the changed part back into the bigger vector
return insert128BitVector(N0, V, IdxVal, DAG, dl);
}
assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
// This will be just movd/movq/movss/movsd.
if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode()) &&
(EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
EltVT == MVT::i64)) {
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
}
// Transform it so it match pinsr{b,w} which expects a GR32 as its second
// argument. SSE41 required for pinsrb.
if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
unsigned Opc;
if (VT == MVT::v8i16) {
assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
Opc = X86ISD::PINSRW;
} else {
assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
Opc = X86ISD::PINSRB;
}
if (N1.getValueType() != MVT::i32)
N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
if (N2.getValueType() != MVT::i32)
N2 = DAG.getIntPtrConstant(IdxVal, dl);
return DAG.getNode(Opc, dl, VT, N0, N1, N2);
}
if (Subtarget.hasSSE41()) {
if (EltVT == MVT::f32) {
// Bits [7:6] of the constant are the source select. This will always be
// zero here. The DAG Combiner may combine an extract_elt index into
// these bits. For example (insert (extract, 3), 2) could be matched by
// putting the '3' into bits [7:6] of X86ISD::INSERTPS.
// Bits [5:4] of the constant are the destination select. This is the
// value of the incoming immediate.
// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
// combine either bitwise AND or insert of float 0.0 to set these bits.
bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
// If this is an insertion of 32-bits into the low 32-bits of
// a vector, we prefer to generate a blend with immediate rather
// than an insertps. Blends are simpler operations in hardware and so
// will always have equal or better performance than insertps.
// But if optimizing for size and there's a load folding opportunity,
// generate insertps because blendps does not have a 32-bit memory
// operand form.
N2 = DAG.getIntPtrConstant(1, dl);
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
}
N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
// Create this as a scalar to vector..
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
}
// PINSR* works with constant index.
if (EltVT == MVT::i32 || EltVT == MVT::i64)
return Op;
}
return SDValue();
}
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
MVT OpVT = Op.getSimpleValueType();
// It's always cheaper to replace a xor+movd with xorps and simplifies further
// combines.
if (X86::isZeroNode(Op.getOperand(0)))
return getZeroVector(OpVT, Subtarget, DAG, dl);
// If this is a 256-bit vector result, first insert into a 128-bit
// vector and then insert into the 256-bit vector.
if (!OpVT.is128BitVector()) {
// Insert into a 128-bit vector.
unsigned SizeFactor = OpVT.getSizeInBits() / 128;
MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
OpVT.getVectorNumElements() / SizeFactor);
Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
// Insert the 128-bit vector.
return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
}
assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
"Expected an SSE type!");
// Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
if (OpVT == MVT::v4i32)
return Op;
SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
return DAG.getBitcast(
OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
}
// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
// simple superregister reference or explicit instructions to insert
// the upper bits of a vector.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
return insert1BitVector(Op, DAG, Subtarget);
}
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
"Only vXi1 extract_subvectors need custom lowering");
SDLoc dl(Op);
SDValue Vec = Op.getOperand(0);
SDValue Idx = Op.getOperand(1);
if (!isa<ConstantSDNode>(Idx))
return SDValue();
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
if (IdxVal == 0) // the operation is legal
return Op;
MVT VecVT = Vec.getSimpleValueType();
unsigned NumElems = VecVT.getVectorNumElements();
// Extend to natively supported kshift.
MVT WideVecVT = VecVT;
if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
DAG.getUNDEF(WideVecVT), Vec,
DAG.getIntPtrConstant(0, dl));
}
// Shift to the LSB.
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
DAG.getConstant(IdxVal, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
DAG.getIntPtrConstant(0, dl));
}
// Returns the appropriate wrapper opcode for a global reference.
unsigned X86TargetLowering::getGlobalWrapperKind(
const GlobalValue *GV, const unsigned char OpFlags) const {
// References to absolute symbols are never PC-relative.
if (GV && GV->isAbsoluteSymbolRef())
return X86ISD::Wrapper;
CodeModel::Model M = getTargetMachine().getCodeModel();
if (Subtarget.isPICStyleRIPRel() &&
(M == CodeModel::Small || M == CodeModel::Kernel))
return X86ISD::WrapperRIP;
// GOTPCREL references must always use RIP.
if (OpFlags == X86II::MO_GOTPCREL)
return X86ISD::WrapperRIP;
return X86ISD::Wrapper;
}
// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
// one of the above mentioned nodes. It has to be wrapped because otherwise
// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
// be used to form addressing mode. These wrapped nodes will be selected
// into MOV32ri.
SDValue
X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
// global base reg.
unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result = DAG.getTargetConstantPool(
CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
SDLoc DL(CP);
Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
// With PIC, the address is actually $g + Offset.
if (OpFlag) {
Result =
DAG.getNode(ISD::ADD, DL, PtrVT,
DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
}
return Result;
}
SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
// global base reg.
unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
SDLoc DL(JT);
Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
// With PIC, the address is actually $g + Offset.
if (OpFlag)
Result =
DAG.getNode(ISD::ADD, DL, PtrVT,
DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
return Result;
}
SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
SelectionDAG &DAG) const {
return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
}
SDValue
X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
// Create the TargetBlockAddressAddress node.
unsigned char OpFlags =
Subtarget.classifyBlockAddressReference();
const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
SDLoc dl(Op);
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
// With PIC, the address is actually $g + Offset.
if (isGlobalRelativeToPICBase(OpFlags)) {
Result = DAG.getNode(ISD::ADD, dl, PtrVT,
DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
}
return Result;
}
/// Creates target global address or external symbol nodes for calls or
/// other uses.
SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
bool ForCall) const {
// Unpack the global address or external symbol.
const SDLoc &dl = SDLoc(Op);
const GlobalValue *GV = nullptr;
int64_t Offset = 0;
const char *ExternalSym = nullptr;
if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
GV = G->getGlobal();
Offset = G->getOffset();
} else {
const auto *ES = cast<ExternalSymbolSDNode>(Op);
ExternalSym = ES->getSymbol();
}
// Calculate some flags for address lowering.
const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
unsigned char OpFlags;
if (ForCall)
OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
else
OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
bool NeedsLoad = isGlobalStubReference(OpFlags);
CodeModel::Model M = DAG.getTarget().getCodeModel();
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result;
if (GV) {
// Create a target global address if this is a global. If possible, fold the
// offset into the global address reference. Otherwise, ADD it on later.
int64_t GlobalOffset = 0;
if (OpFlags == X86II::MO_NO_FLAG &&
X86::isOffsetSuitableForCodeModel(Offset, M)) {
std::swap(GlobalOffset, Offset);
}
Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
} else {
// If this is not a global address, this must be an external symbol.
Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
}
// If this is a direct call, avoid the wrapper if we don't need to do any
// loads or adds. This allows SDAG ISel to match direct calls.
if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
return Result;
Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
// With PIC, the address is actually $g + Offset.
if (HasPICReg) {
Result = DAG.getNode(ISD::ADD, dl, PtrVT,
DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
}
// For globals that require a load from a stub to get the address, emit the
// load.
if (NeedsLoad)
Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
MachinePointerInfo::getGOT(DAG.getMachineFunction()));
// If there was a non-zero offset that we didn't fold, create an explicit
// addition for it.
if (Offset != 0)
Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
DAG.getConstant(Offset, dl, PtrVT));
return Result;
}
SDValue
X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
}
static SDValue
GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
unsigned char OperandFlags, bool LocalDynamic = false) {
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
SDLoc dl(GA);
SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
GA->getValueType(0),
GA->getOffset(),
OperandFlags);
X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
: X86ISD::TLSADDR;
if (InFlag) {
SDValue Ops[] = { Chain, TGA, *InFlag };
Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
} else {
SDValue Ops[] = { Chain, TGA };
Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
}
// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
MFI.setAdjustsStack(true);
MFI.setHasCalls(true);
SDValue Flag = Chain.getValue(1);
return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
}
// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
static SDValue
LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
const EVT PtrVT) {
SDValue InFlag;
SDLoc dl(GA); // ? function entry point might be better
SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
DAG.getNode(X86ISD::GlobalBaseReg,
SDLoc(), PtrVT), InFlag);
InFlag = Chain.getValue(1);
return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
}
// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
static SDValue
LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
const EVT PtrVT) {
return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
X86::RAX, X86II::MO_TLSGD);
}
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
SelectionDAG &DAG,
const EVT PtrVT,
bool is64Bit) {
SDLoc dl(GA);
// Get the start address of the TLS block for this module.
X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
.getInfo<X86MachineFunctionInfo>();
MFI->incNumLocalDynamicTLSAccesses();
SDValue Base;
if (is64Bit) {
Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
X86II::MO_TLSLD, /*LocalDynamic=*/true);
} else {
SDValue InFlag;
SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
InFlag = Chain.getValue(1);
Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
X86II::MO_TLSLDM, /*LocalDynamic=*/true);
}
// Note: the CleanupLocalDynamicTLSPass will remove redundant computations
// of Base.
// Build x@dtpoff.
unsigned char OperandFlags = X86II::MO_DTPOFF;
unsigned WrapperKind = X86ISD::Wrapper;
SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
GA->getValueType(0),
GA->getOffset(), OperandFlags);
SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
// Add x@dtpoff with the base.
return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
}
// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
const EVT PtrVT, TLSModel::Model model,
bool is64Bit, bool isPIC) {
SDLoc dl(GA);
// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
is64Bit ? 257 : 256));
SDValue ThreadPointer =
DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
MachinePointerInfo(Ptr));
unsigned char OperandFlags = 0;
// Most TLS accesses are not RIP relative, even on x86-64. One exception is
// initialexec.
unsigned WrapperKind = X86ISD::Wrapper;
if (model == TLSModel::LocalExec) {
OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
} else if (model == TLSModel::InitialExec) {
if (is64Bit) {
OperandFlags = X86II::MO_GOTTPOFF;
WrapperKind = X86ISD::WrapperRIP;
} else {
OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
}
} else {
llvm_unreachable("Unexpected model");
}
// emit "addl x@ntpoff,%eax" (local exec)
// or "addl x@indntpoff,%eax" (initial exec)
// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
SDValue TGA =
DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
GA->getOffset(), OperandFlags);
SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
if (model == TLSModel::InitialExec) {
if (isPIC && !is64Bit) {
Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
Offset);
}
Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
MachinePointerInfo::getGOT(DAG.getMachineFunction()));
}
// The address of the thread local variable is the add of the thread
// pointer with the offset of the variable.
return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
}
SDValue
X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
if (DAG.getTarget().useEmulatedTLS())
return LowerToTLSEmulatedModel(GA, DAG);
const GlobalValue *GV = GA->getGlobal();
auto PtrVT = getPointerTy(DAG.getDataLayout());
bool PositionIndependent = isPositionIndependent();
if (Subtarget.isTargetELF()) {
TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
switch (model) {
case TLSModel::GeneralDynamic:
if (Subtarget.is64Bit())
return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
case TLSModel::LocalDynamic:
return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
Subtarget.is64Bit());
case TLSModel::InitialExec:
case TLSModel::LocalExec:
return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
PositionIndependent);
}
llvm_unreachable("Unknown TLS model.");
}
if (Subtarget.isTargetDarwin()) {
// Darwin only has one model of TLS. Lower to that.
unsigned char OpFlag = 0;
unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
X86ISD::WrapperRIP : X86ISD::Wrapper;
// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
// global base reg.
bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
if (PIC32)
OpFlag = X86II::MO_TLVP_PIC_BASE;
else
OpFlag = X86II::MO_TLVP;
SDLoc DL(Op);
SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
GA->getValueType(0),
GA->getOffset(), OpFlag);
SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
// With PIC32, the address is actually $g + Offset.
if (PIC32)
Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
Offset);
// Lowering the machine isd will make sure everything is in the right
// location.
SDValue Chain = DAG.getEntryNode();
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
SDValue Args[] = { Chain, Offset };
Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
DAG.getIntPtrConstant(0, DL, true),
Chain.getValue(1), DL);
// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
MFI.setAdjustsStack(true);
// And our return value (tls address) is in the standard call return value
// location.
unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
}
if (Subtarget.isOSWindows()) {
// Just use the implicit TLS architecture
// Need to generate something similar to:
// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
// ; from TEB
// mov ecx, dword [rel _tls_index]: Load index (from C runtime)
// mov rcx, qword [rdx+rcx*8]
// mov eax, .tls$:tlsvar
// [rax+rcx] contains the address
// Windows 64bit: gs:0x58
// Windows 32bit: fs:__tls_array
SDLoc dl(GA);
SDValue Chain = DAG.getEntryNode();
// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
// use its literal value of 0x2C.
Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
? Type::getInt8PtrTy(*DAG.getContext(),
256)
: Type::getInt32PtrTy(*DAG.getContext(),
257));
SDValue TlsArray = Subtarget.is64Bit()
? DAG.getIntPtrConstant(0x58, dl)
: (Subtarget.isTargetWindowsGNU()
? DAG.getIntPtrConstant(0x2C, dl)
: DAG.getExternalSymbol("_tls_array", PtrVT));
SDValue ThreadPointer =
DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
SDValue res;
if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
res = ThreadPointer;
} else {
// Load the _tls_index variable
SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
if (Subtarget.is64Bit())
IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
MachinePointerInfo(), MVT::i32);
else
IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
auto &DL = DAG.getDataLayout();
SDValue Scale =
DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
}
res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
// Get the offset of start of .tls section
SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
GA->getValueType(0),
GA->getOffset(), X86II::MO_SECREL);
SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
// The address of the thread local variable is the add of the thread
// pointer with the offset of the variable.
return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
}
llvm_unreachable("TLS not implemented for this target.");
}
/// Lower SRA_PARTS and friends, which return two i32 values
/// and take a 2 x i32 value to shift plus a shift amount.
/// TODO: Can this be moved to general expansion code?
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
MVT VT = Op.getSimpleValueType();
unsigned VTBits = VT.getSizeInBits();
SDLoc dl(Op);
bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
SDValue ShOpLo = Op.getOperand(0);
SDValue ShOpHi = Op.getOperand(1);
SDValue ShAmt = Op.getOperand(2);
// ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and
// ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away
// during isel.
SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
DAG.getConstant(VTBits - 1, dl, MVT::i8));
SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
DAG.getConstant(VTBits - 1, dl, MVT::i8))
: DAG.getConstant(0, dl, VT);
SDValue Tmp2, Tmp3;
if (Op.getOpcode() == ISD::SHL_PARTS) {
Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt);
Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
} else {
Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt);
Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
}
// If the shift amount is larger or equal than the width of a part we can't
// rely on the results of shld/shrd. Insert a test and select the appropriate
// values for large shift amounts.
SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
DAG.getConstant(VTBits, dl, MVT::i8));
SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);
SDValue Hi, Lo;
if (Op.getOpcode() == ISD::SHL_PARTS) {
Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
} else {
Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
}
return DAG.getMergeValues({ Lo, Hi }, dl);
}
static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
"Unexpected funnel shift opcode!");
SDLoc DL(Op);
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDValue Amt = Op.getOperand(2);
bool IsFSHR = Op.getOpcode() == ISD::FSHR;
if (VT.isVector()) {
assert(Subtarget.hasVBMI2() && "Expected VBMI2");
if (IsFSHR)
std::swap(Op0, Op1);
APInt APIntShiftAmt;
if (isConstantSplat(Amt, APIntShiftAmt)) {
uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
Op0, Op1, DAG.getConstant(ShiftAmt, DL, MVT::i8));
}
return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
Op0, Op1, Amt);
}
assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
"Unexpected funnel shift type!");
// Expand slow SHLD/SHRD cases if we are not optimizing for size.
bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
if (!OptForSize && Subtarget.isSHLDSlow())
return SDValue();
if (IsFSHR)
std::swap(Op0, Op1);
// i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
if (VT == MVT::i16)
Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
DAG.getConstant(15, DL, Amt.getValueType()));
unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD);
return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt);
}
// Try to use a packed vector operation to handle i64 on 32-bit targets when
// AVX512DQ is enabled.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert((Op.getOpcode() == ISD::SINT_TO_FP ||
Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!");
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
MVT VT = Op.getSimpleValueType();
if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
(VT != MVT::f32 && VT != MVT::f64))
return SDValue();
// Pack the i64 into a vector, do the operation and extract.
// Using 256-bit to ensure result is 128-bits for f32 case.
unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
MVT VecVT = MVT::getVectorVT(VT, NumElts);
SDLoc dl(Op);
SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
DAG.getIntPtrConstant(0, dl));
}
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
const X86Subtarget &Subtarget) {
switch (Opcode) {
case ISD::SINT_TO_FP:
// TODO: Handle wider types with AVX/AVX512.
if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
return false;
// CVTDQ2PS or (V)CVTDQ2PD
return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
case ISD::UINT_TO_FP:
// TODO: Handle wider types and i64 elements.
if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
return false;
// VCVTUDQ2PS or VCVTUDQ2PD
return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
default:
return false;
}
}
/// Given a scalar cast operation that is extracted from a vector, try to
/// vectorize the cast op followed by extraction. This will avoid an expensive
/// round-trip between XMM and GPR.
static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// TODO: This could be enhanced to handle smaller integer types by peeking
// through an extend.
SDValue Extract = Cast.getOperand(0);
MVT DestVT = Cast.getSimpleValueType();
if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isa<ConstantSDNode>(Extract.getOperand(1)))
return SDValue();
// See if we have a 128-bit vector cast op for this type of cast.
SDValue VecOp = Extract.getOperand(0);
MVT FromVT = VecOp.getSimpleValueType();
unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
return SDValue();
// If we are extracting from a non-zero element, first shuffle the source
// vector to allow extracting from element zero.
SDLoc DL(Cast);
if (!isNullConstant(Extract.getOperand(1))) {
SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
Mask[0] = Extract.getConstantOperandVal(1);
VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
}
// If the source vector is wider than 128-bits, extract the low part. Do not
// create an unnecessarily wide vector cast op.
if (FromVT != Vec128VT)
VecOp = extract128BitVector(VecOp, 0, DAG, DL);
// cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
// cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
DAG.getIntPtrConstant(0, DL));
}
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
return Extract;
if (SrcVT.isVector()) {
if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
DAG.getUNDEF(SrcVT)));
}
return SDValue();
}
assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!");
// These are really Legal; return the operand so the caller accepts it as
// Legal.
if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(VT))
return Op;
if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && Subtarget.is64Bit())
return Op;
if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
return V;
SDValue ValueToStore = Op.getOperand(0);
if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) &&
!Subtarget.is64Bit())
// Bitcasting to f64 here allows us to do a single 64-bit store from
// an SSE register, avoiding the store forwarding penalty that would come
// with two 32-bit stores.
ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
unsigned Size = SrcVT.getSizeInBits()/8;
MachineFunction &MF = DAG.getMachineFunction();
auto PtrVT = getPointerTy(MF.getDataLayout());
int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
SDValue Chain = DAG.getStore(
DAG.getEntryNode(), dl, ValueToStore, StackSlot,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
}
SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
SDValue StackSlot,
SelectionDAG &DAG) const {
// Build the FILD
SDLoc DL(Op);
SDVTList Tys;
bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
if (useSSE)
Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
else
Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
unsigned ByteSize = SrcVT.getSizeInBits() / 8;
FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
MachineMemOperand *LoadMMO;
if (FI) {
int SSFI = FI->getIndex();
LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
MachineMemOperand::MOLoad, ByteSize, ByteSize);
} else {
LoadMMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
StackSlot = StackSlot.getOperand(1);
}
SDValue FILDOps[] = {Chain, StackSlot};
SDValue Result =
DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL,
Tys, FILDOps, SrcVT, LoadMMO);
if (useSSE) {
Chain = Result.getValue(1);
SDValue InFlag = Result.getValue(2);
// FIXME: Currently the FST is glued to the FILD_FLAG. This
// shouldn't be necessary except that RFP cannot be live across
// multiple blocks. When stackifier is fixed, they can be uncoupled.
MachineFunction &MF = DAG.getMachineFunction();
unsigned SSFISize = Op.getValueSizeInBits() / 8;
int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
auto PtrVT = getPointerTy(MF.getDataLayout());
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
Tys = DAG.getVTList(MVT::Other);
SDValue FSTOps[] = {Chain, Result, StackSlot, InFlag};
MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
MachineMemOperand::MOStore, SSFISize, SSFISize);
Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps,
Op.getValueType(), StoreMMO);
Result = DAG.getLoad(
Op.getValueType(), DL, Chain, StackSlot,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
}
return Result;
}
/// 64-bit unsigned integer to double expansion.
static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// This algorithm is not obvious. Here it is what we're trying to output:
/*
movq %rax, %xmm0
punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
#ifdef __SSE3__
haddpd %xmm0, %xmm0
#else
pshufd $0x4e, %xmm0, %xmm1
addpd %xmm1, %xmm0
#endif
*/
SDLoc dl(Op);
LLVMContext *Context = DAG.getContext();
// Build some magic constants.
static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
Constant *C0 = ConstantDataVector::get(*Context, CV0);
auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
SmallVector<Constant*,2> CV1;
CV1.push_back(
ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
APInt(64, 0x4330000000000000ULL))));
CV1.push_back(
ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
APInt(64, 0x4530000000000000ULL))));
Constant *C1 = ConstantVector::get(CV1);
SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
// Load the 64-bit value into an XMM register.
SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
Op.getOperand(0));
SDValue CLod0 =
DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
/* Alignment = */ 16);
SDValue Unpck1 =
getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
SDValue CLod1 =
DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
/* Alignment = */ 16);
SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
// TODO: Are there any fast-math-flags to propagate here?
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
SDValue Result;
if (Subtarget.hasSSE3()) {
// FIXME: The 'haddpd' instruction may be slower than 'shuffle + addsd'.
Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
} else {
SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
}
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
DAG.getIntPtrConstant(0, dl));
}
/// 32-bit unsigned integer to float expansion.
static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc dl(Op);
// FP constant to bias correct the final result.
SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
MVT::f64);
// Load the 32-bit value into an XMM register.
SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
Op.getOperand(0));
// Zero out the upper parts of the register.
Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
DAG.getBitcast(MVT::v2f64, Load),
DAG.getIntPtrConstant(0, dl));
// Or the load with the bias.
SDValue Or = DAG.getNode(
ISD::OR, dl, MVT::v2i64,
DAG.getBitcast(MVT::v2i64,
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
DAG.getBitcast(MVT::v2i64,
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
Or =
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
// Subtract the bias.
// TODO: Are there any fast-math-flags to propagate here?
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
// Handle final rounding.
return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
}
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
const SDLoc &DL) {
if (Op.getSimpleValueType() != MVT::v2f64)
return SDValue();
SDValue N0 = Op.getOperand(0);
assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
// Legalize to v4i32 type.
N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
DAG.getUNDEF(MVT::v2i32));
if (Subtarget.hasAVX512())
return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
// Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
// but using v2i32 to v2f64 with X86ISD::CVTSI2P.
SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
// Two to the power of half-word-size.
SDValue TWOHW = DAG.getConstantFP((double)(1 << 16), DL, MVT::v2f64);
// Clear upper part of LO, lower HI.
SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
// Add the two halves.
return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
}
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// The algorithm is the following:
// #ifdef __SSE4_1__
// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
// (uint4) 0x53000000, 0xaa);
// #else
// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
// uint4 hi = (v >> 16) | (uint4) 0x53000000;
// #endif
// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
// return (float4) lo + fhi;
// We shouldn't use it when unsafe-fp-math is enabled though: we might later
// reassociate the two FADDs, and if we do that, the algorithm fails
// spectacularly (PR24512).
// FIXME: If we ever have some kind of Machine FMF, this should be marked
// as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
// there's also the MachineCombiner reassociations happening on Machine IR.
if (DAG.getTarget().Options.UnsafeFPMath)
return SDValue();
SDLoc DL(Op);
SDValue V = Op->getOperand(0);
MVT VecIntVT = V.getSimpleValueType();
bool Is128 = VecIntVT == MVT::v4i32;
MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
// If we convert to something else than the supported type, e.g., to v4f64,
// abort early.
if (VecFloatVT != Op->getSimpleValueType(0))
return SDValue();
assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
"Unsupported custom type");
// In the #idef/#else code, we have in common:
// - The vector of constants:
// -- 0x4b000000
// -- 0x53000000
// - A shift:
// -- v >> 16
// Create the splat vector for 0x4b000000.
SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
// Create the splat vector for 0x53000000.
SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
// Create the right shift.
SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
SDValue Low, High;
if (Subtarget.hasSSE41()) {
MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
// Low will be bitcasted right away, so do not bother bitcasting back to its
// original type.
Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
// (uint4) 0x53000000, 0xaa);
SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
// High will be bitcasted right away, so do not bother bitcasting back to
// its original type.
High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
} else {
SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
// uint4 hi = (v >> 16) | (uint4) 0x53000000;
High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
}
// Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
SDValue VecCstFAdd = DAG.getConstantFP(
APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
// TODO: Are there any fast-math-flags to propagate here?
SDValue FHigh =
DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
// return (float4) lo + fhi;
SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
}
static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue N0 = Op.getOperand(0);
MVT SrcVT = N0.getSimpleValueType();
SDLoc dl(Op);
switch (SrcVT.SimpleTy) {
default:
llvm_unreachable("Custom UINT_TO_FP is not supported!");
case MVT::v2i32:
return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
case MVT::v4i32:
case MVT::v8i32:
assert(!Subtarget.hasAVX512());
return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
}
}
SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
SDValue N0 = Op.getOperand(0);
SDLoc dl(Op);
auto PtrVT = getPointerTy(DAG.getDataLayout());
if (Op.getSimpleValueType().isVector())
return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
return Extract;
MVT SrcVT = N0.getSimpleValueType();
MVT DstVT = Op.getSimpleValueType();
if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
(SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
// Conversions from unsigned i32 to f32/f64 are legal,
// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
return Op;
}
if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
return V;
if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
if (SrcVT == MVT::i32 && X86ScalarSSEf64)
return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
return SDValue();
// Make a 64-bit buffer, and use it to build an FILD.
SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
if (SrcVT == MVT::i32) {
SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
StackSlot, MachinePointerInfo());
SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
OffsetSlot, MachinePointerInfo());
SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
return Fild;
}
assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
SDValue ValueToStore = Op.getOperand(0);
if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
// Bitcasting to f64 here allows us to do a single 64-bit store from
// an SSE register, avoiding the store forwarding penalty that would come
// with two 32-bit stores.
ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
MachinePointerInfo());
// For i64 source, we need to add the appropriate power of 2 if the input
// was negative. This is the same as the optimization in
// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
// we must be careful to do the computation in x87 extended precision, not
// in SSE. (The generic code can't know it's OK to do this, or how to.)
int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
MachineMemOperand::MOLoad, 8, 8);
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
SDValue Ops[] = { Store, StackSlot };
SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
MVT::i64, MMO);
APInt FF(32, 0x5F800000ULL);
// Check whether the sign bit is set.
SDValue SignSet = DAG.getSetCC(
dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
// Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
SDValue FudgePtr = DAG.getConstantPool(
ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
SDValue Zero = DAG.getIntPtrConstant(0, dl);
SDValue Four = DAG.getIntPtrConstant(4, dl);
SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
// Load the value out, extending it from f32 to f80.
// FIXME: Avoid the extend by constructing the right constant pool?
SDValue Fudge = DAG.getExtLoad(
ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
/* Alignment = */ 4);
// Extend everything to 80 bits to force it to be done on x87.
// TODO: Are there any fast-math-flags to propagate here?
SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
DAG.getIntPtrConstant(0, dl));
}
// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
// just return an SDValue().
// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
// to i16, i32 or i64, and we lower it to a legal sequence and return the
// result.
SDValue
X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
bool IsSigned) const {
SDLoc DL(Op);
EVT DstTy = Op.getValueType();
EVT TheVT = Op.getOperand(0).getValueType();
auto PtrVT = getPointerTy(DAG.getDataLayout());
if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
// f16 must be promoted before using the lowering in this routine.
// fp128 does not use this lowering.
return SDValue();
}
// If using FIST to compute an unsigned i64, we'll need some fixup
// to handle values above the maximum signed i64. A FIST is always
// used for the 32-bit subtarget, but also for f80 on a 64-bit target.
bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
if (!IsSigned && DstTy != MVT::i64) {
// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
// The low 32 bits of the fist result will have the correct uint32 result.
assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
DstTy = MVT::i64;
}
assert(DstTy.getSimpleVT() <= MVT::i64 &&
DstTy.getSimpleVT() >= MVT::i16 &&
"Unknown FP_TO_INT to lower!");
// We lower FP->int64 into FISTP64 followed by a load from a temporary
// stack slot.
MachineFunction &MF = DAG.getMachineFunction();
unsigned MemSize = DstTy.getStoreSize();
int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
SDValue Chain = DAG.getEntryNode();
SDValue Value = Op.getOperand(0);
SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
if (UnsignedFixup) {
//
// Conversion to unsigned i64 is implemented with a select,
// depending on whether the source value fits in the range
// of a signed i64. Let Thresh be the FP equivalent of
// 0x8000000000000000ULL.
//
// Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
// FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
// Fist-to-mem64 FistSrc
// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
// to XOR'ing the high 32 bits with Adjust.
//
// Being a power of 2, Thresh is exactly representable in all FP formats.
// For X87 we'd like to use the smallest FP type for this constant, but
// for DAG type consistency we have to match the FP operand type.
APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
bool LosesInfo = false;
if (TheVT == MVT::f64)
// The rounding mode is irrelevant as the conversion should be exact.
Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
&LosesInfo);
else if (TheVT == MVT::f80)
Status = Thresh.convert(APFloat::x87DoubleExtended(),
APFloat::rmNearestTiesToEven, &LosesInfo);
assert(Status == APFloat::opOK && !LosesInfo &&
"FP conversion should have been exact");
SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
SDValue Cmp = DAG.getSetCC(DL,
getSetCCResultType(DAG.getDataLayout(),
*DAG.getContext(), TheVT),
Value, ThreshVal, ISD::SETLT);
Adjust = DAG.getSelect(DL, MVT::i64, Cmp,
DAG.getConstant(0, DL, MVT::i64),
DAG.getConstant(APInt::getSignMask(64),
DL, MVT::i64));
SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
*DAG.getContext(), TheVT),
Value, ThreshVal, ISD::SETLT);
Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
}
MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
// FIXME This causes a redundant load/store if the SSE-class value is already
// in memory, such as if it is on the callstack.
if (isScalarFPTypeInSSEReg(TheVT)) {
assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
SDVTList Tys = DAG.getVTList(TheVT, MVT::Other);
SDValue Ops[] = { Chain, StackSlot };
unsigned FLDSize = TheVT.getStoreSize();
assert(FLDSize <= MemSize && "Stack slot not big enough");
MachineMemOperand *MMO = MF.getMachineMemOperand(
MPI, MachineMemOperand::MOLoad, FLDSize, FLDSize);
Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
Chain = Value.getValue(1);
}
// Build the FP_TO_INT*_IN_MEM
MachineMemOperand *MMO = MF.getMachineMemOperand(
MPI, MachineMemOperand::MOStore, MemSize, MemSize);
SDValue Ops[] = { Chain, Value, StackSlot };
SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
DAG.getVTList(MVT::Other),
Ops, DstTy, MMO);
SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
// If we need an unsigned fixup, XOR the result with adjust.
if (UnsignedFixup)
Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
return Res;
}
static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(0);
MVT InVT = In.getSimpleValueType();
SDLoc dl(Op);
unsigned Opc = Op.getOpcode();
assert(VT.isVector() && InVT.isVector() && "Expected vector type");
assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
"Unexpected extension opcode");
assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
"Expected same number of elements");
assert((VT.getVectorElementType() == MVT::i16 ||
VT.getVectorElementType() == MVT::i32 ||
VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type");
assert((InVT.getVectorElementType() == MVT::i8 ||
InVT.getVectorElementType() == MVT::i16 ||
InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type");
unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
// Custom legalize v8i8->v8i64 on CPUs without avx512bw.
if (InVT == MVT::v8i8) {
if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
return SDValue();
In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
return DAG.getNode(ExtendInVecOpc, dl, VT, In);
}
if (Subtarget.hasInt256())
return Op;
// Optimize vectors in AVX mode:
//
// v8i16 -> v8i32
// Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
// Concat upper and lower parts.
//
// v4i32 -> v4i64
// Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
// Concat upper and lower parts.
//
MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
VT.getVectorNumElements() / 2);
SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
// Short-circuit if we can determine that each 128-bit half is the same value.
// Otherwise, this is difficult to match and optimize.
if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
SDValue Undef = DAG.getUNDEF(InVT);
bool NeedZero = Opc == ISD::ZERO_EXTEND;
SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
OpHi = DAG.getBitcast(HalfVT, OpHi);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
}
// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
const SDLoc &dl, SelectionDAG &DAG) {
assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
DAG.getIntPtrConstant(0, dl));
SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
DAG.getIntPtrConstant(8, dl));
Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
}
static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op->getSimpleValueType(0);
SDValue In = Op->getOperand(0);
MVT InVT = In.getSimpleValueType();
assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
SDLoc DL(Op);
unsigned NumElts = VT.getVectorNumElements();
// For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
// avoids a constant pool load.
if (VT.getVectorElementType() != MVT::i8) {
SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
return DAG.getNode(ISD::SRL, DL, VT, Extend,
DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
}
// Extend VT if BWI is not supported.
MVT ExtVT = VT;
if (!Subtarget.hasBWI()) {
// If v16i32 is to be avoided, we'll need to split and concatenate.
if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
}
// Widen to 512-bits if VLX is not supported.
MVT WideVT = ExtVT;
if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
NumElts *= 512 / ExtVT.getSizeInBits();
InVT = MVT::getVectorVT(MVT::i1, NumElts);
In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
In, DAG.getIntPtrConstant(0, DL));
WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
NumElts);
}
SDValue One = DAG.getConstant(1, DL, WideVT);
SDValue Zero = DAG.getConstant(0, DL, WideVT);
SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
// Truncate if we had to extend above.
if (VT != ExtVT) {
WideVT = MVT::getVectorVT(MVT::i8, NumElts);
SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
}
// Extract back to 128/256-bit if we widened.
if (WideVT != VT)
SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
DAG.getIntPtrConstant(0, DL));
return SelectedVal;
}
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue In = Op.getOperand(0);
MVT SVT = In.getSimpleValueType();
if (SVT.getVectorElementType() == MVT::i1)
return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
assert(Subtarget.hasAVX() && "Expected AVX support");
return LowerAVXExtend(Op, DAG, Subtarget);
}
/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
/// It makes use of the fact that vectors with enough leading sign/zero bits
/// prevent the PACKSS/PACKUS from saturating the results.
/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
/// within each 128-bit lane.
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
const SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
"Unexpected PACK opcode");
assert(DstVT.isVector() && "VT not a vector?");
// Requires SSE2 but AVX512 has fast vector truncate.
if (!Subtarget.hasSSE2())
return SDValue();
EVT SrcVT = In.getValueType();
// No truncation required, we might get here due to recursive calls.
if (SrcVT == DstVT)
return In;
// We only support vector truncation to 64bits or greater from a
// 128bits or greater source.
unsigned DstSizeInBits = DstVT.getSizeInBits();
unsigned SrcSizeInBits = SrcVT.getSizeInBits();
if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
return SDValue();
unsigned NumElems = SrcVT.getVectorNumElements();
if (!isPowerOf2_32(NumElems))
return SDValue();
LLVMContext &Ctx = *DAG.getContext();
assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
// Pack to the largest type possible:
// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
EVT InVT = MVT::i16, OutVT = MVT::i8;
if (SrcVT.getScalarSizeInBits() > 16 &&
(Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
InVT = MVT::i32;
OutVT = MVT::i16;
}
// 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
if (SrcVT.is128BitVector()) {
InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
In = DAG.getBitcast(InVT, In);
SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
Res = extractSubVector(Res, 0, DAG, DL, 64);
return DAG.getBitcast(DstVT, Res);
}
// Extract lower/upper subvectors.
unsigned NumSubElts = NumElems / 2;
SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
unsigned SubSizeInBits = SrcSizeInBits / 2;
InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
Lo = DAG.getBitcast(InVT, Lo);
Hi = DAG.getBitcast(InVT, Hi);
SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
return DAG.getBitcast(DstVT, Res);
}
// AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
// AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
Lo = DAG.getBitcast(InVT, Lo);
Hi = DAG.getBitcast(InVT, Hi);
SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
// 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
// Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
SmallVector<int, 64> Mask;
int Scale = 64 / OutVT.getScalarSizeInBits();
scaleShuffleMask<int>(Scale, ArrayRef<int>({ 0, 2, 1, 3 }), Mask);
Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
if (DstVT.is256BitVector())
return DAG.getBitcast(DstVT, Res);
// If 512bit -> 128bit truncate another stage.
EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
Res = DAG.getBitcast(PackedVT, Res);
return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
}
// Recursively pack lower/upper subvectors, concat result and pack again.
assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
}
static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(0);
MVT InVT = In.getSimpleValueType();
assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
// Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
if (InVT.getScalarSizeInBits() <= 16) {
if (Subtarget.hasBWI()) {
// legal, will go to VPMOVB2M, VPMOVW2M
if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
// We need to shift to get the lsb into sign position.
// Shift packed bytes not supported natively, bitcast to word
MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
In = DAG.getNode(ISD::SHL, DL, ExtVT,
DAG.getBitcast(ExtVT, In),
DAG.getConstant(ShiftInx, DL, ExtVT));
In = DAG.getBitcast(InVT, In);
}
return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
In, ISD::SETGT);
}
// Use TESTD/Q, extended vector to packed dword/qword.
assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
"Unexpected vector type.");
unsigned NumElts = InVT.getVectorNumElements();
assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
// We need to change to a wider element type that we have support for.
// For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
// For 16 element vectors we extend to v16i32 unless we are explicitly
// trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
// we need to split into two 8 element vectors which we can extend to v8i32,
// truncate and concat the results. There's an additional complication if
// the original type is v16i8. In that case we can't split the v16i8 so
// first we pre-extend it to v16i16 which we can split to v8i16, then extend
// to v8i32, truncate that to v8i1 and concat the two halves.
if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
if (InVT == MVT::v16i8) {
// First we need to sign extend up to 256-bits so we can split that.
InVT = MVT::v16i16;
In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
}
SDValue Lo = extract128BitVector(In, 0, DAG, DL);
SDValue Hi = extract128BitVector(In, 8, DAG, DL);
// We're split now, just emit two truncates and a concat. The two
// truncates will trigger legalization to come back to this function.
Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
}
// We either have 8 elements or we're allowed to use 512-bit vectors.
// If we have VLX, we want to use the narrowest vector that can get the
// job done so we use vXi32.
MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
InVT = ExtVT;
ShiftInx = InVT.getScalarSizeInBits() - 1;
}
if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
// We need to shift to get the lsb into sign position.
In = DAG.getNode(ISD::SHL, DL, InVT, In,
DAG.getConstant(ShiftInx, DL, InVT));
}
// If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
if (Subtarget.hasDQI())
return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
}
SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(0);
MVT InVT = In.getSimpleValueType();
unsigned InNumEltBits = InVT.getScalarSizeInBits();
assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation");
// If called by the legalizer just return.
if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
return SDValue();
if (VT.getVectorElementType() == MVT::i1)
return LowerTruncateVecI1(Op, DAG, Subtarget);
// vpmovqb/w/d, vpmovdb/w, vpmovwb
if (Subtarget.hasAVX512()) {
// word to byte only under BWI. Otherwise we have to promoted to v16i32
// and then truncate that. But we should only do that if we haven't been
// asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
// handled by isel patterns.
if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
Subtarget.canExtendTo512DQ())
return Op;
}
unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
// Truncate with PACKUS if we are truncating a vector with leading zero bits
// that extend all the way to the packed/truncated value.
// Pre-SSE41 we can only use PACKUSWB.
KnownBits Known = DAG.computeKnownBits(In);
if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
if (SDValue V =
truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
return V;
// Truncate with PACKSS if we are truncating a vector with sign-bits that
// extend all the way to the packed/truncated value.
if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
if (SDValue V =
truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
return V;
if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
// On AVX2, v4i64 -> v4i32 becomes VPERMD.
if (Subtarget.hasInt256()) {
static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
In = DAG.getBitcast(MVT::v8i32, In);
In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
DAG.getIntPtrConstant(0, DL));
}
SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
DAG.getIntPtrConstant(0, DL));
SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
DAG.getIntPtrConstant(2, DL));
OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
static const int ShufMask[] = {0, 2, 4, 6};
return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
}
if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
// On AVX2, v8i32 -> v8i16 becomes PSHUFB.
if (Subtarget.hasInt256()) {
In = DAG.getBitcast(MVT::v32i8, In);
// The PSHUFB mask:
static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
-1, -1, -1, -1, -1, -1, -1, -1,
16, 17, 20, 21, 24, 25, 28, 29,
-1, -1, -1, -1, -1, -1, -1, -1 };
In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
In = DAG.getBitcast(MVT::v4i64, In);
static const int ShufMask2[] = {0, 2, -1, -1};
In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
DAG.getIntPtrConstant(0, DL));
return DAG.getBitcast(VT, In);
}
SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
DAG.getIntPtrConstant(0, DL));
SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
DAG.getIntPtrConstant(4, DL));
OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
// The PSHUFB mask:
static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
-1, -1, -1, -1, -1, -1, -1, -1};
OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
// The MOVLHPS Mask:
static const int ShufMask2[] = {0, 1, 4, 5};
SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
return DAG.getBitcast(MVT::v8i16, res);
}
if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
// Use an AND to zero uppper bits for PACKUS.
In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
DAG.getIntPtrConstant(0, DL));
SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
DAG.getIntPtrConstant(8, DL));
return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
}
// Handle truncation of V256 to V128 using shuffles.
assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
assert(Subtarget.hasAVX() && "256-bit vector without AVX!");
unsigned NumElems = VT.getVectorNumElements();
MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
SmallVector<int, 16> MaskVec(NumElems * 2, -1);
// Prepare truncation shuffle mask
for (unsigned i = 0; i != NumElems; ++i)
MaskVec[i] = i * 2;
In = DAG.getBitcast(NVT, In);
SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
DAG.getIntPtrConstant(0, DL));
}
SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
MVT VT = Op.getSimpleValueType();
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
SDLoc dl(Op);
if (VT.isVector()) {
if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
MVT ResVT = MVT::v4i32;
MVT TruncVT = MVT::v4i1;
unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
if (!IsSigned && !Subtarget.hasVLX()) {
// Widen to 512-bits.
ResVT = MVT::v8i32;
TruncVT = MVT::v8i1;
Opc = ISD::FP_TO_UINT;
Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
DAG.getUNDEF(MVT::v8f64),
Src, DAG.getIntPtrConstant(0, dl));
}
SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
DAG.getIntPtrConstant(0, dl));
}
assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
DAG.getUNDEF(MVT::v2f32)));
}
return SDValue();
}
assert(!VT.isVector());
bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
if (!IsSigned && Subtarget.hasAVX512()) {
// Conversions from f32/f64 should be legal.
if (UseSSEReg)
return Op;
// Use default expansion.
if (VT == MVT::i64)
return SDValue();
}
// Promote i16 to i32 if we can use a SSE operation.
if (VT == MVT::i16 && UseSSEReg) {
assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
}
// If this is a SINT_TO_FP using SSEReg we're done.
if (UseSSEReg && IsSigned)
return Op;
// Fall back to X87.
if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned))
return V;
llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
}
static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(0);
MVT SVT = In.getSimpleValueType();
assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
return DAG.getNode(X86ISD::VFPEXT, DL, VT,
DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
In, DAG.getUNDEF(SVT)));
}
/// Horizontal vector math instructions may be slower than normal math with
/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
/// implementation, and likely shuffle complexity of the alternate sequence.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
bool HasFastHOps = Subtarget.hasFastHorizontalOps();
return !IsSingleSource || IsOptimizingSize || HasFastHOps;
}
/// Depending on uarch and/or optimizing for size, we might prefer to use a
/// vector operation in place of the typical scalar operation.
static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// If both operands have other uses, this is probably not profitable.
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
if (!LHS.hasOneUse() && !RHS.hasOneUse())
return Op;
// FP horizontal add/sub were added with SSE3. Integer with SSSE3.
bool IsFP = Op.getSimpleValueType().isFloatingPoint();
if (IsFP && !Subtarget.hasSSE3())
return Op;
if (!IsFP && !Subtarget.hasSSSE3())
return Op;
// Extract from a common vector.
if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
LHS.getOperand(0) != RHS.getOperand(0) ||
!isa<ConstantSDNode>(LHS.getOperand(1)) ||
!isa<ConstantSDNode>(RHS.getOperand(1)) ||
!shouldUseHorizontalOp(true, DAG, Subtarget))
return Op;
// Allow commuted 'hadd' ops.
// TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
unsigned HOpcode;
switch (Op.getOpcode()) {
case ISD::ADD: HOpcode = X86ISD::HADD; break;
case ISD::SUB: HOpcode = X86ISD::HSUB; break;
case ISD::FADD: HOpcode = X86ISD::FHADD; break;
case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
default:
llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
}
unsigned LExtIndex = LHS.getConstantOperandVal(1);
unsigned RExtIndex = RHS.getConstantOperandVal(1);
if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
(HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
std::swap(LExtIndex, RExtIndex);
if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
return Op;
SDValue X = LHS.getOperand(0);
EVT VecVT = X.getValueType();
unsigned BitWidth = VecVT.getSizeInBits();
unsigned NumLanes = BitWidth / 128;
unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
"Not expecting illegal vector widths here");
// Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
// equivalent, so extract the 256/512-bit source op to 128-bit if we can.
SDLoc DL(Op);
if (BitWidth == 256 || BitWidth == 512) {
unsigned LaneIdx = LExtIndex / NumEltsPerLane;
X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
LExtIndex %= NumEltsPerLane;
}
// add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
// add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
// add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
// sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
DAG.getIntPtrConstant(LExtIndex / 2, DL));
}
/// Depending on uarch and/or optimizing for size, we might prefer to use a
/// vector operation in place of the typical scalar operation.
static SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
"Only expecting float/double");
return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
}
/// The only differences between FABS and FNEG are the mask and the logic op.
/// FNEG also has a folding opportunity for FNEG(FABS(x)).
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
"Wrong opcode for lowering FABS or FNEG.");
bool IsFABS = (Op.getOpcode() == ISD::FABS);
// If this is a FABS and it has an FNEG user, bail out to fold the combination
// into an FNABS. We'll lower the FABS after that if it is still in use.
if (IsFABS)
for (SDNode *User : Op->uses())
if (User->getOpcode() == ISD::FNEG)
return Op;
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
bool IsF128 = (VT == MVT::f128);
assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
"Unexpected type in LowerFABSorFNEG");
// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
// decide if we should generate a 16-byte constant mask when we only need 4 or
// 8 bytes for the scalar case.
// There are no scalar bitwise logical SSE/AVX instructions, so we
// generate a 16-byte vector constant and logic op even for the scalar case.
// Using a 16-byte mask allows folding the load of the mask with
// the logic op, so it can save (~4 bytes) on code size.
bool IsFakeVector = !VT.isVector() && !IsF128;
MVT LogicVT = VT;
if (IsFakeVector)
LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
unsigned EltBits = VT.getScalarSizeInBits();
// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
APInt::getSignMask(EltBits);
const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
SDValue Op0 = Op.getOperand(0);
bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
unsigned LogicOp = IsFABS ? X86ISD::FAND :
IsFNABS ? X86ISD::FOR :
X86ISD::FXOR;
SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
if (VT.isVector() || IsF128)
return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
// For the scalar case extend to a 128-bit vector, perform the logic op,
// and extract the scalar result back out.
Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
DAG.getIntPtrConstant(0, dl));
}
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
SDValue Mag = Op.getOperand(0);
SDValue Sign = Op.getOperand(1);
SDLoc dl(Op);
// If the sign operand is smaller, extend it first.
MVT VT = Op.getSimpleValueType();
if (Sign.getSimpleValueType().bitsLT(VT))
Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
// And if it is bigger, shrink it first.
if (Sign.getSimpleValueType().bitsGT(VT))
Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
// At this point the operands and the result should have the same
// type, and that won't be f80 since that is not custom lowered.
bool IsF128 = (VT == MVT::f128);
assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
"Unexpected type in LowerFCOPYSIGN");
const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
// Perform all scalar logic operations as 16-byte vectors because there are no
// scalar FP logic instructions in SSE.
// TODO: This isn't necessary. If we used scalar types, we might avoid some
// unnecessary splats, but we might miss load folding opportunities. Should
// this decision be based on OptimizeForSize?
bool IsFakeVector = !VT.isVector() && !IsF128;
MVT LogicVT = VT;
if (IsFakeVector)
LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
// The mask constants are automatically splatted for vector types.
unsigned EltSizeInBits = VT.getScalarSizeInBits();
SDValue SignMask = DAG.getConstantFP(
APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
SDValue MagMask = DAG.getConstantFP(
APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
// First, clear all bits but the sign bit from the second operand (sign).
if (IsFakeVector)
Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
// Next, clear the sign bit from the first operand (magnitude).
// TODO: If we had general constant folding for FP logic ops, this check
// wouldn't be necessary.
SDValue MagBits;
if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
APFloat APF = Op0CN->getValueAPF();
APF.clearSign();
MagBits = DAG.getConstantFP(APF, dl, LogicVT);
} else {
// If the magnitude operand wasn't a constant, we need to AND out the sign.
if (IsFakeVector)
Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
}
// OR the magnitude value with the sign bit.
SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
DAG.getIntPtrConstant(0, dl));
}
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
SDValue N0 = Op.getOperand(0);
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
MVT OpVT = N0.getSimpleValueType();
assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
"Unexpected type for FGETSIGN");
// Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
Res = DAG.getZExtOrTrunc(Res, dl, VT);
Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
return Res;
}
/// Helper for creating a X86ISD::SETCC node.
static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
SelectionDAG &DAG) {
return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
}
/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
/// style scalarized (associative) reduction patterns.
static bool matchBitOpReduction(SDValue Op, ISD::NodeType BinOp,
SmallVectorImpl<SDValue> &SrcOps) {
SmallVector<SDValue, 8> Opnds;
DenseMap<SDValue, APInt> SrcOpMap;
EVT VT = MVT::Other;
// Recognize a special case where a vector is casted into wide integer to
// test all 0s.
assert(Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode");
Opnds.push_back(Op.getOperand(0));
Opnds.push_back(Op.getOperand(1));
for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
// BFS traverse all BinOp operands.
if (I->getOpcode() == unsigned(BinOp)) {
Opnds.push_back(I->getOperand(0));
Opnds.push_back(I->getOperand(1));
// Re-evaluate the number of nodes to be traversed.
e += 2; // 2 more nodes (LHS and RHS) are pushed.
continue;
}
// Quit if a non-EXTRACT_VECTOR_ELT
if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return false;
// Quit if without a constant index.
SDValue Idx = I->getOperand(1);
if (!isa<ConstantSDNode>(Idx))
return false;
SDValue Src = I->getOperand(0);
DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
if (M == SrcOpMap.end()) {
VT = Src.getValueType();
// Quit if not the same type.
if (SrcOpMap.begin() != SrcOpMap.end() &&
VT != SrcOpMap.begin()->first.getValueType())
return false;
unsigned NumElts = VT.getVectorNumElements();
APInt EltCount = APInt::getNullValue(NumElts);
M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
SrcOps.push_back(Src);
}
// Quit if element already used.
unsigned CIdx = cast<ConstantSDNode>(Idx)->getZExtValue();
if (M->second[CIdx])
return false;
M->second.setBit(CIdx);
}
// Quit if not all elements are used.
for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
E = SrcOpMap.end();
I != E; ++I) {
if (!I->second.isAllOnesValue())
return false;
}
return true;
}
// Check whether an OR'd tree is PTEST-able.
static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
const X86Subtarget &Subtarget,
SelectionDAG &DAG, SDValue &X86CC) {
assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
if (!Subtarget.hasSSE41() || !Op->hasOneUse())
return SDValue();
SmallVector<SDValue, 8> VecIns;
if (!matchBitOpReduction(Op, ISD::OR, VecIns))
return SDValue();
// Quit if not 128/256-bit vector.
EVT VT = VecIns[0].getValueType();
if (!VT.is128BitVector() && !VT.is256BitVector())
return SDValue();
SDLoc DL(Op);
MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
// Cast all vectors into TestVT for PTEST.
for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
// If more than one full vector is evaluated, OR them first before PTEST.
for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
// Each iteration will OR 2 nodes and append the result until there is only
// 1 node left, i.e. the final OR'd value of all vectors.
SDValue LHS = VecIns[Slot];
SDValue RHS = VecIns[Slot + 1];
VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
}
X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, DL,
MVT::i8);
return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
}
/// return true if \c Op has a use that doesn't just read flags.
static bool hasNonFlagsUse(SDValue Op) {
for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
++UI) {
SDNode *User = *UI;
unsigned UOpNo = UI.getOperandNo();
if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
// Look pass truncate.
UOpNo = User->use_begin().getOperandNo();
User = *User->use_begin();
}
if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
!(User->getOpcode() == ISD::SELECT && UOpNo == 0))
return true;
}
return false;
}
/// Emit nodes that will be selected as "test Op0,Op0", or something
/// equivalent.
static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
SelectionDAG &DAG, const X86Subtarget &Subtarget) {
// CF and OF aren't always set the way we want. Determine which
// of these we need.
bool NeedCF = false;
bool NeedOF = false;
switch (X86CC) {
default: break;
case X86::COND_A: case X86::COND_AE:
case X86::COND_B: case X86::COND_BE:
NeedCF = true;
break;
case X86::COND_G: case X86::COND_GE:
case X86::COND_L: case X86::COND_LE:
case X86::COND_O: case X86::COND_NO: {
// Check if we really need to set the
// Overflow flag. If NoSignedWrap is present
// that is not actually needed.
switch (Op->getOpcode()) {
case ISD::ADD:
case ISD::SUB:
case ISD::MUL:
case ISD::SHL:
if (Op.getNode()->getFlags().hasNoSignedWrap())
break;
LLVM_FALLTHROUGH;
default:
NeedOF = true;
break;
}
break;
}
}
// See if we can use the EFLAGS value from the operand instead of
// doing a separate TEST. TEST always sets OF and CF to 0, so unless
// we prove that the arithmetic won't overflow, we can't use OF or CF.
if (Op.getResNo() != 0 || NeedOF || NeedCF) {
// Emit a CMP with 0, which is the TEST pattern.
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, dl, Op.getValueType()));
}
unsigned Opcode = 0;
unsigned NumOperands = 0;
SDValue ArithOp = Op;
// NOTICE: In the code below we use ArithOp to hold the arithmetic operation
// which may be the result of a CAST. We use the variable 'Op', which is the
// non-casted variable when we check for possible users.
switch (ArithOp.getOpcode()) {
case ISD::AND:
// If the primary 'and' result isn't used, don't bother using X86ISD::AND,
// because a TEST instruction will be better.
if (!hasNonFlagsUse(Op))
break;
LLVM_FALLTHROUGH;
case ISD::ADD:
case ISD::SUB:
case ISD::OR:
case ISD::XOR:
// Transform to an x86-specific ALU node with flags if there is a chance of
// using an RMW op or only the flags are used. Otherwise, leave
// the node alone and emit a 'test' instruction.
for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
UE = Op.getNode()->use_end(); UI != UE; ++UI)
if (UI->getOpcode() != ISD::CopyToReg &&
UI->getOpcode() != ISD::SETCC &&
UI->getOpcode() != ISD::STORE)
goto default_case;
// Otherwise use a regular EFLAGS-setting instruction.
switch (ArithOp.getOpcode()) {
default: llvm_unreachable("unexpected operator!");
case ISD::ADD: Opcode = X86ISD::ADD; break;
case ISD::SUB: Opcode = X86ISD::SUB; break;
case ISD::XOR: Opcode = X86ISD::XOR; break;
case ISD::AND: Opcode = X86ISD::AND; break;
case ISD::OR: Opcode = X86ISD::OR; break;
}
NumOperands = 2;
break;
case X86ISD::ADD:
case X86ISD::SUB:
case X86ISD::OR:
case X86ISD::XOR:
case X86ISD::AND:
return SDValue(Op.getNode(), 1);
default:
default_case:
break;
}
if (Opcode == 0) {
// Emit a CMP with 0, which is the TEST pattern.
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, dl, Op.getValueType()));
}
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
return SDValue(New.getNode(), 1);
}
/// Emit nodes that will be selected as "cmp Op0,Op1", or something
/// equivalent.
SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
const SDLoc &dl, SelectionDAG &DAG) const {
if (isNullConstant(Op1))
return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
EVT CmpVT = Op0.getValueType();
if (CmpVT.isFloatingPoint())
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
// Only promote the compare up to I32 if it is a 16 bit operation
// with an immediate. 16 bit immediates are to be avoided.
if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
!DAG.getMachineFunction().getFunction().hasMinSize()) {
ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
// Don't do this if the immediate can fit in 8-bits.
if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
(COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
unsigned ExtendOp =
isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
// For equality comparisons try to use SIGN_EXTEND if the input was
// truncate from something with enough sign bits.
if (Op0.getOpcode() == ISD::TRUNCATE) {
SDValue In = Op0.getOperand(0);
unsigned EffBits =
In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
if (EffBits <= 16)
ExtendOp = ISD::SIGN_EXTEND;
} else if (Op1.getOpcode() == ISD::TRUNCATE) {
SDValue In = Op1.getOperand(0);
unsigned EffBits =
In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
if (EffBits <= 16)
ExtendOp = ISD::SIGN_EXTEND;
}
}
CmpVT = MVT::i32;
Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
}
}
// Use SUB instead of CMP to enable CSE between SUB and CMP.
SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
return Sub.getValue(1);
}
/// Convert a comparison if required by the subtarget.
SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
SelectionDAG &DAG) const {
// If the subtarget does not support the FUCOMI instruction, floating-point
// comparisons have to be converted.
if (Subtarget.hasCMov() ||
Cmp.getOpcode() != X86ISD::CMP ||
!Cmp.getOperand(0).getValueType().isFloatingPoint() ||
!Cmp.getOperand(1).getValueType().isFloatingPoint())
return Cmp;
// The instruction selector will select an FUCOM instruction instead of
// FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
// build an SDNode sequence that transfers the result from FPSW into EFLAGS:
// (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
SDLoc dl(Cmp);
SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
DAG.getConstant(8, dl, MVT::i8));
SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
// Some 64-bit targets lack SAHF support, but they do support FCOMI.
assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
}
/// Check if replacement of SQRT with RSQRT should be disabled.
bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
// We never want to use both SQRT and RSQRT instructions for the same input.
if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
return false;
if (VT.isVector())
return Subtarget.hasFastVectorFSQRT();
return Subtarget.hasFastScalarFSQRT();
}
/// The minimum architected relative accuracy is 2^-12. We need one
/// Newton-Raphson step to have a good float result (24 bits of precision).
SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
SelectionDAG &DAG, int Enabled,
int &RefinementSteps,
bool &UseOneConstNR,
bool Reciprocal) const {
EVT VT = Op.getValueType();
// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
// It is likely not profitable to do this for f64 because a double-precision
// rsqrt estimate with refinement on x86 prior to FMA requires at least 16
// instructions: convert to single, rsqrtss, convert back to double, refine
// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
// along with FMA, this could be a throughput win.
// TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
// after legalize types.
if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
(VT == MVT::v8f32 && Subtarget.hasAVX()) ||
(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
if (RefinementSteps == ReciprocalEstimate::Unspecified)
RefinementSteps = 1;
UseOneConstNR = false;
// There is no FSQRT for 512-bits, but there is RSQRT14.
unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
}
return SDValue();
}
/// The minimum architected relative accuracy is 2^-12. We need one
/// Newton-Raphson step to have a good float result (24 bits of precision).
SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
int Enabled,
int &RefinementSteps) const {
EVT VT = Op.getValueType();
// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
// It is likely not profitable to do this for f64 because a double-precision
// reciprocal estimate with refinement on x86 prior to FMA requires
// 15 instructions: convert to single, rcpss, convert back to double, refine
// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
// along with FMA, this could be a throughput win.
if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
(VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
(VT == MVT::v8f32 && Subtarget.hasAVX()) ||
(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
// Enable estimate codegen with 1 refinement step for vector division.
// Scalar division estimates are disabled because they break too much
// real-world code. These defaults are intended to match GCC behavior.
if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
return SDValue();
if (RefinementSteps == ReciprocalEstimate::Unspecified)
RefinementSteps = 1;
// There is no FSQRT for 512-bits, but there is RCP14.
unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
}
return SDValue();
}
/// If we have at least two divisions that use the same divisor, convert to
/// multiplication by a reciprocal. This may need to be adjusted for a given
/// CPU if a division's cost is not at least twice the cost of a multiplication.
/// This is because we still need one division to calculate the reciprocal and
/// then we need two multiplies by that reciprocal as replacements for the
/// original divisions.
unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
return 2;
}
/// Result of 'and' is compared against zero. Change to a BT node if possible.
/// Returns the BT node and the condition code needed to use it.
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
const SDLoc &dl, SelectionDAG &DAG,
SDValue &X86CC) {
assert(And.getOpcode() == ISD::AND && "Expected AND node!");
SDValue Op0 = And.getOperand(0);
SDValue Op1 = And.getOperand(1);
if (Op0.getOpcode() == ISD::TRUNCATE)
Op0 = Op0.getOperand(0);
if (Op1.getOpcode() == ISD::TRUNCATE)
Op1 = Op1.getOperand(0);
SDValue Src, BitNo;
if (Op1.getOpcode() == ISD::SHL)
std::swap(Op0, Op1);
if (Op0.getOpcode() == ISD::SHL) {
if (isOneConstant(Op0.getOperand(0))) {
// If we looked past a truncate, check that it's only truncating away
// known zeros.
unsigned BitWidth = Op0.getValueSizeInBits();
unsigned AndBitWidth = And.getValueSizeInBits();
if (BitWidth > AndBitWidth) {
KnownBits Known = DAG.computeKnownBits(Op0);
if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
return SDValue();
}
Src = Op1;
BitNo = Op0.getOperand(1);
}
} else if (Op1.getOpcode() == ISD::Constant) {
ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
uint64_t AndRHSVal = AndRHS->getZExtValue();
SDValue AndLHS = Op0;
if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
Src = AndLHS.getOperand(0);
BitNo = AndLHS.getOperand(1);
} else {
// Use BT if the immediate can't be encoded in a TEST instruction or we
// are optimizing for size and the immedaite won't fit in a byte.
bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
isPowerOf2_64(AndRHSVal)) {
Src = AndLHS;
BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
Src.getValueType());
}
}
}
// No patterns found, give up.
if (!Src.getNode())
return SDValue();
// If Src is i8, promote it to i32 with any_extend. There is no i8 BT
// instruction. Since the shift amount is in-range-or-undefined, we know
// that doing a bittest on the i32 value is ok. We extend to i32 because
// the encoding for the i16 version is larger than the i32 version.
// Also promote i16 to i32 for performance / code size reason.
if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
// See if we can use the 32-bit instruction instead of the 64-bit one for a
// shorter encoding. Since the former takes the modulo 32 of BitNo and the
// latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
// known to be zero.
if (Src.getValueType() == MVT::i64 &&
DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
// If the operand types disagree, extend the shift amount to match. Since
// BT ignores high bits (like shifts) we can use anyextend.
if (Src.getValueType() != BitNo.getValueType())
BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
dl, MVT::i8);
return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
}
/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
/// CMPs.
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
SDValue &Op1) {
unsigned SSECC;
bool Swap = false;
// SSE Condition code mapping:
// 0 - EQ
// 1 - LT
// 2 - LE
// 3 - UNORD
// 4 - NEQ
// 5 - NLT
// 6 - NLE
// 7 - ORD
switch (SetCCOpcode) {
default: llvm_unreachable("Unexpected SETCC condition");
case ISD::SETOEQ:
case ISD::SETEQ: SSECC = 0; break;
case ISD::SETOGT:
case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETLT:
case ISD::SETOLT: SSECC = 1; break;
case ISD::SETOGE:
case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETLE:
case ISD::SETOLE: SSECC = 2; break;
case ISD::SETUO: SSECC = 3; break;
case ISD::SETUNE:
case ISD::SETNE: SSECC = 4; break;
case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETUGE: SSECC = 5; break;
case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETUGT: SSECC = 6; break;
case ISD::SETO: SSECC = 7; break;
case ISD::SETUEQ: SSECC = 8; break;
case ISD::SETONE: SSECC = 12; break;
}
if (Swap)
std::swap(Op0, Op1);
return SSECC;
}
/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
/// concatenate the result back.
static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
"Unsupported value type for operation");
unsigned NumElems = VT.getVectorNumElements();
SDLoc dl(Op);
SDValue CC = Op.getOperand(2);
// Extract the LHS vectors
SDValue LHS = Op.getOperand(0);
SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
// Extract the RHS vectors
SDValue RHS = Op.getOperand(1);
SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
// Issue the operation on the smaller types and concatenate the result back
MVT EltVT = VT.getVectorElementType();
MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
}
static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDValue CC = Op.getOperand(2);
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
assert(VT.getVectorElementType() == MVT::i1 &&
"Cannot set masked compare for this operation");
ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
// If this is a seteq make sure any build vectors of all zeros are on the RHS.
// This helps with vptestm matching.
// TODO: Should we just canonicalize the setcc during DAG combine?
if ((SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE) &&
ISD::isBuildVectorAllZeros(Op0.getNode()))
std::swap(Op0, Op1);
// Prefer SETGT over SETLT.
if (SetCCOpcode == ISD::SETLT) {
SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
std::swap(Op0, Op1);
}
return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
}
/// Given a buildvector constant, return a new vector constant with each element
/// incremented or decremented. If incrementing or decrementing would result in
/// unsigned overflow or underflow or this is not a simple vector constant,
/// return an empty value.
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
if (!BV)
return SDValue();
MVT VT = V.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
unsigned NumElts = VT.getVectorNumElements();
SmallVector<SDValue, 8> NewVecC;
SDLoc DL(V);
for (unsigned i = 0; i < NumElts; ++i) {
auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
return SDValue();
// Avoid overflow/underflow.
const APInt &EltC = Elt->getAPIntValue();
if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
return SDValue();
NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
}
return DAG.getBuildVector(VT, DL, NewVecC);
}
/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
/// Op0 u<= Op1:
/// t = psubus Op0, Op1
/// pcmpeq t, <0..0>
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
ISD::CondCode Cond, const SDLoc &dl,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
if (!Subtarget.hasSSE2())
return SDValue();
MVT VET = VT.getVectorElementType();
if (VET != MVT::i8 && VET != MVT::i16)
return SDValue();
switch (Cond) {
default:
return SDValue();
case ISD::SETULT: {
// If the comparison is against a constant we can turn this into a
// setule. With psubus, setule does not require a swap. This is
// beneficial because the constant in the register is no longer
// destructed as the destination so it can be hoisted out of a loop.
// Only do this pre-AVX since vpcmp* is no longer destructive.
if (Subtarget.hasAVX())
return SDValue();
SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, false);
if (!ULEOp1)
return SDValue();
Op1 = ULEOp1;
break;
}
case ISD::SETUGT: {
// If the comparison is against a constant, we can turn this into a setuge.
// This is beneficial because materializing a constant 0 for the PCMPEQ is
// probably cheaper than XOR+PCMPGT using 2 different vector constants:
// cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, true);
if (!UGEOp1)
return SDValue();
Op1 = Op0;
Op0 = UGEOp1;
break;
}
// Psubus is better than flip-sign because it requires no inversion.
case ISD::SETUGE:
std::swap(Op0, Op1);
break;
case ISD::SETULE:
break;
}
SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
DAG.getConstant(0, dl, VT));
}
static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDValue CC = Op.getOperand(2);
MVT VT = Op.getSimpleValueType();
ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
SDLoc dl(Op);
if (isFP) {
#ifndef NDEBUG
MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
assert(EltVT == MVT::f32 || EltVT == MVT::f64);
#endif
unsigned Opc;
if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
assert(VT.getVectorNumElements() <= 16);
Opc = X86ISD::CMPM;
} else {
Opc = X86ISD::CMPP;
// The SSE/AVX packed FP comparison nodes are defined with a
// floating-point vector result that matches the operand type. This allows
// them to work with an SSE1 target (integer vector types are not legal).
VT = Op0.getSimpleValueType();
}
// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
// emit two comparisons and a logic op to tie them together.
SDValue Cmp;
unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
if (SSECC >= 8 && !Subtarget.hasAVX()) {
// LLVM predicate is SETUEQ or SETONE.
unsigned CC0, CC1;
unsigned CombineOpc;
if (Cond == ISD::SETUEQ) {
CC0 = 3; // UNORD
CC1 = 0; // EQ
CombineOpc = X86ISD::FOR;
} else {
assert(Cond == ISD::SETONE);
CC0 = 7; // ORD
CC1 = 4; // NEQ
CombineOpc = X86ISD::FAND;
}
SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
DAG.getConstant(CC0, dl, MVT::i8));
SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
DAG.getConstant(CC1, dl, MVT::i8));
Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
} else {
// Handle all other FP comparisons here.
Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
DAG.getConstant(SSECC, dl, MVT::i8));
}
// If this is SSE/AVX CMPP, bitcast the result back to integer to match the
// result type of SETCC. The bitcast is expected to be optimized away
// during combining/isel.
if (Opc == X86ISD::CMPP)
Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
return Cmp;
}
MVT VTOp0 = Op0.getSimpleValueType();
assert(VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!");
assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
"Invalid number of packed elements for source and destination!");
// This is being called by type legalization because v2i32 is marked custom
// for result type legalization for v2f32.
if (VTOp0 == MVT::v2i32)
return SDValue();
// The non-AVX512 code below works under the assumption that source and
// destination types are the same.
assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
"Value types for source and destination must be the same!");
// The result is boolean, but operands are int/float
if (VT.getVectorElementType() == MVT::i1) {
// In AVX-512 architecture setcc returns mask with i1 elements,
// But there is no compare instruction for i8 and i16 elements in KNL.
assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
"Unexpected operand type");
return LowerIntVSETCC_AVX512(Op, DAG);
}
// Lower using XOP integer comparisons.
if (VT.is128BitVector() && Subtarget.hasXOP()) {
// Translate compare code to XOP PCOM compare mode.
unsigned CmpMode = 0;
switch (Cond) {
default: llvm_unreachable("Unexpected SETCC condition");
case ISD::SETULT:
case ISD::SETLT: CmpMode = 0x00; break;
case ISD::SETULE:
case ISD::SETLE: CmpMode = 0x01; break;
case ISD::SETUGT:
case ISD::SETGT: CmpMode = 0x02; break;
case ISD::SETUGE:
case ISD::SETGE: CmpMode = 0x03; break;
case ISD::SETEQ: CmpMode = 0x04; break;
case ISD::SETNE: CmpMode = 0x05; break;
}
// Are we comparing unsigned or signed integers?
unsigned Opc =
ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
return DAG.getNode(Opc, dl, VT, Op0, Op1,
DAG.getConstant(CmpMode, dl, MVT::i8));
}
// (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
// Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
SDValue BC0 = peekThroughBitcasts(Op0);
if (BC0.getOpcode() == ISD::AND) {
APInt UndefElts;
SmallVector<APInt, 64> EltBits;
if (getTargetConstantBitsFromNode(BC0.getOperand(1),
VT.getScalarSizeInBits(), UndefElts,
EltBits, false, false)) {
if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
Cond = ISD::SETEQ;
Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
}
}
}
}
// ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
ConstantSDNode *C1 = isConstOrConstSplat(Op1);
if (C1 && C1->getAPIntValue().isPowerOf2()) {
unsigned BitWidth = VT.getScalarSizeInBits();
unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
SDValue Result = Op0.getOperand(0);
Result = DAG.getNode(ISD::SHL, dl, VT, Result,
DAG.getConstant(ShiftAmt, dl, VT));
Result = DAG.getNode(ISD::SRA, dl, VT, Result,
DAG.getConstant(BitWidth - 1, dl, VT));
return Result;
}
}
// Break 256-bit integer vector compare into smaller ones.
if (VT.is256BitVector() && !Subtarget.hasInt256())
return Lower256IntVSETCC(Op, DAG);
// If this is a SETNE against the signed minimum value, change it to SETGT.
// If this is a SETNE against the signed maximum value, change it to SETLT.
// which will be swapped to SETGT.
// Otherwise we use PCMPEQ+invert.
APInt ConstValue;
if (Cond == ISD::SETNE &&
ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
if (ConstValue.isMinSignedValue())
Cond = ISD::SETGT;
else if (ConstValue.isMaxSignedValue())
Cond = ISD::SETLT;
}
// If both operands are known non-negative, then an unsigned compare is the
// same as a signed compare and there's no need to flip signbits.
// TODO: We could check for more general simplifications here since we're
// computing known bits.
bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
// Special case: Use min/max operations for unsigned compares.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (ISD::isUnsignedIntSetCC(Cond) &&
(FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
TLI.isOperationLegal(ISD::UMIN, VT)) {
// If we have a constant operand, increment/decrement it and change the
// condition to avoid an invert.
if (Cond == ISD::SETUGT &&
ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) {
return !C->getAPIntValue().isMaxValue();
})) {
// X > C --> X >= (C+1) --> X == umax(X, C+1)
Op1 = DAG.getNode(ISD::ADD, dl, VT, Op1, DAG.getConstant(1, dl, VT));
Cond = ISD::SETUGE;
}
if (Cond == ISD::SETULT &&
ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) {
return !C->getAPIntValue().isNullValue();
})) {
// X < C --> X <= (C-1) --> X == umin(X, C-1)
Op1 = DAG.getNode(ISD::SUB, dl, VT, Op1, DAG.getConstant(1, dl, VT));
Cond = ISD::SETULE;
}
bool Invert = false;
unsigned Opc;
switch (Cond) {
default: llvm_unreachable("Unexpected condition code");
case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
case ISD::SETULE: Opc = ISD::UMIN; break;
case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
case ISD::SETUGE: Opc = ISD::UMAX; break;
}
SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
// If the logical-not of the result is required, perform that now.
if (Invert)
Result = DAG.getNOT(dl, Result, VT);
return Result;
}
// Try to use SUBUS and PCMPEQ.
if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
return V;
// We are handling one of the integer comparisons here. Since SSE only has
// GT and EQ comparisons for integer, swapping operands and multiple
// operations may be required for some comparisons.
unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
: X86ISD::PCMPGT;
bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
Cond == ISD::SETGE || Cond == ISD::SETUGE;
bool Invert = Cond == ISD::SETNE ||
(Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
if (Swap)
std::swap(Op0, Op1);
// Check that the operation in question is available (most are plain SSE2,
// but PCMPGTQ and PCMPEQQ have different requirements).
if (VT == MVT::v2i64) {
if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
assert(Subtarget.hasSSE2() && "Don't know how to lower!");
// Since SSE has no unsigned integer comparisons, we need to flip the sign
// bits of the inputs before performing those operations. The lower
// compare is always unsigned.
SDValue SB;
if (FlipSigns) {
SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
} else {
SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
}
Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
// Cast everything to the right type.
Op0 = DAG.getBitcast(MVT::v4i32, Op0);
Op1 = DAG.getBitcast(MVT::v4i32, Op1);
// Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
// Create masks for only the low parts/high parts of the 64 bit integers.
static const int MaskHi[] = { 1, 1, 3, 3 };
static const int MaskLo[] = { 0, 0, 2, 2 };
SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
if (Invert)
Result = DAG.getNOT(dl, Result, MVT::v4i32);
return DAG.getBitcast(VT, Result);
}
if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
// pcmpeqd + pshufd + pand.
assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
// First cast everything to the right type.
Op0 = DAG.getBitcast(MVT::v4i32, Op0);
Op1 = DAG.getBitcast(MVT::v4i32, Op1);
// Do the compare.
SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
// Make sure the lower and upper halves are both all-ones.
static const int Mask[] = { 1, 0, 3, 2 };
SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
if (Invert)
Result = DAG.getNOT(dl, Result, MVT::v4i32);
return DAG.getBitcast(VT, Result);
}
}
// Since SSE has no unsigned integer comparisons, we need to flip the sign
// bits of the inputs before performing those operations.
if (FlipSigns) {
MVT EltVT = VT.getVectorElementType();
SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
VT);
Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
}
SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
// If the logical-not of the result is required, perform that now.
if (Invert)
Result = DAG.getNOT(dl, Result, VT);
return Result;
}
// Try to select this as a KORTEST+SETCC if possible.
static SDValue EmitKORTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
const SDLoc &dl, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
SDValue &X86CC) {
// Only support equality comparisons.
if (CC != ISD::SETEQ && CC != ISD::SETNE)
return SDValue();
// Must be a bitcast from vXi1.
if (Op0.getOpcode() != ISD::BITCAST)
return SDValue();
Op0 = Op0.getOperand(0);
MVT VT = Op0.getSimpleValueType();
if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
!(Subtarget.hasDQI() && VT == MVT::v8i1) &&
!(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
return SDValue();
X86::CondCode X86Cond;
if (isNullConstant(Op1)) {
X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
} else if (isAllOnesConstant(Op1)) {
// C flag is set for all ones.
X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
} else
return SDValue();
// If the input is an OR, we can combine it's operands into the KORTEST.
SDValue LHS = Op0;
SDValue RHS = Op0;
if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
LHS = Op0.getOperand(0);
RHS = Op0.getOperand(1);
}
X86CC = DAG.getConstant(X86Cond, dl, MVT::i8);
return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
}
/// Emit flags for the given setcc condition and operands. Also returns the
/// corresponding X86 condition code constant in X86CC.
SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
ISD::CondCode CC, const SDLoc &dl,
SelectionDAG &DAG,
SDValue &X86CC) const {
// Optimize to BT if possible.
// Lower (X & (1 << N)) == 0 to BT(X, N).
// Lower ((X >>u N) & 1) != 0 to BT(X, N).
// Lower ((X >>s N) & 1) != 0 to BT(X, N).
if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
return BT;
}
// Try to use PTEST for a tree ORs equality compared with 0.
// TODO: We could do AND tree with all 1s as well by using the C flag.
if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
if (SDValue PTEST = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG, X86CC))
return PTEST;
}
// Try to lower using KORTEST.
if (SDValue KORTEST = EmitKORTEST(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
return KORTEST;
// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
// these.
if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
// If the input is a setcc, then reuse the input setcc or use a new one with
// the inverted condition.
if (Op0.getOpcode() == X86ISD::SETCC) {
bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
X86CC = Op0.getOperand(0);
if (Invert) {
X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
CCode = X86::GetOppositeBranchCondition(CCode);
X86CC = DAG.getConstant(CCode, dl, MVT::i8);
}
return Op0.getOperand(1);
}
}
bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
X86::CondCode CondCode = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
if (CondCode == X86::COND_INVALID)
return SDValue();
SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG);
EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
X86CC = DAG.getConstant(CondCode, dl, MVT::i8);
return EFLAGS;
}
SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
MVT VT = Op.getSimpleValueType();
if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDLoc dl(Op);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
SDValue X86CC;
SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
if (!EFLAGS)
return SDValue();
return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
}
SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
SDValue Carry = Op.getOperand(2);
SDValue Cond = Op.getOperand(3);
SDLoc DL(Op);
assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
// Recreate the carry if needed.
EVT CarryVT = Carry.getValueType();
APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
Carry, DAG.getConstant(NegOne, DL, CarryVT));
SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
return getSETCC(CC, Cmp.getValue(1), DL, DAG);
}
// This function returns three things: the arithmetic computation itself
// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
// flag and the condition code define the case in which the arithmetic
// computation overflows.
static std::pair<SDValue, SDValue>
getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
assert(Op.getResNo() == 0 && "Unexpected result number!");
SDValue Value, Overflow;
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
unsigned BaseOp = 0;
SDLoc DL(Op);
switch (Op.getOpcode()) {
default: llvm_unreachable("Unknown ovf instruction!");
case ISD::SADDO:
BaseOp = X86ISD::ADD;
Cond = X86::COND_O;
break;
case ISD::UADDO:
BaseOp = X86ISD::ADD;
Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
break;
case ISD::SSUBO:
BaseOp = X86ISD::SUB;
Cond = X86::COND_O;
break;
case ISD::USUBO:
BaseOp = X86ISD::SUB;
Cond = X86::COND_B;
break;
case ISD::SMULO:
BaseOp = X86ISD::SMUL;
Cond = X86::COND_O;
break;
case ISD::UMULO:
BaseOp = X86ISD::UMUL;
Cond = X86::COND_O;
break;
}
if (BaseOp) {
// Also sets EFLAGS.
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
Overflow = Value.getValue(1);
}
return std::make_pair(Value, Overflow);
}
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
// Lower the "add/sub/mul with overflow" instruction into a regular ins plus
// a "setcc" instruction that checks the overflow flag. The "brcond" lowering
// looks for this combo and may remove the "setcc" instruction if the "setcc"
// has only one use.
SDLoc DL(Op);
X86::CondCode Cond;
SDValue Value, Overflow;
std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
}
/// Return true if opcode is a X86 logical comparison.
static bool isX86LogicalCmp(SDValue Op) {
unsigned Opc = Op.getOpcode();
if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
Opc == X86ISD::SAHF)
return true;
if (Op.getResNo() == 1 &&
(Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
return true;
return false;
}
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
if (V.getOpcode() != ISD::TRUNCATE)
return false;
SDValue VOp0 = V.getOperand(0);
unsigned InBits = VOp0.getValueSizeInBits();
unsigned Bits = V.getValueSizeInBits();
return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
}
SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
bool AddTest = true;
SDValue Cond = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDValue Op2 = Op.getOperand(2);
SDLoc DL(Op);
MVT VT = Op1.getSimpleValueType();
SDValue CC;
// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
// are available or VBLENDV if AVX is available.
// Otherwise FP cmovs get lowered into a less efficient branch sequence later.
if (Cond.getOpcode() == ISD::SETCC &&
((Subtarget.hasSSE2() && VT == MVT::f64) ||
(Subtarget.hasSSE1() && VT == MVT::f32)) &&
VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
unsigned SSECC = translateX86FSETCC(
cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
if (Subtarget.hasAVX512()) {
SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
assert(!VT.isVector() && "Not a scalar type?");
return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
}
if (SSECC < 8 || Subtarget.hasAVX()) {
SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
DAG.getConstant(SSECC, DL, MVT::i8));
// If we have AVX, we can use a variable vector select (VBLENDV) instead
// of 3 logic instructions for size savings and potentially speed.
// Unfortunately, there is no scalar form of VBLENDV.
// If either operand is a +0.0 constant, don't try this. We can expect to
// optimize away at least one of the logic instructions later in that
// case, so that sequence would be faster than a variable blend.
// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
// uses XMM0 as the selection register. That may need just as many
// instructions as the AND/ANDN/OR sequence due to register moves, so
// don't bother.
if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
!isNullFPConstant(Op2)) {
// Convert to vectors, do a VSELECT, and convert back to scalar.
// All of the conversions should be optimized away.
MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
VCmp = DAG.getBitcast(VCmpVT, VCmp);
SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
VSel, DAG.getIntPtrConstant(0, DL));
}
SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
}
}
// AVX512 fallback is to lower selects of scalar floats to masked moves.
if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
}
// For v64i1 without 64-bit support we need to split and rejoin.
if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
assert(Subtarget.hasBWI() && "Expected BWI to be legal");
SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
}
if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
SDValue Op1Scalar;
if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
Op1Scalar = Op1.getOperand(0);
SDValue Op2Scalar;
if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
Op2Scalar = Op2.getOperand(0);
if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
Op1Scalar, Op2Scalar);
if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
return DAG.getBitcast(VT, newSelect);
SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
DAG.getIntPtrConstant(0, DL));
}
}
if (Cond.getOpcode() == ISD::SETCC) {
if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
Cond = NewCond;
// If the condition was updated, it's possible that the operands of the
// select were also updated (for example, EmitTest has a RAUW). Refresh
// the local references to the select operands in case they got stale.
Op1 = Op.getOperand(1);
Op2 = Op.getOperand(2);
}
}
// (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
// (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
// (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
if (Cond.getOpcode() == X86ISD::SETCC &&
Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
isNullConstant(Cond.getOperand(1).getOperand(1))) {
SDValue Cmp = Cond.getOperand(1);
unsigned CondCode =
cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
(CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
SDValue CmpOp0 = Cmp.getOperand(0);
// Apply further optimizations for special cases
// (select (x != 0), -1, 0) -> neg & sbb
// (select (x == 0), 0, -1) -> neg & sbb
if (isNullConstant(Y) &&
(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
SDValue CmpZero = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
Zero = DAG.getConstant(0, DL, Op.getValueType());
return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, CmpZero);
}
Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
Cmp = ConvertCmpIfNecessary(Cmp, DAG);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
SDValue Res = // Res = 0 or -1.
DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp);
if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
Res = DAG.getNOT(DL, Res, Res.getValueType());
if (!isNullConstant(Op2))
Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
return Res;
} else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
Cmp.getOperand(0).getOpcode() == ISD::AND &&
isOneConstant(Cmp.getOperand(0).getOperand(1))) {
SDValue CmpOp0 = Cmp.getOperand(0);
SDValue Src1, Src2;
// true if Op2 is XOR or OR operator and one of its operands
// is equal to Op1
// ( a , a op b) || ( b , a op b)
auto isOrXorPattern = [&]() {
if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
(Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
Src1 =
Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
Src2 = Op1;
return true;
}
return false;
};
if (isOrXorPattern()) {
SDValue Neg;
unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
// we need mask of all zeros or ones with same size of the other
// operands.
if (CmpSz > VT.getSizeInBits())
Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
else if (CmpSz < VT.getSizeInBits())
Neg = DAG.getNode(ISD::AND, DL, VT,
DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
DAG.getConstant(1, DL, VT));
else
Neg = CmpOp0;
SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
Neg); // -(and (x, 0x1))
SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
}
}
}
// Look past (and (setcc_carry (cmp ...)), 1).
if (Cond.getOpcode() == ISD::AND &&
Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
isOneConstant(Cond.getOperand(1)))
Cond = Cond.getOperand(0);
// If condition flag is set by a X86ISD::CMP, then use it as the condition
// setting operand in place of the X86ISD::SETCC.
unsigned CondOpcode = Cond.getOpcode();
if (CondOpcode == X86ISD::SETCC ||
CondOpcode == X86ISD::SETCC_CARRY) {
CC = Cond.getOperand(0);
SDValue Cmp = Cond.getOperand(1);
MVT VT = Op.getSimpleValueType();
bool IllegalFPCMov = false;
if (VT.isFloatingPoint() && !VT.isVector() &&
!isScalarFPTypeInSSEReg(VT)) // FPStack?
IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
Cmp.getOpcode() == X86ISD::BT) { // FIXME
Cond = Cmp;
AddTest = false;
}
} else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
SDValue Value;
X86::CondCode X86Cond;
std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
CC = DAG.getConstant(X86Cond, DL, MVT::i8);
AddTest = false;
}
if (AddTest) {
// Look past the truncate if the high bits are known zero.
if (isTruncWithZeroHighBitsInput(Cond, DAG))
Cond = Cond.getOperand(0);
// We know the result of AND is compared against zero. Try to match
// it to BT.
if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
SDValue BTCC;
if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
CC = BTCC;
Cond = BT;
AddTest = false;
}
}
}
if (AddTest) {
CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
Cond = EmitCmp(Cond, DAG.getConstant(0, DL, Cond.getValueType()),
X86::COND_NE, DL, DAG);
}
// a < b ? -1 : 0 -> RES = ~setcc_carry
// a < b ? 0 : -1 -> RES = setcc_carry
// a >= b ? -1 : 0 -> RES = setcc_carry
// a >= b ? 0 : -1 -> RES = ~setcc_carry
if (Cond.getOpcode() == X86ISD::SUB) {
Cond = ConvertCmpIfNecessary(Cond, DAG);
unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
(isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
(isNullConstant(Op1) || isNullConstant(Op2))) {
SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
DAG.getConstant(X86::COND_B, DL, MVT::i8),
Cond);
if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
return DAG.getNOT(DL, Res, Res.getValueType());
return Res;
}
}
// X86 doesn't have an i8 cmov. If both operands are the result of a truncate
// widen the cmov and push the truncate through. This avoids introducing a new
// branch during isel and doesn't add any extensions.
if (Op.getValueType() == MVT::i8 &&
Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
if (T1.getValueType() == T2.getValueType() &&
// Blacklist CopyFromReg to avoid partial register stalls.
T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
CC, Cond);
return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
}
}
// Or finally, promote i8 cmovs if we have CMOV,
// or i16 cmovs if it won't prevent folding a load.
// FIXME: we should not limit promotion of i8 case to only when the CMOV is
// legal, but EmitLoweredSelect() can not deal with these extensions
// being inserted between two CMOV's. (in i16 case too TBN)
// https://bugs.llvm.org/show_bug.cgi?id=40974
if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
(Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
!MayFoldLoad(Op2))) {
Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
SDValue Ops[] = { Op2, Op1, CC, Cond };
SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
}
// X86ISD::CMOV means set the result (which is operand 1) to the RHS if
// condition is true.
SDValue Ops[] = { Op2, Op1, CC, Cond };
return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
}
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op->getSimpleValueType(0);
SDValue In = Op->getOperand(0);
MVT InVT = In.getSimpleValueType();
assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
MVT VTElt = VT.getVectorElementType();
SDLoc dl(Op);
unsigned NumElts = VT.getVectorNumElements();
// Extend VT if the scalar type is i8/i16 and BWI is not supported.
MVT ExtVT = VT;
if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
// If v16i32 is to be avoided, we'll need to split and concatenate.
if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
}
// Widen to 512-bits if VLX is not supported.
MVT WideVT = ExtVT;
if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
NumElts *= 512 / ExtVT.getSizeInBits();
InVT = MVT::getVectorVT(MVT::i1, NumElts);
In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
In, DAG.getIntPtrConstant(0, dl));
WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
}
SDValue V;
MVT WideEltVT = WideVT.getVectorElementType();
if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
(Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
} else {
SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
SDValue Zero = DAG.getConstant(0, dl, WideVT);
V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
}
// Truncate if we had to extend i16/i8 above.
if (VT != ExtVT) {
WideVT = MVT::getVectorVT(VTElt, NumElts);
V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
}
// Extract back to 128/256-bit if we widened.
if (WideVT != VT)
V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
DAG.getIntPtrConstant(0, dl));
return V;
}
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue In = Op->getOperand(0);
MVT InVT = In.getSimpleValueType();
if (InVT.getVectorElementType() == MVT::i1)
return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
assert(Subtarget.hasAVX() && "Expected AVX support");
return LowerAVXExtend(Op, DAG, Subtarget);
}
// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
// For sign extend this needs to handle all vector sizes and SSE4.1 and
// non-SSE4.1 targets. For zero extend this should only handle inputs of
// MVT::v64i8 when BWI is not supported, but AVX512 is.
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue In = Op->getOperand(0);
MVT VT = Op->getSimpleValueType(0);
MVT InVT = In.getSimpleValueType();
MVT SVT = VT.getVectorElementType();
MVT InSVT = InVT.getVectorElementType();
assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
return SDValue();
if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
return SDValue();
if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
!(VT.is256BitVector() && Subtarget.hasAVX()) &&
!(VT.is512BitVector() && Subtarget.hasAVX512()))
return SDValue();
SDLoc dl(Op);
unsigned Opc = Op.getOpcode();
unsigned NumElts = VT.getVectorNumElements();
// For 256-bit vectors, we only need the lower (128-bit) half of the input.
// For 512-bit vectors, we need 128-bits or 256-bits.
if (InVT.getSizeInBits() > 128) {
// Input needs to be at least the same number of elements as output, and
// at least 128-bits.
int InSize = InSVT.getSizeInBits() * NumElts;
In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
InVT = In.getSimpleValueType();
}
// SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
// need to be handled here for 256/512-bit results.
if (Subtarget.hasInt256()) {
assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
if (InVT.getVectorNumElements() != NumElts)
return DAG.getNode(Op.getOpcode(), dl, VT, In);
// FIXME: Apparently we create inreg operations that could be regular
// extends.
unsigned ExtOpc =
Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND;
return DAG.getNode(ExtOpc, dl, VT, In);
}
// pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
if (Subtarget.hasAVX()) {
assert(VT.is256BitVector() && "256-bit vector expected");
int HalfNumElts = NumElts / 2;
MVT HalfVT = MVT::getVectorVT(SVT, HalfNumElts);
unsigned NumSrcElts = InVT.getVectorNumElements();
SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
for (int i = 0; i != HalfNumElts; ++i)
HiMask[i] = HalfNumElts + i;
SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
}
// We should only get here for sign extend.
assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
SDValue Curr = In;
SDValue SignExt = Curr;
// As SRAI is only available on i16/i32 types, we expand only up to i32
// and handle i64 separately.
if (InVT != MVT::v4i32) {
MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
unsigned DestWidth = DestVT.getScalarSizeInBits();
unsigned Scale = DestWidth / InSVT.getSizeInBits();
unsigned InNumElts = InVT.getVectorNumElements();
unsigned DestElts = DestVT.getVectorNumElements();
// Build a shuffle mask that takes each input element and places it in the
// MSBs of the new element size.
SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
for (unsigned i = 0; i != DestElts; ++i)
Mask[i * Scale + (Scale - 1)] = i;
Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
Curr = DAG.getBitcast(DestVT, Curr);
unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
DAG.getConstant(SignExtShift, dl, MVT::i8));
}
if (VT == MVT::v2i64) {
assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
SignExt = DAG.getBitcast(VT, SignExt);
}
return SignExt;
}
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op->getSimpleValueType(0);
SDValue In = Op->getOperand(0);
MVT InVT = In.getSimpleValueType();
SDLoc dl(Op);
if (InVT.getVectorElementType() == MVT::i1)
return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
assert(VT.isVector() && InVT.isVector() && "Expected vector type");
assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
"Expected same number of elements");
assert((VT.getVectorElementType() == MVT::i16 ||
VT.getVectorElementType() == MVT::i32 ||
VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type");
assert((InVT.getVectorElementType() == MVT::i8 ||
InVT.getVectorElementType() == MVT::i16 ||
InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type");
// Custom legalize v8i8->v8i64 on CPUs without avx512bw.
if (InVT == MVT::v8i8) {
if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
return SDValue();
In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In);
}
if (Subtarget.hasInt256())
return Op;
// Optimize vectors in AVX mode
// Sign extend v8i16 to v8i32 and
// v4i32 to v4i64
//
// Divide input vector into two parts
// for v4i32 the high shuffle mask will be {2, 3, -1, -1}
// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
// concat the vectors to original VT
MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
VT.getVectorNumElements() / 2);
SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
unsigned NumElems = InVT.getVectorNumElements();
SmallVector<int,8> ShufMask(NumElems, -1);
for (unsigned i = 0; i != NumElems/2; ++i)
ShufMask[i] = i + NumElems/2;
SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
}
/// Change a vector store into a pair of half-size vector stores.
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
SDValue StoredVal = Store->getValue();
assert((StoredVal.getValueType().is256BitVector() ||
StoredVal.getValueType().is512BitVector()) &&
"Expecting 256/512-bit op");
// Splitting volatile memory ops is not allowed unless the operation was not
// legal to begin with. We are assuming the input op is legal (this transform
// is only used for targets with AVX).
if (Store->isVolatile())
return SDValue();
MVT StoreVT = StoredVal.getSimpleValueType();
unsigned NumElems = StoreVT.getVectorNumElements();
unsigned HalfSize = StoredVal.getValueSizeInBits() / 2;
unsigned HalfAlign = (128 == HalfSize ? 16 : 32);
SDLoc DL(Store);
SDValue Value0 = extractSubVector(StoredVal, 0, DAG, DL, HalfSize);
SDValue Value1 = extractSubVector(StoredVal, NumElems / 2, DAG, DL, HalfSize);
SDValue Ptr0 = Store->getBasePtr();
SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfAlign, DL);
unsigned Alignment = Store->getAlignment();
SDValue Ch0 =
DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
Alignment, Store->getMemOperand()->getFlags());
SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
Store->getPointerInfo().getWithOffset(HalfAlign),
MinAlign(Alignment, HalfAlign),
Store->getMemOperand()->getFlags());
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
}
/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
/// type.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
SelectionDAG &DAG) {
SDValue StoredVal = Store->getValue();
assert(StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
StoredVal = DAG.getBitcast(StoreVT, StoredVal);
// Splitting volatile memory ops is not allowed unless the operation was not
// legal to begin with. We are assuming the input op is legal (this transform
// is only used for targets with AVX).
if (Store->isVolatile())
return SDValue();
MVT StoreSVT = StoreVT.getScalarType();
unsigned NumElems = StoreVT.getVectorNumElements();
unsigned ScalarSize = StoreSVT.getStoreSize();
unsigned Alignment = Store->getAlignment();
SDLoc DL(Store);
SmallVector<SDValue, 4> Stores;
for (unsigned i = 0; i != NumElems; ++i) {
unsigned Offset = i * ScalarSize;
SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL);
SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
DAG.getIntPtrConstant(i, DL));
SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
Store->getPointerInfo().getWithOffset(Offset),
MinAlign(Alignment, Offset),
Store->getMemOperand()->getFlags());
Stores.push_back(Ch);
}
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
}
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
SDLoc dl(St);
SDValue StoredVal = St->getValue();
// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
if (StoredVal.getValueType().isVector() &&
StoredVal.getValueType().getVectorElementType() == MVT::i1) {
assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&
"Unexpected VT");
assert(!St->isTruncatingStore() && "Expected non-truncating store");
assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI");
StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
DAG.getUNDEF(MVT::v16i1), StoredVal,
DAG.getIntPtrConstant(0, dl));
StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
St->getPointerInfo(), St->getAlignment(),
St->getMemOperand()->getFlags());
}
if (St->isTruncatingStore())
return SDValue();
// If this is a 256-bit store of concatenated ops, we are better off splitting
// that store into two 128-bit stores. This avoids spurious use of 256-bit ops
// and each half can execute independently. Some cores would split the op into
// halves anyway, so the concat (vinsertf128) is purely an extra op.
MVT StoreVT = StoredVal.getSimpleValueType();
if (StoreVT.is256BitVector()) {
SmallVector<SDValue, 4> CatOps;
if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
return splitVectorStore(St, DAG);
return SDValue();
}
assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
"Unexpected VT");
if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) !=
TargetLowering::TypeWidenVector)
return SDValue();
MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(),
StoreVT.getVectorNumElements() * 2);
StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
DAG.getUNDEF(StoreVT));
if (Subtarget.hasSSE2()) {
// Widen the vector, cast to a v2x64 type, extract the single 64-bit element
// and store it.
MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
MVT CastVT = MVT::getVectorVT(StVT, 2);
StoredVal = DAG.getBitcast(CastVT, StoredVal);
StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
DAG.getIntPtrConstant(0, dl));
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
St->getPointerInfo(), St->getAlignment(),
St->getMemOperand()->getFlags());
}
assert(Subtarget.hasSSE1() && "Expected SSE");
SDVTList Tys = DAG.getVTList(MVT::Other);
SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
St->getMemOperand());
}
// Lower vector extended loads using a shuffle. If SSSE3 is not available we
// may emit an illegal shuffle but the expansion is still better than scalar
// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
// we'll emit a shuffle and a arithmetic shift.
// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
// TODO: It is possible to support ZExt by zeroing the undef values during
// the shuffle phase or after the shuffle.
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT RegVT = Op.getSimpleValueType();
assert(RegVT.isVector() && "We only custom lower vector loads.");
assert(RegVT.isInteger() &&
"We only custom lower integer vector loads.");
LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
SDLoc dl(Ld);
EVT MemVT = Ld->getMemoryVT();
// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
if (RegVT.getVectorElementType() == MVT::i1) {
assert(EVT(RegVT) == MemVT && "Expected non-extending load");
assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI");
SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
Ld->getPointerInfo(), Ld->getAlignment(),
Ld->getMemOperand()->getFlags());
// Replace chain users with the new chain.
assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
DAG.getBitcast(MVT::v16i1, Val),
DAG.getIntPtrConstant(0, dl));
return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
}
// Nothing useful we can do without SSE2 shuffles.
assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned RegSz = RegVT.getSizeInBits();
ISD::LoadExtType Ext = Ld->getExtensionType();
assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
&& "Only anyext and sext are currently implemented.");
assert(MemVT != RegVT && "Cannot extend to the same type");
assert(MemVT.isVector() && "Must load a vector from memory");
unsigned NumElems = RegVT.getVectorNumElements();
unsigned MemSz = MemVT.getSizeInBits();
assert(RegSz > MemSz && "Register size must be greater than the mem size");
if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
// The only way in which we have a legal 256-bit vector result but not the
// integer 256-bit operations needed to directly lower a sextload is if we
// have AVX1 but not AVX2. In that case, we can always emit a sextload to
// a 128-bit vector and a normal sign_extend to 256-bits that should get
// correctly legalized. We do this late to allow the canonical form of
// sextload to persist throughout the rest of the DAG combiner -- it wants
// to fold together any extensions it can, and so will fuse a sign_extend
// of an sextload into a sextload targeting a wider value.
SDValue Load;
if (MemSz == 128) {
// Just switch this to a normal load.
assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
"it must be a legal 128-bit vector "
"type!");
Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
Ld->getPointerInfo(), Ld->getAlignment(),
Ld->getMemOperand()->getFlags());
} else {
assert(MemSz < 128 &&
"Can't extend a type wider than 128 bits to a 256 bit vector!");
// Do an sext load to a 128-bit vector type. We want to use the same
// number of elements, but elements half as wide. This will end up being
// recursively lowered by this routine, but will succeed as we definitely
// have all the necessary features if we're using AVX1.
EVT HalfEltVT =
EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
Load =
DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
Ld->getMemOperand()->getFlags());
}
// Replace chain users with the new chain.
assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
// Finally, do a normal sign-extend to the desired register.
SDValue SExt = DAG.getSExtOrTrunc(Load, dl, RegVT);
return DAG.getMergeValues({SExt, Load.getValue(1)}, dl);
}
// All sizes must be a power of two.
assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
"Non-power-of-two elements are not custom lowered!");
// Attempt to load the original value using scalar loads.
// Find the largest scalar type that divides the total loaded size.
MVT SclrLoadTy = MVT::i8;
for (MVT Tp : MVT::integer_valuetypes()) {
if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
SclrLoadTy = Tp;
}
}
// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
(64 <= MemSz))
SclrLoadTy = MVT::f64;
// Calculate the number of scalar loads that we need to perform
// in order to load our vector from memory.
unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
"Can only lower sext loads with a single scalar load!");
unsigned loadRegSize = RegSz;
if (Ext == ISD::SEXTLOAD && RegSz >= 256)
loadRegSize = 128;
// If we don't have BWI we won't be able to create the shuffle needed for
// v8i8->v8i64.
if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
MemVT == MVT::v8i8)
loadRegSize = 128;
// Represent our vector as a sequence of elements which are the
// largest scalar that we can load.
EVT LoadUnitVecVT = EVT::getVectorVT(
*DAG.getContext(), SclrLoadTy, loadRegSize / SclrLoadTy.getSizeInBits());
// Represent the data using the same element type that is stored in
// memory. In practice, we ''widen'' MemVT.
EVT WideVecVT =
EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
loadRegSize / MemVT.getScalarSizeInBits());
assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
"Invalid vector type");
// We can't shuffle using an illegal type.
assert(TLI.isTypeLegal(WideVecVT) &&
"We only lower types that form legal widened vector types");
SmallVector<SDValue, 8> Chains;
SDValue Ptr = Ld->getBasePtr();
unsigned OffsetInc = SclrLoadTy.getSizeInBits() / 8;
SDValue Increment = DAG.getConstant(OffsetInc, dl,
TLI.getPointerTy(DAG.getDataLayout()));
SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
unsigned Offset = 0;
for (unsigned i = 0; i < NumLoads; ++i) {
unsigned NewAlign = MinAlign(Ld->getAlignment(), Offset);
// Perform a single load.
SDValue ScalarLoad =
DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr,
Ld->getPointerInfo().getWithOffset(Offset),
NewAlign, Ld->getMemOperand()->getFlags());
Chains.push_back(ScalarLoad.getValue(1));
// Create the first element type using SCALAR_TO_VECTOR in order to avoid
// another round of DAGCombining.
if (i == 0)
Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
else
Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
ScalarLoad, DAG.getIntPtrConstant(i, dl));
Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
Offset += OffsetInc;
}
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
// Bitcast the loaded value to a vector of the original element type, in
// the size of the target vector type.
SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
unsigned SizeRatio = RegSz / MemSz;
if (Ext == ISD::SEXTLOAD) {
SDValue Sext = getExtendInVec(ISD::SIGN_EXTEND, dl, RegVT, SlicedVec, DAG);
return DAG.getMergeValues({Sext, TF}, dl);
}
if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
MemVT == MVT::v8i8) {
SDValue Sext = getExtendInVec(ISD::ZERO_EXTEND, dl, RegVT, SlicedVec, DAG);
return DAG.getMergeValues({Sext, TF}, dl);
}
// Redistribute the loaded elements into the different locations.
SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
for (unsigned i = 0; i != NumElems; ++i)
ShuffleVec[i * SizeRatio] = i;
SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
DAG.getUNDEF(WideVecVT), ShuffleVec);
// Bitcast to the requested type.
Shuff = DAG.getBitcast(RegVT, Shuff);
return DAG.getMergeValues({Shuff, TF}, dl);
}
/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
/// each of which has no other use apart from the AND / OR.
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
Opc = Op.getOpcode();
if (Opc != ISD::OR && Opc != ISD::AND)
return false;
return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
Op.getOperand(0).hasOneUse() &&
Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
Op.getOperand(1).hasOneUse());
}
/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
/// SETCC node has a single use.
static bool isXor1OfSetCC(SDValue Op) {
if (Op.getOpcode() != ISD::XOR)
return false;
if (isOneConstant(Op.getOperand(1)))
return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
Op.getOperand(0).hasOneUse();
return false;
}
SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
bool addTest = true;
SDValue Chain = Op.getOperand(0);
SDValue Cond = Op.getOperand(1);
SDValue Dest = Op.getOperand(2);
SDLoc dl(Op);
SDValue CC;
bool Inverted = false;
if (Cond.getOpcode() == ISD::SETCC) {
// Check for setcc([su]{add,sub,mul}o == 0).
if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
isNullConstant(Cond.getOperand(1)) &&
Cond.getOperand(0).getResNo() == 1 &&
(Cond.getOperand(0).getOpcode() == ISD::SADDO ||
Cond.getOperand(0).getOpcode() == ISD::UADDO ||
Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
Cond.getOperand(0).getOpcode() == ISD::USUBO ||
Cond.getOperand(0).getOpcode() == ISD::SMULO ||
Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
Inverted = true;
Cond = Cond.getOperand(0);
} else {
if (SDValue NewCond = LowerSETCC(Cond, DAG))
Cond = NewCond;
}
}
#if 0
// FIXME: LowerXALUO doesn't handle these!!
else if (Cond.getOpcode() == X86ISD::ADD ||
Cond.getOpcode() == X86ISD::SUB ||
Cond.getOpcode() == X86ISD::SMUL ||
Cond.getOpcode() == X86ISD::UMUL)
Cond = LowerXALUO(Cond, DAG);
#endif
// Look pass (and (setcc_carry (cmp ...)), 1).
if (Cond.getOpcode() == ISD::AND &&
Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
isOneConstant(Cond.getOperand(1)))
Cond = Cond.getOperand(0);
// If condition flag is set by a X86ISD::CMP, then use it as the condition
// setting operand in place of the X86ISD::SETCC.
unsigned CondOpcode = Cond.getOpcode();
if (CondOpcode == X86ISD::SETCC ||
CondOpcode == X86ISD::SETCC_CARRY) {
CC = Cond.getOperand(0);
SDValue Cmp = Cond.getOperand(1);
unsigned Opc = Cmp.getOpcode();
// FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
Cond = Cmp;
addTest = false;
} else {
switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
default: break;
case X86::COND_O:
case X86::COND_B:
// These can only come from an arithmetic instruction with overflow,
// e.g. SADDO, UADDO.
Cond = Cond.getOperand(1);
addTest = false;
break;
}
}
}
CondOpcode = Cond.getOpcode();
if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
SDValue Value;
X86::CondCode X86Cond;
std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
if (Inverted)
X86Cond = X86::GetOppositeBranchCondition(X86Cond);
CC = DAG.getConstant(X86Cond, dl, MVT::i8);
addTest = false;
} else {
unsigned CondOpc;
if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
SDValue Cmp = Cond.getOperand(0).getOperand(1);
if (CondOpc == ISD::OR) {
// Also, recognize the pattern generated by an FCMP_UNE. We can emit
// two branches instead of an explicit OR instruction with a
// separate test.
if (Cmp == Cond.getOperand(1).getOperand(1) &&
isX86LogicalCmp(Cmp)) {
CC = Cond.getOperand(0).getOperand(0);
Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
Chain, Dest, CC, Cmp);
CC = Cond.getOperand(1).getOperand(0);
Cond = Cmp;
addTest = false;
}
} else { // ISD::AND
// Also, recognize the pattern generated by an FCMP_OEQ. We can emit
// two branches instead of an explicit AND instruction with a
// separate test. However, we only do this if this block doesn't
// have a fall-through edge, because this requires an explicit
// jmp when the condition is false.
if (Cmp == Cond.getOperand(1).getOperand(1) &&
isX86LogicalCmp(Cmp) &&
Op.getNode()->hasOneUse()) {
X86::CondCode CCode =
(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
CCode = X86::GetOppositeBranchCondition(CCode);
CC = DAG.getConstant(CCode, dl, MVT::i8);
SDNode *User = *Op.getNode()->use_begin();
// Look for an unconditional branch following this conditional branch.
// We need this because we need to reverse the successors in order
// to implement FCMP_OEQ.
if (User->getOpcode() == ISD::BR) {
SDValue FalseBB = User->getOperand(1);
SDNode *NewBR =
DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
assert(NewBR == User);
(void)NewBR;
Dest = FalseBB;
Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
Chain, Dest, CC, Cmp);
X86::CondCode CCode =
(X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
CCode = X86::GetOppositeBranchCondition(CCode);
CC = DAG.getConstant(CCode, dl, MVT::i8);
Cond = Cmp;
addTest = false;
}
}
}
} else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
// Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
// It should be transformed during dag combiner except when the condition
// is set by a arithmetics with overflow node.
X86::CondCode CCode =
(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
CCode = X86::GetOppositeBranchCondition(CCode);
CC = DAG.getConstant(CCode, dl, MVT::i8);
Cond = Cond.getOperand(0).getOperand(1);
addTest = false;
} else if (Cond.getOpcode() == ISD::SETCC &&
cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
// For FCMP_OEQ, we can emit
// two branches instead of an explicit AND instruction with a
// separate test. However, we only do this if this block doesn't
// have a fall-through edge, because this requires an explicit
// jmp when the condition is false.
if (Op.getNode()->hasOneUse()) {
SDNode *User = *Op.getNode()->use_begin();
// Look for an unconditional branch following this conditional branch.
// We need this because we need to reverse the successors in order
// to implement FCMP_OEQ.
if (User->getOpcode() == ISD::BR) {
SDValue FalseBB = User->getOperand(1);
SDNode *NewBR =
DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
assert(NewBR == User);
(void)NewBR;
Dest = FalseBB;
SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
Cond.getOperand(0), Cond.getOperand(1));
Cmp = ConvertCmpIfNecessary(Cmp, DAG);
CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
Chain, Dest, CC, Cmp);
CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
Cond = Cmp;
addTest = false;
}
}
} else if (Cond.getOpcode() == ISD::SETCC &&
cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
// For FCMP_UNE, we can emit
// two branches instead of an explicit OR instruction with a
// separate test.
SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
Cond.getOperand(0), Cond.getOperand(1));
Cmp = ConvertCmpIfNecessary(Cmp, DAG);
CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
Chain, Dest, CC, Cmp);
CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
Cond = Cmp;
addTest = false;
}
}
if (addTest) {
// Look pass the truncate if the high bits are known zero.
if (isTruncWithZeroHighBitsInput(Cond, DAG))
Cond = Cond.getOperand(0);
// We know the result of AND is compared against zero. Try to match
// it to BT.
if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
SDValue BTCC;
if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, dl, DAG, BTCC)) {
CC = BTCC;
Cond = BT;
addTest = false;
}
}
}
if (addTest) {
X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
CC = DAG.getConstant(X86Cond, dl, MVT::i8);
Cond = EmitCmp(Cond, DAG.getConstant(0, dl, Cond.getValueType()),
X86Cond, dl, DAG);
}
Cond = ConvertCmpIfNecessary(Cond, DAG);
return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
Chain, Dest, CC, Cond);
}
// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
// Calls to _alloca are needed to probe the stack when allocating more than 4k
// bytes in one go. Touching the stack at 4K increments is necessary to ensure
// that the guard pages used by the OS virtual memory manager are allocated in
// correct sequence.
SDValue
X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
bool SplitStack = MF.shouldSplitStack();
bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
SplitStack || EmitStackProbe;
SDLoc dl(Op);
// Get the inputs.
SDNode *Node = Op.getNode();
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
EVT VT = Node->getValueType(0);
// Chain the dynamic stack allocation so that it doesn't modify the stack
// pointer when other instructions are using the stack.
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
bool Is64Bit = Subtarget.is64Bit();
MVT SPTy = getPointerTy(DAG.getDataLayout());
SDValue Result;
if (!Lower) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!");
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
Chain = SP.getValue(1);
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
unsigned StackAlign = TFI.getStackAlignment();
Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
if (Align > StackAlign)
Result = DAG.getNode(ISD::AND, dl, VT, Result,
DAG.getConstant(-(uint64_t)Align, dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
} else if (SplitStack) {
MachineRegisterInfo &MRI = MF.getRegInfo();
if (Is64Bit) {
// The 64 bit implementation of segmented stacks needs to clobber both r10
// r11. This makes it impossible to use it along with nested parameters.
const Function &F = MF.getFunction();
for (const auto &A : F.args()) {
if (A.hasNestAttr())
report_fatal_error("Cannot use segmented stacks with functions that "
"have nested arguments.");
}
}
const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
DAG.getRegister(Vreg, SPTy));
} else {
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
unsigned SPReg = RegInfo->getStackRegister();
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
Chain = SP.getValue(1);
if (Align) {
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
DAG.getConstant(-(uint64_t)Align, dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
}
Result = SP;
}
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
SDValue Ops[2] = {Result, Chain};
return DAG.getMergeValues(Ops, dl);
}
SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
auto PtrVT = getPointerTy(MF.getDataLayout());
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
SDLoc DL(Op);
if (!Subtarget.is64Bit() ||
Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
// vastart just stores the address of the VarArgsFrameIndex slot into the
// memory location argument.
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
MachinePointerInfo(SV));
}
// __va_list_tag:
// gp_offset (0 - 6 * 8)
// fp_offset (48 - 48 + 8 * 16)
// overflow_arg_area (point to parameters coming in memory).
// reg_save_area
SmallVector<SDValue, 8> MemOps;
SDValue FIN = Op.getOperand(1);
// Store gp_offset
SDValue Store = DAG.getStore(
Op.getOperand(0), DL,
DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
MachinePointerInfo(SV));
MemOps.push_back(Store);
// Store fp_offset
FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
Store = DAG.getStore(
Op.getOperand(0), DL,
DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
MachinePointerInfo(SV, 4));
MemOps.push_back(Store);
// Store ptr to overflow_arg_area
FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
Store =
DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
MemOps.push_back(Store);
// Store ptr to reg_save_area.
FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
Store = DAG.getStore(
Op.getOperand(0), DL, RSFIN, FIN,
MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
MemOps.push_back(Store);
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
}
SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
assert(Subtarget.is64Bit() &&
"LowerVAARG only handles 64-bit va_arg!");
assert(Op.getNumOperands() == 4);
MachineFunction &MF = DAG.getMachineFunction();
if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
// The Win64 ABI uses char* instead of a structure.
return DAG.expandVAArg(Op.getNode());
SDValue Chain = Op.getOperand(0);
SDValue SrcPtr = Op.getOperand(1);
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
unsigned Align = Op.getConstantOperandVal(3);
SDLoc dl(Op);
EVT ArgVT = Op.getNode()->getValueType(0);
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
uint8_t ArgMode;
// Decide which area this value should be read from.
// TODO: Implement the AMD64 ABI in its entirety. This simple
// selection mechanism works only for the basic types.
if (ArgVT == MVT::f80) {
llvm_unreachable("va_arg for f80 not yet implemented");
} else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
} else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
} else {
llvm_unreachable("Unhandled argument type in LowerVAARG");
}
if (ArgMode == 2) {
// Sanity Check: Make sure using fp_offset makes sense.
assert(!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
Subtarget.hasSSE1());
}
// Insert VAARG_64 node into the DAG
// VAARG_64 returns two values: Variable Argument Address, Chain
SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
DAG.getConstant(ArgMode, dl, MVT::i8),
DAG.getConstant(Align, dl, MVT::i32)};
SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
SDValue VAARG = DAG.getMemIntrinsicNode(
X86ISD::VAARG_64, dl,
VTs, InstOps, MVT::i64,
MachinePointerInfo(SV),
/*Align=*/0,
MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
Chain = VAARG.getValue(1);
// Load the next argument and return it
return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
}
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
// where a va_list is still an i8*.
assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
if (Subtarget.isCallingConvWin64(
DAG.getMachineFunction().getFunction().getCallingConv()))
// Probably a Win64 va_copy.
return DAG.expandVACopy(Op.getNode());
SDValue Chain = Op.getOperand(0);
SDValue DstPtr = Op.getOperand(1);
SDValue SrcPtr = Op.getOperand(2);
const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
SDLoc DL(Op);
return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
false, false,
MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
}
// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
switch (Opc) {
case ISD::SHL:
case X86ISD::VSHL:
case X86ISD::VSHLI:
return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
case ISD::SRL:
case X86ISD::VSRL:
case X86ISD::VSRLI:
return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
case ISD::SRA:
case X86ISD::VSRA:
case X86ISD::VSRAI:
return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
}
llvm_unreachable("Unknown target vector shift node");
}
/// Handle vector element shifts where the shift amount is a constant.
/// Takes immediate version of shift as input.
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
SDValue SrcOp, uint64_t ShiftAmt,
SelectionDAG &DAG) {
MVT ElementType = VT.getVectorElementType();
// Bitcast the source vector to the output type, this is mainly necessary for
// vXi8/vXi64 shifts.
if (VT != SrcOp.getSimpleValueType())
SrcOp = DAG.getBitcast(VT, SrcOp);
// Fold this packed shift into its first operand if ShiftAmt is 0.
if (ShiftAmt == 0)
return SrcOp;
// Check for ShiftAmt >= element width
if (ShiftAmt >= ElementType.getSizeInBits()) {
if (Opc == X86ISD::VSRAI)
ShiftAmt = ElementType.getSizeInBits() - 1;
else
return DAG.getConstant(0, dl, VT);
}
assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
&& "Unknown target vector shift-by-constant node");
// Fold this packed vector shift into a build vector if SrcOp is a
// vector of Constants or UNDEFs.
if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
SmallVector<SDValue, 8> Elts;
unsigned NumElts = SrcOp->getNumOperands();
switch (Opc) {
default: llvm_unreachable("Unknown opcode!");
case X86ISD::VSHLI:
for (unsigned i = 0; i != NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
if (CurrentOp->isUndef()) {
Elts.push_back(CurrentOp);
continue;
}
auto *ND = cast<ConstantSDNode>(CurrentOp);
const APInt &C = ND->getAPIntValue();
Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
}
break;
case X86ISD::VSRLI:
for (unsigned i = 0; i != NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
if (CurrentOp->isUndef()) {
Elts.push_back(CurrentOp);
continue;
}
auto *ND = cast<ConstantSDNode>(CurrentOp);
const APInt &C = ND->getAPIntValue();
Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
}
break;
case X86ISD::VSRAI:
for (unsigned i = 0; i != NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
if (CurrentOp->isUndef()) {
Elts.push_back(CurrentOp);
continue;
}
auto *ND = cast<ConstantSDNode>(CurrentOp);
const APInt &C = ND->getAPIntValue();
Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
}
break;
}
return DAG.getBuildVector(VT, dl, Elts);
}
return DAG.getNode(Opc, dl, VT, SrcOp,
DAG.getConstant(ShiftAmt, dl, MVT::i8));
}
/// Handle vector element shifts where the shift amount may or may not be a
/// constant. Takes immediate version of shift as input.
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
SDValue SrcOp, SDValue ShAmt,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT SVT = ShAmt.getSimpleValueType();
assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
// Catch shift-by-constant.
if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
CShAmt->getZExtValue(), DAG);
// Change opcode to non-immediate version.
Opc = getTargetVShiftUniformOpcode(Opc, true);
// Need to build a vector containing shift amount.
// SSE/AVX packed shifts only use the lower 64-bit of the shift count.
// +====================+============+=======================================+
// | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
// +====================+============+=======================================+
// | i64 | Yes, No | Use ShAmt as lowest elt |
// | i32 | Yes | zero-extend in-reg |
// | (i32 zext(i16/i8)) | Yes | zero-extend in-reg |
// | (i32 zext(i16/i8)) | No | byte-shift-in-reg |
// | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
// +====================+============+=======================================+
if (SVT == MVT::i64)
ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
(ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||
ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
ShAmt = ShAmt.getOperand(0);
MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
if (Subtarget.hasSSE41())
ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
MVT::v2i64, ShAmt);
else {
SDValue ByteShift = DAG.getConstant(
(128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
ByteShift);
ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
ByteShift);
}
} else if (Subtarget.hasSSE41() &&
ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
MVT::v2i64, ShAmt);
} else {
SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
DAG.getUNDEF(SVT)};
ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
}
// The return type has to be a 128-bit type with the same element
// type as the input type.
MVT EltVT = VT.getVectorElementType();
MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
ShAmt = DAG.getBitcast(ShVT, ShAmt);
return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
}
/// Return Mask with the necessary casting or extending
/// for \p Mask according to \p MaskVT when lowering masking intrinsics
static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
const X86Subtarget &Subtarget, SelectionDAG &DAG,
const SDLoc &dl) {
if (isAllOnesConstant(Mask))
return DAG.getConstant(1, dl, MaskVT);
if (X86::isZeroNode(Mask))
return DAG.getConstant(0, dl, MaskVT);
assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
// In case 32bit mode, bitcast i64 is illegal, extend/split it.
SDValue Lo, Hi;
Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
DAG.getConstant(0, dl, MVT::i32));
Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
DAG.getConstant(1, dl, MVT::i32));
Lo = DAG.getBitcast(MVT::v32i1, Lo);
Hi = DAG.getBitcast(MVT::v32i1, Hi);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
} else {
MVT BitcastVT = MVT::getVectorVT(MVT::i1,
Mask.getSimpleValueType().getSizeInBits());
// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
// are extracted by EXTRACT_SUBVECTOR.
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
DAG.getBitcast(BitcastVT, Mask),
DAG.getIntPtrConstant(0, dl));
}
}
/// Return (and \p Op, \p Mask) for compare instructions or
/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
/// necessary casting or extending for \p Mask when lowering masking intrinsics
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
SDValue PreservedSrc,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
unsigned OpcodeSelect = ISD::VSELECT;
SDLoc dl(Op);
if (isAllOnesConstant(Mask))
return Op;
SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
if (PreservedSrc.isUndef())
PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
}
/// Creates an SDNode for a predicated scalar operation.
/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
/// The mask is coming as MVT::i8 and it should be transformed
/// to MVT::v1i1 while lowering masking intrinsics.
/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
/// "X86select" instead of "vselect". We just can't create the "vselect" node
/// for a scalar instruction.
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
SDValue PreservedSrc,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
if (MaskConst->getZExtValue() & 0x1)
return Op;
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
DAG.getBitcast(MVT::v8i1, Mask),
DAG.getIntPtrConstant(0, dl));
if (Op.getOpcode() == X86ISD::FSETCCM ||
Op.getOpcode() == X86ISD::FSETCCM_SAE ||
Op.getOpcode() == X86ISD::VFPCLASSS)
return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
if (PreservedSrc.isUndef())
PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
}
static int getSEHRegistrationNodeSize(const Function *Fn) {
if (!Fn->hasPersonalityFn())
report_fatal_error(
"querying registration node size for function without personality");
// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
// WinEHStatePass for the full struct definition.
switch (classifyEHPersonality(Fn->getPersonalityFn())) {
case EHPersonality::MSVC_X86SEH: return 24;
case EHPersonality::MSVC_CXX: return 16;
default: break;
}
report_fatal_error(
"can only recover FP for 32-bit MSVC EH personality functions");
}
/// When the MSVC runtime transfers control to us, either to an outlined
/// function or when returning to a parent frame after catching an exception, we
/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
/// Here's the math:
/// RegNodeBase = EntryEBP - RegNodeSize
/// ParentFP = RegNodeBase - ParentFrameOffset
/// Subtracting RegNodeSize takes us to the offset of the registration node, and
/// subtracting the offset (negative on x86) takes us back to the parent FP.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
SDValue EntryEBP) {
MachineFunction &MF = DAG.getMachineFunction();
SDLoc dl;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
// It's possible that the parent function no longer has a personality function
// if the exceptional code was optimized away, in which case we just return
// the incoming EBP.
if (!Fn->hasPersonalityFn())
return EntryEBP;
// Get an MCSymbol that will ultimately resolve to the frame offset of the EH
// registration, or the .set_setframe offset.
MCSymbol *OffsetSym =
MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
GlobalValue::dropLLVMManglingEscape(Fn->getName()));
SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
SDValue ParentFrameOffset =
DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
// prologue to RBP in the parent function.
const X86Subtarget &Subtarget =
static_cast<const X86Subtarget &>(DAG.getSubtarget());
if (Subtarget.is64Bit())
return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
int RegNodeSize = getSEHRegistrationNodeSize(Fn);
// RegNodeBase = EntryEBP - RegNodeSize
// ParentFP = RegNodeBase - ParentFrameOffset
SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
DAG.getConstant(RegNodeSize, dl, PtrVT));
return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
}
SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
// Helper to detect if the operand is CUR_DIRECTION rounding mode.
auto isRoundModeCurDirection = [](SDValue Rnd) {
if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
return C->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
return false;
};
auto isRoundModeSAE = [](SDValue Rnd) {
if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
return C->getZExtValue() == X86::STATIC_ROUNDING::NO_EXC;
return false;
};
auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
RC = C->getZExtValue();
if (RC & X86::STATIC_ROUNDING::NO_EXC) {
// Clear the NO_EXC bit and check remaining bits.
RC ^= X86::STATIC_ROUNDING::NO_EXC;
return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
RC == X86::STATIC_ROUNDING::TO_POS_INF ||
RC == X86::STATIC_ROUNDING::TO_ZERO;
}
}
return false;
};
SDLoc dl(Op);
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
MVT VT = Op.getSimpleValueType();
const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
if (IntrData) {
switch(IntrData->Type) {
case INTR_TYPE_1OP: {
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(2);
unsigned RC = 0;
if (isRoundModeSAEToX(Rnd, RC))
return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
Op.getOperand(1),
DAG.getTargetConstant(RC, dl, MVT::i32));
if (!isRoundModeCurDirection(Rnd))
return SDValue();
}
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
}
case INTR_TYPE_1OP_SAE: {
SDValue Sae = Op.getOperand(2);
unsigned Opc;
if (isRoundModeCurDirection(Sae))
Opc = IntrData->Opc0;
else if (isRoundModeSAE(Sae))
Opc = IntrData->Opc1;
else
return SDValue();
return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
}
case INTR_TYPE_2OP: {
SDValue Src2 = Op.getOperand(2);
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(3);
unsigned RC = 0;
if (isRoundModeSAEToX(Rnd, RC))
return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
Op.getOperand(1), Src2,
DAG.getTargetConstant(RC, dl, MVT::i32));
if (!isRoundModeCurDirection(Rnd))
return SDValue();
}
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(1), Src2);
}
case INTR_TYPE_2OP_SAE: {
SDValue Sae = Op.getOperand(3);
unsigned Opc;
if (isRoundModeCurDirection(Sae))
Opc = IntrData->Opc0;
else if (isRoundModeSAE(Sae))
Opc = IntrData->Opc1;
else
return SDValue();
return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2));
}
case INTR_TYPE_3OP:
case INTR_TYPE_3OP_IMM8: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
if (IntrData->Type == INTR_TYPE_3OP_IMM8)
Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(4);
unsigned RC = 0;
if (isRoundModeSAEToX(Rnd, RC))
return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
Src1, Src2, Src3,
DAG.getTargetConstant(RC, dl, MVT::i32));
if (!isRoundModeCurDirection(Rnd))
return SDValue();
}
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Src1, Src2, Src3);
}
case INTR_TYPE_4OP:
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
case INTR_TYPE_1OP_MASK: {
SDValue Src = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
// We add rounding mode to the Node when
// - RC Opcode is specified and
// - RC is not "current direction".
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(4);
unsigned RC = 0;
if (isRoundModeSAEToX(Rnd, RC))
return getVectorMaskingNode(
DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
Mask, PassThru, Subtarget, DAG);
if (!isRoundModeCurDirection(Rnd))
return SDValue();
}
return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
Mask, PassThru, Subtarget, DAG);
}
case INTR_TYPE_1OP_MASK_SAE: {
SDValue Src = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
SDValue Rnd = Op.getOperand(4);
unsigned Opc;
if (isRoundModeCurDirection(Rnd))
Opc = IntrData->Opc0;
else if (isRoundModeSAE(Rnd))
Opc = IntrData->Opc1;
else
return SDValue();
return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src),
Mask, PassThru, Subtarget, DAG);
}
case INTR_TYPE_SCALAR_MASK: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue passThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
// There are 2 kinds of intrinsics in this group:
// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
// (2) With rounding mode and sae - 7 operands.
bool HasRounding = IntrWithRoundingModeOpcode != 0;
if (Op.getNumOperands() == (5U + HasRounding)) {
if (HasRounding) {
SDValue Rnd = Op.getOperand(5);
unsigned RC = 0;
if (isRoundModeSAEToX(Rnd, RC))
return getScalarMaskingNode(
DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
DAG.getTargetConstant(RC, dl, MVT::i32)),
Mask, passThru, Subtarget, DAG);
if (!isRoundModeCurDirection(Rnd))
return SDValue();
}
return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
Src2),
Mask, passThru, Subtarget, DAG);
}
assert(Op.getNumOperands() == (6U + HasRounding) &&
"Unexpected intrinsic form");
SDValue RoundingMode = Op.getOperand(5);
unsigned Opc = IntrData->Opc0;
if (HasRounding) {
SDValue Sae = Op.getOperand(6);
if (isRoundModeSAE(Sae))
Opc = IntrWithRoundingModeOpcode;
else if (!isRoundModeCurDirection(Sae))
return SDValue();
}
return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
Src2, RoundingMode),
Mask, passThru, Subtarget, DAG);
}
case INTR_TYPE_SCALAR_MASK_RND: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue passThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
SDValue Rnd = Op.getOperand(5);
SDValue NewOp;
unsigned RC = 0;
if (isRoundModeCurDirection(Rnd))
NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
else if (isRoundModeSAEToX(Rnd, RC))
NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
DAG.getTargetConstant(RC, dl, MVT::i32));
else
return SDValue();
return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
}
case INTR_TYPE_SCALAR_MASK_SAE: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue passThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
SDValue Sae = Op.getOperand(5);
unsigned Opc;
if (isRoundModeCurDirection(Sae))
Opc = IntrData->Opc0;
else if (isRoundModeSAE(Sae))
Opc = IntrData->Opc1;
else
return SDValue();
return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
Mask, passThru, Subtarget, DAG);
}
case INTR_TYPE_2OP_MASK: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue PassThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
SDValue NewOp;
if (IntrData->Opc1 != 0) {
SDValue Rnd = Op.getOperand(5);
unsigned RC = 0;
if (isRoundModeSAEToX(Rnd, RC))
NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
DAG.getTargetConstant(RC, dl, MVT::i32));
else if (!isRoundModeCurDirection(Rnd))
return SDValue();
}
if (!NewOp)
NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
}
case INTR_TYPE_2OP_MASK_SAE: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue PassThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
unsigned Opc = IntrData->Opc0;
if (IntrData->Opc1 != 0) {
SDValue Sae = Op.getOperand(5);
if (isRoundModeSAE(Sae))
Opc = IntrData->Opc1;
else if (!isRoundModeCurDirection(Sae))
return SDValue();
}
return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
Mask, PassThru, Subtarget, DAG);
}
case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
SDValue PassThru = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
SDValue Sae = Op.getOperand(6);
unsigned Opc;
if (isRoundModeCurDirection(Sae))
Opc = IntrData->Opc0;
else if (isRoundModeSAE(Sae))
Opc = IntrData->Opc1;
else
return SDValue();
return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
Mask, PassThru, Subtarget, DAG);
}
case INTR_TYPE_3OP_MASK_SAE: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
SDValue PassThru = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
unsigned Opc = IntrData->Opc0;
if (IntrData->Opc1 != 0) {
SDValue Sae = Op.getOperand(6);
if (isRoundModeSAE(Sae))
Opc = IntrData->Opc1;
else if (!isRoundModeCurDirection(Sae))
return SDValue();
}
return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
Mask, PassThru, Subtarget, DAG);
}
case BLENDV: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
Src3 = DAG.getBitcast(MaskVT, Src3);
// Reverse the operands to match VSELECT order.
return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
}
case VPERM_2OP : {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
// Swap Src1 and Src2 in the node creation
return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
}
case IFMA_OP:
// NOTE: We need to swizzle the operands to pass the multiply operands
// first.
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
case FPCLASSS: {
SDValue Src1 = Op.getOperand(1);
SDValue Imm = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
Subtarget, DAG);
// Need to fill with zeros to ensure the bitcast will produce zeroes
// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
DAG.getConstant(0, dl, MVT::v8i1),
FPclassMask, DAG.getIntPtrConstant(0, dl));
return DAG.getBitcast(MVT::i8, Ins);
}
case CMP_MASK_CC: {
MVT MaskVT = Op.getSimpleValueType();
SDValue CC = Op.getOperand(3);
CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
if (IntrData->Opc1 != 0) {
SDValue Sae = Op.getOperand(4);
if (isRoundModeSAE(Sae))
return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
Op.getOperand(2), CC, Sae);
if (!isRoundModeCurDirection(Sae))
return SDValue();
}
//default rounding mode
return DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
Op.getOperand(2), CC);
}
case CMP_MASK_SCALAR_CC: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
SDValue Mask = Op.getOperand(4);
SDValue Cmp;
if (IntrData->Opc1 != 0) {
SDValue Sae = Op.getOperand(5);
if (isRoundModeSAE(Sae))
Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
else if (!isRoundModeCurDirection(Sae))
return SDValue();
}
//default rounding mode
if (!Cmp.getNode())
Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
Subtarget, DAG);
// Need to fill with zeros to ensure the bitcast will produce zeroes
// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
DAG.getConstant(0, dl, MVT::v8i1),
CmpMask, DAG.getIntPtrConstant(0, dl));
return DAG.getBitcast(MVT::i8, Ins);
}
case COMI: { // Comparison intrinsics
ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
SDValue SetCC;
switch (CC) {
case ISD::SETEQ: { // (ZF = 0 and PF = 0)
SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
break;
}
case ISD::SETNE: { // (ZF = 1 or PF = 1)
SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
break;
}
case ISD::SETGT: // (CF = 0 and ZF = 0)
SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
break;
case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
break;
}
case ISD::SETGE: // CF = 0
SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
break;
case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
break;
default:
llvm_unreachable("Unexpected illegal condition!");
}
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
case COMI_RM: { // Comparison intrinsics with Sae
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
SDValue Sae = Op.getOperand(4);
SDValue FCmp;
if (isRoundModeCurDirection(Sae))
FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
DAG.getConstant(CondVal, dl, MVT::i8));
else if (isRoundModeSAE(Sae))
FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
DAG.getConstant(CondVal, dl, MVT::i8), Sae);
else
return SDValue();
// Need to fill with zeros to ensure the bitcast will produce zeroes
// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
DAG.getConstant(0, dl, MVT::v16i1),
FCmp, DAG.getIntPtrConstant(0, dl));
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
DAG.getBitcast(MVT::i16, Ins));
}
case VSHIFT:
return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
Op.getOperand(1), Op.getOperand(2), Subtarget,
DAG);
case COMPRESS_EXPAND_IN_REG: {
SDValue Mask = Op.getOperand(3);
SDValue DataToCompress = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
return Op.getOperand(1);
// Avoid false dependency.
if (PassThru.isUndef())
PassThru = DAG.getConstant(0, dl, VT);
return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
Mask);
}
case FIXUPIMM:
case FIXUPIMM_MASKZ: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
SDValue Imm = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
SDValue Passthru = (IntrData->Type == FIXUPIMM)
? Src1
: getZeroVector(VT, Subtarget, DAG, dl);
unsigned Opc = IntrData->Opc0;
if (IntrData->Opc1 != 0) {
SDValue Sae = Op.getOperand(6);
if (isRoundModeSAE(Sae))
Opc = IntrData->Opc1;
else if (!isRoundModeCurDirection(Sae))
return SDValue();
}
SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
}
case ROUNDP: {
assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
// Clear the upper bits of the rounding immediate so that the legacy
// intrinsic can't trigger the scaling behavior of VRNDSCALE.
SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
Op.getOperand(2),
DAG.getConstant(0xf, dl, MVT::i32));
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(1), RoundingMode);
}
case ROUNDS: {
assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
// Clear the upper bits of the rounding immediate so that the legacy
// intrinsic can't trigger the scaling behavior of VRNDSCALE.
SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
Op.getOperand(3),
DAG.getConstant(0xf, dl, MVT::i32));
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2), RoundingMode);
}
// ADC/ADCX/SBB
case ADX: {
SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
SDValue Res;
// If the carry in is zero, then we should just use ADD/SUB instead of
// ADC/SBB.
if (isNullConstant(Op.getOperand(1))) {
Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
Op.getOperand(3));
} else {
SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
DAG.getConstant(-1, dl, MVT::i8));
Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
Op.getOperand(3), GenCF.getValue(1));
}
SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
SDValue Results[] = { SetCC, Res };
return DAG.getMergeValues(Results, dl);
}
case CVTPD2PS_MASK:
case CVTPD2DQ_MASK:
case CVTQQ2PS_MASK:
case TRUNCATE_TO_REG: {
SDValue Src = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
if (isAllOnesConstant(Mask))
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
MVT SrcVT = Src.getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
Mask);
}
case CVTPS2PH_MASK: {
SDValue Src = Op.getOperand(1);
SDValue Rnd = Op.getOperand(2);
SDValue PassThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
if (isAllOnesConstant(Mask))
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
MVT SrcVT = Src.getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
PassThru, Mask);
}
case CVTNEPS2BF16_MASK: {
SDValue Src = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
if (ISD::isBuildVectorAllOnes(Mask.getNode()))
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
// Break false dependency.
if (PassThru.isUndef())
PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
Mask);
}
default:
break;
}
}
switch (IntNo) {
default: return SDValue(); // Don't custom lower most intrinsics.
// ptest and testp intrinsics. The intrinsic these come from are designed to
// return an integer value, not just an instruction so lower it to the ptest
// or testp pattern and a setcc for the result.
case Intrinsic::x86_avx512_ktestc_b:
case Intrinsic::x86_avx512_ktestc_w:
case Intrinsic::x86_avx512_ktestc_d:
case Intrinsic::x86_avx512_ktestc_q:
case Intrinsic::x86_avx512_ktestz_b:
case Intrinsic::x86_avx512_ktestz_w:
case Intrinsic::x86_avx512_ktestz_d:
case Intrinsic::x86_avx512_ktestz_q:
case Intrinsic::x86_sse41_ptestz:
case Intrinsic::x86_sse41_ptestc:
case Intrinsic::x86_sse41_ptestnzc:
case Intrinsic::x86_avx_ptestz_256:
case Intrinsic::x86_avx_ptestc_256:
case Intrinsic::x86_avx_ptestnzc_256:
case Intrinsic::x86_avx_vtestz_ps:
case Intrinsic::x86_avx_vtestc_ps:
case Intrinsic::x86_avx_vtestnzc_ps:
case Intrinsic::x86_avx_vtestz_pd:
case Intrinsic::x86_avx_vtestc_pd:
case Intrinsic::x86_avx_vtestnzc_pd:
case Intrinsic::x86_avx_vtestz_ps_256:
case Intrinsic::x86_avx_vtestc_ps_256:
case Intrinsic::x86_avx_vtestnzc_ps_256:
case Intrinsic::x86_avx_vtestz_pd_256:
case Intrinsic::x86_avx_vtestc_pd_256:
case Intrinsic::x86_avx_vtestnzc_pd_256: {
unsigned TestOpc = X86ISD::PTEST;
X86::CondCode X86CC;
switch (IntNo) {
default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
case Intrinsic::x86_avx512_ktestc_b:
case Intrinsic::x86_avx512_ktestc_w:
case Intrinsic::x86_avx512_ktestc_d:
case Intrinsic::x86_avx512_ktestc_q:
// CF = 1
TestOpc = X86ISD::KTEST;
X86CC = X86::COND_B;
break;
case Intrinsic::x86_avx512_ktestz_b:
case Intrinsic::x86_avx512_ktestz_w:
case Intrinsic::x86_avx512_ktestz_d:
case Intrinsic::x86_avx512_ktestz_q:
TestOpc = X86ISD::KTEST;
X86CC = X86::COND_E;
break;
case Intrinsic::x86_avx_vtestz_ps:
case Intrinsic::x86_avx_vtestz_pd:
case Intrinsic::x86_avx_vtestz_ps_256:
case Intrinsic::x86_avx_vtestz_pd_256:
TestOpc = X86ISD::TESTP;
LLVM_FALLTHROUGH;
case Intrinsic::x86_sse41_ptestz:
case Intrinsic::x86_avx_ptestz_256:
// ZF = 1
X86CC = X86::COND_E;
break;
case Intrinsic::x86_avx_vtestc_ps:
case Intrinsic::x86_avx_vtestc_pd:
case Intrinsic::x86_avx_vtestc_ps_256:
case Intrinsic::x86_avx_vtestc_pd_256:
TestOpc = X86ISD::TESTP;
LLVM_FALLTHROUGH;
case Intrinsic::x86_sse41_ptestc:
case Intrinsic::x86_avx_ptestc_256:
// CF = 1
X86CC = X86::COND_B;
break;
case Intrinsic::x86_avx_vtestnzc_ps:
case Intrinsic::x86_avx_vtestnzc_pd:
case Intrinsic::x86_avx_vtestnzc_ps_256:
case Intrinsic::x86_avx_vtestnzc_pd_256:
TestOpc = X86ISD::TESTP;
LLVM_FALLTHROUGH;
case Intrinsic::x86_sse41_ptestnzc:
case Intrinsic::x86_avx_ptestnzc_256:
// ZF and CF = 0
X86CC = X86::COND_A;
break;
}
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
case Intrinsic::x86_sse42_pcmpistria128:
case Intrinsic::x86_sse42_pcmpestria128:
case Intrinsic::x86_sse42_pcmpistric128:
case Intrinsic::x86_sse42_pcmpestric128:
case Intrinsic::x86_sse42_pcmpistrio128:
case Intrinsic::x86_sse42_pcmpestrio128:
case Intrinsic::x86_sse42_pcmpistris128:
case Intrinsic::x86_sse42_pcmpestris128:
case Intrinsic::x86_sse42_pcmpistriz128:
case Intrinsic::x86_sse42_pcmpestriz128: {
unsigned Opcode;
X86::CondCode X86CC;
switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::x86_sse42_pcmpistria128:
Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_A;
break;
case Intrinsic::x86_sse42_pcmpestria128:
Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_A;
break;
case Intrinsic::x86_sse42_pcmpistric128:
Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_B;
break;
case Intrinsic::x86_sse42_pcmpestric128:
Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_B;
break;
case Intrinsic::x86_sse42_pcmpistrio128:
Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_O;
break;
case Intrinsic::x86_sse42_pcmpestrio128:
Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_O;
break;
case Intrinsic::x86_sse42_pcmpistris128:
Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_S;
break;
case Intrinsic::x86_sse42_pcmpestris128:
Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_S;
break;
case Intrinsic::x86_sse42_pcmpistriz128:
Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_E;
break;
case Intrinsic::x86_sse42_pcmpestriz128:
Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_E;
break;
}
SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
case Intrinsic::x86_sse42_pcmpistri128:
case Intrinsic::x86_sse42_pcmpestri128: {
unsigned Opcode;
if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
Opcode = X86ISD::PCMPISTR;
else
Opcode = X86ISD::PCMPESTR;
SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
return DAG.getNode(Opcode, dl, VTs, NewOps);
}
case Intrinsic::x86_sse42_pcmpistrm128:
case Intrinsic::x86_sse42_pcmpestrm128: {
unsigned Opcode;
if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
Opcode = X86ISD::PCMPISTR;
else
Opcode = X86ISD::PCMPESTR;
SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
}
case Intrinsic::eh_sjlj_lsda: {
MachineFunction &MF = DAG.getMachineFunction();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
auto &Context = MF.getMMI().getContext();
MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
Twine(MF.getFunctionNumber()));
return DAG.getNode(getGlobalWrapperKind(), dl, VT,
DAG.getMCSymbol(S, PtrVT));
}
case Intrinsic::x86_seh_lsda: {
// Compute the symbol for the LSDA. We know it'll get emitted later.
MachineFunction &MF = DAG.getMachineFunction();
SDValue Op1 = Op.getOperand(1);
auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
GlobalValue::dropLLVMManglingEscape(Fn->getName()));
// Generate a simple absolute symbol reference. This intrinsic is only
// supported on 32-bit Windows, which isn't PIC.
SDValue Result = DAG.getMCSymbol(LSDASym, VT);
return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
}
case Intrinsic::eh_recoverfp: {
SDValue FnOp = Op.getOperand(1);
SDValue IncomingFPOp = Op.getOperand(2);
GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
if (!Fn)
report_fatal_error(
"llvm.eh.recoverfp must take a function as the first argument");
return recoverFramePointer(DAG, Fn, IncomingFPOp);
}
case Intrinsic::localaddress: {
// Returns one of the stack, base, or frame pointer registers, depending on
// which is used to reference local variables.
MachineFunction &MF = DAG.getMachineFunction();
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
unsigned Reg;
if (RegInfo->hasBasePointer(MF))
Reg = RegInfo->getBaseRegister();
else { // Handles the SP or FP case.
bool CantUseFP = RegInfo->needsStackRealignment(MF);
if (CantUseFP)
Reg = RegInfo->getPtrSizedStackRegister(MF);
else
Reg = RegInfo->getPtrSizedFrameRegister(MF);
}
return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
}
case Intrinsic::x86_avx512_vp2intersect_q_512:
case Intrinsic::x86_avx512_vp2intersect_q_256:
case Intrinsic::x86_avx512_vp2intersect_q_128:
case Intrinsic::x86_avx512_vp2intersect_d_512:
case Intrinsic::x86_avx512_vp2intersect_d_256:
case Intrinsic::x86_avx512_vp2intersect_d_128: {
MVT MaskVT = Op.getSimpleValueType();
SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
SDLoc DL(Op);
SDValue Operation =
DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
Op->getOperand(1), Op->getOperand(2));
SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
MaskVT, Operation);
SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
MaskVT, Operation);
return DAG.getMergeValues({Result0, Result1}, DL);
}
}
}
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Src, SDValue Mask, SDValue Base,
SDValue Index, SDValue ScaleOp, SDValue Chain,
const X86Subtarget &Subtarget) {
SDLoc dl(Op);
auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
// Scale must be constant.
if (!C)
return SDValue();
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
// If source is undef or we know it won't be used, use a zero vector
// to break register dependency.
// TODO: use undef instead and let BreakFalseDeps deal with it?
if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
}
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
SDValue Src, SDValue Mask, SDValue Base,
SDValue Index, SDValue ScaleOp, SDValue Chain,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
// Scale must be constant.
if (!C)
return SDValue();
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
VT.getVectorNumElements());
MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
// We support two versions of the gather intrinsics. One with scalar mask and
// one with vXi1 mask. Convert scalar to vXi1 if necessary.
if (Mask.getValueType() != MaskVT)
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
// If source is undef or we know it won't be used, use a zero vector
// to break register dependency.
// TODO: use undef instead and let BreakFalseDeps deal with it?
if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
}
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Src, SDValue Mask, SDValue Base,
SDValue Index, SDValue ScaleOp, SDValue Chain,
const X86Subtarget &Subtarget) {
SDLoc dl(Op);
auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
// Scale must be constant.
if (!C)
return SDValue();
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
Src.getSimpleValueType().getVectorNumElements());
MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
// We support two versions of the scatter intrinsics. One with scalar mask and
// one with vXi1 mask. Convert scalar to vXi1 if necessary.
if (Mask.getValueType() != MaskVT)
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
SDValue Res = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
return Res.getValue(1);
}
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Mask, SDValue Base, SDValue Index,
SDValue ScaleOp, SDValue Chain,
const X86Subtarget &Subtarget) {
SDLoc dl(Op);
auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
// Scale must be constant.
if (!C)
return SDValue();
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
MVT MaskVT =
MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
return SDValue(Res, 0);
}
/// Handles the lowering of builtin intrinsics with chain that return their
/// value into registers EDX:EAX.
/// If operand ScrReg is a valid register identifier, then operand 2 of N is
/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
/// TargetOpcode.
/// Returns a Glue value which can be used to add extra copy-from-reg if the
/// expanded intrinsics implicitly defines extra registers (i.e. not just
/// EDX:EAX).
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
SelectionDAG &DAG,
unsigned TargetOpcode,
unsigned SrcReg,
const X86Subtarget &Subtarget,
SmallVectorImpl<SDValue> &Results) {
SDValue Chain = N->getOperand(0);
SDValue Glue;
if (SrcReg) {
assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
Glue = Chain.getValue(1);
}
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue N1Ops[] = {Chain, Glue};
SDNode *N1 = DAG.getMachineNode(
TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
Chain = SDValue(N1, 0);
// Reads the content of XCR and returns it in registers EDX:EAX.
SDValue LO, HI;
if (Subtarget.is64Bit()) {
LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
LO.getValue(2));
} else {
LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
LO.getValue(2));
}
Chain = HI.getValue(1);
Glue = HI.getValue(2);
if (Subtarget.is64Bit()) {
// Merge the two 32-bit values into a 64-bit one.
SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
DAG.getConstant(32, DL, MVT::i8));
Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
Results.push_back(Chain);
return Glue;
}
// Use a buildpair to merge the two 32-bit values into a 64-bit one.
SDValue Ops[] = { LO, HI };
SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
Results.push_back(Pair);
Results.push_back(Chain);
return Glue;
}
/// Handles the lowering of builtin intrinsics that read the time stamp counter
/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
/// READCYCLECOUNTER nodes.
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
SelectionDAG &DAG,
const X86Subtarget &Subtarget,
SmallVectorImpl<SDValue> &Results) {
// The processor's time-stamp counter (a 64-bit MSR) is stored into the
// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
// and the EAX register is loaded with the low-order 32 bits.
SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
/* NoRegister */0, Subtarget,
Results);
if (Opcode != X86::RDTSCP)
return;
SDValue Chain = Results[1];
// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
// the ECX register. Add 'ecx' explicitly to the chain.
SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
Results[1] = ecx;
Results.push_back(ecx.getValue(1));
}
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SmallVector<SDValue, 3> Results;
SDLoc DL(Op);
getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
Results);
return DAG.getMergeValues(Results, DL);
}
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
MachineFunction &MF = DAG.getMachineFunction();
SDValue Chain = Op.getOperand(0);
SDValue RegNode = Op.getOperand(2);
WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
if (!EHInfo)
report_fatal_error("EH registrations only live in functions using WinEH");
// Cast the operand to an alloca, and remember the frame index.
auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
if (!FINode)
report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
// Return the chain operand without making any DAG nodes.
return Chain;
}
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
MachineFunction &MF = DAG.getMachineFunction();
SDValue Chain = Op.getOperand(0);
SDValue EHGuard = Op.getOperand(2);
WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
if (!EHInfo)
report_fatal_error("EHGuard only live in functions using WinEH");
// Cast the operand to an alloca, and remember the frame index.
auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
if (!FINode)
report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
EHInfo->EHGuardFrameIndex = FINode->getIndex();
// Return the chain operand without making any DAG nodes.
return Chain;
}
/// Emit Truncating Store with signed or unsigned saturation.
static SDValue
EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
SelectionDAG &DAG) {
SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
SDValue Ops[] = { Chain, Val, Ptr, Undef };
return SignedSat ?
DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
}
/// Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue
EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
MachineMemOperand *MMO, SelectionDAG &DAG) {
SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Ops[] = { Chain, Val, Ptr, Mask };
return SignedSat ?
DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
}
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
if (!IntrData) {
switch (IntNo) {
case llvm::Intrinsic::x86_seh_ehregnode:
return MarkEHRegistrationNode(Op, DAG);
case llvm::Intrinsic::x86_seh_ehguard:
return MarkEHGuard(Op, DAG);
case llvm::Intrinsic::x86_rdpkru: {
SDLoc dl(Op);
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
// Create a RDPKRU node and pass 0 to the ECX parameter.
return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
DAG.getConstant(0, dl, MVT::i32));
}
case llvm::Intrinsic::x86_wrpkru: {
SDLoc dl(Op);
// Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
// to the EDX and ECX parameters.
return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
Op.getOperand(0), Op.getOperand(2),
DAG.getConstant(0, dl, MVT::i32),
DAG.getConstant(0, dl, MVT::i32));
}
case llvm::Intrinsic::x86_flags_read_u32:
case llvm::Intrinsic::x86_flags_read_u64:
case llvm::Intrinsic::x86_flags_write_u32:
case llvm::Intrinsic::x86_flags_write_u64: {
// We need a frame pointer because this will get lowered to a PUSH/POP
// sequence.
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
MFI.setHasCopyImplyingStackAdjustment(true);
// Don't do anything here, we will expand these intrinsics out later
// during FinalizeISel in EmitInstrWithCustomInserter.
return SDValue();
}
case Intrinsic::x86_lwpins32:
case Intrinsic::x86_lwpins64:
case Intrinsic::x86_umwait:
case Intrinsic::x86_tpause: {
SDLoc dl(Op);
SDValue Chain = Op->getOperand(0);
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
unsigned Opcode;
switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic");
case Intrinsic::x86_umwait:
Opcode = X86ISD::UMWAIT;
break;
case Intrinsic::x86_tpause:
Opcode = X86ISD::TPAUSE;
break;
case Intrinsic::x86_lwpins32:
case Intrinsic::x86_lwpins64:
Opcode = X86ISD::LWPINS;
break;
}
SDValue Operation =
DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
Op->getOperand(3), Op->getOperand(4));
SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
Operation.getValue(1));
}
case Intrinsic::x86_enqcmd:
case Intrinsic::x86_enqcmds: {
SDLoc dl(Op);
SDValue Chain = Op.getOperand(0);
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
unsigned Opcode;
switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic!");
case Intrinsic::x86_enqcmd:
Opcode = X86ISD::ENQCMD;
break;
case Intrinsic::x86_enqcmds:
Opcode = X86ISD::ENQCMDS;
break;
}
SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
Op.getOperand(3));
SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
Operation.getValue(1));
}
}
return SDValue();
}
SDLoc dl(Op);
switch(IntrData->Type) {
default: llvm_unreachable("Unknown Intrinsic Type");
case RDSEED:
case RDRAND: {
// Emit the node with the right value type.
SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
// Otherwise return the value from Rand, which is always 0, casted to i32.
SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
DAG.getConstant(1, dl, Op->getValueType(1)),
DAG.getConstant(X86::COND_B, dl, MVT::i8),
SDValue(Result.getNode(), 1) };
SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
// Return { result, isValid, chain }.
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
SDValue(Result.getNode(), 2));
}
case GATHER_AVX2: {
SDValue Chain = Op.getOperand(0);
SDValue Src = Op.getOperand(2);
SDValue Base = Op.getOperand(3);
SDValue Index = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
Scale, Chain, Subtarget);
}
case GATHER: {
//gather(v1, mask, index, base, scale);
SDValue Chain = Op.getOperand(0);
SDValue Src = Op.getOperand(2);
SDValue Base = Op.getOperand(3);
SDValue Index = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
Chain, Subtarget);
}
case SCATTER: {
//scatter(base, mask, index, v1, scale);
SDValue Chain = Op.getOperand(0);
SDValue Base = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
SDValue Index = Op.getOperand(4);
SDValue Src = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
Scale, Chain, Subtarget);
}
case PREFETCH: {
SDValue Hint = Op.getOperand(6);
unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
assert((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3");
unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
SDValue Chain = Op.getOperand(0);
SDValue Mask = Op.getOperand(2);
SDValue Index = Op.getOperand(3);
SDValue Base = Op.getOperand(4);
SDValue Scale = Op.getOperand(5);
return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
Subtarget);
}
// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
case RDTSC: {
SmallVector<SDValue, 2> Results;
getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
Results);
return DAG.getMergeValues(Results, dl);
}
// Read Performance Monitoring Counters.
case RDPMC:
// GetExtended Control Register.
case XGETBV: {
SmallVector<SDValue, 2> Results;
// RDPMC uses ECX to select the index of the performance counter to read.
// XGETBV uses ECX to select the index of the XCR register to return.
// The result is stored into registers EDX:EAX.
expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
Subtarget, Results);
return DAG.getMergeValues(Results, dl);
}
// XTEST intrinsics.
case XTEST: {
SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
Ret, SDValue(InTrans.getNode(), 1));
}
case TRUNCATE_TO_MEM_VI8:
case TRUNCATE_TO_MEM_VI16:
case TRUNCATE_TO_MEM_VI32: {
SDValue Mask = Op.getOperand(4);
SDValue DataToTruncate = Op.getOperand(3);
SDValue Addr = Op.getOperand(2);
SDValue Chain = Op.getOperand(0);
MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
assert(MemIntr && "Expected MemIntrinsicSDNode!");
EVT MemVT = MemIntr->getMemoryVT();
uint16_t TruncationOp = IntrData->Opc0;
switch (TruncationOp) {
case X86ISD::VTRUNC: {
if (isAllOnesConstant(Mask)) // return just a truncate store
return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
MemIntr->getMemOperand());
MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
MemIntr->getMemOperand(), true /* truncating */);
}
case X86ISD::VTRUNCUS:
case X86ISD::VTRUNCS: {
bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
if (isAllOnesConstant(Mask))
return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
MemIntr->getMemOperand(), DAG);
MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
VMask, MemVT, MemIntr->getMemOperand(), DAG);
}
default:
llvm_unreachable("Unsupported truncstore intrinsic");
}
}
}
}
SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
SelectionDAG &DAG) const {
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
MFI.setReturnAddressIsTaken(true);
if (verifyReturnAddressArgumentIsConstant(Op, DAG))
return SDValue();
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
SDLoc dl(Op);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
if (Depth > 0) {
SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
MachinePointerInfo());
}
// Just load the return address.
SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
MachinePointerInfo());
}
SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
SelectionDAG &DAG) const {
DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
return getReturnAddressFrameIndex(DAG);
}
SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
EVT VT = Op.getValueType();
MFI.setFrameAddressIsTaken(true);
if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
// Depth > 0 makes no sense on targets which use Windows unwind codes. It
// is not possible to crawl up the stack without looking at the unwind codes
// simultaneously.
int FrameAddrIndex = FuncInfo->getFAIndex();
if (!FrameAddrIndex) {
// Set up a frame object for the return address.
unsigned SlotSize = RegInfo->getSlotSize();
FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
FuncInfo->setFAIndex(FrameAddrIndex);
}
return DAG.getFrameIndex(FrameAddrIndex, VT);
}
unsigned FrameReg =
RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
SDLoc dl(Op); // FIXME probably not meaningful
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
(FrameReg == X86::EBP && VT == MVT::i32)) &&
"Invalid Frame Register!");
SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
while (Depth--)
FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
MachinePointerInfo());
return FrameAddr;
}
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const {
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
const MachineFunction &MF = DAG.getMachineFunction();
unsigned Reg = StringSwitch<unsigned>(RegName)
.Case("esp", X86::ESP)
.Case("rsp", X86::RSP)
.Case("ebp", X86::EBP)
.Case("rbp", X86::RBP)
.Default(0);
if (Reg == X86::EBP || Reg == X86::RBP) {
if (!TFI.hasFP(MF))
report_fatal_error("register " + StringRef(RegName) +
" is allocatable: function has no frame pointer");
#ifndef NDEBUG
else {
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
unsigned FrameReg =
RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
"Invalid Frame Register!");
}
#endif
}
if (Reg)
return Reg;
report_fatal_error("Invalid register name global variable");
}
SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
SelectionDAG &DAG) const {
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
}
unsigned X86TargetLowering::getExceptionPointerRegister(
const Constant *PersonalityFn) const {
if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
}
unsigned X86TargetLowering::getExceptionSelectorRegister(
const Constant *PersonalityFn) const {
// Funclet personalities don't use selectors (the runtime does the selection).
assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
}
bool X86TargetLowering::needsFixedCatchObjects() const {
return Subtarget.isTargetWin64();
}
SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
SDValue Offset = Op.getOperand(1);
SDValue Handler = Op.getOperand(2);
SDLoc dl (Op);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
"Invalid Frame Register!");
SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
DAG.getIntPtrConstant(RegInfo->getSlotSize(),
dl));
StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
DAG.getRegister(StoreAddrReg, PtrVT));
}
SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
// If the subtarget is not 64bit, we may need the global base reg
// after isel expand pseudo, i.e., after CGBR pass ran.
// Therefore, ask for the GlobalBaseReg now, so that the pass
// inserts the code for us in case we need it.
// Otherwise, we will end up in a situation where we will
// reference a virtual register that is not defined!
if (!Subtarget.is64Bit()) {
const X86InstrInfo *TII = Subtarget.getInstrInfo();
(void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
}
return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
DAG.getVTList(MVT::i32, MVT::Other),
Op.getOperand(0), Op.getOperand(1));
}
SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
Op.getOperand(0), Op.getOperand(1));
}
SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
Op.getOperand(0));
}
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
return Op.getOperand(0);
}
SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
SelectionDAG &DAG) const {
SDValue Root = Op.getOperand(0);
SDValue Trmp = Op.getOperand(1); // trampoline
SDValue FPtr = Op.getOperand(2); // nested function
SDValue Nest = Op.getOperand(3); // 'nest' parameter value
SDLoc dl (Op);
const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
if (Subtarget.is64Bit()) {
SDValue OutChains[6];
// Large code-model.
const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
// Load the pointer to the nested function into R11.
unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
SDValue Addr = Trmp;
OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
Addr, MachinePointerInfo(TrmpAddr));
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
DAG.getConstant(2, dl, MVT::i64));
OutChains[1] =
DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
/* Alignment = */ 2);
// Load the 'nest' parameter value into R10.
// R10 is specified in X86CallingConv.td
OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
DAG.getConstant(10, dl, MVT::i64));
OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
Addr, MachinePointerInfo(TrmpAddr, 10));
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
DAG.getConstant(12, dl, MVT::i64));
OutChains[3] =
DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
/* Alignment = */ 2);
// Jump to the nested function.
OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
DAG.getConstant(20, dl, MVT::i64));
OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
Addr, MachinePointerInfo(TrmpAddr, 20));
unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
DAG.getConstant(22, dl, MVT::i64));
OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
Addr, MachinePointerInfo(TrmpAddr, 22));
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
} else {
const Function *Func =
cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
CallingConv::ID CC = Func->getCallingConv();
unsigned NestReg;
switch (CC) {
default:
llvm_unreachable("Unsupported calling convention");
case CallingConv::C:
case CallingConv::X86_StdCall: {
// Pass 'nest' parameter in ECX.
// Must be kept in sync with X86CallingConv.td
NestReg = X86::ECX;
// Check that ECX wasn't needed by an 'inreg' parameter.
FunctionType *FTy = Func->getFunctionType();
const AttributeList &Attrs = Func->getAttributes();
if (!Attrs.isEmpty() && !Func->isVarArg()) {
unsigned InRegCount = 0;
unsigned Idx = 1;
for (FunctionType::param_iterator I = FTy->param_begin(),
E = FTy->param_end(); I != E; ++I, ++Idx)
if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
auto &DL = DAG.getDataLayout();
// FIXME: should only count parameters that are lowered to integers.
InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
}
if (InRegCount > 2) {
report_fatal_error("Nest register in use - reduce number of inreg"
" parameters!");
}
}
break;
}
case CallingConv::X86_FastCall:
case CallingConv::X86_ThisCall:
case CallingConv::Fast:
// Pass 'nest' parameter in EAX.
// Must be kept in sync with X86CallingConv.td
NestReg = X86::EAX;
break;
}
SDValue OutChains[4];
SDValue Addr, Disp;
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
DAG.getConstant(10, dl, MVT::i32));
Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
// This is storing the opcode for MOV32ri.
const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
OutChains[0] =
DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
Trmp, MachinePointerInfo(TrmpAddr));
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
DAG.getConstant(1, dl, MVT::i32));
OutChains[1] =
DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
/* Alignment = */ 1);
const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
DAG.getConstant(5, dl, MVT::i32));
OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
Addr, MachinePointerInfo(TrmpAddr, 5),
/* Alignment = */ 1);
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
DAG.getConstant(6, dl, MVT::i32));
OutChains[3] =
DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
/* Alignment = */ 1);
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
}
}
SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
SelectionDAG &DAG) const {
/*
The rounding mode is in bits 11:10 of FPSR, and has the following
settings:
00 Round to nearest
01 Round to -inf
10 Round to +inf
11 Round to 0
FLT_ROUNDS, on the other hand, expects the following:
-1 Undefined
0 Round to 0
1 Round to nearest
2 Round to +inf
3 Round to -inf
To perform the conversion, we do:
(((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
*/
MachineFunction &MF = DAG.getMachineFunction();
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
unsigned StackAlignment = TFI.getStackAlignment();
MVT VT = Op.getSimpleValueType();
SDLoc DL(Op);
// Save FP Control Word to stack slot
int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
SDValue StackSlot =
DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
MachineMemOperand *MMO =
MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
MachineMemOperand::MOStore, 2, 2);
SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
DAG.getVTList(MVT::Other),
Ops, MVT::i16, MMO);
// Load FP Control Word from stack slot
SDValue CWD =
DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
// Transform as necessary
SDValue CWD1 =
DAG.getNode(ISD::SRL, DL, MVT::i16,
DAG.getNode(ISD::AND, DL, MVT::i16,
CWD, DAG.getConstant(0x800, DL, MVT::i16)),
DAG.getConstant(11, DL, MVT::i8));
SDValue CWD2 =
DAG.getNode(ISD::SRL, DL, MVT::i16,
DAG.getNode(ISD::AND, DL, MVT::i16,
CWD, DAG.getConstant(0x400, DL, MVT::i16)),
DAG.getConstant(9, DL, MVT::i8));
SDValue RetVal =
DAG.getNode(ISD::AND, DL, MVT::i16,
DAG.getNode(ISD::ADD, DL, MVT::i16,
DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
DAG.getConstant(1, DL, MVT::i16)),
DAG.getConstant(3, DL, MVT::i16));
return DAG.getNode((VT.getSizeInBits() < 16 ?
ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
}
// Split an unary integer op into 2 half sized ops.
static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
unsigned NumElems = VT.getVectorNumElements();
unsigned SizeInBits = VT.getSizeInBits();
MVT EltVT = VT.getVectorElementType();
SDValue Src = Op.getOperand(0);
assert(EltVT == Src.getSimpleValueType().getVectorElementType() &&
"Src and Op should have the same element type!");
// Extract the Lo/Hi vectors
SDLoc dl(Op);
SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
}
// Decompose 256-bit ops into smaller 128-bit ops.
static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
assert(Op.getSimpleValueType().is256BitVector() &&
Op.getSimpleValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
return LowerVectorIntUnary(Op, DAG);
}
// Decompose 512-bit ops into smaller 256-bit ops.
static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
assert(Op.getSimpleValueType().is512BitVector() &&
Op.getSimpleValueType().isInteger() &&
"Only handle AVX 512-bit vector integer operation");
return LowerVectorIntUnary(Op, DAG);
}
/// Lower a vector CTLZ using native supported vector CTLZ instruction.
//
// i8/i16 vector implemented using dword LZCNT vector instruction
// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
// split the vector, perform operation on it's Lo a Hi part and
// concatenate the results.
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(Op.getOpcode() == ISD::CTLZ);
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
unsigned NumElems = VT.getVectorNumElements();
assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
"Unsupported element type");
// Split vector, it's Lo and Hi parts will be handled in next iteration.
if (NumElems > 16 ||
(NumElems == 16 && !Subtarget.canExtendTo512DQ()))
return LowerVectorIntUnary(Op, DAG);
MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
"Unsupported value type for operation");
// Use native supported vector instruction vplzcntd.
Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
}
// Lower CTLZ using a PSHUFB lookup table implementation.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
int NumElts = VT.getVectorNumElements();
int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
// Per-nibble leading zero PSHUFB lookup table.
const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
/* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
/* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
/* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
SmallVector<SDValue, 64> LUTVec;
for (int i = 0; i < NumBytes; ++i)
LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
// Begin by bitcasting the input to byte vector, then split those bytes
// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
// If the hi input nibble is zero then we add both results together, otherwise
// we just take the hi result (by masking the lo result to zero before the
// add).
SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
SDValue Zero = DAG.getConstant(0, DL, CurrVT);
SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
SDValue Lo = Op0;
SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
SDValue HiZ;
if (CurrVT.is512BitVector()) {
MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
} else {
HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
}
Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
// Merge result back from vXi8 back to VT, working on the lo/hi halves
// of the current vector width in the same way we did for the nibbles.
// If the upper half of the input element is zero then add the halves'
// leading zero counts together, otherwise just use the upper half's.
// Double the width of the result until we are at target width.
while (CurrVT != VT) {
int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
int CurrNumElts = CurrVT.getVectorNumElements();
MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
// Check if the upper half of the input element is zero.
if (CurrVT.is512BitVector()) {
MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
} else {
HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
}
HiZ = DAG.getBitcast(NextVT, HiZ);
// Move the upper/lower halves to the lower bits as we'll be extending to
// NextVT. Mask the lower result to zero if HiZ is true and add the results
// together.
SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
CurrVT = NextVT;
}
return Res;
}
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
if (Subtarget.hasCDI() &&
// vXi8 vectors need to be promoted to 512-bits for vXi32.
(Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
// Decompose 256-bit ops into smaller 128-bit ops.
if (VT.is256BitVector() && !Subtarget.hasInt256())
return Lower256IntUnary(Op, DAG);
// Decompose 512-bit ops into smaller 256-bit ops.
if (VT.is512BitVector() && !Subtarget.hasBWI())
return Lower512IntUnary(Op, DAG);
assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
}
static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
MVT OpVT = VT;
unsigned NumBits = VT.getSizeInBits();
SDLoc dl(Op);
unsigned Opc = Op.getOpcode();
if (VT.isVector())
return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
Op = Op.getOperand(0);
if (VT == MVT::i8) {
// Zero extend to i32 since there is not an i8 bsr.
OpVT = MVT::i32;
Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
}
// Issue a bsr (scan bits in reverse) which also sets EFLAGS.
SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
if (Opc == ISD::CTLZ) {
// If src is zero (i.e. bsr sets ZF), returns NumBits.
SDValue Ops[] = {
Op,
DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
DAG.getConstant(X86::COND_E, dl, MVT::i8),
Op.getValue(1)
};
Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
}
// Finally xor with NumBits-1.
Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
DAG.getConstant(NumBits - 1, dl, OpVT));
if (VT == MVT::i8)
Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
return Op;
}
static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
unsigned NumBits = VT.getScalarSizeInBits();
SDValue N0 = Op.getOperand(0);
SDLoc dl(Op);
assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering");
// Issue a bsf (scan bits forward) which also sets EFLAGS.
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
// If src is zero (i.e. bsf sets ZF), returns NumBits.
SDValue Ops[] = {
Op,
DAG.getConstant(NumBits, dl, VT),
DAG.getConstant(X86::COND_E, dl, MVT::i8),
Op.getValue(1)
};
return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
}
/// Break a 256-bit integer operation into two new 128-bit ones and then
/// concatenate the result back.
static SDValue split256IntArith(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
assert(VT.is256BitVector() && VT.isInteger() &&
"Unsupported value type for operation");
unsigned NumElems = VT.getVectorNumElements();
SDLoc dl(Op);
// Extract the LHS vectors
SDValue LHS = Op.getOperand(0);
SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
// Extract the RHS vectors
SDValue RHS = Op.getOperand(1);
SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
MVT EltVT = VT.getVectorElementType();
MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
}
/// Break a 512-bit integer operation into two new 256-bit ones and then
/// concatenate the result back.
static SDValue split512IntArith(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
assert(VT.is512BitVector() && VT.isInteger() &&
"Unsupported value type for operation");
unsigned NumElems = VT.getVectorNumElements();
SDLoc dl(Op);
// Extract the LHS vectors
SDValue LHS = Op.getOperand(0);
SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
// Extract the RHS vectors
SDValue RHS = Op.getOperand(1);
SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
MVT EltVT = VT.getVectorElementType();
MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
}
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
if (VT == MVT::i16 || VT == MVT::i32)
return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
if (VT.getScalarType() == MVT::i1)
return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
Op.getOperand(0), Op.getOperand(1));
assert(Op.getSimpleValueType().is256BitVector() &&
Op.getSimpleValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
return split256IntArith(Op, DAG);
}
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
unsigned Opcode = Op.getOpcode();
if (VT.getScalarType() == MVT::i1) {
SDLoc dl(Op);
switch (Opcode) {
default: llvm_unreachable("Expected saturated arithmetic opcode");
case ISD::UADDSAT:
case ISD::SADDSAT:
// *addsat i1 X, Y --> X | Y
return DAG.getNode(ISD::OR, dl, VT, X, Y);
case ISD::USUBSAT:
case ISD::SSUBSAT:
// *subsat i1 X, Y --> X & ~Y
return DAG.getNode(ISD::AND, dl, VT, X, DAG.getNOT(dl, Y, VT));
}
}
if (VT.is128BitVector()) {
// Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
*DAG.getContext(), VT);
SDLoc DL(Op);
if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) {
// uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y
SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y);
SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT);
return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add);
}
if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
// usubsat X, Y --> (X >u Y) ? X - Y : 0
SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
}
// Use default expansion.
return SDValue();
}
assert(Op.getSimpleValueType().is256BitVector() &&
Op.getSimpleValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
return split256IntArith(Op, DAG);
}
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
// Since X86 does not have CMOV for 8-bit integer, we don't convert
// 8-bit integer abs to NEG and CMOV.
SDLoc DL(Op);
SDValue N0 = Op.getOperand(0);
SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
DAG.getConstant(0, DL, VT), N0);
SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
SDValue(Neg.getNode(), 1)};
return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
}
// ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
SDLoc DL(Op);
SDValue Src = Op.getOperand(0);
SDValue Sub =
DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
}
if (VT.is256BitVector() && !Subtarget.hasInt256()) {
assert(VT.isInteger() &&
"Only handle AVX 256-bit vector integer operation");
return Lower256IntUnary(Op, DAG);
}
// Default to expand.
return SDValue();
}
static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
// For AVX1 cases, split to use legal ops (everything but v4i64).
if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
return split256IntArith(Op, DAG);
SDLoc DL(Op);
unsigned Opcode = Op.getOpcode();
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
// For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
// using the SMIN/SMAX instructions and flipping the signbit back.
if (VT == MVT::v8i16) {
assert((Opcode == ISD::UMIN || Opcode == ISD::UMAX) &&
"Unexpected MIN/MAX opcode");
SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
}
// Else, expand to a compare/select.
ISD::CondCode CC;
switch (Opcode) {
case ISD::SMIN: CC = ISD::CondCode::SETLT; break;
case ISD::SMAX: CC = ISD::CondCode::SETGT; break;
case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
default: llvm_unreachable("Unknown MINMAX opcode");
}
SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
return DAG.getSelect(DL, VT, Cond, N0, N1);
}
static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
if (VT.getScalarType() == MVT::i1)
return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
// Decompose 256-bit ops into 128-bit ops.
if (VT.is256BitVector() && !Subtarget.hasInt256())
return split256IntArith(Op, DAG);
SDValue A = Op.getOperand(0);
SDValue B = Op.getOperand(1);
// Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
// vector pairs, multiply and truncate.
if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
unsigned NumElts = VT.getVectorNumElements();
if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
return DAG.getNode(
ISD::TRUNCATE, dl, VT,
DAG.getNode(ISD::MUL, dl, ExVT,
DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
}
MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
// Extract the lo/hi parts to any extend to i16.
// We're going to mask off the low byte of each result element of the
// pmullw, so it doesn't matter what's in the high byte of each 16-bit
// element.
SDValue Undef = DAG.getUNDEF(VT);
SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
SDValue BLo, BHi;
if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
// If the LHS is a constant, manually unpackl/unpackh.
SmallVector<SDValue, 16> LoOps, HiOps;
for (unsigned i = 0; i != NumElts; i += 16) {
for (unsigned j = 0; j != 8; ++j) {
LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
MVT::i16));
HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
MVT::i16));
}
}
BLo = DAG.getBuildVector(ExVT, dl, LoOps);
BHi = DAG.getBuildVector(ExVT, dl, HiOps);
} else {
BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
}
// Multiply, mask the lower 8bits of the lo/hi results and pack.
SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
}
// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
if (VT == MVT::v4i32) {
assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
"Should not custom lower when pmulld is available!");
// Extract the odd parts.
static const int UnpackMask[] = { 1, -1, 3, -1 };
SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
// Multiply the even parts.
SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
DAG.getBitcast(MVT::v2i64, A),
DAG.getBitcast(MVT::v2i64, B));
// Now multiply odd parts.
SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
DAG.getBitcast(MVT::v2i64, Aodds),
DAG.getBitcast(MVT::v2i64, Bodds));
Evens = DAG.getBitcast(VT, Evens);
Odds = DAG.getBitcast(VT, Odds);
// Merge the two vectors back together with a shuffle. This expands into 2
// shuffles.
static const int ShufMask[] = { 0, 4, 2, 6 };
return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
}
assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply");
assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
// Ahi = psrlqi(a, 32);
// Bhi = psrlqi(b, 32);
//
// AloBlo = pmuludq(a, b);
// AloBhi = pmuludq(a, Bhi);
// AhiBlo = pmuludq(Ahi, b);
//
// Hi = psllqi(AloBhi + AhiBlo, 32);
// return AloBlo + Hi;
KnownBits AKnown = DAG.computeKnownBits(A);
KnownBits BKnown = DAG.computeKnownBits(B);
APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
SDValue Zero = DAG.getConstant(0, dl, VT);
// Only multiply lo/hi halves that aren't known to be zero.
SDValue AloBlo = Zero;
if (!ALoIsZero && !BLoIsZero)
AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
SDValue AloBhi = Zero;
if (!ALoIsZero && !BHiIsZero) {
SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
}
SDValue AhiBlo = Zero;
if (!AHiIsZero && !BLoIsZero) {
SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
}
SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
}
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
bool IsSigned = Op->getOpcode() == ISD::MULHS;
unsigned NumElts = VT.getVectorNumElements();
SDValue A = Op.getOperand(0);
SDValue B = Op.getOperand(1);
// Decompose 256-bit ops into 128-bit ops.
if (VT.is256BitVector() && !Subtarget.hasInt256())
return split256IntArith(Op, DAG);
if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
(VT == MVT::v8i32 && Subtarget.hasInt256()) ||
(VT == MVT::v16i32 && Subtarget.hasAVX512()));
// PMULxD operations multiply each even value (starting at 0) of LHS with
// the related value of RHS and produce a widen result.
// E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
// => <2 x i64> <ae|cg>
//
// In other word, to have all the results, we need to perform two PMULxD:
// 1. one with the even values.
// 2. one with the odd values.
// To achieve #2, with need to place the odd values at an even position.
//
// Place the odd value at an even position (basically, shift all values 1
// step to the left):
const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
9, -1, 11, -1, 13, -1, 15, -1};
// <a|b|c|d> => <b|undef|d|undef>
SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
makeArrayRef(&Mask[0], NumElts));
// <e|f|g|h> => <f|undef|h|undef>
SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
makeArrayRef(&Mask[0], NumElts));
// Emit two multiplies, one for the lower 2 ints and one for the higher 2
// ints.
MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
unsigned Opcode =
(IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
// PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
// => <2 x i64> <ae|cg>
SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
DAG.getBitcast(MulVT, A),
DAG.getBitcast(MulVT, B)));
// PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
// => <2 x i64> <bf|dh>
SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
DAG.getBitcast(MulVT, Odd0),
DAG.getBitcast(MulVT, Odd1)));
// Shuffle it back into the right order.
SmallVector<int, 16> ShufMask(NumElts);
for (int i = 0; i != (int)NumElts; ++i)
ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
// If we have a signed multiply but no PMULDQ fix up the result of an
// unsigned multiply.
if (IsSigned && !Subtarget.hasSSE41()) {
SDValue Zero = DAG.getConstant(0, dl, VT);
SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
}
return Res;
}
// Only i8 vectors should need custom lowering after this.
assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
(VT == MVT::v64i8 && Subtarget.hasBWI())) &&
"Unsupported vector type");
// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
// logical shift down the upper half and pack back to i8.
// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
// and then ashr/lshr the upper bits down to the lower bits before multiply.
unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
}
// For signed 512-bit vectors, split into 256-bit vectors to allow the
// sign-extension to occur.
if (VT == MVT::v64i8 && IsSigned)
return split512IntArith(Op, DAG);
// Signed AVX2 implementation - extend xmm subvectors to ymm.
if (VT == MVT::v32i8 && IsSigned) {
MVT ExVT = MVT::v16i16;
SDValue ALo = extract128BitVector(A, 0, DAG, dl);
SDValue BLo = extract128BitVector(B, 0, DAG, dl);
SDValue AHi = extract128BitVector(A, NumElts / 2, DAG, dl);
SDValue BHi = extract128BitVector(B, NumElts / 2, DAG, dl);
ALo = DAG.getNode(ExAVX, dl, ExVT, ALo);
BLo = DAG.getNode(ExAVX, dl, ExVT, BLo);
AHi = DAG.getNode(ExAVX, dl, ExVT, AHi);
BHi = DAG.getNode(ExAVX, dl, ExVT, BHi);
SDValue Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
SDValue Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG);
Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG);
// Bitcast back to VT and then pack all the even elements from Lo and Hi.
// Shuffle lowering should turn this into PACKUS+PERMQ
Lo = DAG.getBitcast(VT, Lo);
Hi = DAG.getBitcast(VT, Hi);
return DAG.getVectorShuffle(VT, dl, Lo, Hi,
{ 0, 2, 4, 6, 8, 10, 12, 14,
16, 18, 20, 22, 24, 26, 28, 30,
32, 34, 36, 38, 40, 42, 44, 46,
48, 50, 52, 54, 56, 58, 60, 62});
}
// For signed v16i8 and all unsigned vXi8 we will unpack the low and high
// half of each 128 bit lane to widen to a vXi16 type. Do the multiplies,
// shift the results and pack the half lane results back together.
MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
static const int PSHUFDMask[] = { 8, 9, 10, 11, 12, 13, 14, 15,
-1, -1, -1, -1, -1, -1, -1, -1};
// Extract the lo parts and zero/sign extend to i16.
// Only use SSE4.1 instructions for signed v16i8 where using unpack requires
// shifts to sign extend. Using unpack for unsigned only requires an xor to
// create zeros and a copy due to tied registers contraints pre-avx. But using
// zero_extend_vector_inreg would require an additional pshufd for the high
// part.
SDValue ALo, AHi;
if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
ALo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, A);
AHi = DAG.getVectorShuffle(VT, dl, A, A, PSHUFDMask);
AHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, AHi);
} else if (IsSigned) {
ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), A));
AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), A));
ALo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, ALo, 8, DAG);
AHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, AHi, 8, DAG);
} else {
ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A,
DAG.getConstant(0, dl, VT)));
AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A,
DAG.getConstant(0, dl, VT)));
}
SDValue BLo, BHi;
if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
// If the LHS is a constant, manually unpackl/unpackh and extend.
SmallVector<SDValue, 16> LoOps, HiOps;
for (unsigned i = 0; i != NumElts; i += 16) {
for (unsigned j = 0; j != 8; ++j) {
SDValue LoOp = B.getOperand(i + j);
SDValue HiOp = B.getOperand(i + j + 8);
if (IsSigned) {
LoOp = DAG.getSExtOrTrunc(LoOp, dl, MVT::i16);
HiOp = DAG.getSExtOrTrunc(HiOp, dl, MVT::i16);
} else {
LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
}
LoOps.push_back(LoOp);
HiOps.push_back(HiOp);
}
}
BLo = DAG.getBuildVector(ExVT, dl, LoOps);
BHi = DAG.getBuildVector(ExVT, dl, HiOps);
} else if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
BLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, B);
BHi = DAG.getVectorShuffle(VT, dl, B, B, PSHUFDMask);
BHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, BHi);
} else if (IsSigned) {
BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), B));
BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), B));
BLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BLo, 8, DAG);
BHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BHi, 8, DAG);
} else {
BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B,
DAG.getConstant(0, dl, VT)));
BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B,
DAG.getConstant(0, dl, VT)));
}
// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
// pack back to vXi8.
SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
// Bitcast back to VT and then pack all the even elements from Lo and Hi.
return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
}
SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
assert(Subtarget.isTargetWin64() && "Unexpected target");
EVT VT = Op.getValueType();
assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
"Unexpected return type for lowering");
RTLIB::Libcall LC;
bool isSigned;
switch (Op->getOpcode()) {
default: llvm_unreachable("Unexpected request for libcall!");
case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
}
SDLoc dl(Op);
SDValue InChain = DAG.getEntryNode();
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
EVT ArgVT = Op->getOperand(i).getValueType();
assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
"Unexpected argument type for lowering");
SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
Entry.Node = StackPtr;
InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
MachinePointerInfo(), /* Alignment = */ 16);
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
Entry.Ty = PointerType::get(ArgTy,0);
Entry.IsSExt = false;
Entry.IsZExt = false;
Args.push_back(Entry);
}
SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
getPointerTy(DAG.getDataLayout()));
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl)
.setChain(InChain)
.setLibCallee(
getLibcallCallingConv(LC),
static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
std::move(Args))
.setInRegister()
.setSExtResult(isSigned)
.setZExtResult(!isSigned);
std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
return DAG.getBitcast(VT, CallInfo.first);
}
// Return true if the required (according to Opcode) shift-imm form is natively
// supported by the Subtarget
static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
unsigned Opcode) {
if (VT.getScalarSizeInBits() < 16)
return false;
if (VT.is512BitVector() && Subtarget.hasAVX512() &&
(VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
return true;
bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
(VT.is256BitVector() && Subtarget.hasInt256());
bool AShift = LShift && (Subtarget.hasAVX512() ||
(VT != MVT::v2i64 && VT != MVT::v4i64));
return (Opcode == ISD::SRA) ? AShift : LShift;
}
// The shift amount is a variable, but it is the same for all vector lanes.
// These instructions are defined together with shift-immediate.
static
bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
unsigned Opcode) {
return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
}
// Return true if the required (according to Opcode) variable-shift form is
// natively supported by the Subtarget
static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
unsigned Opcode) {
if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
return false;
// vXi16 supported only on AVX-512, BWI
if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
return false;
if (Subtarget.hasAVX512())
return true;
bool LShift = VT.is128BitVector() || VT.is256BitVector();
bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
return (Opcode == ISD::SRA) ? AShift : LShift;
}
static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
SDValue Ex = DAG.getBitcast(ExVT, R);
// ashr(R, 63) === cmp_slt(R, 0)
if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
"Unsupported PCMPGT op");
return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
}
if (ShiftAmt >= 32) {
// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
SDValue Upper =
getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
ShiftAmt - 32, DAG);
if (VT == MVT::v2i64)
Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
if (VT == MVT::v4i64)
Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
{9, 1, 11, 3, 13, 5, 15, 7});
} else {
// SRA upper i32, SRL whole i64 and select lower i32.
SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
ShiftAmt, DAG);
SDValue Lower =
getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
Lower = DAG.getBitcast(ExVT, Lower);
if (VT == MVT::v2i64)
Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
if (VT == MVT::v4i64)
Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
{8, 1, 10, 3, 12, 5, 14, 7});
}
return DAG.getBitcast(VT, Ex);
};
// Optimize shl/srl/sra with constant shift amount.
APInt APIntShiftAmt;
if (!isConstantSplat(Amt, APIntShiftAmt))
return SDValue();
// If the shift amount is out of range, return undef.
if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
return DAG.getUNDEF(VT);
uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
// i64 SRA needs to be performed as partial shifts.
if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
(Subtarget.hasInt256() && VT == MVT::v4i64)) &&
Op.getOpcode() == ISD::SRA)
return ArithmeticShiftRight64(ShiftAmt);
if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
VT == MVT::v64i8) {
unsigned NumElts = VT.getVectorNumElements();
MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
// Simple i8 add case
if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
return DAG.getNode(ISD::ADD, dl, VT, R, R);
// ashr(R, 7) === cmp_slt(R, 0)
if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
SDValue Zeros = DAG.getConstant(0, dl, VT);
if (VT.is512BitVector()) {
assert(VT == MVT::v64i8 && "Unexpected element type!");
SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
}
return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
}
// XOP can shift v16i8 directly instead of as shift v8i16 + mask.
if (VT == MVT::v16i8 && Subtarget.hasXOP())
return SDValue();
if (Op.getOpcode() == ISD::SHL) {
// Make a large shift.
SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
ShiftAmt, DAG);
SHL = DAG.getBitcast(VT, SHL);
// Zero out the rightmost bits.
APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
}
if (Op.getOpcode() == ISD::SRL) {
// Make a large shift.
SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
ShiftAmt, DAG);
SRL = DAG.getBitcast(VT, SRL);
// Zero out the leftmost bits.
return DAG.getNode(ISD::AND, dl, VT, SRL,
DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
}
if (Op.getOpcode() == ISD::SRA) {
// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
return Res;
}
llvm_unreachable("Unknown shift opcode.");
}
return SDValue();
}
static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
unsigned Opcode = Op.getOpcode();
unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
MVT EltVT = VT.getVectorElementType();
assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
else if (EltVT.bitsLT(MVT::i32))
BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
}
// vXi8 shifts - shift as v8i16 + mask result.
if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
(VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
VT == MVT::v64i8) &&
!Subtarget.hasXOP()) {
unsigned NumElts = VT.getVectorNumElements();
MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
// Create the mask using vXi16 shifts. For shift-rights we need to move
// the upper byte down before splatting the vXi8 mask.
SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
BaseShAmt, Subtarget, DAG);
if (Opcode != ISD::SHL)
BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
8, DAG);
BitMask = DAG.getBitcast(VT, BitMask);
BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
SmallVector<int, 64>(NumElts, 0));
SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
DAG.getBitcast(ExtVT, R), BaseShAmt,
Subtarget, DAG);
Res = DAG.getBitcast(VT, Res);
Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
if (Opcode == ISD::SRA) {
// ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
// SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
BaseShAmt, Subtarget, DAG);
SignMask = DAG.getBitcast(VT, SignMask);
Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
}
return Res;
}
}
}
// Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
Amt = Amt.getOperand(0);
unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
std::vector<SDValue> Vals(Ratio);
for (unsigned i = 0; i != Ratio; ++i)
Vals[i] = Amt.getOperand(i);
for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
for (unsigned j = 0; j != Ratio; ++j)
if (Vals[j] != Amt.getOperand(i + j))
return SDValue();
}
if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
}
return SDValue();
}
// Convert a shift/rotate left amount to a multiplication scale factor.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Amt.getSimpleValueType();
if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
(Subtarget.hasInt256() && VT == MVT::v16i16) ||
(!Subtarget.hasAVX512() && VT == MVT::v16i8)))
return SDValue();
if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
SmallVector<SDValue, 8> Elts;
MVT SVT = VT.getVectorElementType();
unsigned SVTBits = SVT.getSizeInBits();
APInt One(SVTBits, 1);
unsigned NumElems = VT.getVectorNumElements();
for (unsigned i = 0; i != NumElems; ++i) {
SDValue Op = Amt->getOperand(i);
if (Op->isUndef()) {
Elts.push_back(Op);
continue;
}
ConstantSDNode *ND = cast<ConstantSDNode>(Op);
APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
uint64_t ShAmt = C.getZExtValue();
if (ShAmt >= SVTBits) {
Elts.push_back(DAG.getUNDEF(SVT));
continue;
}
Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
}
return DAG.getBuildVector(VT, dl, Elts);
}
// If the target doesn't support variable shifts, use either FP conversion
// or integer multiplication to avoid shifting each element individually.
if (VT == MVT::v4i32) {
Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
DAG.getConstant(0x3f800000U, dl, VT));
Amt = DAG.getBitcast(MVT::v4f32, Amt);
return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
}
// AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
SDValue Z = DAG.getConstant(0, dl, VT);
SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
if (Subtarget.hasSSE41())
return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
DAG.getBitcast(VT, Hi),
{0, 2, 4, 6, 8, 10, 12, 14});
}
return SDValue();
}
static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
unsigned EltSizeInBits = VT.getScalarSizeInBits();
bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
unsigned Opc = Op.getOpcode();
unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
assert(VT.isVector() && "Custom lowering only for vector shifts!");
assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
return V;
if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
return V;
if (SupportedVectorVarShift(VT, Subtarget, Opc))
return Op;
// XOP has 128-bit variable logical/arithmetic shifts.
// +ve/-ve Amt = shift left/right.
if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
VT == MVT::v8i16 || VT == MVT::v16i8)) {
if (Opc == ISD::SRL || Opc == ISD::SRA) {
SDValue Zero = DAG.getConstant(0, dl, VT);
Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
}
if (Opc == ISD::SHL || Opc == ISD::SRL)
return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
if (Opc == ISD::SRA)
return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
}
// 2i64 vector logical shifts can efficiently avoid scalarization - do the
// shifts per-lane and then shuffle the partial results back together.
if (VT == MVT::v2i64 && Opc != ISD::SRA) {
// Splat the shift amounts so the scalar shifts above will catch it.
SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
}
// i64 vector arithmetic shift can be emulated with the transform:
// M = lshr(SIGN_MASK, Amt)
// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
Opc == ISD::SRA) {
SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
R = DAG.getNode(ISD::XOR, dl, VT, R, M);
R = DAG.getNode(ISD::SUB, dl, VT, R, M);
return R;
}
// If possible, lower this shift as a sequence of two shifts by
// constant plus a BLENDing shuffle instead of scalarizing it.
// Example:
// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
//
// Could be rewritten as:
// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
//
// The advantage is that the two shifts from the example would be
// lowered as X86ISD::VSRLI nodes in parallel before blending.
if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
(VT == MVT::v16i16 && Subtarget.hasInt256()))) {
SDValue Amt1, Amt2;
unsigned NumElts = VT.getVectorNumElements();
SmallVector<int, 8> ShuffleMask;
for (unsigned i = 0; i != NumElts; ++i) {
SDValue A = Amt->getOperand(i);
if (A.isUndef()) {
ShuffleMask.push_back(SM_SentinelUndef);
continue;
}
if (!Amt1 || Amt1 == A) {
ShuffleMask.push_back(i);
Amt1 = A;
continue;
}
if (!Amt2 || Amt2 == A) {
ShuffleMask.push_back(i + NumElts);
Amt2 = A;
continue;
}
break;
}
// Only perform this blend if we can perform it without loading a mask.
if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
(VT != MVT::v16i16 ||
is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
(VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
canWidenShuffleElements(ShuffleMask))) {
auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
Cst2->getAPIntValue().ult(EltSizeInBits)) {
SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
Cst1->getZExtValue(), DAG);
SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
Cst2->getZExtValue(), DAG);
return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
}
}
}
// If possible, lower this packed shift into a vector multiply instead of
// expanding it into a sequence of scalar shifts.
if (Opc == ISD::SHL)
if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
// Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
// can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
if (Opc == ISD::SRL && ConstantAmt &&
(VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
SDValue Zero = DAG.getConstant(0, dl, VT);
SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
return DAG.getSelect(dl, VT, ZAmt, R, Res);
}
}
// Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
// can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
// TODO: Special case handling for shift by 0/1, really we can afford either
// of these cases in pre-SSE41/XOP/AVX512 but not both.
if (Opc == ISD::SRA && ConstantAmt &&
(VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
!Subtarget.hasAVX512()) ||
DAG.isKnownNeverZero(Amt))) {
SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
SDValue Amt0 =
DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
SDValue Amt1 =
DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
SDValue Sra1 =
getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
Res = DAG.getSelect(dl, VT, Amt0, R, Res);
return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
}
}
// v4i32 Non Uniform Shifts.
// If the shift amount is constant we can shift each lane using the SSE2
// immediate shifts, else we need to zero-extend each lane to the lower i64
// and shift using the SSE2 variable shifts.
// The separate results can then be blended together.
if (VT == MVT::v4i32) {
SDValue Amt0, Amt1, Amt2, Amt3;
if (ConstantAmt) {
Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
} else {
// The SSE2 shifts use the lower i64 as the same shift amount for
// all lanes and the upper i64 is ignored. On AVX we're better off
// just zero-extending, but for SSE just duplicating the top 16-bits is
// cheaper and has the same effect for out of range values.
if (Subtarget.hasAVX()) {
SDValue Z = DAG.getConstant(0, dl, VT);
Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
} else {
SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
{4, 5, 6, 7, -1, -1, -1, -1});
Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
{0, 1, 1, 1, -1, -1, -1, -1});
Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
{2, 3, 3, 3, -1, -1, -1, -1});
Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
{0, 1, 1, 1, -1, -1, -1, -1});
Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
{2, 3, 3, 3, -1, -1, -1, -1});
}
}
unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
// Merge the shifted lane results optimally with/without PBLENDW.
// TODO - ideally shuffle combining would handle this.
if (Subtarget.hasSSE41()) {
SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
}
SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
}
// It's worth extending once and using the vXi16/vXi32 shifts for smaller
// types, but without AVX512 the extra overheads to get from vXi8 to vXi32
// make the existing SSE solution better.
// NOTE: We honor prefered vector width before promoting to 512-bits.
if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
(Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
(Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
(Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
(Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
"Unexpected vector type");
MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
R = DAG.getNode(ExtOpc, dl, ExtVT, R);
Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
return DAG.getNode(ISD::TRUNCATE, dl, VT,
DAG.getNode(Opc, dl, ExtVT, R, Amt));
}
// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
// extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
(VT == MVT::v16i8 || VT == MVT::v64i8 ||
(VT == MVT::v32i8 && Subtarget.hasInt256())) &&
!Subtarget.hasXOP()) {
int NumElts = VT.getVectorNumElements();
SDValue Cst8 = DAG.getConstant(8, dl, MVT::i8);
// Extend constant shift amount to vXi16 (it doesn't matter if the type
// isn't legal).
MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
"Constant build vector expected");
if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
: DAG.getZExtOrTrunc(R, dl, ExVT);
R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
return DAG.getZExtOrTrunc(R, dl, VT);
}
SmallVector<SDValue, 16> LoAmt, HiAmt;
for (int i = 0; i != NumElts; i += 16) {
for (int j = 0; j != 8; ++j) {
LoAmt.push_back(Amt.getOperand(i + j));
HiAmt.push_back(Amt.getOperand(i + j + 8));
}
}
MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
}
if (VT == MVT::v16i8 ||
(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
(VT == MVT::v64i8 && Subtarget.hasBWI())) {
MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
if (VT.is512BitVector()) {
// On AVX512BW targets we make use of the fact that VSELECT lowers
// to a masked blend which selects bytes based just on the sign bit
// extracted to a mask.
MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
V0 = DAG.getBitcast(VT, V0);
V1 = DAG.getBitcast(VT, V1);
Sel = DAG.getBitcast(VT, Sel);
Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
ISD::SETGT);
return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
} else if (Subtarget.hasSSE41()) {
// On SSE41 targets we make use of the fact that VSELECT lowers
// to PBLENDVB which selects bytes based just on the sign bit.
V0 = DAG.getBitcast(VT, V0);
V1 = DAG.getBitcast(VT, V1);
Sel = DAG.getBitcast(VT, Sel);
return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
}
// On pre-SSE41 targets we test for the sign bit by comparing to
// zero - a negative value will set all bits of the lanes to true
// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
SDValue Z = DAG.getConstant(0, dl, SelVT);
SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
return DAG.getSelect(dl, SelVT, C, V0, V1);
};
// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
// We can safely do this using i16 shifts as we're only interested in
// the 3 lower bits of each byte.
Amt = DAG.getBitcast(ExtVT, Amt);
Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
Amt = DAG.getBitcast(VT, Amt);
if (Opc == ISD::SHL || Opc == ISD::SRL) {
// r = VSELECT(r, shift(r, 4), a);
SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
R = SignBitSelect(VT, Amt, M, R);
// a += a
Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
// r = VSELECT(r, shift(r, 2), a);
M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
R = SignBitSelect(VT, Amt, M, R);
// a += a
Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
// return VSELECT(r, shift(r, 1), a);
M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
R = SignBitSelect(VT, Amt, M, R);
return R;
}
if (Opc == ISD::SRA) {
// For SRA we need to unpack each byte to the higher byte of a i16 vector
// so we can correctly sign extend. We don't care what happens to the
// lower byte.
SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
ALo = DAG.getBitcast(ExtVT, ALo);
AHi = DAG.getBitcast(ExtVT, AHi);
RLo = DAG.getBitcast(ExtVT, RLo);
RHi = DAG.getBitcast(ExtVT, RHi);
// r = VSELECT(r, shift(r, 4), a);
SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
// a += a
ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
// r = VSELECT(r, shift(r, 2), a);
MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
// a += a
ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
// r = VSELECT(r, shift(r, 1), a);
MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
// Logical shift the result back to the lower byte, leaving a zero upper
// byte meaning that we can safely pack with PACKUSWB.
RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
}
}
if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
MVT ExtVT = MVT::v8i32;
SDValue Z = DAG.getConstant(0, dl, VT);
SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
ALo = DAG.getBitcast(ExtVT, ALo);
AHi = DAG.getBitcast(ExtVT, AHi);
RLo = DAG.getBitcast(ExtVT, RLo);
RHi = DAG.getBitcast(ExtVT, RHi);
SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
}
if (VT == MVT::v8i16) {
// If we have a constant shift amount, the non-SSE41 path is best as
// avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
bool UseSSE41 = Subtarget.hasSSE41() &&
!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
// On SSE41 targets we make use of the fact that VSELECT lowers
// to PBLENDVB which selects bytes based just on the sign bit.
if (UseSSE41) {
MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
V0 = DAG.getBitcast(ExtVT, V0);
V1 = DAG.getBitcast(ExtVT, V1);
Sel = DAG.getBitcast(ExtVT, Sel);
return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
}
// On pre-SSE41 targets we splat the sign bit - a negative value will
// set all bits of the lanes to true and VSELECT uses that in
// its OR(AND(V0,C),AND(V1,~C)) lowering.
SDValue C =
getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
return DAG.getSelect(dl, VT, C, V0, V1);
};
// Turn 'a' into a mask suitable for VSELECT: a = a << 12;
if (UseSSE41) {
// On SSE41 targets we need to replicate the shift mask in both
// bytes for PBLENDVB.
Amt = DAG.getNode(
ISD::OR, dl, VT,
getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
} else {
Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
}
// r = VSELECT(r, shift(r, 8), a);
SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
R = SignBitSelect(Amt, M, R);
// a += a
Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
// r = VSELECT(r, shift(r, 4), a);
M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
R = SignBitSelect(Amt, M, R);
// a += a
Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
// r = VSELECT(r, shift(r, 2), a);
M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
R = SignBitSelect(Amt, M, R);
// a += a
Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
// return VSELECT(r, shift(r, 1), a);
M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
R = SignBitSelect(Amt, M, R);
return R;
}
// Decompose 256-bit shifts into 128-bit shifts.
if (VT.is256BitVector())
return split256IntArith(Op, DAG);
return SDValue();
}
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
assert(VT.isVector() && "Custom lowering only for vector rotates!");
SDLoc DL(Op);
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
unsigned Opcode = Op.getOpcode();
unsigned EltSizeInBits = VT.getScalarSizeInBits();
int NumElts = VT.getVectorNumElements();
// Check for constant splat rotation amount.
APInt UndefElts;
SmallVector<APInt, 32> EltBits;
int CstSplatIndex = -1;
if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits))
for (int i = 0; i != NumElts; ++i)
if (!UndefElts[i]) {
if (CstSplatIndex < 0 || EltBits[i] == EltBits[CstSplatIndex]) {
CstSplatIndex = i;
continue;
}
CstSplatIndex = -1;
break;
}
// AVX512 implicitly uses modulo rotation amounts.
if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
// Attempt to rotate by immediate.
if (0 <= CstSplatIndex) {
unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
return DAG.getNode(Op, DL, VT, R,
DAG.getConstant(RotateAmt, DL, MVT::i8));
}
// Else, fall-back on VPROLV/VPRORV.
return Op;
}
assert((Opcode == ISD::ROTL) && "Only ROTL supported");
// XOP has 128-bit vector variable + immediate rotates.
// +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
// XOP implicitly uses modulo rotation amounts.
if (Subtarget.hasXOP()) {
if (VT.is256BitVector())
return split256IntArith(Op, DAG);
assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
// Attempt to rotate by immediate.
if (0 <= CstSplatIndex) {
uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
DAG.getConstant(RotateAmt, DL, MVT::i8));
}
// Use general rotate by variable (per-element).
return Op;
}
// Split 256-bit integers on pre-AVX2 targets.
if (VT.is256BitVector() && !Subtarget.hasAVX2())
return split256IntArith(Op, DAG);
assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
Subtarget.hasAVX2())) &&
"Only vXi32/vXi16/vXi8 vector rotates supported");
// Rotate by an uniform constant - expand back to shifts.
if (0 <= CstSplatIndex)
return SDValue();
bool IsSplatAmt = DAG.isSplatValue(Amt);
// v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
// the amount bit.
if (EltSizeInBits == 8 && !IsSplatAmt) {
if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
return SDValue();
// We don't need ModuloAmt here as we just peek at individual bits.
MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
if (Subtarget.hasSSE41()) {
// On SSE41 targets we make use of the fact that VSELECT lowers
// to PBLENDVB which selects bytes based just on the sign bit.
V0 = DAG.getBitcast(VT, V0);
V1 = DAG.getBitcast(VT, V1);
Sel = DAG.getBitcast(VT, Sel);
return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1));
}
// On pre-SSE41 targets we test for the sign bit by comparing to
// zero - a negative value will set all bits of the lanes to true
// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
SDValue Z = DAG.getConstant(0, DL, SelVT);
SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
return DAG.getSelect(DL, SelVT, C, V0, V1);
};
// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
// We can safely do this using i16 shifts as we're only interested in
// the 3 lower bits of each byte.
Amt = DAG.getBitcast(ExtVT, Amt);
Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
Amt = DAG.getBitcast(VT, Amt);
// r = VSELECT(r, rot(r, 4), a);
SDValue M;
M = DAG.getNode(
ISD::OR, DL, VT,
DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
R = SignBitSelect(VT, Amt, M, R);
// a += a
Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
// r = VSELECT(r, rot(r, 2), a);
M = DAG.getNode(
ISD::OR, DL, VT,
DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
R = SignBitSelect(VT, Amt, M, R);
// a += a
Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
// return VSELECT(r, rot(r, 1), a);
M = DAG.getNode(
ISD::OR, DL, VT,
DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
return SignBitSelect(VT, Amt, M, R);
}
// ISD::ROT* uses modulo rotate amounts.
Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
DAG.getConstant(EltSizeInBits - 1, DL, VT));
bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
// Fallback for splats + all supported variable shifts.
// Fallback for non-constants AVX2 vXi16 as well.
if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
}
// As with shifts, convert the rotation amount to a multiplication factor.
SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
assert(Scale && "Failed to convert ROTL amount to scale");
// v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
if (EltSizeInBits == 16) {
SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
}
// v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
// to v2i64 results at a time. The upper 32-bits contain the wrapped bits
// that can then be OR'd with the lower 32-bits.
assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
static const int OddMask[] = {1, -1, 3, -1};
SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
DAG.getBitcast(MVT::v2i64, R),
DAG.getBitcast(MVT::v2i64, Scale));
SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
DAG.getBitcast(MVT::v2i64, R13),
DAG.getBitcast(MVT::v2i64, Scale13));
Res02 = DAG.getBitcast(VT, Res02);
Res13 = DAG.getBitcast(VT, Res13);
return DAG.getNode(ISD::OR, DL, VT,
DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
}
/// Returns true if the operand type is exactly twice the native width, and
/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
unsigned OpWidth = MemType->getPrimitiveSizeInBits();
if (OpWidth == 64)
return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
if (OpWidth == 128)
return Subtarget.hasCmpxchg16b();
return false;
}
// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
// TODO: In 32-bit mode, use FISTP when X87 is available?
bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
Type *MemType = SI->getValueOperand()->getType();
bool NoImplicitFloatOps =
SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2())
return false;
return needsCmpXchgNb(MemType);
}
// Note: this turns large loads into lock cmpxchg8b/16b.
// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
TargetLowering::AtomicExpansionKind
X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
Type *MemType = LI->getType();
// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
// can use movq to do the load. If we have X87 we can load into an 80-bit
// X87 register and store it to a stack temporary.
bool NoImplicitFloatOps =
LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
(Subtarget.hasSSE2() || Subtarget.hasX87()))
return AtomicExpansionKind::None;
return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
: AtomicExpansionKind::None;
}
TargetLowering::AtomicExpansionKind
X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
Type *MemType = AI->getType();
// If the operand is too big, we must see if cmpxchg8/16b is available
// and default to library calls otherwise.
if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
: AtomicExpansionKind::None;
}
AtomicRMWInst::BinOp Op = AI->getOperation();
switch (Op) {
default:
llvm_unreachable("Unknown atomic operation");
case AtomicRMWInst::Xchg:
case AtomicRMWInst::Add:
case AtomicRMWInst::Sub:
// It's better to use xadd, xsub or xchg for these in all cases.
return AtomicExpansionKind::None;
case AtomicRMWInst::Or:
case AtomicRMWInst::And:
case AtomicRMWInst::Xor:
// If the atomicrmw's result isn't actually used, we can just add a "lock"
// prefix to a normal instruction for these operations.
return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
: AtomicExpansionKind::None;
case AtomicRMWInst::Nand:
case AtomicRMWInst::Max:
case AtomicRMWInst::Min:
case AtomicRMWInst::UMax:
case AtomicRMWInst::UMin:
case AtomicRMWInst::FAdd:
case AtomicRMWInst::FSub:
// These always require a non-trivial set of data operations on x86. We must
// use a cmpxchg loop.
return AtomicExpansionKind::CmpXChg;
}
}
LoadInst *
X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
Type *MemType = AI->getType();
// Accesses larger than the native width are turned into cmpxchg/libcalls, so
// there is no benefit in turning such RMWs into loads, and it is actually
// harmful as it introduces a mfence.
if (MemType->getPrimitiveSizeInBits() > NativeWidth)
return nullptr;
// If this is a canonical idempotent atomicrmw w/no uses, we have a better
// lowering available in lowerAtomicArith.
// TODO: push more cases through this path.
if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
AI->use_empty())
return nullptr;
auto Builder = IRBuilder<>(AI);
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
auto SSID = AI->getSyncScopeID();
// We must restrict the ordering to avoid generating loads with Release or
// ReleaseAcquire orderings.
auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
// Before the load we need a fence. Here is an example lifted from
// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
// is required:
// Thread 0:
// x.store(1, relaxed);
// r1 = y.fetch_add(0, release);
// Thread 1:
// y.fetch_add(42, acquire);
// r2 = x.load(relaxed);
// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
// lowered to just a load without a fence. A mfence flushes the store buffer,
// making the optimization clearly correct.
// FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
// otherwise, we might be able to be more aggressive on relaxed idempotent
// rmw. In practice, they do not look useful, so we don't try to be
// especially clever.
if (SSID == SyncScope::SingleThread)
// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
// the IR level, so we must wrap it in an intrinsic.
return nullptr;
if (!Subtarget.hasMFence())
// FIXME: it might make sense to use a locked operation here but on a
// different cache-line to prevent cache-line bouncing. In practice it
// is probably a small win, and x86 processors without mfence are rare
// enough that we do not bother.
return nullptr;
Function *MFence =
llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
Builder.CreateCall(MFence, {});
// Finally we can emit the atomic load.
LoadInst *Loaded =
Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(),
AI->getType()->getPrimitiveSizeInBits());
Loaded->setAtomic(Order, SSID);
AI->replaceAllUsesWith(Loaded);
AI->eraseFromParent();
return Loaded;
}
/// Emit a locked operation on a stack location which does not change any
/// memory location, but does involve a lock prefix. Location is chosen to be
/// a) very likely accessed only by a single thread to minimize cache traffic,
/// and b) definitely dereferenceable. Returns the new Chain result.
static SDValue emitLockedStackOp(SelectionDAG &DAG,
const X86Subtarget &Subtarget,
SDValue Chain, SDLoc DL) {
// Implementation notes:
// 1) LOCK prefix creates a full read/write reordering barrier for memory
// operations issued by the current processor. As such, the location
// referenced is not relevant for the ordering properties of the instruction.
// See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
// 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
// 2) Using an immediate operand appears to be the best encoding choice
// here since it doesn't require an extra register.
// 3) OR appears to be very slightly faster than ADD. (Though, the difference
// is small enough it might just be measurement noise.)
// 4) When choosing offsets, there are several contributing factors:
// a) If there's no redzone, we default to TOS. (We could allocate a cache
// line aligned stack object to improve this case.)
// b) To minimize our chances of introducing a false dependence, we prefer
// to offset the stack usage from TOS slightly.
// c) To minimize concerns about cross thread stack usage - in particular,
// the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
// captures state in the TOS frame and accesses it from many threads -
// we want to use an offset such that the offset is in a distinct cache
// line from the TOS frame.
//
// For a general discussion of the tradeoffs and benchmark results, see:
// https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
auto &MF = DAG.getMachineFunction();
auto &TFL = *Subtarget.getFrameLowering();
const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
if (Subtarget.is64Bit()) {
SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
SDValue Ops[] = {
DAG.getRegister(X86::RSP, MVT::i64), // Base
DAG.getTargetConstant(1, DL, MVT::i8), // Scale
DAG.getRegister(0, MVT::i64), // Index
DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
DAG.getRegister(0, MVT::i16), // Segment.
Zero,
Chain};
SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
MVT::Other, Ops);
return SDValue(Res, 1);
}
SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
SDValue Ops[] = {
DAG.getRegister(X86::ESP, MVT::i32), // Base
DAG.getTargetConstant(1, DL, MVT::i8), // Scale
DAG.getRegister(0, MVT::i32), // Index
DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
DAG.getRegister(0, MVT::i16), // Segment.
Zero,
Chain
};
SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
MVT::Other, Ops);
return SDValue(Res, 1);
}
static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
// The only fence that needs an instruction is a sequentially-consistent
// cross-thread fence.
if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
FenceSSID == SyncScope::System) {
if (Subtarget.hasMFence())
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
SDValue Chain = Op.getOperand(0);
return emitLockedStackOp(DAG, Subtarget, Chain, dl);
}
// MEMBARRIER is a compiler barrier; it codegens to a no-op.
return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
}
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT T = Op.getSimpleValueType();
SDLoc DL(Op);
unsigned Reg = 0;
unsigned size = 0;
switch(T.SimpleTy) {
default: llvm_unreachable("Invalid value type!");
case MVT::i8: Reg = X86::AL; size = 1; break;
case MVT::i16: Reg = X86::AX; size = 2; break;
case MVT::i32: Reg = X86::EAX; size = 4; break;
case MVT::i64:
assert(Subtarget.is64Bit() && "Node not type legal!");
Reg = X86::RAX; size = 8;
break;
}
SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
Op.getOperand(2), SDValue());
SDValue Ops[] = { cpIn.getValue(0),
Op.getOperand(1),
Op.getOperand(3),
DAG.getTargetConstant(size, DL, MVT::i8),
cpIn.getValue(1) };
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
Ops, T, MMO);
SDValue cpOut =
DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
MVT::i32, cpOut.getValue(2));
SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
cpOut, Success, EFLAGS.getValue(1));
}
// Create MOVMSKB, taking into account whether we need to split for AVX1.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT InVT = V.getSimpleValueType();
if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
DAG.getConstant(16, DL, MVT::i8));
return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
}
return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
}
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
MVT DstVT = Op.getSimpleValueType();
// Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
// half to v32i1 and concatenating the result.
if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
assert(Subtarget.hasBWI() && "Expected BWI target");
SDLoc dl(Op);
SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
DAG.getIntPtrConstant(0, dl));
Lo = DAG.getBitcast(MVT::v32i1, Lo);
SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
DAG.getIntPtrConstant(1, dl));
Hi = DAG.getBitcast(MVT::v32i1, Hi);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
}
// Custom splitting for BWI types when AVX512F is available but BWI isn't.
if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() &&
DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {
SDLoc dl(Op);
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
EVT CastVT = MVT::getVectorVT(DstVT.getVectorElementType(),
DstVT.getVectorNumElements() / 2);
Lo = DAG.getBitcast(CastVT, Lo);
Hi = DAG.getBitcast(CastVT, Hi);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
}
// Use MOVMSK for vector to scalar conversion to prevent scalarization.
if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
SDLoc DL(Op);
SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
V = getPMOVMSKB(DL, V, DAG, Subtarget);
return DAG.getZExtOrTrunc(V, DL, DstVT);
}
if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
SrcVT == MVT::i64) {
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
if (DstVT != MVT::f64 && DstVT != MVT::i64 &&
!(DstVT == MVT::x86mmx && SrcVT.isVector()))
// This conversion needs to be expanded.
return SDValue();
SDLoc dl(Op);
if (SrcVT.isVector()) {
// Widen the vector in input in the case of MVT::v2i32.
// Example: from MVT::v2i32 to MVT::v4i32.
MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
SrcVT.getVectorNumElements() * 2);
Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
DAG.getUNDEF(SrcVT));
} else {
assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
"Unexpected source type in LowerBITCAST");
Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
}
MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
if (DstVT == MVT::x86mmx)
return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
DAG.getIntPtrConstant(0, dl));
}
assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
Subtarget.hasMMX() && "Unexpected custom BITCAST");
assert((DstVT == MVT::i64 ||
(DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
"Unexpected custom BITCAST");
// i64 <=> MMX conversions are Legal.
if (SrcVT==MVT::i64 && DstVT.isVector())
return Op;
if (DstVT==MVT::i64 && SrcVT.isVector())
return Op;
// MMX <=> MMX conversions are Legal.
if (SrcVT.isVector() && DstVT.isVector())
return Op;
// All other conversions need to be expanded.
return SDValue();
}
/// Compute the horizontal sum of bytes in V for the elements of VT.
///
/// Requires V to be a byte vector and VT to be an integer vector type with
/// wider elements than V's type. The width of the elements of VT determines
/// how many bytes of V are summed horizontally to produce each element of the
/// result.
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc DL(V);
MVT ByteVecVT = V.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
"Expected value to have byte element type.");
assert(EltVT != MVT::i8 &&
"Horizontal byte sum only makes sense for wider elements!");
unsigned VecSize = VT.getSizeInBits();
assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
// PSADBW instruction horizontally add all bytes and leave the result in i64
// chunks, thus directly computes the pop count for v2i64 and v4i64.
if (EltVT == MVT::i64) {
SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
return DAG.getBitcast(VT, V);
}
if (EltVT == MVT::i32) {
// We unpack the low half and high half into i32s interleaved with zeros so
// that we can use PSADBW to horizontally sum them. The most useful part of
// this is that it lines up the results of two PSADBW instructions to be
// two v2i64 vectors which concatenated are the 4 population counts. We can
// then use PACKUSWB to shrink and concatenate them into a v4i32 again.
SDValue Zeros = DAG.getConstant(0, DL, VT);
SDValue V32 = DAG.getBitcast(VT, V);
SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
// Do the horizontal sums into two v2i64s.
Zeros = DAG.getConstant(0, DL, ByteVecVT);
MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
DAG.getBitcast(ByteVecVT, Low), Zeros);
High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
DAG.getBitcast(ByteVecVT, High), Zeros);
// Merge them together.
MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
DAG.getBitcast(ShortVecVT, Low),
DAG.getBitcast(ShortVecVT, High));
return DAG.getBitcast(VT, V);
}
// The only element type left is i16.
assert(EltVT == MVT::i16 && "Unknown how to handle type");
// To obtain pop count for each i16 element starting from the pop count for
// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
// right by 8. It is important to shift as i16s as i8 vector shift isn't
// directly supported.
SDValue ShifterV = DAG.getConstant(8, DL, VT);
SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
DAG.getBitcast(ByteVecVT, V));
return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
}
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
int NumElts = VT.getVectorNumElements();
(void)EltVT;
assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
// Implement a lookup table in register by using an algorithm based on:
// http://wm.ite.pl/articles/sse-popcount.html
//
// The general idea is that every lower byte nibble in the input vector is an
// index into a in-register pre-computed pop count table. We then split up the
// input vector in two new ones: (1) a vector with only the shifted-right
// higher nibbles for each byte and (2) a vector with the lower nibbles (and
// masked out higher ones) for each byte. PSHUFB is used separately with both
// to index the in-register table. Next, both are added and the result is a
// i8 vector where each element contains the pop count for input byte.
const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
/* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
/* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
/* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
SmallVector<SDValue, 64> LUTVec;
for (int i = 0; i < NumElts; ++i)
LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
SDValue M0F = DAG.getConstant(0x0F, DL, VT);
// High nibbles
SDValue FourV = DAG.getConstant(4, DL, VT);
SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
// Low nibbles
SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
// The input vector is used as the shuffle mask that index elements into the
// LUT. After counting low and high nibbles, add the vector to obtain the
// final pop count per i8 element.
SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
}
// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
"Unknown CTPOP type to handle");
SDLoc DL(Op.getNode());
SDValue Op0 = Op.getOperand(0);
// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
if (Subtarget.hasVPOPCNTDQ()) {
unsigned NumElems = VT.getVectorNumElements();
assert((VT.getVectorElementType() == MVT::i8 ||
VT.getVectorElementType() == MVT::i16) && "Unexpected type");
if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
}
}
// Decompose 256-bit ops into smaller 128-bit ops.
if (VT.is256BitVector() && !Subtarget.hasInt256())
return Lower256IntUnary(Op, DAG);
// Decompose 512-bit ops into smaller 256-bit ops.
if (VT.is512BitVector() && !Subtarget.hasBWI())
return Lower512IntUnary(Op, DAG);
// For element types greater than i8, do vXi8 pop counts and a bytesum.
if (VT.getScalarType() != MVT::i8) {
MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
}
// We can't use the fast LUT approach, so fall back on LegalizeDAG.
if (!Subtarget.hasSSSE3())
return SDValue();
return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
}
static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Op.getSimpleValueType().isVector() &&
"We only do custom lowering for vector population count.");
return LowerVectorCTPOP(Op, Subtarget, DAG);
}
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(0);
SDLoc DL(Op);
// For scalars, its still beneficial to transfer to/from the SIMD unit to
// perform the BITREVERSE.
if (!VT.isVector()) {
MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
DAG.getIntPtrConstant(0, DL));
}
int NumElts = VT.getVectorNumElements();
int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
// Decompose 256-bit ops into smaller 128-bit ops.
if (VT.is256BitVector())
return Lower256IntUnary(Op, DAG);
assert(VT.is128BitVector() &&
"Only 128-bit vector bitreverse lowering supported.");
// VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
// perform the BSWAP in the shuffle.
// Its best to shuffle using the second operand as this will implicitly allow
// memory folding for multiple vectors.
SmallVector<SDValue, 16> MaskElts;
for (int i = 0; i != NumElts; ++i) {
for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
int PermuteByte = SourceByte | (2 << 5);
MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
}
}
SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
SDValue Res = DAG.getBitcast(MVT::v16i8, In);
Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
Res, Mask);
return DAG.getBitcast(VT, Res);
}
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
if (Subtarget.hasXOP() && !VT.is512BitVector())
return LowerBITREVERSE_XOP(Op, DAG);
assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
SDValue In = Op.getOperand(0);
SDLoc DL(Op);
unsigned NumElts = VT.getVectorNumElements();
assert(VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported");
// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
if (VT.is256BitVector() && !Subtarget.hasInt256())
return Lower256IntUnary(Op, DAG);
// Perform BITREVERSE using PSHUFB lookups. Each byte is split into
// two nibbles and a PSHUFB lookup to find the bitreverse of each
// 0-15 value (moved to the other nibble).
SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
const int LoLUT[16] = {
/* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
/* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
/* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
/* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
const int HiLUT[16] = {
/* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
/* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
/* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
/* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
for (unsigned i = 0; i < NumElts; ++i) {
LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
}
SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
}
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
unsigned NewOpc = 0;
switch (N->getOpcode()) {
case ISD::ATOMIC_LOAD_ADD:
NewOpc = X86ISD::LADD;
break;
case ISD::ATOMIC_LOAD_SUB:
NewOpc = X86ISD::LSUB;
break;
case ISD::ATOMIC_LOAD_OR:
NewOpc = X86ISD::LOR;
break;
case ISD::ATOMIC_LOAD_XOR:
NewOpc = X86ISD::LXOR;
break;
case ISD::ATOMIC_LOAD_AND:
NewOpc = X86ISD::LAND;
break;
default:
llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
}
MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
return DAG.getMemIntrinsicNode(
NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
{N->getOperand(0), N->getOperand(1), N->getOperand(2)},
/*MemVT=*/N->getSimpleValueType(0), MMO);
}
/// Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
SDValue Chain = N->getOperand(0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
unsigned Opc = N->getOpcode();
MVT VT = N->getSimpleValueType(0);
SDLoc DL(N);
// We can lower atomic_load_add into LXADD. However, any other atomicrmw op
// can only be lowered when the result is unused. They should have already
// been transformed into a cmpxchg loop in AtomicExpand.
if (N->hasAnyUseOfValue(0)) {
// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
// select LXADD if LOCK_SUB can't be selected.
if (Opc == ISD::ATOMIC_LOAD_SUB) {
RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
RHS, AN->getMemOperand());
}
assert(Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!");
return N;
}
// Specialized lowering for the canonical form of an idemptotent atomicrmw.
// The core idea here is that since the memory location isn't actually
// changing, all we need is a lowering for the *ordering* impacts of the
// atomicrmw. As such, we can chose a different operation and memory
// location to minimize impact on other code.
if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
// On X86, the only ordering which actually requires an instruction is
// seq_cst which isn't SingleThread, everything just needs to be preserved
// during codegen and then dropped. Note that we expect (but don't assume),
// that orderings other than seq_cst and acq_rel have been canonicalized to
// a store or load.
if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
AN->getSyncScopeID() == SyncScope::System) {
// Prefer a locked operation against a stack location to minimize cache
// traffic. This assumes that stack locations are very likely to be
// accessed only by the owning thread.
SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
assert(!N->hasAnyUseOfValue(0));
// NOTE: The getUNDEF is needed to give something for the unused result 0.
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
DAG.getUNDEF(VT), NewChain);
}
// MEMBARRIER is a compiler barrier; it codegens to a no-op.
SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
assert(!N->hasAnyUseOfValue(0));
// NOTE: The getUNDEF is needed to give something for the unused result 0.
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
DAG.getUNDEF(VT), NewChain);
}
SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
// RAUW the chain, but don't worry about the result, as it's unused.
assert(!N->hasAnyUseOfValue(0));
// NOTE: The getUNDEF is needed to give something for the unused result 0.
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
DAG.getUNDEF(VT), LockOp.getValue(1));
}
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
auto *Node = cast<AtomicSDNode>(Op.getNode());
SDLoc dl(Node);
EVT VT = Node->getMemoryVT();
bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent;
bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
// If this store is not sequentially consistent and the type is legal
// we can just keep it.
if (!IsSeqCst && IsTypeLegal)
return Op;
if (VT == MVT::i64 && !IsTypeLegal) {
// For illegal i64 atomic_stores, we can try to use MOVQ if SSE2 is enabled.
// FIXME: Use movlps with SSE1.
// FIXME: Use fist with X87.
bool NoImplicitFloatOps =
DAG.getMachineFunction().getFunction().hasFnAttribute(
Attribute::NoImplicitFloat);
if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
Subtarget.hasSSE2()) {
SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
Node->getOperand(2));
SDVTList Tys = DAG.getVTList(MVT::Other);
SDValue Ops[] = { Node->getChain(), SclToVec, Node->getBasePtr() };
SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys,
Ops, MVT::i64,
Node->getMemOperand());
// If this is a sequentially consistent store, also emit an appropriate
// barrier.
if (IsSeqCst)
Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
return Chain;
}
}
// Convert seq_cst store -> xchg
// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
Node->getMemoryVT(),
Node->getOperand(0),
Node->getOperand(1), Node->getOperand(2),
Node->getMemOperand());
return Swap.getValue(1);
}
static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
SDNode *N = Op.getNode();
MVT VT = N->getSimpleValueType(0);
// Let legalize expand this if it isn't a legal type yet.
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
SDLoc DL(N);
// Set the carry flag.
SDValue Carry = Op.getOperand(2);
EVT CarryVT = Carry.getValueType();
APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
Carry, DAG.getConstant(NegOne, DL, CarryVT));
unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
Op.getOperand(1), Carry.getValue(1));
SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
if (N->getValueType(1) == MVT::i1)
SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
}
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
// For MacOSX, we want to call an alternative entry point: __sincos_stret,
// which returns the values as { float, float } (in XMM0) or
// { double, double } (which is returned in XMM0, XMM1).
SDLoc dl(Op);
SDValue Arg = Op.getOperand(0);
EVT ArgVT = Arg.getValueType();
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
Entry.Node = Arg;
Entry.Ty = ArgTy;
Entry.IsSExt = false;
Entry.IsZExt = false;
Args.push_back(Entry);
bool isF64 = ArgVT == MVT::f64;
// Only optimize x86_64 for now. i386 is a bit messy. For f32,
// the small struct {f32, f32} is returned in (eax, edx). For f64,
// the results are returned via SRet in memory.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
const char *LibcallName = TLI.getLibcallName(LC);
SDValue Callee =
DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
: (Type *)VectorType::get(ArgTy, 4);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl)
.setChain(DAG.getEntryNode())
.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
if (isF64)
// Returned in xmm0 and xmm1.
return CallResult.first;
// Returned in bits 0:31 and 32:64 xmm0.
SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
CallResult.first, DAG.getIntPtrConstant(0, dl));
SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
CallResult.first, DAG.getIntPtrConstant(1, dl));
SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
}
/// Widen a vector input to a vector of NVT. The
/// input vector must have the same element type as NVT.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
bool FillWithZeroes = false) {
// Check if InOp already has the right width.
MVT InVT = InOp.getSimpleValueType();
if (InVT == NVT)
return InOp;
if (InOp.isUndef())
return DAG.getUNDEF(NVT);
assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
"input and widen element type must match");
unsigned InNumElts = InVT.getVectorNumElements();
unsigned WidenNumElts = NVT.getVectorNumElements();
assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
"Unexpected request for vector widening");
SDLoc dl(InOp);
if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
InOp.getNumOperands() == 2) {
SDValue N1 = InOp.getOperand(1);
if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
N1.isUndef()) {
InOp = InOp.getOperand(0);
InVT = InOp.getSimpleValueType();
InNumElts = InVT.getVectorNumElements();
}
}
if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
SmallVector<SDValue, 16> Ops;
for (unsigned i = 0; i < InNumElts; ++i)
Ops.push_back(InOp.getOperand(i));
EVT EltVT = InOp.getOperand(0).getValueType();
SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
DAG.getUNDEF(EltVT);
for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
Ops.push_back(FillVal);
return DAG.getBuildVector(NVT, dl, Ops);
}
SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
DAG.getUNDEF(NVT);
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
InOp, DAG.getIntPtrConstant(0, dl));
}
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Subtarget.hasAVX512() &&
"MGATHER/MSCATTER are supported on AVX-512 arch only");
MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
SDValue Src = N->getValue();
MVT VT = Src.getSimpleValueType();
assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
SDLoc dl(Op);
SDValue Scale = N->getScale();
SDValue Index = N->getIndex();
SDValue Mask = N->getMask();
SDValue Chain = N->getChain();
SDValue BasePtr = N->getBasePtr();
if (VT == MVT::v2f32) {
assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
// If the index is v2i64 and we have VLX we can use xmm for data and index.
if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
DAG.getUNDEF(MVT::v2f32));
SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
return SDValue(NewScatter.getNode(), 1);
}
return SDValue();
}
if (VT == MVT::v2i32) {
assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
DAG.getUNDEF(MVT::v2i32));
// If the index is v2i64 and we have VLX we can use xmm for data and index.
if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
return SDValue(NewScatter.getNode(), 1);
}
// Custom widen all the operands to avoid promotion.
EVT NewIndexVT = EVT::getVectorVT(
*DAG.getContext(), Index.getValueType().getVectorElementType(), 4);
Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
DAG.getUNDEF(Index.getValueType()));
Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
DAG.getConstant(0, dl, MVT::v2i1));
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl,
Ops, N->getMemOperand());
}
MVT IndexVT = Index.getSimpleValueType();
MVT MaskVT = Mask.getSimpleValueType();
// If the index is v2i32, we're being called by type legalization and we
// should just let the default handling take care of it.
if (IndexVT == MVT::v2i32)
return SDValue();
// If we don't have VLX and neither the passthru or index is 512-bits, we
// need to widen until one is.
if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
!Index.getSimpleValueType().is512BitVector()) {
// Determine how much we need to widen by to get a 512-bit type.
unsigned Factor = std::min(512/VT.getSizeInBits(),
512/IndexVT.getSizeInBits());
unsigned NumElts = VT.getVectorNumElements() * Factor;
VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
Src = ExtendToType(Src, VT, DAG);
Index = ExtendToType(Index, IndexVT, DAG);
Mask = ExtendToType(Mask, MaskVT, DAG, true);
}
SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
return SDValue(NewScatter.getNode(), 1);
}
static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
MVT VT = Op.getSimpleValueType();
MVT ScalarVT = VT.getScalarType();
SDValue Mask = N->getMask();
MVT MaskVT = Mask.getSimpleValueType();
SDValue PassThru = N->getPassThru();
SDLoc dl(Op);
// Handle AVX masked loads which don't support passthru other than 0.
if (MaskVT.getVectorElementType() != MVT::i1) {
// We also allow undef in the isel pattern.
if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
return Op;
SDValue NewLoad = DAG.getMaskedLoad(VT, dl, N->getChain(),
N->getBasePtr(), Mask,
getZeroVector(VT, Subtarget, DAG, dl),
N->getMemoryVT(), N->getMemOperand(),
N->getExtensionType(),
N->isExpandingLoad());
// Emit a blend.
SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad,
PassThru);
return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
}
assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!");
assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
"Expanding masked load is supported for 32 and 64-bit types only!");
assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.");
assert((ScalarVT.getSizeInBits() >= 32 ||
(Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
"Unsupported masked load op.");
// This operation is legal for targets with VLX, but without
// VLX the vector should be widened to 512 bit
unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
PassThru = ExtendToType(PassThru, WideDataVT, DAG);
// Mask element has to be i1.
assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type");
MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
N->getBasePtr(), Mask, PassThru,
N->getMemoryVT(), N->getMemOperand(),
N->getExtensionType(),
N->isExpandingLoad());
SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
NewLoad.getValue(0),
DAG.getIntPtrConstant(0, dl));
SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
return DAG.getMergeValues(RetOps, dl);
}
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
SDValue DataToStore = N->getValue();
MVT VT = DataToStore.getSimpleValueType();
MVT ScalarVT = VT.getScalarType();
SDValue Mask = N->getMask();
SDLoc dl(Op);
assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!");
assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
"Expanding masked load is supported for 32 and 64-bit types only!");
assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.");
assert((ScalarVT.getSizeInBits() >= 32 ||
(Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
"Unsupported masked store op.");
// This operation is legal for targets with VLX, but without
// VLX the vector should be widened to 512 bit
unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
// Mask element has to be i1.
assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type");
MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
Mask, N->getMemoryVT(), N->getMemOperand(),
N->isTruncatingStore(), N->isCompressingStore());
}
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Subtarget.hasAVX2() &&
"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
SDValue Index = N->getIndex();
SDValue Mask = N->getMask();
SDValue PassThru = N->getPassThru();
MVT IndexVT = Index.getSimpleValueType();
MVT MaskVT = Mask.getSimpleValueType();
assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
// If the index is v2i32, we're being called by type legalization.
if (IndexVT == MVT::v2i32)
return SDValue();
// If we don't have VLX and neither the passthru or index is 512-bits, we
// need to widen until one is.
MVT OrigVT = VT;
if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
!IndexVT.is512BitVector()) {
// Determine how much we need to widen by to get a 512-bit type.
unsigned Factor = std::min(512/VT.getSizeInBits(),
512/IndexVT.getSizeInBits());
unsigned NumElts = VT.getVectorNumElements() * Factor;
VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
PassThru = ExtendToType(PassThru, VT, DAG);
Index = ExtendToType(Index, IndexVT, DAG);
Mask = ExtendToType(Mask, MaskVT, DAG, true);
}
SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
N->getScale() };
SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
N->getMemOperand());
SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
NewGather, DAG.getIntPtrConstant(0, dl));
return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
}
SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
SelectionDAG &DAG) const {
// TODO: Eventually, the lowering of these nodes should be informed by or
// deferred to the GC strategy for the function in which they appear. For
// now, however, they must be lowered to something. Since they are logically
// no-ops in the case of a null GC strategy (or a GC strategy which does not
// require special handling for these nodes), lower them as literal NOOPs for
// the time being.
SmallVector<SDValue, 2> Ops;
Ops.push_back(Op.getOperand(0));
if (Op->getGluedNode())
Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
SDLoc OpDL(Op);
SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
return NOOP;
}
SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
SelectionDAG &DAG) const {
// TODO: Eventually, the lowering of these nodes should be informed by or
// deferred to the GC strategy for the function in which they appear. For
// now, however, they must be lowered to something. Since they are logically
// no-ops in the case of a null GC strategy (or a GC strategy which does not
// require special handling for these nodes), lower them as literal NOOPs for
// the time being.
SmallVector<SDValue, 2> Ops;
Ops.push_back(Op.getOperand(0));
if (Op->getGluedNode())
Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
SDLoc OpDL(Op);
SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
return NOOP;
}
/// Provide custom lowering hooks for some operations.
SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: llvm_unreachable("Should not custom lower this!");
case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
return LowerCMP_SWAP(Op, Subtarget, DAG);
case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
case ISD::ATOMIC_LOAD_ADD:
case ISD::ATOMIC_LOAD_SUB:
case ISD::ATOMIC_LOAD_OR:
case ISD::ATOMIC_LOAD_XOR:
case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
case ISD::VSELECT: return LowerVSELECT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
case ISD::SHL_PARTS:
case ISD::SRA_PARTS:
case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
case ISD::FSHL:
case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
case ISD::ZERO_EXTEND_VECTOR_INREG:
case ISD::SIGN_EXTEND_VECTOR_INREG:
return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
case ISD::FADD:
case ISD::FSUB: return lowerFaddFsub(Op, DAG, Subtarget);
case ISD::FABS:
case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
case ISD::SETCC: return LowerSETCC(Op, DAG);
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
case ISD::SELECT: return LowerSELECT(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::JumpTable: return LowerJumpTable(Op, DAG);
case ISD::VASTART: return LowerVASTART(Op, DAG);
case ISD::VAARG: return LowerVAARG(Op, DAG);
case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
case ISD::FRAME_TO_ARGS_OFFSET:
return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
case ISD::EH_SJLJ_SETUP_DISPATCH:
return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
case ISD::CTLZ:
case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
case ISD::MULHS:
case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
case ISD::ROTL:
case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
case ISD::SRA:
case ISD::SRL:
case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
case ISD::SADDO:
case ISD::UADDO:
case ISD::SSUBO:
case ISD::USUBO:
case ISD::SMULO:
case ISD::UMULO: return LowerXALUO(Op, DAG);
case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
case ISD::ADDCARRY:
case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
case ISD::ADD:
case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
case ISD::UADDSAT:
case ISD::SADDSAT:
case ISD::USUBSAT:
case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
case ISD::SMAX:
case ISD::SMIN:
case ISD::UMAX:
case ISD::UMIN: return LowerMINMAX(Op, DAG);
case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
case ISD::GC_TRANSITION_START:
return LowerGC_TRANSITION_START(Op, DAG);
case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
}
}
/// Places new result values for the node in Results (their number
/// and types must exactly match those of the original return values of
/// the node), or leaves Results empty, which indicates that the node is not
/// to be custom lowered after all.
void X86TargetLowering::LowerOperationWrapper(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
SDValue Res = LowerOperation(SDValue(N, 0), DAG);
if (!Res.getNode())
return;
// If the original node has one result, take the return value from
// LowerOperation as is. It might not be result number 0.
if (N->getNumValues() == 1) {
Results.push_back(Res);
return;
}
// If the original node has multiple results, then the return node should
// have the same number of results.
assert((N->getNumValues() == Res->getNumValues()) &&
"Lowering returned the wrong number of results!");
// Places new result values base on N result number.
for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
Results.push_back(Res.getValue(I));
}
/// Replace a node with an illegal result type with a new node built out of
/// custom code.
void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue>&Results,
SelectionDAG &DAG) const {
SDLoc dl(N);
switch (N->getOpcode()) {
default:
#ifndef NDEBUG
dbgs() << "ReplaceNodeResults: ";
N->dump(&DAG);
#endif
llvm_unreachable("Do not know how to custom type legalize this operation!");
case ISD::CTPOP: {
assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
// Use a v2i64 if possible.
bool NoImplicitFloatOps =
DAG.getMachineFunction().getFunction().hasFnAttribute(
Attribute::NoImplicitFloat);
if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
SDValue Wide =
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
// Bit count should fit in 32-bits, extract it as that and then zero
// extend to i64. Otherwise we end up extracting bits 63:32 separately.
Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
DAG.getIntPtrConstant(0, dl));
Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
Results.push_back(Wide);
}
return;
}
case ISD::MUL: {
EVT VT = N->getValueType(0);
assert(VT.isVector() && "Unexpected VT");
if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger &&
VT.getVectorNumElements() == 2) {
// Promote to a pattern that will be turned into PMULUDQ.
SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
N->getOperand(0));
SDValue N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
N->getOperand(1));
SDValue Mul = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, N0, N1);
Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, VT, Mul));
} else if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
VT.getVectorElementType() == MVT::i8) {
// Pre-promote these to vXi16 to avoid op legalization thinking all 16
// elements are needed.
MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
unsigned NumConcats = 16 / VT.getVectorNumElements();
SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
ConcatOps[0] = Res;
Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
Results.push_back(Res);
}
return;
}
case ISD::UADDSAT:
case ISD::SADDSAT:
case ISD::USUBSAT:
case ISD::SSUBSAT:
case X86ISD::VPMADDWD:
case X86ISD::AVG: {
// Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and
// X86ISD::AVG/VPMADDWD by widening.
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
EVT VT = N->getValueType(0);
EVT InVT = N->getOperand(0).getValueType();
assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
"Expected a VT that divides into 128 bits.");
unsigned NumConcat = 128 / InVT.getSizeInBits();
EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
InVT.getVectorElementType(),
NumConcat * InVT.getVectorNumElements());
EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
VT.getVectorElementType(),
NumConcat * VT.getVectorNumElements());
SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
Ops[0] = N->getOperand(0);
SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
Ops[0] = N->getOperand(1);
SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
DAG.getIntPtrConstant(0, dl));
Results.push_back(Res);
return;
}
case ISD::ABS: {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
assert(N->getValueType(0) == MVT::i64 &&
"Unexpected type (!= i64) on ABS.");
MVT HalfT = MVT::i32;
SDValue Lo, Hi, Tmp;
SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
DAG.getConstant(0, dl, HalfT));
Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
DAG.getConstant(1, dl, HalfT));
Tmp = DAG.getNode(
ISD::SRA, dl, HalfT, Hi,
DAG.getConstant(HalfT.getSizeInBits() - 1, dl,
TLI.getShiftAmountTy(HalfT, DAG.getDataLayout())));
Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
SDValue(Lo.getNode(), 1));
Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
Results.push_back(Lo);
Results.push_back(Hi);
return;
}
case ISD::SETCC: {
// Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when
// setCC result type is v2i1 because type legalzation will end up with
// a v4i1 setcc plus an extend.
assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type");
if (N->getOperand(0).getValueType() != MVT::v2f32 ||
getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector)
return;
SDValue UNDEF = DAG.getUNDEF(MVT::v2f32);
SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
N->getOperand(0), UNDEF);
SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
N->getOperand(1), UNDEF);
SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS,
N->getOperand(2));
Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
DAG.getIntPtrConstant(0, dl));
Results.push_back(Res);
return;
}
// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
case X86ISD::FMINC:
case X86ISD::FMIN:
case X86ISD::FMAXC:
case X86ISD::FMAX: {
EVT VT = N->getValueType(0);
assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
SDValue UNDEF = DAG.getUNDEF(VT);
SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
N->getOperand(0), UNDEF);
SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
N->getOperand(1), UNDEF);
Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
return;
}
case ISD::SDIV:
case ISD::UDIV:
case ISD::SREM:
case ISD::UREM: {
EVT VT = N->getValueType(0);
if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector) {
// If this RHS is a constant splat vector we can widen this and let
// division/remainder by constant optimize it.
// TODO: Can we do something for non-splat?
APInt SplatVal;
if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
unsigned NumConcats = 128 / VT.getSizeInBits();
SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
Ops0[0] = N->getOperand(0);
EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
Results.push_back(Res);
}
return;
}
if (VT == MVT::v2i32) {
// Legalize v2i32 div/rem by unrolling. Otherwise we promote to the
// v2i64 and unroll later. But then we create i64 scalar ops which
// might be slow in 64-bit mode or require a libcall in 32-bit mode.
Results.push_back(DAG.UnrollVectorOp(N));
return;
}
if (VT.isVector())
return;
LLVM_FALLTHROUGH;
}
case ISD::SDIVREM:
case ISD::UDIVREM: {
SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
Results.push_back(V);
return;
}
case ISD::TRUNCATE: {
MVT VT = N->getSimpleValueType(0);
if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
return;
// The generic legalizer will try to widen the input type to the same
// number of elements as the widened result type. But this isn't always
// the best thing so do some custom legalization to avoid some cases.
MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
SDValue In = N->getOperand(0);
EVT InVT = In.getValueType();
unsigned InBits = InVT.getSizeInBits();
if (128 % InBits == 0) {
// 128 bit and smaller inputs should avoid truncate all together and
// just use a build_vector that will become a shuffle.
// TODO: Widen and use a shuffle directly?
MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
EVT EltVT = VT.getVectorElementType();
unsigned WidenNumElts = WidenVT.getVectorNumElements();
SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
// Use the original element count so we don't do more scalar opts than
// necessary.
unsigned MinElts = VT.getVectorNumElements();
for (unsigned i=0; i < MinElts; ++i) {
SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
DAG.getIntPtrConstant(i, dl));
Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
}
Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
return;
}
// With AVX512 there are some cases that can use a target specific
// truncate node to go from 256/512 to less than 128 with zeros in the
// upper elements of the 128 bit result.
if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
// We can use VTRUNC directly if for 256 bits with VLX or for any 512.
if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
return;
}
// There's one case we can widen to 512 bits and use VTRUNC.
if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
DAG.getUNDEF(MVT::v4i64));
Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
return;
}
}
return;
}
case ISD::SIGN_EXTEND_VECTOR_INREG: {
if (ExperimentalVectorWideningLegalization)
return;
EVT VT = N->getValueType(0);
SDValue In = N->getOperand(0);
EVT InVT = In.getValueType();
if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
(InVT == MVT::v16i16 || InVT == MVT::v32i8)) {
// Custom split this so we can extend i8/i16->i32 invec. This is better
// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
// sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
// we allow the sra from the extend to i32 to be shared by the split.
EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(),
InVT.getVectorElementType(),
InVT.getVectorNumElements() / 2);
MVT ExtendVT = MVT::getVectorVT(MVT::i32,
VT.getVectorNumElements());
In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExtractVT,
In, DAG.getIntPtrConstant(0, dl));
In = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, MVT::v4i32, In);
// Fill a vector with sign bits for each element.
SDValue Zero = DAG.getConstant(0, dl, ExtendVT);
SDValue SignBits = DAG.getSetCC(dl, ExtendVT, Zero, In, ISD::SETGT);
EVT LoVT, HiVT;
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
// Create an unpackl and unpackh to interleave the sign bits then bitcast
// to vXi64.
SDValue Lo = getUnpackl(DAG, dl, ExtendVT, In, SignBits);
Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
SDValue Hi = getUnpackh(DAG, dl, ExtendVT, In, SignBits);
Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
Results.push_back(Res);
return;
}
return;
}
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND: {
EVT VT = N->getValueType(0);
SDValue In = N->getOperand(0);
EVT InVT = In.getValueType();
if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
(InVT == MVT::v4i16 || InVT == MVT::v4i8) &&
getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector) {
assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
// Custom split this so we can extend i8/i16->i32 invec. This is better
// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
// sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
// we allow the sra from the extend to i32 to be shared by the split.
In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
// Fill a vector with sign bits for each element.
SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
// Create an unpackl and unpackh to interleave the sign bits then bitcast
// to v2i64.
SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
{0, 4, 1, 5});
Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
{2, 6, 3, 7});
Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
Results.push_back(Res);
return;
}
if (VT == MVT::v16i32 || VT == MVT::v8i64) {
if (!InVT.is128BitVector()) {
// Not a 128 bit vector, but maybe type legalization will promote
// it to 128 bits.
if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
return;
InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
if (!InVT.is128BitVector())
return;
// Promote the input to 128 bits. Type legalization will turn this into
// zext_inreg/sext_inreg.
In = DAG.getNode(N->getOpcode(), dl, InVT, In);
}
// Perform custom splitting instead of the two stage extend we would get
// by default.
EVT LoVT, HiVT;
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
assert(isTypeLegal(LoVT) && "Split VT not legal?");
SDValue Lo = getExtendInVec(N->getOpcode(), dl, LoVT, In, DAG);
// We need to shift the input over by half the number of elements.
unsigned NumElts = InVT.getVectorNumElements();
unsigned HalfNumElts = NumElts / 2;
SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
for (unsigned i = 0; i != HalfNumElts; ++i)
ShufMask[i] = i + HalfNumElts;
SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
Hi = getExtendInVec(N->getOpcode(), dl, HiVT, Hi, DAG);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
Results.push_back(Res);
}
return;
}
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT: {
bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
EVT VT = N->getValueType(0);
SDValue Src = N->getOperand(0);
EVT SrcVT = Src.getValueType();
// Promote these manually to avoid over promotion to v2i64. Type
// legalization will revisit the v2i32 operation for more cleanup.
if ((VT == MVT::v2i8 || VT == MVT::v2i16) &&
getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) {
// AVX512DQ provides instructions that produce a v2i64 result.
if (Subtarget.hasDQI())
return;
SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v2i32, Src);
Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext
: ISD::AssertSext,
dl, MVT::v2i32, Res,
DAG.getValueType(VT.getVectorElementType()));
Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
Results.push_back(Res);
return;
}
if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
return;
// Try to create a 128 bit vector, but don't exceed a 32 bit element.
unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
VT.getVectorNumElements());
SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
// Preserve what we know about the size of the original result. Except
// when the result is v2i32 since we can't widen the assert.
if (PromoteVT != MVT::v2i32)
Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext
: ISD::AssertSext,
dl, PromoteVT, Res,
DAG.getValueType(VT.getVectorElementType()));
// Truncate back to the original width.
Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
// Now widen to 128 bits.
unsigned NumConcats = 128 / VT.getSizeInBits();
MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
VT.getVectorNumElements() * NumConcats);
SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
ConcatOps[0] = Res;
Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
Results.push_back(Res);
return;
}
if (VT == MVT::v2i32) {
assert((IsSigned || Subtarget.hasAVX512()) &&
"Can only handle signed conversion without AVX512");
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
bool Widenv2i32 =
getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector;
if (Src.getValueType() == MVT::v2f64) {
unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
if (!IsSigned && !Subtarget.hasVLX()) {
// If v2i32 is widened, we can defer to the generic legalizer.
if (Widenv2i32)
return;
// Custom widen by doubling to a legal vector with. Isel will
// further widen to v8f64.
Opc = ISD::FP_TO_UINT;
Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64,
Src, DAG.getUNDEF(MVT::v2f64));
}
SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
if (!Widenv2i32)
Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
DAG.getIntPtrConstant(0, dl));
Results.push_back(Res);
return;
}
if (SrcVT == MVT::v2f32 &&
getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
SDValue Idx = DAG.getIntPtrConstant(0, dl);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
DAG.getUNDEF(MVT::v2f32));
Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
: ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
Results.push_back(Res);
return;
}
// The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
// so early out here.
return;
}
if (Subtarget.hasDQI() && VT == MVT::i64 &&
(SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
assert(!Subtarget.is64Bit() && "i64 should be legal");
unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
// Using a 256-bit input here to guarantee 128-bit input for f32 case.
// TODO: Use 128-bit vectors for f64 case?
// TODO: Use 128-bit vectors for f32 by using CVTTP2SI/CVTTP2UI.
MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), NumElts);
SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
DAG.getConstantFP(0.0, dl, VecInVT), Src,
ZeroIdx);
Res = DAG.getNode(N->getOpcode(), SDLoc(N), VecVT, Res);
Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
Results.push_back(Res);
return;
}
if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned))
Results.push_back(V);
return;
}
case ISD::SINT_TO_FP: {
assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
SDValue Src = N->getOperand(0);
if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
return;
Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
return;
}
case ISD::UINT_TO_FP: {
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
EVT VT = N->getValueType(0);
if (VT != MVT::v2f32)
return;
SDValue Src = N->getOperand(0);
EVT SrcVT = Src.getValueType();
if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
return;
}
if (SrcVT != MVT::v2i32)
return;
SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
SDValue VBias =
DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
DAG.getBitcast(MVT::v2i64, VBias));
Or = DAG.getBitcast(MVT::v2f64, Or);
// TODO: Are there any fast-math-flags to propagate here?
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
return;
}
case ISD::FP_ROUND: {
if (!isTypeLegal(N->getOperand(0).getValueType()))
return;
SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
Results.push_back(V);
return;
}
case ISD::FP_EXTEND: {
// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
// No other ValueType for FP_EXTEND should reach this point.
assert(N->getValueType(0) == MVT::v2f32 &&
"Do not know how to legalize this Node");
return;
}
case ISD::INTRINSIC_W_CHAIN: {
unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
switch (IntNo) {
default : llvm_unreachable("Do not know how to custom type "
"legalize this intrinsic operation!");
case Intrinsic::x86_rdtsc:
return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
Results);
case Intrinsic::x86_rdtscp:
return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
Results);
case Intrinsic::x86_rdpmc:
expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
Results);
return;
case Intrinsic::x86_xgetbv:
expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
Results);
return;
}
}
case ISD::READCYCLECOUNTER: {
return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
}
case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
EVT T = N->getValueType(0);
assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
bool Regs64bit = T == MVT::i128;
assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&
"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
SDValue cpInL, cpInH;
cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
DAG.getConstant(0, dl, HalfT));
cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
DAG.getConstant(1, dl, HalfT));
cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
Regs64bit ? X86::RAX : X86::EAX,
cpInL, SDValue());
cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
Regs64bit ? X86::RDX : X86::EDX,
cpInH, cpInL.getValue(1));
SDValue swapInL, swapInH;
swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
DAG.getConstant(0, dl, HalfT));
swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
DAG.getConstant(1, dl, HalfT));
swapInH =
DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
swapInH, cpInH.getValue(1));
// If the current function needs the base pointer, RBX,
// we shouldn't use cmpxchg directly.
// Indeed the lowering of that instruction will clobber
// that register and since RBX will be a reserved register
// the register allocator will not make sure its value will
// be properly saved and restored around this live-range.
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
SDValue Result;
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
unsigned BasePtr = TRI->getBaseRegister();
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
(BasePtr == X86::RBX || BasePtr == X86::EBX)) {
// ISel prefers the LCMPXCHG64 variant.
// If that assert breaks, that means it is not the case anymore,
// and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
// not just EBX. This is a matter of accepting i64 input for that
// pseudo, and restoring into the register of the right wide
// in expand pseudo. Everything else should just work.
assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
"Saving only half of the RBX");
unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
: X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
Regs64bit ? X86::RBX : X86::EBX,
HalfT, swapInH.getValue(1));
SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
RBXSave,
/*Glue*/ RBXSave.getValue(2)};
Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
} else {
unsigned Opcode =
Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
Regs64bit ? X86::RBX : X86::EBX, swapInL,
swapInH.getValue(1));
SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
swapInL.getValue(1)};
Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
}
SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
Regs64bit ? X86::RAX : X86::EAX,
HalfT, Result.getValue(1));
SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
Regs64bit ? X86::RDX : X86::EDX,
HalfT, cpOutL.getValue(2));
SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
MVT::i32, cpOutH.getValue(2));
SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
Results.push_back(Success);
Results.push_back(EFLAGS.getValue(1));
return;
}
case ISD::ATOMIC_LOAD: {
assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
bool NoImplicitFloatOps =
DAG.getMachineFunction().getFunction().hasFnAttribute(
Attribute::NoImplicitFloat);
if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
auto *Node = cast<AtomicSDNode>(N);
if (Subtarget.hasSSE2()) {
// Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the
// lower 64-bits.
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
MVT::i64, Node->getMemOperand());
SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
DAG.getIntPtrConstant(0, dl));
Results.push_back(Res);
Results.push_back(Ld.getValue(1));
return;
}
if (Subtarget.hasX87()) {
// First load this into an 80-bit X87 register. This will put the whole
// integer into the significand.
// FIXME: Do we need to glue? See FIXME comment in BuildFILD.
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other, MVT::Glue);
SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD_FLAG,
dl, Tys, Ops, MVT::i64,
Node->getMemOperand());
SDValue Chain = Result.getValue(1);
SDValue InFlag = Result.getValue(2);
// Now store the X87 register to a stack temporary and convert to i64.
// This store is not atomic and doesn't need to be.
// FIXME: We don't need a stack temporary if the result of the load
// is already being stored. We could just directly store there.
SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
MachinePointerInfo MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
SDValue StoreOps[] = { Chain, Result, StackPtr, InFlag };
Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, dl,
DAG.getVTList(MVT::Other), StoreOps,
MVT::i64, MPI, 0 /*Align*/,
MachineMemOperand::MOStore);
// Finally load the value back from the stack temporary and return it.
// This load is not atomic and doesn't need to be.
// This load will be further type legalized.
Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
Results.push_back(Result);
Results.push_back(Result.getValue(1));
return;
}
}
// TODO: Use MOVLPS when SSE1 is available?
// Delegate to generic TypeLegalization. Situations we can really handle
// should have already been dealt with by AtomicExpandPass.cpp.
break;
}
case ISD::ATOMIC_SWAP:
case ISD::ATOMIC_LOAD_ADD:
case ISD::ATOMIC_LOAD_SUB:
case ISD::ATOMIC_LOAD_AND:
case ISD::ATOMIC_LOAD_OR:
case ISD::ATOMIC_LOAD_XOR:
case ISD::ATOMIC_LOAD_NAND:
case ISD::ATOMIC_LOAD_MIN:
case ISD::ATOMIC_LOAD_MAX:
case ISD::ATOMIC_LOAD_UMIN:
case ISD::ATOMIC_LOAD_UMAX:
// Delegate to generic TypeLegalization. Situations we can really handle
// should have already been dealt with by AtomicExpandPass.cpp.
break;
case ISD::BITCAST: {
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
EVT DstVT = N->getValueType(0);
EVT SrcVT = N->getOperand(0).getValueType();
// If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
// we can split using the k-register rather than memory.
if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
Lo = DAG.getBitcast(MVT::i32, Lo);
Hi = DAG.getBitcast(MVT::i32, Hi);
SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
Results.push_back(Res);
return;
}
// Custom splitting for BWI types when AVX512F is available but BWI isn't.
if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) &&
SrcVT.isVector() && isTypeLegal(SrcVT)) {
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;
Lo = DAG.getBitcast(CastVT, Lo);
Hi = DAG.getBitcast(CastVT, Hi);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
Results.push_back(Res);
return;
}
if (SrcVT != MVT::f64 ||
(DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8) ||
getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector)
return;
unsigned NumElts = DstVT.getVectorNumElements();
EVT SVT = DstVT.getVectorElementType();
EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
SDValue Res;
Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, N->getOperand(0));
Res = DAG.getBitcast(WiderVT, Res);
Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, Res,
DAG.getIntPtrConstant(0, dl));
Results.push_back(Res);
return;
}
case ISD::MGATHER: {
EVT VT = N->getValueType(0);
if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
auto *Gather = cast<MaskedGatherSDNode>(N);
SDValue Index = Gather->getIndex();
if (Index.getValueType() != MVT::v2i64)
return;
SDValue Mask = Gather->getMask();
assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
Gather->getPassThru(),
DAG.getUNDEF(MVT::v2f32));
if (!Subtarget.hasVLX()) {
// We need to widen the mask, but the instruction will only use 2
// of its elements. So we can use undef.
Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
DAG.getUNDEF(MVT::v2i1));
Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
}
SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
Gather->getBasePtr(), Index, Gather->getScale() };
SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
Gather->getMemoryVT(), Gather->getMemOperand());
Results.push_back(Res);
Results.push_back(Res.getValue(2));
return;
}
if (VT == MVT::v2i32) {
auto *Gather = cast<MaskedGatherSDNode>(N);
SDValue Index = Gather->getIndex();
SDValue Mask = Gather->getMask();
assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
Gather->getPassThru(),
DAG.getUNDEF(MVT::v2i32));
// If the index is v2i64 we can use it directly.
if (Index.getValueType() == MVT::v2i64 &&
(Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
if (!Subtarget.hasVLX()) {
// We need to widen the mask, but the instruction will only use 2
// of its elements. So we can use undef.
Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
DAG.getUNDEF(MVT::v2i1));
Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
}
SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
Gather->getBasePtr(), Index, Gather->getScale() };
SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
Gather->getMemoryVT(), Gather->getMemOperand());
SDValue Chain = Res.getValue(2);
if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
DAG.getIntPtrConstant(0, dl));
Results.push_back(Res);
Results.push_back(Chain);
return;
}
if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
EVT IndexVT = Index.getValueType();
EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
IndexVT.getScalarType(), 4);
// Otherwise we need to custom widen everything to avoid promotion.
Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
DAG.getUNDEF(IndexVT));
Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
DAG.getConstant(0, dl, MVT::v2i1));
SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
Gather->getBasePtr(), Index, Gather->getScale() };
SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
Gather->getMemoryVT(), dl, Ops,
Gather->getMemOperand());
SDValue Chain = Res.getValue(1);
if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
DAG.getIntPtrConstant(0, dl));
Results.push_back(Res);
Results.push_back(Chain);
return;
}
}
return;
}
case ISD::LOAD: {
// Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
// avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
// cast since type legalization will try to use an i64 load.
MVT VT = N->getSimpleValueType(0);
assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
return;
if (!ISD::isNON_EXTLoad(N))
return;
auto *Ld = cast<LoadSDNode>(N);
if (Subtarget.hasSSE2()) {
MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
Ld->getPointerInfo(), Ld->getAlignment(),
Ld->getMemOperand()->getFlags());
SDValue Chain = Res.getValue(1);
MVT WideVT = MVT::getVectorVT(LdVT, 2);
Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
VT.getVectorNumElements() * 2);
Res = DAG.getBitcast(CastVT, Res);
Results.push_back(Res);
Results.push_back(Chain);
return;
}
assert(Subtarget.hasSSE1() && "Expected SSE");
SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
MVT::i64, Ld->getMemOperand());
Results.push_back(Res);
Results.push_back(Res.getValue(1));
return;
}
}
}
const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((X86ISD::NodeType)Opcode) {
case X86ISD::FIRST_NUMBER: break;
case X86ISD::BSF: return "X86ISD::BSF";
case X86ISD::BSR: return "X86ISD::BSR";
case X86ISD::SHLD: return "X86ISD::SHLD";
case X86ISD::SHRD: return "X86ISD::SHRD";
case X86ISD::FAND: return "X86ISD::FAND";
case X86ISD::FANDN: return "X86ISD::FANDN";
case X86ISD::FOR: return "X86ISD::FOR";
case X86ISD::FXOR: return "X86ISD::FXOR";
case X86ISD::FILD: return "X86ISD::FILD";
case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
case X86ISD::FIST: return "X86ISD::FIST";
case X86ISD::FP_TO_INT_IN_MEM: return "X86ISD::FP_TO_INT_IN_MEM";
case X86ISD::FLD: return "X86ISD::FLD";
case X86ISD::FST: return "X86ISD::FST";
case X86ISD::CALL: return "X86ISD::CALL";
case X86ISD::BT: return "X86ISD::BT";
case X86ISD::CMP: return "X86ISD::CMP";
case X86ISD::COMI: return "X86ISD::COMI";
case X86ISD::UCOMI: return "X86ISD::UCOMI";
case X86ISD::CMPM: return "X86ISD::CMPM";
case X86ISD::CMPM_SAE: return "X86ISD::CMPM_SAE";
case X86ISD::SETCC: return "X86ISD::SETCC";
case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
case X86ISD::FSETCC: return "X86ISD::FSETCC";
case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
case X86ISD::FSETCCM_SAE: return "X86ISD::FSETCCM_SAE";
case X86ISD::CMOV: return "X86ISD::CMOV";
case X86ISD::BRCOND: return "X86ISD::BRCOND";
case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
case X86ISD::IRET: return "X86ISD::IRET";
case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
case X86ISD::Wrapper: return "X86ISD::Wrapper";
case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
case X86ISD::PINSRB: return "X86ISD::PINSRB";
case X86ISD::PINSRW: return "X86ISD::PINSRW";
case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
case X86ISD::ANDNP: return "X86ISD::ANDNP";
case X86ISD::BLENDI: return "X86ISD::BLENDI";
case X86ISD::BLENDV: return "X86ISD::BLENDV";
case X86ISD::HADD: return "X86ISD::HADD";
case X86ISD::HSUB: return "X86ISD::HSUB";
case X86ISD::FHADD: return "X86ISD::FHADD";
case X86ISD::FHSUB: return "X86ISD::FHSUB";
case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
case X86ISD::FMAX: return "X86ISD::FMAX";
case X86ISD::FMAXS: return "X86ISD::FMAXS";
case X86ISD::FMAX_SAE: return "X86ISD::FMAX_SAE";
case X86ISD::FMAXS_SAE: return "X86ISD::FMAXS_SAE";
case X86ISD::FMIN: return "X86ISD::FMIN";
case X86ISD::FMINS: return "X86ISD::FMINS";
case X86ISD::FMIN_SAE: return "X86ISD::FMIN_SAE";
case X86ISD::FMINS_SAE: return "X86ISD::FMINS_SAE";
case X86ISD::FMAXC: return "X86ISD::FMAXC";
case X86ISD::FMINC: return "X86ISD::FMINC";
case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
case X86ISD::FRCP: return "X86ISD::FRCP";
case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
case X86ISD::EH_SJLJ_SETUP_DISPATCH:
return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
case X86ISD::LADD: return "X86ISD::LADD";
case X86ISD::LSUB: return "X86ISD::LSUB";
case X86ISD::LOR: return "X86ISD::LOR";
case X86ISD::LXOR: return "X86ISD::LXOR";
case X86ISD::LAND: return "X86ISD::LAND";
case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
case X86ISD::VEXTRACT_STORE: return "X86ISD::VEXTRACT_STORE";
case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
case X86ISD::VMTRUNC: return "X86ISD::VMTRUNC";
case X86ISD::VMTRUNCS: return "X86ISD::VMTRUNCS";
case X86ISD::VMTRUNCUS: return "X86ISD::VMTRUNCUS";
case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
case X86ISD::VFPEXT_SAE: return "X86ISD::VFPEXT_SAE";
case X86ISD::VFPEXTS: return "X86ISD::VFPEXTS";
case X86ISD::VFPEXTS_SAE: return "X86ISD::VFPEXTS_SAE";
case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND";
case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
case X86ISD::VFPROUNDS: return "X86ISD::VFPROUNDS";
case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
case X86ISD::VSHL: return "X86ISD::VSHL";
case X86ISD::VSRL: return "X86ISD::VSRL";
case X86ISD::VSRA: return "X86ISD::VSRA";
case X86ISD::VSHLI: return "X86ISD::VSHLI";
case X86ISD::VSRLI: return "X86ISD::VSRLI";
case X86ISD::VSRAI: return "X86ISD::VSRAI";
case X86ISD::VSHLV: return "X86ISD::VSHLV";
case X86ISD::VSRLV: return "X86ISD::VSRLV";
case X86ISD::VSRAV: return "X86ISD::VSRAV";
case X86ISD::VROTLI: return "X86ISD::VROTLI";
case X86ISD::VROTRI: return "X86ISD::VROTRI";
case X86ISD::VPPERM: return "X86ISD::VPPERM";
case X86ISD::CMPP: return "X86ISD::CMPP";
case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
case X86ISD::ADD: return "X86ISD::ADD";
case X86ISD::SUB: return "X86ISD::SUB";
case X86ISD::ADC: return "X86ISD::ADC";
case X86ISD::SBB: return "X86ISD::SBB";
case X86ISD::SMUL: return "X86ISD::SMUL";
case X86ISD::UMUL: return "X86ISD::UMUL";
case X86ISD::OR: return "X86ISD::OR";
case X86ISD::XOR: return "X86ISD::XOR";
case X86ISD::AND: return "X86ISD::AND";
case X86ISD::BEXTR: return "X86ISD::BEXTR";
case X86ISD::BZHI: return "X86ISD::BZHI";
case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
case X86ISD::PTEST: return "X86ISD::PTEST";
case X86ISD::TESTP: return "X86ISD::TESTP";
case X86ISD::KORTEST: return "X86ISD::KORTEST";
case X86ISD::KTEST: return "X86ISD::KTEST";
case X86ISD::KADD: return "X86ISD::KADD";
case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
case X86ISD::PACKSS: return "X86ISD::PACKSS";
case X86ISD::PACKUS: return "X86ISD::PACKUS";
case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
case X86ISD::VALIGN: return "X86ISD::VALIGN";
case X86ISD::VSHLD: return "X86ISD::VSHLD";
case X86ISD::VSHRD: return "X86ISD::VSHRD";
case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
case X86ISD::SHUFP: return "X86ISD::SHUFP";
case X86ISD::SHUF128: return "X86ISD::SHUF128";
case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
case X86ISD::MOVSD: return "X86ISD::MOVSD";
case X86ISD::MOVSS: return "X86ISD::MOVSS";
case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
case X86ISD::VPERMV: return "X86ISD::VPERMV";
case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
case X86ISD::VPERMI: return "X86ISD::VPERMI";
case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
case X86ISD::VFIXUPIMM_SAE: return "X86ISD::VFIXUPIMM_SAE";
case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
case X86ISD::VFIXUPIMMS_SAE: return "X86ISD::VFIXUPIMMS_SAE";
case X86ISD::VRANGE: return "X86ISD::VRANGE";
case X86ISD::VRANGE_SAE: return "X86ISD::VRANGE_SAE";
case X86ISD::VRANGES: return "X86ISD::VRANGES";
case X86ISD::VRANGES_SAE: return "X86ISD::VRANGES_SAE";
case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
case X86ISD::PSADBW: return "X86ISD::PSADBW";
case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
case X86ISD::MFENCE: return "X86ISD::MFENCE";
case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
case X86ISD::SAHF: return "X86ISD::SAHF";
case X86ISD::RDRAND: return "X86ISD::RDRAND";
case X86ISD::RDSEED: return "X86ISD::RDSEED";
case X86ISD::RDPKRU: return "X86ISD::RDPKRU";
case X86ISD::WRPKRU: return "X86ISD::WRPKRU";
case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
case X86ISD::VPSHA: return "X86ISD::VPSHA";
case X86ISD::VPSHL: return "X86ISD::VPSHL";
case X86ISD::VPCOM: return "X86ISD::VPCOM";
case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
case X86ISD::FMSUB: return "X86ISD::FMSUB";
case X86ISD::FNMADD: return "X86ISD::FNMADD";
case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
case X86ISD::VRNDSCALE_SAE: return "X86ISD::VRNDSCALE_SAE";
case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
case X86ISD::VRNDSCALES_SAE: return "X86ISD::VRNDSCALES_SAE";
case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
case X86ISD::VREDUCE_SAE: return "X86ISD::VREDUCE_SAE";
case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
case X86ISD::VREDUCES_SAE: return "X86ISD::VREDUCES_SAE";
case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
case X86ISD::VGETMANT_SAE: return "X86ISD::VGETMANT_SAE";
case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
case X86ISD::VGETMANTS_SAE: return "X86ISD::VGETMANTS_SAE";
case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR";
case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR";
case X86ISD::XTEST: return "X86ISD::XTEST";
case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
case X86ISD::EXPAND: return "X86ISD::EXPAND";
case X86ISD::SELECTS: return "X86ISD::SELECTS";
case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
case X86ISD::RCP14: return "X86ISD::RCP14";
case X86ISD::RCP14S: return "X86ISD::RCP14S";
case X86ISD::RCP28: return "X86ISD::RCP28";
case X86ISD::RCP28_SAE: return "X86ISD::RCP28_SAE";
case X86ISD::RCP28S: return "X86ISD::RCP28S";
case X86ISD::RCP28S_SAE: return "X86ISD::RCP28S_SAE";
case X86ISD::EXP2: return "X86ISD::EXP2";
case X86ISD::EXP2_SAE: return "X86ISD::EXP2_SAE";
case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
case X86ISD::RSQRT28_SAE: return "X86ISD::RSQRT28_SAE";
case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
case X86ISD::RSQRT28S_SAE: return "X86ISD::RSQRT28S_SAE";
case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
case X86ISD::FADDS: return "X86ISD::FADDS";
case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
case X86ISD::FSUBS: return "X86ISD::FSUBS";
case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
case X86ISD::FMULS: return "X86ISD::FMULS";
case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
case X86ISD::FDIVS: return "X86ISD::FDIVS";
case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
case X86ISD::FSQRTS: return "X86ISD::FSQRTS";
case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
case X86ISD::FGETEXP: return "X86ISD::FGETEXP";
case X86ISD::FGETEXP_SAE: return "X86ISD::FGETEXP_SAE";
case X86ISD::FGETEXPS: return "X86ISD::FGETEXPS";
case X86ISD::FGETEXPS_SAE: return "X86ISD::FGETEXPS_SAE";
case X86ISD::SCALEF: return "X86ISD::SCALEF";
case X86ISD::SCALEF_RND: return "X86ISD::SCALEF_RND";
case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
case X86ISD::SCALEFS_RND: return "X86ISD::SCALEFS_RND";
case X86ISD::AVG: return "X86ISD::AVG";
case X86ISD::MULHRS: return "X86ISD::MULHRS";
case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI";
case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI";
case X86ISD::CVTTP2SI_SAE: return "X86ISD::CVTTP2SI_SAE";
case X86ISD::CVTTP2UI_SAE: return "X86ISD::CVTTP2UI_SAE";
case X86ISD::CVTTS2SI: return "X86ISD::CVTTS2SI";
case X86ISD::CVTTS2UI: return "X86ISD::CVTTS2UI";
case X86ISD::CVTTS2SI_SAE: return "X86ISD::CVTTS2SI_SAE";
case X86ISD::CVTTS2UI_SAE: return "X86ISD::CVTTS2UI_SAE";
case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
case X86ISD::MCVTSI2P: return "X86ISD::MCVTSI2P";
case X86ISD::MCVTUI2P: return "X86ISD::MCVTUI2P";
case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
case X86ISD::SCALAR_SINT_TO_FP: return "X86ISD::SCALAR_SINT_TO_FP";
case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
case X86ISD::SCALAR_UINT_TO_FP: return "X86ISD::SCALAR_UINT_TO_FP";
case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
case X86ISD::MCVTPS2PH: return "X86ISD::MCVTPS2PH";
case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
case X86ISD::CVTPH2PS_SAE: return "X86ISD::CVTPH2PS_SAE";
case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
case X86ISD::MCVTP2SI: return "X86ISD::MCVTP2SI";
case X86ISD::MCVTP2UI: return "X86ISD::MCVTP2UI";
case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
case X86ISD::CVTS2SI: return "X86ISD::CVTS2SI";
case X86ISD::CVTS2UI: return "X86ISD::CVTS2UI";
case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
case X86ISD::CVTNE2PS2BF16: return "X86ISD::CVTNE2PS2BF16";
case X86ISD::CVTNEPS2BF16: return "X86ISD::CVTNEPS2BF16";
case X86ISD::MCVTNEPS2BF16: return "X86ISD::MCVTNEPS2BF16";
case X86ISD::DPBF16PS: return "X86ISD::DPBF16PS";
case X86ISD::LWPINS: return "X86ISD::LWPINS";
case X86ISD::MGATHER: return "X86ISD::MGATHER";
case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
case X86ISD::NT_CALL: return "X86ISD::NT_CALL";
case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND";
case X86ISD::UMWAIT: return "X86ISD::UMWAIT";
case X86ISD::TPAUSE: return "X86ISD::TPAUSE";
case X86ISD::ENQCMD: return "X86ISD:ENQCMD";
case X86ISD::ENQCMDS: return "X86ISD:ENQCMDS";
case X86ISD::VP2INTERSECT: return "X86ISD::VP2INTERSECT";
}
return nullptr;
}
/// Return true if the addressing mode represented by AM is legal for this
/// target, for a load/store of the specified type.
bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS,
Instruction *I) const {
// X86 supports extremely general addressing modes.
CodeModel::Model M = getTargetMachine().getCodeModel();
// X86 allows a sign-extended 32-bit immediate field as a displacement.
if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
return false;
if (AM.BaseGV) {
unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
// If a reference to this global requires an extra load, we can't fold it.
if (isGlobalStubReference(GVFlags))
return false;
// If BaseGV requires a register for the PIC base, we cannot also have a
// BaseReg specified.
if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
return false;
// If lower 4G is not available, then we must use rip-relative addressing.
if ((M != CodeModel::Small || isPositionIndependent()) &&
Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
return false;
}
switch (AM.Scale) {
case 0:
case 1:
case 2:
case 4:
case 8:
// These scales always work.
break;
case 3:
case 5:
case 9:
// These scales are formed with basereg+scalereg. Only accept if there is
// no basereg yet.
if (AM.HasBaseReg)
return false;
break;
default: // Other stuff never works.
return false;
}
return true;
}
bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
unsigned Bits = Ty->getScalarSizeInBits();
// 8-bit shifts are always expensive, but versions with a scalar amount aren't
// particularly cheaper than those without.
if (Bits == 8)
return false;
// XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
(Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
return false;
// AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
// shifts just as cheap as scalar ones.
if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
return false;
// AVX512BW has shifts such as vpsllvw.
if (Subtarget.hasBWI() && Bits == 16)
return false;
// Otherwise, it's significantly cheaper to shift by a scalar amount than by a
// fully general vector.
return true;
}
bool X86TargetLowering::isBinOp(unsigned Opcode) const {
switch (Opcode) {
// These are non-commutative binops.
// TODO: Add more X86ISD opcodes once we have test coverage.
case X86ISD::ANDNP:
case X86ISD::PCMPGT:
case X86ISD::FMAX:
case X86ISD::FMIN:
case X86ISD::FANDN:
return true;
}
return TargetLoweringBase::isBinOp(Opcode);
}
bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
switch (Opcode) {
// TODO: Add more X86ISD opcodes once we have test coverage.
case X86ISD::PCMPEQ:
case X86ISD::PMULDQ:
case X86ISD::PMULUDQ:
case X86ISD::FMAXC:
case X86ISD::FMINC:
case X86ISD::FAND:
case X86ISD::FOR:
case X86ISD::FXOR:
return true;
}
return TargetLoweringBase::isCommutativeBinOp(Opcode);
}
bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
return NumBits1 > NumBits2;
}
bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
if (!isTypeLegal(EVT::getEVT(Ty1)))
return false;
assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
// Assuming the caller doesn't have a zeroext or signext return parameter,
// truncation all the way down to i1 is valid.
return true;
}
bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
return isInt<32>(Imm);
}
bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
// Can also use sub to handle negated immediates.
return isInt<32>(Imm);
}
bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
return isInt<32>(Imm);
}
bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
if (!VT1.isInteger() || !VT2.isInteger())
return false;
unsigned NumBits1 = VT1.getSizeInBits();
unsigned NumBits2 = VT2.getSizeInBits();
return NumBits1 > NumBits2;
}
bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
}
bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
}
bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
EVT VT1 = Val.getValueType();
if (isZExtFree(VT1, VT2))
return true;
if (Val.getOpcode() != ISD::LOAD)
return false;
if (!VT1.isSimple() || !VT1.isInteger() ||
!VT2.isSimple() || !VT2.isInteger())
return false;
switch (VT1.getSimpleVT().SimpleTy) {
default: break;
case MVT::i8:
case MVT::i16:
case MVT::i32:
// X86 has 8, 16, and 32-bit zero-extending loads.
return true;
}
return false;
}
bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
EVT SrcVT = ExtVal.getOperand(0).getValueType();
// There is no extending load for vXi1.
if (SrcVT.getScalarType() == MVT::i1)
return false;
return true;
}
bool
X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
if (!Subtarget.hasAnyFMA())
return false;
VT = VT.getScalarType();
if (!VT.isSimple())
return false;
switch (VT.getSimpleVT().SimpleTy) {
case MVT::f32:
case MVT::f64:
return true;
default:
break;
}
return false;
}
bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
// i16 instructions are longer (0x66 prefix) and potentially slower.
return !(VT1 == MVT::i32 && VT2 == MVT::i16);
}
/// Targets can use this to indicate that they only support *some*
/// VECTOR_SHUFFLE operations, those with specific masks.
/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
/// are assumed to be legal.
bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
if (!VT.isSimple())
return false;
// Not for i1 vectors
if (VT.getSimpleVT().getScalarType() == MVT::i1)
return false;
// Very little shuffling can be done for 64-bit vectors right now.
if (VT.getSimpleVT().getSizeInBits() == 64)
return false;
// We only care that the types being shuffled are legal. The lowering can
// handle any possible shuffle mask that results.
return isTypeLegal(VT.getSimpleVT());
}
bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
EVT VT) const {
// Don't convert an 'and' into a shuffle that we don't directly support.
// vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
if (!Subtarget.hasAVX2())
if (VT == MVT::v32i8 || VT == MVT::v16i16)
return false;
// Just delegate to the generic legality, clear masks aren't special.
return isShuffleMaskLegal(Mask, VT);
}
bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
// If the subtarget is using retpolines, we need to not generate jump tables.
if (Subtarget.useRetpolineIndirectBranches())
return false;
// Otherwise, fallback on the generic logic.
return TargetLowering::areJTsAllowed(Fn);
}
//===----------------------------------------------------------------------===//
// X86 Scheduler Hooks
//===----------------------------------------------------------------------===//
/// Utility function to emit xbegin specifying the start of an RTM region.
static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
const TargetInstrInfo *TII) {
DebugLoc DL = MI.getDebugLoc();
const BasicBlock *BB = MBB->getBasicBlock();
MachineFunction::iterator I = ++MBB->getIterator();
// For the v = xbegin(), we generate
//
// thisMBB:
// xbegin sinkMBB
//
// mainMBB:
// s0 = -1
//
// fallBB:
// eax = # XABORT_DEF
// s1 = eax
//
// sinkMBB:
// v = phi(s0/mainBB, s1/fallBB)
MachineBasicBlock *thisMBB = MBB;
MachineFunction *MF = MBB->getParent();
MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
MF->insert(I, mainMBB);
MF->insert(I, fallMBB);
MF->insert(I, sinkMBB);
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), MBB,
std::next(MachineBasicBlock::iterator(MI)), MBB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
MachineRegisterInfo &MRI = MF->getRegInfo();
unsigned DstReg = MI.getOperand(0).getReg();
const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
unsigned mainDstReg = MRI.createVirtualRegister(RC);
unsigned fallDstReg = MRI.createVirtualRegister(RC);
// thisMBB:
// xbegin fallMBB
// # fallthrough to mainMBB
// # abortion to fallMBB
BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
thisMBB->addSuccessor(mainMBB);
thisMBB->addSuccessor(fallMBB);
// mainMBB:
// mainDstReg := -1
BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
mainMBB->addSuccessor(sinkMBB);
// fallMBB:
// ; pseudo instruction to model hardware's definition from XABORT
// EAX := XABORT_DEF
// fallDstReg := EAX
BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
.addReg(X86::EAX);
fallMBB->addSuccessor(sinkMBB);
// sinkMBB:
// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
.addReg(mainDstReg).addMBB(mainMBB)
.addReg(fallDstReg).addMBB(fallMBB);
MI.eraseFromParent();
return sinkMBB;
}
MachineBasicBlock *
X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const {
// Emit va_arg instruction on X86-64.
// Operands to this pseudo-instruction:
// 0 ) Output : destination address (reg)
// 1-5) Input : va_list address (addr, i64mem)
// 6 ) ArgSize : Size (in bytes) of vararg type
// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
// 8 ) Align : Alignment of type
// 9 ) EFLAGS (implicit-def)
assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
static_assert(X86::AddrNumOperands == 5,
"VAARG_64 assumes 5 address operands");
unsigned DestReg = MI.getOperand(0).getReg();
MachineOperand &Base = MI.getOperand(1);
MachineOperand &Scale = MI.getOperand(2);
MachineOperand &Index = MI.getOperand(3);
MachineOperand &Disp = MI.getOperand(4);
MachineOperand &Segment = MI.getOperand(5);
unsigned ArgSize = MI.getOperand(6).getImm();
unsigned ArgMode = MI.getOperand(7).getImm();
unsigned Align = MI.getOperand(8).getImm();
MachineFunction *MF = MBB->getParent();
// Memory Reference
assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
MachineMemOperand *OldMMO = MI.memoperands().front();
// Clone the MMO into two separate MMOs for loading and storing
MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
// Machine Information
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
DebugLoc DL = MI.getDebugLoc();
// struct va_list {
// i32 gp_offset
// i32 fp_offset
// i64 overflow_area (address)
// i64 reg_save_area (address)
// }
// sizeof(va_list) = 24
// alignment(va_list) = 8
unsigned TotalNumIntRegs = 6;
unsigned TotalNumXMMRegs = 8;
bool UseGPOffset = (ArgMode == 1);
bool UseFPOffset = (ArgMode == 2);
unsigned MaxOffset = TotalNumIntRegs * 8 +
(UseFPOffset ? TotalNumXMMRegs * 16 : 0);
/* Align ArgSize to a multiple of 8 */
unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
bool NeedsAlign = (Align > 8);
MachineBasicBlock *thisMBB = MBB;
MachineBasicBlock *overflowMBB;
MachineBasicBlock *offsetMBB;
MachineBasicBlock *endMBB;
unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
unsigned OffsetReg = 0;
if (!UseGPOffset && !UseFPOffset) {
// If we only pull from the overflow region, we don't create a branch.
// We don't need to alter control flow.
OffsetDestReg = 0; // unused
OverflowDestReg = DestReg;
offsetMBB = nullptr;
overflowMBB = thisMBB;
endMBB = thisMBB;
} else {
// First emit code to check if gp_offset (or fp_offset) is below the bound.
// If so, pull the argument from reg_save_area. (branch to offsetMBB)
// If not, pull from overflow_area. (branch to overflowMBB)
//
// thisMBB
// | .
// | .
// offsetMBB overflowMBB
// | .
// | .
// endMBB
// Registers for the PHI in endMBB
OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineFunction::iterator MBBIter = ++MBB->getIterator();
// Insert the new basic blocks
MF->insert(MBBIter, offsetMBB);
MF->insert(MBBIter, overflowMBB);
MF->insert(MBBIter, endMBB);
// Transfer the remainder of MBB and its successor edges to endMBB.
endMBB->splice(endMBB->begin(), thisMBB,
std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
// Make offsetMBB and overflowMBB successors of thisMBB
thisMBB->addSuccessor(offsetMBB);
thisMBB->addSuccessor(overflowMBB);
// endMBB is a successor of both offsetMBB and overflowMBB
offsetMBB->addSuccessor(endMBB);
overflowMBB->addSuccessor(endMBB);
// Load the offset value into a register
OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
.add(Base)
.add(Scale)
.add(Index)
.addDisp(Disp, UseFPOffset ? 4 : 0)
.add(Segment)
.setMemRefs(LoadOnlyMMO);
// Check if there is enough room left to pull this argument.
BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
.addReg(OffsetReg)
.addImm(MaxOffset + 8 - ArgSizeA8);
// Branch to "overflowMBB" if offset >= max
// Fall through to "offsetMBB" otherwise
BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
.addMBB(overflowMBB).addImm(X86::COND_AE);
}
// In offsetMBB, emit code to use the reg_save_area.
if (offsetMBB) {
assert(OffsetReg != 0);
// Read the reg_save_area address.
unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
.add(Base)
.add(Scale)
.add(Index)
.addDisp(Disp, 16)
.add(Segment)
.setMemRefs(LoadOnlyMMO);
// Zero-extend the offset
unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
.addImm(0)
.addReg(OffsetReg)
.addImm(X86::sub_32bit);
// Add the offset to the reg_save_area to get the final address.
BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
.addReg(OffsetReg64)
.addReg(RegSaveReg);
// Compute the offset for the next argument
unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
.addReg(OffsetReg)
.addImm(UseFPOffset ? 16 : 8);
// Store it back into the va_list.
BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
.add(Base)
.add(Scale)
.add(Index)
.addDisp(Disp, UseFPOffset ? 4 : 0)
.add(Segment)
.addReg(NextOffsetReg)
.setMemRefs(StoreOnlyMMO);
// Jump to endMBB
BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
.addMBB(endMBB);
}
//
// Emit code to use overflow area
//
// Load the overflow_area address into a register.
unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
.add(Base)
.add(Scale)
.add(Index)
.addDisp(Disp, 8)
.add(Segment)
.setMemRefs(LoadOnlyMMO);
// If we need to align it, do so. Otherwise, just copy the address
// to OverflowDestReg.
if (NeedsAlign) {
// Align the overflow address
assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
// aligned_addr = (addr + (align-1)) & ~(align-1)
BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
.addReg(OverflowAddrReg)
.addImm(Align-1);
BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
.addReg(TmpReg)
.addImm(~(uint64_t)(Align-1));
} else {
BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
.addReg(OverflowAddrReg);
}
// Compute the next overflow address after this argument.
// (the overflow address should be kept 8-byte aligned)
unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
.addReg(OverflowDestReg)
.addImm(ArgSizeA8);
// Store the new overflow address.
BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
.add(Base)
.add(Scale)
.add(Index)
.addDisp(Disp, 8)
.add(Segment)
.addReg(NextAddrReg)
.setMemRefs(StoreOnlyMMO);
// If we branched, emit the PHI to the front of endMBB.
if (offsetMBB) {
BuildMI(*endMBB, endMBB->begin(), DL,
TII->get(X86::PHI), DestReg)
.addReg(OffsetDestReg).addMBB(offsetMBB)
.addReg(OverflowDestReg).addMBB(overflowMBB);
}
// Erase the pseudo instruction
MI.eraseFromParent();
return endMBB;
}
MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *MBB) const {
// Emit code to save XMM registers to the stack. The ABI says that the
// number of registers to save is given in %al, so it's theoretically
// possible to do an indirect jump trick to avoid saving all of them,
// however this code takes a simpler approach and just executes all
// of the stores if %al is non-zero. It's less code, and it's probably
// easier on the hardware branch predictor, and stores aren't all that
// expensive anyway.
// Create the new basic blocks. One block contains all the XMM stores,
// and one block is the final destination regardless of whether any
// stores were performed.
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
MachineFunction *F = MBB->getParent();
MachineFunction::iterator MBBIter = ++MBB->getIterator();
MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
F->insert(MBBIter, XMMSaveMBB);
F->insert(MBBIter, EndMBB);
// Transfer the remainder of MBB and its successor edges to EndMBB.
EndMBB->splice(EndMBB->begin(), MBB,
std::next(MachineBasicBlock::iterator(MI)), MBB->end());
EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
// The original block will now fall through to the XMM save block.
MBB->addSuccessor(XMMSaveMBB);
// The XMMSaveMBB will fall through to the end block.
XMMSaveMBB->addSuccessor(EndMBB);
// Now add the instructions.
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
unsigned CountReg = MI.getOperand(0).getReg();
int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
// If %al is 0, branch around the XMM save block.
BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E);
MBB->addSuccessor(EndMBB);
}
// Make sure the last operand is EFLAGS, which gets clobbered by the branch
// that was just emitted, but clearly shouldn't be "saved".
assert((MI.getNumOperands() <= 3 ||
!MI.getOperand(MI.getNumOperands() - 1).isReg() ||
MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
"Expected last argument to be EFLAGS");
unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
// In the XMM save block, save all the XMM argument registers.
for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
MachineMemOperand *MMO = F->getMachineMemOperand(
MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
MachineMemOperand::MOStore,
/*Size=*/16, /*Align=*/16);
BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
.addFrameIndex(RegSaveFrameIndex)
.addImm(/*Scale=*/1)
.addReg(/*IndexReg=*/0)
.addImm(/*Disp=*/Offset)
.addReg(/*Segment=*/0)
.addReg(MI.getOperand(i).getReg())
.addMemOperand(MMO);
}
MI.eraseFromParent(); // The pseudo instruction is gone now.
return EndMBB;
}
// The EFLAGS operand of SelectItr might be missing a kill marker
// because there were multiple uses of EFLAGS, and ISel didn't know
// which to mark. Figure out whether SelectItr should have had a
// kill marker, and set it if it should. Returns the correct kill
// marker value.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
MachineBasicBlock* BB,
const TargetRegisterInfo* TRI) {
// Scan forward through BB for a use/def of EFLAGS.
MachineBasicBlock::iterator miI(std::next(SelectItr));
for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
const MachineInstr& mi = *miI;
if (mi.readsRegister(X86::EFLAGS))
return false;
if (mi.definesRegister(X86::EFLAGS))
break; // Should have kill-flag - update below.
}
// If we hit the end of the block, check whether EFLAGS is live into a
// successor.
if (miI == BB->end()) {
for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
sEnd = BB->succ_end();
sItr != sEnd; ++sItr) {
MachineBasicBlock* succ = *sItr;
if (succ->isLiveIn(X86::EFLAGS))
return false;
}
}
// We found a def, or hit the end of the basic block and EFLAGS wasn't live
// out. SelectMI should have a kill flag on EFLAGS.
SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
return true;
}
// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
// together with other CMOV pseudo-opcodes into a single basic-block with
// conditional jump around it.
static bool isCMOVPseudo(MachineInstr &MI) {
switch (MI.getOpcode()) {
case X86::CMOV_FR32:
case X86::CMOV_FR64:
case X86::CMOV_GR8:
case X86::CMOV_GR16:
case X86::CMOV_GR32:
case X86::CMOV_RFP32:
case X86::CMOV_RFP64:
case X86::CMOV_RFP80:
case X86::CMOV_VR128:
case X86::CMOV_VR128X:
case X86::CMOV_VR256:
case X86::CMOV_VR256X:
case X86::CMOV_VR512:
case X86::CMOV_VK2:
case X86::CMOV_VK4:
case X86::CMOV_VK8:
case X86::CMOV_VK16:
case X86::CMOV_VK32:
case X86::CMOV_VK64:
return true;
default:
return false;
}
}
// Helper function, which inserts PHI functions into SinkMBB:
// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
// the last PHI function inserted.
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
MachineBasicBlock *SinkMBB) {
MachineFunction *MF = TrueMBB->getParent();
const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
DebugLoc DL = MIItBegin->getDebugLoc();
X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
// As we are creating the PHIs, we have to be careful if there is more than
// one. Later CMOVs may reference the results of earlier CMOVs, but later
// PHIs have to reference the individual true/false inputs from earlier PHIs.
// That also means that PHI construction must work forward from earlier to
// later, and that the code must maintain a mapping from earlier PHI's
// destination registers, and the registers that went into the PHI.
DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
MachineInstrBuilder MIB;
for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
unsigned DestReg = MIIt->getOperand(0).getReg();
unsigned Op1Reg = MIIt->getOperand(1).getReg();
unsigned Op2Reg = MIIt->getOperand(2).getReg();
// If this CMOV we are generating is the opposite condition from
// the jump we generated, then we have to swap the operands for the
// PHI that is going to be generated.
if (MIIt->getOperand(3).getImm() == OppCC)
std::swap(Op1Reg, Op2Reg);
if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
Op1Reg = RegRewriteTable[Op1Reg].first;
if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
Op2Reg = RegRewriteTable[Op2Reg].second;
MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
.addReg(Op1Reg)
.addMBB(FalseMBB)
.addReg(Op2Reg)
.addMBB(TrueMBB);
// Add this PHI to the rewrite table.
RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
}
return MIB;
}
// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
MachineBasicBlock *
X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
MachineInstr &SecondCascadedCMOV,
MachineBasicBlock *ThisMBB) const {
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = FirstCMOV.getDebugLoc();
// We lower cascaded CMOVs such as
//
// (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
//
// to two successive branches.
//
// Without this, we would add a PHI between the two jumps, which ends up
// creating a few copies all around. For instance, for
//
// (sitofp (zext (fcmp une)))
//
// we would generate:
//
// ucomiss %xmm1, %xmm0
// movss <1.0f>, %xmm0
// movaps %xmm0, %xmm1
// jne .LBB5_2
// xorps %xmm1, %xmm1
// .LBB5_2:
// jp .LBB5_4
// movaps %xmm1, %xmm0
// .LBB5_4:
// retq
//
// because this custom-inserter would have generated:
//
// A
// | \
// | B
// | /
// C
// | \
// | D
// | /
// E
//
// A: X = ...; Y = ...
// B: empty
// C: Z = PHI [X, A], [Y, B]
// D: empty
// E: PHI [X, C], [Z, D]
//
// If we lower both CMOVs in a single step, we can instead generate:
//
// A
// | \
// | C
// | /|
// |/ |
// | |
// | D
// | /
// E
//
// A: X = ...; Y = ...
// D: empty
// E: PHI [X, A], [X, C], [Y, D]
//
// Which, in our sitofp/fcmp example, gives us something like:
//
// ucomiss %xmm1, %xmm0
// movss <1.0f>, %xmm0
// jne .LBB5_4
// jp .LBB5_4
// xorps %xmm0, %xmm0
// .LBB5_4:
// retq
//
// We lower cascaded CMOV into two successive branches to the same block.
// EFLAGS is used by both, so mark it as live in the second.
const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
MachineFunction *F = ThisMBB->getParent();
MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineFunction::iterator It = ++ThisMBB->getIterator();
F->insert(It, FirstInsertedMBB);
F->insert(It, SecondInsertedMBB);
F->insert(It, SinkMBB);
// For a cascaded CMOV, we lower it to two successive branches to
// the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
// the FirstInsertedMBB.
FirstInsertedMBB->addLiveIn(X86::EFLAGS);
// If the EFLAGS register isn't dead in the terminator, then claim that it's
// live into the sink and copy blocks.
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
!checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
SecondInsertedMBB->addLiveIn(X86::EFLAGS);
SinkMBB->addLiveIn(X86::EFLAGS);
}
// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
SinkMBB->splice(SinkMBB->begin(), ThisMBB,
std::next(MachineBasicBlock::iterator(FirstCMOV)),
ThisMBB->end());
SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
// Fallthrough block for ThisMBB.
ThisMBB->addSuccessor(FirstInsertedMBB);
// The true block target of the first branch is always SinkMBB.
ThisMBB->addSuccessor(SinkMBB);
// Fallthrough block for FirstInsertedMBB.
FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
// The true block for the branch of FirstInsertedMBB.
FirstInsertedMBB->addSuccessor(SinkMBB);
// This is fallthrough.
SecondInsertedMBB->addSuccessor(SinkMBB);
// Create the conditional branch instructions.
X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
X86::CondCode SecondCC =
X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
// SinkMBB:
// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
unsigned DestReg = FirstCMOV.getOperand(0).getReg();
unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
MachineInstrBuilder MIB =
BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
.addReg(Op1Reg)
.addMBB(SecondInsertedMBB)
.addReg(Op2Reg)
.addMBB(ThisMBB);
// The second SecondInsertedMBB provides the same incoming value as the
// FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
// Copy the PHI result to the register defined by the second CMOV.
BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
TII->get(TargetOpcode::COPY),
SecondCascadedCMOV.getOperand(0).getReg())
.addReg(FirstCMOV.getOperand(0).getReg());
// Now remove the CMOVs.
FirstCMOV.eraseFromParent();
SecondCascadedCMOV.eraseFromParent();
return SinkMBB;
}
MachineBasicBlock *
X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
MachineBasicBlock *ThisMBB) const {
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
// To "insert" a SELECT_CC instruction, we actually have to insert the
// diamond control-flow pattern. The incoming instruction knows the
// destination vreg to set, the condition code register to branch on, the
// true/false values to select between and a branch opcode to use.
// ThisMBB:
// ...
// TrueVal = ...
// cmpTY ccX, r1, r2
// bCC copy1MBB
// fallthrough --> FalseMBB
// This code lowers all pseudo-CMOV instructions. Generally it lowers these
// as described above, by inserting a BB, and then making a PHI at the join
// point to select the true and false operands of the CMOV in the PHI.
//
// The code also handles two different cases of multiple CMOV opcodes
// in a row.
//
// Case 1:
// In this case, there are multiple CMOVs in a row, all which are based on
// the same condition setting (or the exact opposite condition setting).
// In this case we can lower all the CMOVs using a single inserted BB, and
// then make a number of PHIs at the join point to model the CMOVs. The only
// trickiness here, is that in a case like:
//
// t2 = CMOV cond1 t1, f1
// t3 = CMOV cond1 t2, f2
//
// when rewriting this into PHIs, we have to perform some renaming on the
// temps since you cannot have a PHI operand refer to a PHI result earlier
// in the same block. The "simple" but wrong lowering would be:
//
// t2 = PHI t1(BB1), f1(BB2)
// t3 = PHI t2(BB1), f2(BB2)
//
// but clearly t2 is not defined in BB1, so that is incorrect. The proper
// renaming is to note that on the path through BB1, t2 is really just a
// copy of t1, and do that renaming, properly generating:
//
// t2 = PHI t1(BB1), f1(BB2)
// t3 = PHI t1(BB1), f2(BB2)
//
// Case 2:
// CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
// function - EmitLoweredCascadedSelect.
X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
MachineInstr *LastCMOV = &MI;
MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
// Check for case 1, where there are multiple CMOVs with the same condition
// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
// number of jumps the most.
if (isCMOVPseudo(MI)) {
// See if we have a string of CMOVS with the same condition. Skip over
// intervening debug insts.
while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
(NextMIIt->getOperand(3).getImm() == CC ||
NextMIIt->getOperand(3).getImm() == OppCC)) {
LastCMOV = &*NextMIIt;
++NextMIIt;
NextMIIt = skipDebugInstructionsForward(NextMIIt, ThisMBB->end());
}
}
// This checks for case 2, but only do this if we didn't already find
// case 1, as indicated by LastCMOV == MI.
if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
NextMIIt->getOpcode() == MI.getOpcode() &&
NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
NextMIIt->getOperand(1).isKill()) {
return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
}
const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
MachineFunction *F = ThisMBB->getParent();
MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineFunction::iterator It = ++ThisMBB->getIterator();
F->insert(It, FalseMBB);
F->insert(It, SinkMBB);
// If the EFLAGS register isn't dead in the terminator, then claim that it's
// live into the sink and copy blocks.
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
if (!LastCMOV->killsRegister(X86::EFLAGS) &&
!checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
FalseMBB->addLiveIn(X86::EFLAGS);
SinkMBB->addLiveIn(X86::EFLAGS);
}
// Transfer any debug instructions inside the CMOV sequence to the sunk block.
auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
auto DbgIt = MachineBasicBlock::iterator(MI);
while (DbgIt != DbgEnd) {
auto Next = std::next(DbgIt);
if (DbgIt->isDebugInstr())
SinkMBB->push_back(DbgIt->removeFromParent());
DbgIt = Next;
}
// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
SinkMBB->splice(SinkMBB->end(), ThisMBB,
std::next(MachineBasicBlock::iterator(LastCMOV)),
ThisMBB->end());
SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
// Fallthrough block for ThisMBB.
ThisMBB->addSuccessor(FalseMBB);
// The true block target of the first (or only) branch is always a SinkMBB.
ThisMBB->addSuccessor(SinkMBB);
// Fallthrough block for FalseMBB.
FalseMBB->addSuccessor(SinkMBB);
// Create the conditional branch instruction.
BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
// SinkMBB:
// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
// ...
MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
MachineBasicBlock::iterator MIItEnd =
std::next(MachineBasicBlock::iterator(LastCMOV));
createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
// Now remove the CMOV(s).
ThisMBB->erase(MIItBegin, MIItEnd);
return SinkMBB;
}
MachineBasicBlock *
X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
assert(MF->shouldSplitStack());
const bool Is64Bit = Subtarget.is64Bit();
const bool IsLP64 = Subtarget.isTarget64BitLP64();
const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
// BB:
// ... [Till the alloca]
// If stacklet is not large enough, jump to mallocMBB
//
// bumpMBB:
// Allocate by subtracting from RSP
// Jump to continueMBB
//
// mallocMBB:
// Allocate by call to runtime
//
// continueMBB:
// ...
// [rest of original BB]
//
MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineRegisterInfo &MRI = MF->getRegInfo();
const TargetRegisterClass *AddrRegClass =
getRegClassFor(getPointerTy(MF->getDataLayout()));
unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
sizeVReg = MI.getOperand(1).getReg(),
physSPReg =
IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
MachineFunction::iterator MBBIter = ++BB->getIterator();
MF->insert(MBBIter, bumpMBB);
MF->insert(MBBIter, mallocMBB);
MF->insert(MBBIter, continueMBB);
continueMBB->splice(continueMBB->begin(), BB,
std::next(MachineBasicBlock::iterator(MI)), BB->end());
continueMBB->transferSuccessorsAndUpdatePHIs(BB);
// Add code to the main basic block to check if the stack limit has been hit,
// and if so, jump to mallocMBB otherwise to bumpMBB.
BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
.addReg(tmpSPVReg).addReg(sizeVReg);
BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
.addReg(SPLimitVReg);
BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
// bumpMBB simply decreases the stack pointer, since we know the current
// stacklet has enough space.
BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
.addReg(SPLimitVReg);
BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
.addReg(SPLimitVReg);
BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
// Calls into a routine in libgcc to allocate more space from the heap.
const uint32_t *RegMask =
Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
if (IsLP64) {
BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
.addReg(sizeVReg);
BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
.addExternalSymbol("__morestack_allocate_stack_space")
.addRegMask(RegMask)
.addReg(X86::RDI, RegState::Implicit)
.addReg(X86::RAX, RegState::ImplicitDefine);
} else if (Is64Bit) {
BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
.addReg(sizeVReg);
BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
.addExternalSymbol("__morestack_allocate_stack_space")
.addRegMask(RegMask)
.addReg(X86::EDI, RegState::Implicit)
.addReg(X86::EAX, RegState::ImplicitDefine);
} else {
BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
.addImm(12);
BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
.addExternalSymbol("__morestack_allocate_stack_space")
.addRegMask(RegMask)
.addReg(X86::EAX, RegState::ImplicitDefine);
}
if (!Is64Bit)
BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
.addImm(16);
BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
.addReg(IsLP64 ? X86::RAX : X86::EAX);
BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
// Set up the CFG correctly.
BB->addSuccessor(bumpMBB);
BB->addSuccessor(mallocMBB);
mallocMBB->addSuccessor(continueMBB);
bumpMBB->addSuccessor(continueMBB);
// Take care of the PHI nodes.
BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
MI.getOperand(0).getReg())
.addReg(mallocPtrVReg)
.addMBB(mallocMBB)
.addReg(bumpSPPtrVReg)
.addMBB(bumpMBB);
// Delete the original pseudo instruction.
MI.eraseFromParent();
// And we're done.
return continueMBB;
}
MachineBasicBlock *
X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
DebugLoc DL = MI.getDebugLoc();
assert(!isAsynchronousEHPersonality(
classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
"SEH does not use catchret!");
// Only 32-bit EH needs to worry about manually restoring stack pointers.
if (!Subtarget.is32Bit())
return BB;
// C++ EH creates a new target block to hold the restore code, and wires up
// the new block to the return destination with a normal JMP_4.
MachineBasicBlock *RestoreMBB =
MF->CreateMachineBasicBlock(BB->getBasicBlock());
assert(BB->succ_size() == 1);
MF->insert(std::next(BB->getIterator()), RestoreMBB);
RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
BB->addSuccessor(RestoreMBB);
MI.getOperand(0).setMBB(RestoreMBB);
auto RestoreMBBI = RestoreMBB->begin();
BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
return BB;
}
MachineBasicBlock *
X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
const Constant *PerFn = MF->getFunction().getPersonalityFn();
bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
// Only 32-bit SEH requires special handling for catchpad.
if (IsSEH && Subtarget.is32Bit()) {
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
}
MI.eraseFromParent();
return BB;
}
MachineBasicBlock *
X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
MachineBasicBlock *BB) const {
// So, here we replace TLSADDR with the sequence:
// adjust_stackdown -> TLSADDR -> adjust_stackup.
// We need this because TLSADDR is lowered into calls
// inside MC, therefore without the two markers shrink-wrapping
// may push the prologue/epilogue pass them.
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
MachineFunction &MF = *BB->getParent();
// Emit CALLSEQ_START right before the instruction.
unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
MachineInstrBuilder CallseqStart =
BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
// Emit CALLSEQ_END right after the instruction.
// We don't call erase from parent because we want to keep the
// original instruction around.
unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
MachineInstrBuilder CallseqEnd =
BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
return BB;
}
MachineBasicBlock *
X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
MachineBasicBlock *BB) const {
// This is pretty easy. We're taking the value that we received from
// our load from the relocation, sticking it in either RDI (x86-64)
// or EAX and doing an indirect call. The return value will then
// be in the normal return register.
MachineFunction *F = BB->getParent();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
assert(MI.getOperand(3).isGlobal() && "This should be a global");
// Get a register mask for the lowered call.
// FIXME: The 32-bit calls have non-standard calling conventions. Use a
// proper register mask.
const uint32_t *RegMask =
Subtarget.is64Bit() ?
Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
if (Subtarget.is64Bit()) {
MachineInstrBuilder MIB =
BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
.addReg(X86::RIP)
.addImm(0)
.addReg(0)
.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
MI.getOperand(3).getTargetFlags())
.addReg(0);
MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
addDirectMem(MIB, X86::RDI);
MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
} else if (!isPositionIndependent()) {
MachineInstrBuilder MIB =
BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
.addReg(0)
.addImm(0)
.addReg(0)
.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
MI.getOperand(3).getTargetFlags())
.addReg(0);
MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
addDirectMem(MIB, X86::EAX);
MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
} else {
MachineInstrBuilder MIB =
BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
.addReg(TII->getGlobalBaseReg(F))
.addImm(0)
.addReg(0)
.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
MI.getOperand(3).getTargetFlags())
.addReg(0);
MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
addDirectMem(MIB, X86::EAX);
MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
}
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
}
static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
switch (RPOpc) {
case X86::RETPOLINE_CALL32:
return X86::CALLpcrel32;
case X86::RETPOLINE_CALL64:
return X86::CALL64pcrel32;
case X86::RETPOLINE_TCRETURN32:
return X86::TCRETURNdi;
case X86::RETPOLINE_TCRETURN64:
return X86::TCRETURNdi64;
}
llvm_unreachable("not retpoline opcode");
}
static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
unsigned Reg) {
if (Subtarget.useRetpolineExternalThunk()) {
// When using an external thunk for retpolines, we pick names that match the
// names GCC happens to use as well. This helps simplify the implementation
// of the thunks for kernels where they have no easy ability to create
// aliases and are doing non-trivial configuration of the thunk's body. For
// example, the Linux kernel will do boot-time hot patching of the thunk
// bodies and cannot easily export aliases of these to loaded modules.
//
// Note that at any point in the future, we may need to change the semantics
// of how we implement retpolines and at that time will likely change the
// name of the called thunk. Essentially, there is no hard guarantee that
// LLVM will generate calls to specific thunks, we merely make a best-effort
// attempt to help out kernels and other systems where duplicating the
// thunks is costly.
switch (Reg) {
case X86::EAX:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__x86_indirect_thunk_eax";
case X86::ECX:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__x86_indirect_thunk_ecx";
case X86::EDX:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__x86_indirect_thunk_edx";
case X86::EDI:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__x86_indirect_thunk_edi";
case X86::R11:
assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
return "__x86_indirect_thunk_r11";
}
llvm_unreachable("unexpected reg for retpoline");
}
// When targeting an internal COMDAT thunk use an LLVM-specific name.
switch (Reg) {
case X86::EAX:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__llvm_retpoline_eax";
case X86::ECX:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__llvm_retpoline_ecx";
case X86::EDX:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__llvm_retpoline_edx";
case X86::EDI:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__llvm_retpoline_edi";
case X86::R11:
assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
return "__llvm_retpoline_r11";
}
llvm_unreachable("unexpected reg for retpoline");
}
MachineBasicBlock *
X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
MachineBasicBlock *BB) const {
// Copy the virtual register into the R11 physical register and
// call the retpoline thunk.
DebugLoc DL = MI.getDebugLoc();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
unsigned CalleeVReg = MI.getOperand(0).getReg();
unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
// Find an available scratch register to hold the callee. On 64-bit, we can
// just use R11, but we scan for uses anyway to ensure we don't generate
// incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
// already a register use operand to the call to hold the callee. If none
// are available, use EDI instead. EDI is chosen because EBX is the PIC base
// register and ESI is the base pointer to realigned stack frames with VLAs.
SmallVector<unsigned, 3> AvailableRegs;
if (Subtarget.is64Bit())
AvailableRegs.push_back(X86::R11);
else
AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
// Zero out any registers that are already used.
for (const auto &MO : MI.operands()) {
if (MO.isReg() && MO.isUse())
for (unsigned &Reg : AvailableRegs)
if (Reg == MO.getReg())
Reg = 0;
}
// Choose the first remaining non-zero available register.
unsigned AvailableReg = 0;
for (unsigned MaybeReg : AvailableRegs) {
if (MaybeReg) {
AvailableReg = MaybeReg;
break;
}
}
if (!AvailableReg)
report_fatal_error("calling convention incompatible with retpoline, no "
"available registers");
const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
.addReg(CalleeVReg);
MI.getOperand(0).ChangeToES(Symbol);
MI.setDesc(TII->get(Opc));
MachineInstrBuilder(*BB->getParent(), &MI)
.addReg(AvailableReg, RegState::Implicit | RegState::Kill);
return BB;
}
/// SetJmp implies future control flow change upon calling the corresponding
/// LongJmp.
/// Instead of using the 'return' instruction, the long jump fixes the stack and
/// performs an indirect branch. To do so it uses the registers that were stored
/// in the jump buffer (when calling SetJmp).
/// In case the shadow stack is enabled we need to fix it as well, because some
/// return addresses will be skipped.
/// The function will save the SSP for future fixing in the function
/// emitLongJmpShadowStackFix.
/// \sa emitLongJmpShadowStackFix
/// \param [in] MI The temporary Machine Instruction for the builtin.
/// \param [in] MBB The Machine Basic Block that will be modified.
void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
MachineBasicBlock *MBB) const {
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
MachineInstrBuilder MIB;
// Memory Reference.
SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
MI.memoperands_end());
// Initialize a register with zero.
MVT PVT = getPointerTy(MF->getDataLayout());
const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
unsigned ZReg = MRI.createVirtualRegister(PtrRC);
unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
.addDef(ZReg)
.addReg(ZReg, RegState::Undef)
.addReg(ZReg, RegState::Undef);
// Read the current SSP Register value to the zeroed register.
unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
// Write the SSP register value to offset 3 in input memory buffer.
unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
const int64_t SSPOffset = 3 * PVT.getStoreSize();
const unsigned MemOpndSlot = 1;
for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
if (i == X86::AddrDisp)
MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
else
MIB.add(MI.getOperand(MemOpndSlot + i));
}
MIB.addReg(SSPCopyReg);
MIB.setMemRefs(MMOs);
}
MachineBasicBlock *
X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
MachineBasicBlock *MBB) const {
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
const BasicBlock *BB = MBB->getBasicBlock();
MachineFunction::iterator I = ++MBB->getIterator();
// Memory Reference
SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
MI.memoperands_end());
unsigned DstReg;
unsigned MemOpndSlot = 0;
unsigned CurOp = 0;
DstReg = MI.getOperand(CurOp++).getReg();
const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
(void)TRI;
unsigned mainDstReg = MRI.createVirtualRegister(RC);
unsigned restoreDstReg = MRI.createVirtualRegister(RC);
MemOpndSlot = CurOp;
MVT PVT = getPointerTy(MF->getDataLayout());
assert((PVT == MVT::i64 || PVT == MVT::i32) &&
"Invalid Pointer Size!");
// For v = setjmp(buf), we generate
//
// thisMBB:
// buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
// SjLjSetup restoreMBB
//
// mainMBB:
// v_main = 0
//
// sinkMBB:
// v = phi(main, restore)
//
// restoreMBB:
// if base pointer being used, load it from frame
// v_restore = 1
MachineBasicBlock *thisMBB = MBB;
MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
MF->insert(I, mainMBB);
MF->insert(I, sinkMBB);
MF->push_back(restoreMBB);
restoreMBB->setHasAddressTaken();
MachineInstrBuilder MIB;
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), MBB,
std::next(MachineBasicBlock::iterator(MI)), MBB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
// thisMBB:
unsigned PtrStoreOpc = 0;
unsigned LabelReg = 0;
const int64_t LabelOffset = 1 * PVT.getStoreSize();
bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
!isPositionIndependent();
// Prepare IP either in reg or imm.
if (!UseImmLabel) {
PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
LabelReg = MRI.createVirtualRegister(PtrRC);
if (Subtarget.is64Bit()) {
MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
.addReg(X86::RIP)
.addImm(0)
.addReg(0)
.addMBB(restoreMBB)
.addReg(0);
} else {
const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
.addReg(XII->getGlobalBaseReg(MF))
.addImm(0)
.addReg(0)
.addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
.addReg(0);
}
} else
PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
// Store IP
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
if (i == X86::AddrDisp)
MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
else
MIB.add(MI.getOperand(MemOpndSlot + i));
}
if (!UseImmLabel)
MIB.addReg(LabelReg);
else
MIB.addMBB(restoreMBB);
MIB.setMemRefs(MMOs);
if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
emitSetJmpShadowStackFix(MI, thisMBB);
}
// Setup
MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
.addMBB(restoreMBB);
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
MIB.addRegMask(RegInfo->getNoPreservedMask());
thisMBB->addSuccessor(mainMBB);
thisMBB->addSuccessor(restoreMBB);
// mainMBB:
// EAX = 0
BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
mainMBB->addSuccessor(sinkMBB);
// sinkMBB:
BuildMI(*sinkMBB, sinkMBB->begin(), DL,
TII->get(X86::PHI), DstReg)
.addReg(mainDstReg).addMBB(mainMBB)
.addReg(restoreDstReg).addMBB(restoreMBB);
// restoreMBB:
if (RegInfo->hasBasePointer(*MF)) {
const bool Uses64BitFramePtr =
Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
X86FI->setRestoreBasePointer(MF);
unsigned FramePtr = RegInfo->getFrameRegister(*MF);
unsigned BasePtr = RegInfo->getBaseRegister();
unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
FramePtr, true, X86FI->getRestoreBasePointerOffset())
.setMIFlag(MachineInstr::FrameSetup);
}
BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
restoreMBB->addSuccessor(sinkMBB);
MI.eraseFromParent();
return sinkMBB;
}
/// Fix the shadow stack using the previously saved SSP pointer.
/// \sa emitSetJmpShadowStackFix
/// \param [in] MI The temporary Machine Instruction for the builtin.
/// \param [in] MBB The Machine Basic Block that will be modified.
/// \return The sink MBB that will perform the future indirect branch.
MachineBasicBlock *
X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
MachineBasicBlock *MBB) const {
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
// Memory Reference
SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
MI.memoperands_end());
MVT PVT = getPointerTy(MF->getDataLayout());
const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
// checkSspMBB:
// xor vreg1, vreg1
// rdssp vreg1
// test vreg1, vreg1
// je sinkMBB # Jump if Shadow Stack is not supported
// fallMBB:
// mov buf+24/12(%rip), vreg2
// sub vreg1, vreg2
// jbe sinkMBB # No need to fix the Shadow Stack
// fixShadowMBB:
// shr 3/2, vreg2
// incssp vreg2 # fix the SSP according to the lower 8 bits
// shr 8, vreg2
// je sinkMBB
// fixShadowLoopPrepareMBB:
// shl vreg2
// mov 128, vreg3
// fixShadowLoopMBB:
// incssp vreg3
// dec vreg2
// jne fixShadowLoopMBB # Iterate until you finish fixing
// # the Shadow Stack
// sinkMBB:
MachineFunction::iterator I = ++MBB->getIterator();
const BasicBlock *BB = MBB->getBasicBlock();
MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
MF->insert(I, checkSspMBB);
MF->insert(I, fallMBB);
MF->insert(I, fixShadowMBB);
MF->insert(I, fixShadowLoopPrepareMBB);
MF->insert(I, fixShadowLoopMBB);
MF->insert(I, sinkMBB);
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
MBB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
MBB->addSuccessor(checkSspMBB);
// Initialize a register with zero.
unsigned ZReg = MRI.createVirtualRegister(PtrRC);
unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
.addDef(ZReg)
.addReg(ZReg, RegState::Undef)
.addReg(ZReg, RegState::Undef);
// Read the current SSP Register value to the zeroed register.
unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
// Check whether the result of the SSP register is zero and jump directly
// to the sink.
unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
.addReg(SSPCopyReg)
.addReg(SSPCopyReg);
BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
checkSspMBB->addSuccessor(sinkMBB);
checkSspMBB->addSuccessor(fallMBB);
// Reload the previously saved SSP register value.
unsigned PrevSSPReg = MRI.createVirtualRegister(PtrRC);
unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
const int64_t SPPOffset = 3 * PVT.getStoreSize();
MachineInstrBuilder MIB =
BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
const MachineOperand &MO = MI.getOperand(i);
if (i == X86::AddrDisp)
MIB.addDisp(MO, SPPOffset);
else if (MO.isReg()) // Don't add the whole operand, we don't want to
// preserve kill flags.
MIB.addReg(MO.getReg());
else
MIB.add(MO);
}
MIB.setMemRefs(MMOs);
// Subtract the current SSP from the previous SSP.
unsigned SspSubReg = MRI.createVirtualRegister(PtrRC);
unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
.addReg(PrevSSPReg)
.addReg(SSPCopyReg);
// Jump to sink in case PrevSSPReg <= SSPCopyReg.
BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
fallMBB->addSuccessor(sinkMBB);
fallMBB->addSuccessor(fixShadowMBB);
// Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
unsigned SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
.addReg(SspSubReg)
.addImm(Offset);
// Increase SSP when looking only on the lower 8 bits of the delta.
unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
// Reset the lower 8 bits.
unsigned SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
.addReg(SspFirstShrReg)
.addImm(8);
// Jump if the result of the shift is zero.
BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
fixShadowMBB->addSuccessor(sinkMBB);
fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
// Do a single shift left.
unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
unsigned SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
.addReg(SspSecondShrReg);
// Save the value 128 to a register (will be used next with incssp).
unsigned Value128InReg = MRI.createVirtualRegister(PtrRC);
unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
.addImm(128);
fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
// Since incssp only looks at the lower 8 bits, we might need to do several
// iterations of incssp until we finish fixing the shadow stack.
unsigned DecReg = MRI.createVirtualRegister(PtrRC);
unsigned CounterReg = MRI.createVirtualRegister(PtrRC);
BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
.addReg(SspAfterShlReg)
.addMBB(fixShadowLoopPrepareMBB)
.addReg(DecReg)
.addMBB(fixShadowLoopMBB);
// Every iteration we increase the SSP by 128.
BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
// Every iteration we decrement the counter by 1.
unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
// Jump if the counter is not zero yet.
BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
fixShadowLoopMBB->addSuccessor(sinkMBB);
fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
return sinkMBB;
}
MachineBasicBlock *
X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
MachineBasicBlock *MBB) const {
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
// Memory Reference
SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
MI.memoperands_end());
MVT PVT = getPointerTy(MF->getDataLayout());
assert((PVT == MVT::i64 || PVT == MVT::i32) &&
"Invalid Pointer Size!");
const TargetRegisterClass *RC =
(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
unsigned Tmp = MRI.createVirtualRegister(RC);
// Since FP is only updated here but NOT referenced, it's treated as GPR.
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
unsigned SP = RegInfo->getStackRegister();
MachineInstrBuilder MIB;
const int64_t LabelOffset = 1 * PVT.getStoreSize();
const int64_t SPOffset = 2 * PVT.getStoreSize();
unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
MachineBasicBlock *thisMBB = MBB;
// When CET and shadow stack is enabled, we need to fix the Shadow Stack.
if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
}
// Reload FP
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
const MachineOperand &MO = MI.getOperand(i);
if (MO.isReg()) // Don't add the whole operand, we don't want to
// preserve kill flags.
MIB.addReg(MO.getReg());
else
MIB.add(MO);
}
MIB.setMemRefs(MMOs);
// Reload IP
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
const MachineOperand &MO = MI.getOperand(i);
if (i == X86::AddrDisp)
MIB.addDisp(MO, LabelOffset);
else if (MO.isReg()) // Don't add the whole operand, we don't want to
// preserve kill flags.
MIB.addReg(MO.getReg());
else
MIB.add(MO);
}
MIB.setMemRefs(MMOs);
// Reload SP
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
if (i == X86::AddrDisp)
MIB.addDisp(MI.getOperand(i), SPOffset);
else
MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
// the last instruction of the expansion.
}
MIB.setMemRefs(MMOs);
// Jump
BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
MI.eraseFromParent();
return thisMBB;
}
void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
MachineBasicBlock *MBB,
MachineBasicBlock *DispatchBB,
int FI) const {
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
MachineRegisterInfo *MRI = &MF->getRegInfo();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
MVT PVT = getPointerTy(MF->getDataLayout());
assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
unsigned Op = 0;
unsigned VR = 0;
bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
!isPositionIndependent();
if (UseImmLabel) {
Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
} else {
const TargetRegisterClass *TRC =
(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
VR = MRI->createVirtualRegister(TRC);
Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
if (Subtarget.is64Bit())
BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
.addReg(X86::RIP)
.addImm(1)
.addReg(0)
.addMBB(DispatchBB)
.addReg(0);
else
BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
.addReg(0) /* TII->getGlobalBaseReg(MF) */
.addImm(1)
.addReg(0)
.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
.addReg(0);
}
MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
if (UseImmLabel)
MIB.addMBB(DispatchBB);
else
MIB.addReg(VR);
}
MachineBasicBlock *
X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
MachineBasicBlock *BB) const {
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = BB->getParent();
MachineRegisterInfo *MRI = &MF->getRegInfo();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
int FI = MF->getFrameInfo().getFunctionContextIndex();
// Get a mapping of the call site numbers to all of the landing pads they're
// associated with.
DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
unsigned MaxCSNum = 0;
for (auto &MBB : *MF) {
if (!MBB.isEHPad())
continue;
MCSymbol *Sym = nullptr;
for (const auto &MI : MBB) {
if (MI.isDebugInstr())
continue;
assert(MI.isEHLabel() && "expected EH_LABEL");
Sym = MI.getOperand(0).getMCSymbol();
break;
}
if (!MF->hasCallSiteLandingPad(Sym))
continue;
for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
CallSiteNumToLPad[CSI].push_back(&MBB);
MaxCSNum = std::max(MaxCSNum, CSI);
}
}
// Get an ordered list of the machine basic blocks for the jump table.
std::vector<MachineBasicBlock *> LPadList;
SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
LPadList.reserve(CallSiteNumToLPad.size());
for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
for (auto &LP : CallSiteNumToLPad[CSI]) {
LPadList.push_back(LP);
InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
}
}
assert(!LPadList.empty() &&
"No landing pad destinations for the dispatch jump table!");
// Create the MBBs for the dispatch code.
// Shove the dispatch's address into the return slot in the function context.
MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
DispatchBB->setIsEHPad(true);
MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
BuildMI(TrapBB, DL, TII->get(X86::TRAP));
DispatchBB->addSuccessor(TrapBB);
MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
DispatchBB->addSuccessor(DispContBB);
// Insert MBBs.
MF->push_back(DispatchBB);
MF->push_back(DispContBB);
MF->push_back(TrapBB);
// Insert code into the entry block that creates and registers the function
// context.
SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
// Create the jump table and associated information
unsigned JTE = getJumpTableEncoding();
MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
unsigned MJTI = JTI->createJumpTableIndex(LPadList);
const X86RegisterInfo &RI = TII->getRegisterInfo();
// Add a register mask with no preserved registers. This results in all
// registers being marked as clobbered.
if (RI.hasBasePointer(*MF)) {
const bool FPIs64Bit =
Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
MFI->setRestoreBasePointer(MF);
unsigned FP = RI.getFrameRegister(*MF);
unsigned BP = RI.getBaseRegister();
unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
MFI->getRestoreBasePointerOffset())
.addRegMask(RI.getNoPreservedMask());
} else {
BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
.addRegMask(RI.getNoPreservedMask());
}
// IReg is used as an index in a memory operand and therefore can't be SP
unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
Subtarget.is64Bit() ? 8 : 4);
BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
.addReg(IReg)
.addImm(LPadList.size());
BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
if (Subtarget.is64Bit()) {
unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
// leaq .LJTI0_0(%rip), BReg
BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
.addReg(X86::RIP)
.addImm(1)
.addReg(0)
.addJumpTableIndex(MJTI)
.addReg(0);
// movzx IReg64, IReg
BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
.addImm(0)
.addReg(IReg)
.addImm(X86::sub_32bit);
switch (JTE) {
case MachineJumpTableInfo::EK_BlockAddress:
// jmpq *(BReg,IReg64,8)
BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
.addReg(BReg)
.addImm(8)
.addReg(IReg64)
.addImm(0)
.addReg(0);
break;
case MachineJumpTableInfo::EK_LabelDifference32: {
unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
// movl (BReg,IReg64,4), OReg
BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
.addReg(BReg)
.addImm(4)
.addReg(IReg64)
.addImm(0)
.addReg(0);
// movsx OReg64, OReg
BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
// addq BReg, OReg64, TReg
BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
.addReg(OReg64)
.addReg(BReg);
// jmpq *TReg
BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
break;
}
default:
llvm_unreachable("Unexpected jump table encoding");
}
} else {
// jmpl *.LJTI0_0(,IReg,4)
BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
.addReg(0)
.addImm(4)
.addReg(IReg)
.addJumpTableIndex(MJTI)
.addReg(0);
}
// Add the jump table entries as successors to the MBB.
SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
for (auto &LP : LPadList)
if (SeenMBBs.insert(LP).second)
DispContBB->addSuccessor(LP);
// N.B. the order the invoke BBs are processed in doesn't matter here.
SmallVector<MachineBasicBlock *, 64> MBBLPads;
const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
for (MachineBasicBlock *MBB : InvokeBBs) {
// Remove the landing pad successor from the invoke block and replace it
// with the new dispatch block.
// Keep a copy of Successors since it's modified inside the loop.
SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
MBB->succ_rend());
// FIXME: Avoid quadratic complexity.
for (auto MBBS : Successors) {
if (MBBS->isEHPad()) {
MBB->removeSuccessor(MBBS);
MBBLPads.push_back(MBBS);
}
}
MBB->addSuccessor(DispatchBB);
// Find the invoke call and mark all of the callee-saved registers as
// 'implicit defined' so that they're spilled. This prevents code from
// moving instructions to before the EH block, where they will never be
// executed.
for (auto &II : reverse(*MBB)) {
if (!II.isCall())
continue;
DenseMap<unsigned, bool> DefRegs;
for (auto &MOp : II.operands())
if (MOp.isReg())
DefRegs[MOp.getReg()] = true;
MachineInstrBuilder MIB(*MF, &II);
for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
unsigned Reg = SavedRegs[RI];
if (!DefRegs[Reg])
MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
}
break;
}
}
// Mark all former landing pads as non-landing pads. The dispatch is the only
// landing pad now.
for (auto &LP : MBBLPads)
LP->setIsEHPad(false);
// The instruction is gone now.
MI.eraseFromParent();
return BB;
}
MachineBasicBlock *
X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
switch (MI.getOpcode()) {
default: llvm_unreachable("Unexpected instr type to insert");
case X86::TLS_addr32:
case X86::TLS_addr64:
case X86::TLS_base_addr32:
case X86::TLS_base_addr64:
return EmitLoweredTLSAddr(MI, BB);
case X86::RETPOLINE_CALL32:
case X86::RETPOLINE_CALL64:
case X86::RETPOLINE_TCRETURN32:
case X86::RETPOLINE_TCRETURN64:
return EmitLoweredRetpoline(MI, BB);
case X86::CATCHRET:
return EmitLoweredCatchRet(MI, BB);
case X86::CATCHPAD:
return EmitLoweredCatchPad(MI, BB);
case X86::SEG_ALLOCA_32:
case X86::SEG_ALLOCA_64:
return EmitLoweredSegAlloca(MI, BB);
case X86::TLSCall_32:
case X86::TLSCall_64:
return EmitLoweredTLSCall(MI, BB);
case X86::CMOV_FR32:
case X86::CMOV_FR32X:
case X86::CMOV_FR64:
case X86::CMOV_FR64X:
case X86::CMOV_GR8:
case X86::CMOV_GR16:
case X86::CMOV_GR32:
case X86::CMOV_RFP32:
case X86::CMOV_RFP64:
case X86::CMOV_RFP80:
case X86::CMOV_VR128:
case X86::CMOV_VR128X:
case X86::CMOV_VR256:
case X86::CMOV_VR256X:
case X86::CMOV_VR512:
case X86::CMOV_VK2:
case X86::CMOV_VK4:
case X86::CMOV_VK8:
case X86::CMOV_VK16:
case X86::CMOV_VK32:
case X86::CMOV_VK64:
return EmitLoweredSelect(MI, BB);
case X86::RDFLAGS32:
case X86::RDFLAGS64: {
unsigned PushF =
MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
// Permit reads of the EFLAGS and DF registers without them being defined.
// This intrinsic exists to read external processor state in flags, such as
// the trap flag, interrupt flag, and direction flag, none of which are
// modeled by the backend.
assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
"Unexpected register in operand!");
Push->getOperand(2).setIsUndef();
assert(Push->getOperand(3).getReg() == X86::DF &&
"Unexpected register in operand!");
Push->getOperand(3).setIsUndef();
BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
MI.eraseFromParent(); // The pseudo is gone now.
return BB;
}
case X86::WRFLAGS32:
case X86::WRFLAGS64: {
unsigned Push =
MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
unsigned PopF =
MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
BuildMI(*BB, MI, DL, TII->get(PopF));
MI.eraseFromParent(); // The pseudo is gone now.
return BB;
}
case X86::FP32_TO_INT16_IN_MEM:
case X86::FP32_TO_INT32_IN_MEM:
case X86::FP32_TO_INT64_IN_MEM:
case X86::FP64_TO_INT16_IN_MEM:
case X86::FP64_TO_INT32_IN_MEM:
case X86::FP64_TO_INT64_IN_MEM:
case X86::FP80_TO_INT16_IN_MEM:
case X86::FP80_TO_INT32_IN_MEM:
case X86::FP80_TO_INT64_IN_MEM: {
// Change the floating point control register to use "round towards zero"
// mode when truncating to an integer value.
int OrigCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
addFrameReference(BuildMI(*BB, MI, DL,
TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
// Load the old value of the control word...
unsigned OldCW =
MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
OrigCWFrameIdx);
// OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
unsigned NewCW =
MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
.addReg(OldCW, RegState::Kill).addImm(0xC00);
// Extract to 16 bits.
unsigned NewCW16 =
MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
.addReg(NewCW, RegState::Kill, X86::sub_16bit);
// Prepare memory for FLDCW.
int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
NewCWFrameIdx)
.addReg(NewCW16, RegState::Kill);
// Reload the modified control word now...
addFrameReference(BuildMI(*BB, MI, DL,
TII->get(X86::FLDCW16m)), NewCWFrameIdx);
// Get the X86 opcode to use.
unsigned Opc;
switch (MI.getOpcode()) {
default: llvm_unreachable("illegal opcode!");
case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
}
X86AddressMode AM = getAddressFromInstr(&MI, 0);
addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
.addReg(MI.getOperand(X86::AddrNumOperands).getReg());
// Reload the original control word now.
addFrameReference(BuildMI(*BB, MI, DL,
TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
}
// xbegin
case X86::XBEGIN:
return emitXBegin(MI, BB, Subtarget.getInstrInfo());
case X86::VASTART_SAVE_XMM_REGS:
return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
case X86::VAARG_64:
return EmitVAARG64WithCustomInserter(MI, BB);
case X86::EH_SjLj_SetJmp32:
case X86::EH_SjLj_SetJmp64:
return emitEHSjLjSetJmp(MI, BB);
case X86::EH_SjLj_LongJmp32:
case X86::EH_SjLj_LongJmp64:
return emitEHSjLjLongJmp(MI, BB);
case X86::Int_eh_sjlj_setup_dispatch:
return EmitSjLjDispatchBlock(MI, BB);
case TargetOpcode::STATEPOINT:
// As an implementation detail, STATEPOINT shares the STACKMAP format at
// this point in the process. We diverge later.
return emitPatchPoint(MI, BB);
case TargetOpcode::STACKMAP:
case TargetOpcode::PATCHPOINT:
return emitPatchPoint(MI, BB);
case TargetOpcode::PATCHABLE_EVENT_CALL:
return emitXRayCustomEvent(MI, BB);
case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
return emitXRayTypedEvent(MI, BB);
case X86::LCMPXCHG8B: {
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
// requires a memory operand. If it happens that current architecture is
// i686 and for current function we need a base pointer
// - which is ESI for i686 - register allocator would not be able to
// allocate registers for an address in form of X(%reg, %reg, Y)
// - there never would be enough unreserved registers during regalloc
// (without the need for base ptr the only option would be X(%edi, %esi, Y).
// We are giving a hand to register allocator by precomputing the address in
// a new vreg using LEA.
// If it is not i686 or there is no base pointer - nothing to do here.
if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
return BB;
// Even though this code does not necessarily needs the base pointer to
// be ESI, we check for that. The reason: if this assert fails, there are
// some changes happened in the compiler base pointer handling, which most
// probably have to be addressed somehow here.
assert(TRI->getBaseRegister() == X86::ESI &&
"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind");
MachineRegisterInfo &MRI = MF->getRegInfo();
MVT SPTy = getPointerTy(MF->getDataLayout());
const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
X86AddressMode AM = getAddressFromInstr(&MI, 0);
// Regalloc does not need any help when the memory operand of CMPXCHG8B
// does not use index register.
if (AM.IndexReg == X86::NoRegister)
return BB;
// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
// four operand definitions that are E[ABCD] registers. We skip them and
// then insert the LEA.
MachineBasicBlock::iterator MBBI(MI);
while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
--MBBI;
addFullAddress(
BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
setDirectAddressInInstr(&MI, 0, computedAddrVReg);
return BB;
}
case X86::LCMPXCHG16B:
return BB;
case X86::LCMPXCHG8B_SAVE_EBX:
case X86::LCMPXCHG16B_SAVE_RBX: {
unsigned BasePtr =
MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
if (!BB->isLiveIn(BasePtr))
BB->addLiveIn(BasePtr);
return BB;
}
}
}
//===----------------------------------------------------------------------===//
// X86 Optimization Hooks
//===----------------------------------------------------------------------===//
bool
X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
const APInt &Demanded,
TargetLoweringOpt &TLO) const {
// Only optimize Ands to prevent shrinking a constant that could be
// matched by movzx.
if (Op.getOpcode() != ISD::AND)
return false;
EVT VT = Op.getValueType();
// Ignore vectors.
if (VT.isVector())
return false;
unsigned Size = VT.getSizeInBits();
// Make sure the RHS really is a constant.
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
if (!C)
return false;
const APInt &Mask = C->getAPIntValue();
// Clear all non-demanded bits initially.
APInt ShrunkMask = Mask & Demanded;
// Find the width of the shrunk mask.
unsigned Width = ShrunkMask.getActiveBits();
// If the mask is all 0s there's nothing to do here.
if (Width == 0)
return false;
// Find the next power of 2 width, rounding up to a byte.
Width = PowerOf2Ceil(std::max(Width, 8U));
// Truncate the width to size to handle illegal types.
Width = std::min(Width, Size);
// Calculate a possible zero extend mask for this constant.
APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);
// If we aren't changing the mask, just return true to keep it and prevent
// the caller from optimizing.
if (ZeroExtendMask == Mask)
return true;
// Make sure the new mask can be represented by a combination of mask bits
// and non-demanded bits.
if (!ZeroExtendMask.isSubsetOf(Mask | ~Demanded))
return false;
// Replace the constant with the zero extend mask.
SDLoc DL(Op);
SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
return TLO.CombineTo(Op, NewOp);
}
void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
KnownBits &Known,
const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth) const {
unsigned BitWidth = Known.getBitWidth();
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
assert((Opc >= ISD::BUILTIN_OP_END ||
Opc == ISD::INTRINSIC_WO_CHAIN ||
Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) &&
"Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!");
Known.resetAll();
switch (Opc) {
default: break;
case X86ISD::SETCC:
Known.Zero.setBitsFrom(1);
break;
case X86ISD::MOVMSK: {
unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
Known.Zero.setBitsFrom(NumLoBits);
break;
}
case X86ISD::PEXTRB:
case X86ISD::PEXTRW: {
SDValue Src = Op.getOperand(0);
EVT SrcVT = Src.getValueType();
APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
Op.getConstantOperandVal(1));
Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
Known = Known.zextOrTrunc(BitWidth, false);
Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
break;
}
case X86ISD::VSRAI:
case X86ISD::VSHLI:
case X86ISD::VSRLI: {
if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
Known.setAllZero();
break;
}
Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
unsigned ShAmt = ShiftImm->getZExtValue();
if (Opc == X86ISD::VSHLI) {
Known.Zero <<= ShAmt;
Known.One <<= ShAmt;
// Low bits are known zero.
Known.Zero.setLowBits(ShAmt);
} else if (Opc == X86ISD::VSRLI) {
Known.Zero.lshrInPlace(ShAmt);
Known.One.lshrInPlace(ShAmt);
// High bits are known zero.
Known.Zero.setHighBits(ShAmt);
} else {
Known.Zero.ashrInPlace(ShAmt);
Known.One.ashrInPlace(ShAmt);
}
}
break;
}
case X86ISD::PACKUS: {
// PACKUS is just a truncation if the upper half is zero.
APInt DemandedLHS, DemandedRHS;
getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
Known.One = APInt::getAllOnesValue(BitWidth * 2);
Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
KnownBits Known2;
if (!!DemandedLHS) {
Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
}
if (!!DemandedRHS) {
Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
}
if (Known.countMinLeadingZeros() < BitWidth)
Known.resetAll();
Known = Known.trunc(BitWidth);
break;
}
case X86ISD::ANDNP: {
KnownBits Known2;
Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
// ANDNP = (~X & Y);
Known.One &= Known2.Zero;
Known.Zero |= Known2.One;
break;
}
case X86ISD::FOR: {
KnownBits Known2;
Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
// Output known-0 bits are only known if clear in both the LHS & RHS.
Known.Zero &= Known2.Zero;
// Output known-1 are known to be set if set in either the LHS | RHS.
Known.One |= Known2.One;
break;
}
case X86ISD::CMOV: {
Known = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
// If we don't know any bits, early out.
if (Known.isUnknown())
break;
KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
// Only known if known in both the LHS and RHS.
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
break;
}
}
// Handle target shuffles.
// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
if (isTargetShuffle(Opc)) {
bool IsUnary;
SmallVector<int, 64> Mask;
SmallVector<SDValue, 2> Ops;
if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
IsUnary)) {
unsigned NumOps = Ops.size();
unsigned NumElts = VT.getVectorNumElements();
if (Mask.size() == NumElts) {
SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
Known.Zero.setAllBits(); Known.One.setAllBits();
for (unsigned i = 0; i != NumElts; ++i) {
if (!DemandedElts[i])
continue;
int M = Mask[i];
if (M == SM_SentinelUndef) {
// For UNDEF elements, we don't know anything about the common state
// of the shuffle result.
Known.resetAll();
break;
} else if (M == SM_SentinelZero) {
Known.One.clearAllBits();
continue;
}
assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range");
unsigned OpIdx = (unsigned)M / NumElts;
unsigned EltIdx = (unsigned)M % NumElts;
if (Ops[OpIdx].getValueType() != VT) {
// TODO - handle target shuffle ops with different value types.
Known.resetAll();
break;
}
DemandedOps[OpIdx].setBit(EltIdx);
}
// Known bits are the values that are shared by every demanded element.
for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
if (!DemandedOps[i])
continue;
KnownBits Known2 =
DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
}
}
}
}
}
unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
unsigned Depth) const {
EVT VT = Op.getValueType();
unsigned VTBits = VT.getScalarSizeInBits();
unsigned Opcode = Op.getOpcode();
switch (Opcode) {
case X86ISD::SETCC_CARRY:
// SETCC_CARRY sets the dest to ~0 for true or 0 for false.
return VTBits;
case X86ISD::VTRUNC: {
// TODO: Add DemandedElts support.
SDValue Src = Op.getOperand(0);
unsigned NumSrcBits = Src.getScalarValueSizeInBits();
assert(VTBits < NumSrcBits && "Illegal truncation input type");
unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
if (Tmp > (NumSrcBits - VTBits))
return Tmp - (NumSrcBits - VTBits);
return 1;
}
case X86ISD::PACKSS: {
// PACKSS is just a truncation if the sign bits extend to the packed size.
APInt DemandedLHS, DemandedRHS;
getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
DemandedRHS);
unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
if (!!DemandedLHS)
Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
if (!!DemandedRHS)
Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
unsigned Tmp = std::min(Tmp0, Tmp1);
if (Tmp > (SrcBits - VTBits))
return Tmp - (SrcBits - VTBits);
return 1;
}
case X86ISD::VSHLI: {
SDValue Src = Op.getOperand(0);
const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
if (ShiftVal.uge(VTBits))
return VTBits; // Shifted all bits out --> zero.
unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
if (ShiftVal.uge(Tmp))
return 1; // Shifted all sign bits out --> unknown.
return Tmp - ShiftVal.getZExtValue();
}
case X86ISD::VSRAI: {
SDValue Src = Op.getOperand(0);
APInt ShiftVal = Op.getConstantOperandAPInt(1);
if (ShiftVal.uge(VTBits - 1))
return VTBits; // Sign splat.
unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
ShiftVal += Tmp;
return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
}
case X86ISD::PCMPGT:
case X86ISD::PCMPEQ:
case X86ISD::CMPP:
case X86ISD::VPCOM:
case X86ISD::VPCOMU:
// Vector compares return zero/all-bits result values.
return VTBits;
case X86ISD::ANDNP: {
unsigned Tmp0 =
DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
if (Tmp0 == 1) return 1; // Early out.
unsigned Tmp1 =
DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
return std::min(Tmp0, Tmp1);
}
case X86ISD::CMOV: {
unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
if (Tmp0 == 1) return 1; // Early out.
unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
return std::min(Tmp0, Tmp1);
}
}
// Handle target shuffles.
// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
if (isTargetShuffle(Opcode)) {
bool IsUnary;
SmallVector<int, 64> Mask;
SmallVector<SDValue, 2> Ops;
if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
IsUnary)) {
unsigned NumOps = Ops.size();
unsigned NumElts = VT.getVectorNumElements();
if (Mask.size() == NumElts) {
SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
for (unsigned i = 0; i != NumElts; ++i) {
if (!DemandedElts[i])
continue;
int M = Mask[i];
if (M == SM_SentinelUndef) {
// For UNDEF elements, we don't know anything about the common state
// of the shuffle result.
return 1;
} else if (M == SM_SentinelZero) {
// Zero = all sign bits.
continue;
}
assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range");
unsigned OpIdx = (unsigned)M / NumElts;
unsigned EltIdx = (unsigned)M % NumElts;
if (Ops[OpIdx].getValueType() != VT) {
// TODO - handle target shuffle ops with different value types.
return 1;
}
DemandedOps[OpIdx].setBit(EltIdx);
}
unsigned Tmp0 = VTBits;
for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
if (!DemandedOps[i])
continue;
unsigned Tmp1 =
DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
Tmp0 = std::min(Tmp0, Tmp1);
}
return Tmp0;
}
}
}
// Fallback case.
return 1;
}
SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
return N->getOperand(0);
return N;
}
// Attempt to match a combined shuffle mask against supported unary shuffle
// instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
bool AllowFloatDomain, bool AllowIntDomain,
SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget, unsigned &Shuffle,
MVT &SrcVT, MVT &DstVT) {
unsigned NumMaskElts = Mask.size();
unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
// Match against a VZEXT_MOVL vXi32 zero-extending instruction.
if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
Shuffle = X86ISD::VZEXT_MOVL;
SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
return true;
}
// Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
unsigned MaxScale = 64 / MaskEltSize;
for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
bool MatchAny = true;
bool MatchZero = true;
unsigned NumDstElts = NumMaskElts / Scale;
for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
MatchAny = MatchZero = false;
break;
}
MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
}
if (MatchAny || MatchZero) {
assert(MatchZero && "Failed to match zext but matched aext?");
unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
MVT::getIntegerVT(MaskEltSize);
SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
if (SrcVT.getVectorNumElements() != NumDstElts)
Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
DstVT = MVT::getVectorVT(DstVT, NumDstElts);
return true;
}
}
}
// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
isUndefOrEqual(Mask[0], 0) &&
isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
Shuffle = X86ISD::VZEXT_MOVL;
SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
return true;
}
// Check if we have SSE3 which will let us use MOVDDUP etc. The
// instructions are no slower than UNPCKLPD but has the option to
// fold the input operand into even an unaligned memory load.
if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
if (isTargetShuffleEquivalent(Mask, {0, 0})) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v2f64;
return true;
}
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
Shuffle = X86ISD::MOVSLDUP;
SrcVT = DstVT = MVT::v4f32;
return true;
}
if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
Shuffle = X86ISD::MOVSHDUP;
SrcVT = DstVT = MVT::v4f32;
return true;
}
}
if (MaskVT.is256BitVector() && AllowFloatDomain) {
assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v4f64;
return true;
}
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
Shuffle = X86ISD::MOVSLDUP;
SrcVT = DstVT = MVT::v8f32;
return true;
}
if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
Shuffle = X86ISD::MOVSHDUP;
SrcVT = DstVT = MVT::v8f32;
return true;
}
}
if (MaskVT.is512BitVector() && AllowFloatDomain) {
assert(Subtarget.hasAVX512() &&
"AVX512 required for 512-bit vector shuffles");
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v8f64;
return true;
}
if (isTargetShuffleEquivalent(
Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
Shuffle = X86ISD::MOVSLDUP;
SrcVT = DstVT = MVT::v16f32;
return true;
}
if (isTargetShuffleEquivalent(
Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
Shuffle = X86ISD::MOVSHDUP;
SrcVT = DstVT = MVT::v16f32;
return true;
}
}
return false;
}
// Attempt to match a combined shuffle mask against supported unary immediate
// permute instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
const APInt &Zeroable,
bool AllowFloatDomain, bool AllowIntDomain,
const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &ShuffleVT,
unsigned &PermuteImm) {
unsigned NumMaskElts = Mask.size();
unsigned InputSizeInBits = MaskVT.getSizeInBits();
unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
bool ContainsZeros =
llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
if (!ContainsZeros && MaskScalarSizeInBits == 64) {
// Check for lane crossing permutes.
if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
Shuffle = X86ISD::VPERMI;
ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
PermuteImm = getV4X86ShuffleImm(Mask);
return true;
}
if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
SmallVector<int, 4> RepeatedMask;
if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
Shuffle = X86ISD::VPERMI;
ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
PermuteImm = getV4X86ShuffleImm(RepeatedMask);
return true;
}
}
} else if (AllowFloatDomain && Subtarget.hasAVX()) {
// VPERMILPD can permute with a non-repeating shuffle.
Shuffle = X86ISD::VPERMILPI;
ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
PermuteImm = 0;
for (int i = 0, e = Mask.size(); i != e; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef)
continue;
assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
PermuteImm |= (M & 1) << i;
}
return true;
}
}
// Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
!ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
SmallVector<int, 4> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
// Narrow the repeated mask to create 32-bit element permutes.
SmallVector<int, 4> WordMask = RepeatedMask;
if (MaskScalarSizeInBits == 64)
scaleShuffleMask<int>(2, RepeatedMask, WordMask);
Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
PermuteImm = getV4X86ShuffleImm(WordMask);
return true;
}
}
// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
SmallVector<int, 4> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
ArrayRef<int> LoMask(Mask.data() + 0, 4);
ArrayRef<int> HiMask(Mask.data() + 4, 4);
// PSHUFLW: permute lower 4 elements only.
if (isUndefOrInRange(LoMask, 0, 4) &&
isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
Shuffle = X86ISD::PSHUFLW;
ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
PermuteImm = getV4X86ShuffleImm(LoMask);
return true;
}
// PSHUFHW: permute upper 4 elements only.
if (isUndefOrInRange(HiMask, 4, 8) &&
isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
// Offset the HiMask so that we can create the shuffle immediate.
int OffsetHiMask[4];
for (int i = 0; i != 4; ++i)
OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
Shuffle = X86ISD::PSHUFHW;
ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
return true;
}
}
}
// Attempt to match against byte/bit shifts.
// FIXME: Add 512-bit support.
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
Mask, 0, Zeroable, Subtarget);
if (0 < ShiftAmt) {
PermuteImm = (unsigned)ShiftAmt;
return true;
}
}
return false;
}
// Attempt to match a combined unary shuffle mask against supported binary
// shuffle instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
bool AllowFloatDomain, bool AllowIntDomain,
SDValue &V1, SDValue &V2, const SDLoc &DL,
SelectionDAG &DAG, const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
bool IsUnary) {
unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
if (MaskVT.is128BitVector()) {
if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
V2 = V1;
V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
return true;
}
if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
V2 = V1;
Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
return true;
}
if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
(AllowFloatDomain || !Subtarget.hasSSE41())) {
std::swap(V1, V2);
Shuffle = X86ISD::MOVSD;
SrcVT = DstVT = MVT::v2f64;
return true;
}
if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
(AllowFloatDomain || !Subtarget.hasSSE41())) {
Shuffle = X86ISD::MOVSS;
SrcVT = DstVT = MVT::v4f32;
return true;
}
}
// Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
Subtarget)) {
DstVT = MaskVT;
return true;
}
}
// Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
(MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
DAG, Subtarget)) {
SrcVT = DstVT = MaskVT;
if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
return true;
}
}
return false;
}
static bool matchBinaryPermuteShuffle(
MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
unsigned NumMaskElts = Mask.size();
unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
// Attempt to match against PALIGNR byte rotate.
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
if (0 < ByteRotation) {
Shuffle = X86ISD::PALIGNR;
ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
PermuteImm = ByteRotation;
return true;
}
}
// Attempt to combine to X86ISD::BLENDI.
if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
(Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
(MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
uint64_t BlendMask = 0;
bool ForceV1Zero = false, ForceV2Zero = false;
SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
BlendMask)) {
if (MaskVT == MVT::v16i16) {
// We can only use v16i16 PBLENDW if the lanes are repeated.
SmallVector<int, 8> RepeatedMask;
if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
RepeatedMask)) {
assert(RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!");
PermuteImm = 0;
for (int i = 0; i < 8; ++i)
if (RepeatedMask[i] >= 8)
PermuteImm |= 1 << i;
V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
Shuffle = X86ISD::BLENDI;
ShuffleVT = MaskVT;
return true;
}
} else {
V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
PermuteImm = (unsigned)BlendMask;
Shuffle = X86ISD::BLENDI;
ShuffleVT = MaskVT;
return true;
}
}
}
// Attempt to combine to INSERTPS.
if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
MaskVT.is128BitVector()) {
if (Zeroable.getBoolValue() &&
matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
Shuffle = X86ISD::INSERTPS;
ShuffleVT = MVT::v4f32;
return true;
}
}
// Attempt to combine to SHUFPD.
if (AllowFloatDomain && EltSizeInBits == 64 &&
((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
if (matchShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
Shuffle = X86ISD::SHUFP;
ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
return true;
}
}
// Attempt to combine to SHUFPS.
if (AllowFloatDomain && EltSizeInBits == 32 &&
((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
SmallVector<int, 4> RepeatedMask;
if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
// Match each half of the repeated mask, to determine if its just
// referencing one of the vectors, is zeroable or entirely undef.
auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
int M0 = RepeatedMask[Offset];
int M1 = RepeatedMask[Offset + 1];
if (isUndefInRange(RepeatedMask, Offset, 2)) {
return DAG.getUNDEF(MaskVT);
} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
S0 = (SM_SentinelUndef == M0 ? -1 : 0);
S1 = (SM_SentinelUndef == M1 ? -1 : 1);
return getZeroVector(MaskVT, Subtarget, DAG, DL);
} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
return V1;
} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
return V2;
}
return SDValue();
};
int ShufMask[4] = {-1, -1, -1, -1};
SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
if (Lo && Hi) {
V1 = Lo;
V2 = Hi;
Shuffle = X86ISD::SHUFP;
ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
PermuteImm = getV4X86ShuffleImm(ShufMask);
return true;
}
}
}
return false;
}
static SDValue combineX86ShuffleChainWithExtract(
ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
const X86Subtarget &Subtarget);
/// Combine an arbitrary chain of shuffles into a single instruction if
/// possible.
///
/// This is the leaf of the recursive combine below. When we have found some
/// chain of single-use x86 shuffle instructions and accumulated the combined
/// shuffle mask represented by them, this will try to pattern match that mask
/// into either a single instruction if there is a special purpose instruction
/// for this operation, or into a PSHUFB instruction which is a fully general
/// instruction but should only be used to replace chains over a certain depth.
static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
ArrayRef<int> BaseMask, int Depth,
bool HasVariableMask,
bool AllowVariableMask, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
assert((Inputs.size() == 1 || Inputs.size() == 2) &&
"Unexpected number of shuffle inputs!");
// Find the inputs that enter the chain. Note that multiple uses are OK
// here, we're not going to remove the operands we find.
bool UnaryShuffle = (Inputs.size() == 1);
SDValue V1 = peekThroughBitcasts(Inputs[0]);
SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
: peekThroughBitcasts(Inputs[1]));
MVT VT1 = V1.getSimpleValueType();
MVT VT2 = V2.getSimpleValueType();
MVT RootVT = Root.getSimpleValueType();
assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
VT2.getSizeInBits() == RootVT.getSizeInBits() &&
"Vector size mismatch");
SDLoc DL(Root);
SDValue Res;
unsigned NumBaseMaskElts = BaseMask.size();
if (NumBaseMaskElts == 1) {
assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
return DAG.getBitcast(RootVT, V1);
}
unsigned RootSizeInBits = RootVT.getSizeInBits();
unsigned NumRootElts = RootVT.getVectorNumElements();
unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
(RootVT.isFloatingPoint() && Depth >= 2) ||
(RootVT.is256BitVector() && !Subtarget.hasAVX2());
// Don't combine if we are a AVX512/EVEX target and the mask element size
// is different from the root element size - this would prevent writemasks
// from being reused.
// TODO - this currently prevents all lane shuffles from occurring.
// TODO - check for writemasks usage instead of always preventing combining.
// TODO - attempt to narrow Mask back to writemask size.
bool IsEVEXShuffle =
RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
// Attempt to match a subvector broadcast.
// shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0)
if (UnaryShuffle &&
(BaseMaskEltSizeInBits == 128 || BaseMaskEltSizeInBits == 256)) {
SmallVector<int, 64> BroadcastMask(NumBaseMaskElts, 0);
if (isTargetShuffleEquivalent(BaseMask, BroadcastMask)) {
SDValue Src = Inputs[0];
if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
Src.getOperand(0).isUndef() &&
Src.getOperand(1).getValueSizeInBits() == BaseMaskEltSizeInBits &&
MayFoldLoad(Src.getOperand(1)) && isNullConstant(Src.getOperand(2))) {
return DAG.getBitcast(RootVT, DAG.getNode(X86ISD::SUBV_BROADCAST, DL,
Src.getValueType(),
Src.getOperand(1)));
}
}
}
// TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
// Handle 128-bit lane shuffles of 256-bit vectors.
// If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
// we need to use the zeroing feature.
// TODO - this should support binary shuffles.
if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
!(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
!isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
return SDValue(); // Nothing to do!
MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
unsigned PermMask = 0;
PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
Res = DAG.getBitcast(ShuffleVT, V1);
Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
DAG.getUNDEF(ShuffleVT),
DAG.getConstant(PermMask, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
}
// For masks that have been widened to 128-bit elements or more,
// narrow back down to 64-bit elements.
SmallVector<int, 64> Mask;
if (BaseMaskEltSizeInBits > 64) {
assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
int MaskScale = BaseMaskEltSizeInBits / 64;
scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
} else {
Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
}
unsigned NumMaskElts = Mask.size();
unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
// Determine the effective mask value type.
FloatDomain &= (32 <= MaskEltSizeInBits);
MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
: MVT::getIntegerVT(MaskEltSizeInBits);
MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
// Only allow legal mask types.
if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
return SDValue();
// Attempt to match the mask against known shuffle patterns.
MVT ShuffleSrcVT, ShuffleVT;
unsigned Shuffle, PermuteImm;
// Which shuffle domains are permitted?
// Permit domain crossing at higher combine depths.
// TODO: Should we indicate which domain is preferred if both are allowed?
bool AllowFloatDomain = FloatDomain || (Depth > 3);
bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
(!MaskVT.is256BitVector() || Subtarget.hasAVX2());
// Determine zeroable mask elements.
APInt Zeroable(NumMaskElts, 0);
for (unsigned i = 0; i != NumMaskElts; ++i)
if (isUndefOrZero(Mask[i]))
Zeroable.setBit(i);
if (UnaryShuffle) {
// If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
// directly if we don't shuffle the lower element and we shuffle the upper
// (zero) elements within themselves.
if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
(cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() %
MaskEltSizeInBits) == 0) {
unsigned Scale =
cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() /
MaskEltSizeInBits;
ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
return DAG.getBitcast(RootVT, V1);
}
}
// Attempt to match against broadcast-from-vector.
// Limit AVX1 to cases where we're loading+broadcasting a scalar element.
if ((Subtarget.hasAVX2() || (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits))
&& (!IsEVEXShuffle || NumRootElts == NumMaskElts)) {
SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
if (V1.getValueType() == MaskVT &&
V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
MayFoldLoad(V1.getOperand(0))) {
if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST)
return SDValue(); // Nothing to do!
Res = V1.getOperand(0);
Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
return DAG.getBitcast(RootVT, Res);
}
if (Subtarget.hasAVX2()) {
if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST)
return SDValue(); // Nothing to do!
Res = DAG.getBitcast(MaskVT, V1);
Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
return DAG.getBitcast(RootVT, Res);
}
}
}
SDValue NewV1 = V1; // Save operand in case early exit happens.
if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
ShuffleVT) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
return DAG.getBitcast(RootVT, Res);
}
if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
PermuteImm) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
Res = DAG.getBitcast(ShuffleVT, V1);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
DAG.getConstant(PermuteImm, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
}
}
SDValue NewV1 = V1; // Save operands in case early exit happens.
SDValue NewV2 = V2;
if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
ShuffleVT, UnaryShuffle) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
return DAG.getBitcast(RootVT, Res);
}
NewV1 = V1; // Save operands in case early exit happens.
NewV2 = V2;
if (matchBinaryPermuteShuffle(
MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
DAG.getConstant(PermuteImm, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
}
// Typically from here on, we need an integer version of MaskVT.
MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
// Annoyingly, SSE4A instructions don't map into the above match helpers.
if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
uint64_t BitLen, BitIdx;
if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
Zeroable)) {
if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
return SDValue(); // Nothing to do!
V1 = DAG.getBitcast(IntMaskVT, V1);
Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
DAG.getConstant(BitLen, DL, MVT::i8),
DAG.getConstant(BitIdx, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
}
if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
return SDValue(); // Nothing to do!
V1 = DAG.getBitcast(IntMaskVT, V1);
V2 = DAG.getBitcast(IntMaskVT, V2);
Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
DAG.getConstant(BitLen, DL, MVT::i8),
DAG.getConstant(BitIdx, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
}
}
// Don't try to re-form single instruction chains under any circumstances now
// that we've done encoding canonicalization for them.
if (Depth < 2)
return SDValue();
// Depth threshold above which we can efficiently use variable mask shuffles.
int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
bool MaskContainsZeros =
any_of(Mask, [](int M) { return M == SM_SentinelZero; });
if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
// If we have a single input lane-crossing shuffle then lower to VPERMV.
if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX2() &&
(MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
(Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
(Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
Res = DAG.getBitcast(MaskVT, V1);
Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
return DAG.getBitcast(RootVT, Res);
}
// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
// vector as the second source.
if (UnaryShuffle && AllowVariableMask &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
(Subtarget.hasVLX() &&
(MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
(Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
// Adjust shuffle mask - replace SM_SentinelZero with second source index.
for (unsigned i = 0; i != NumMaskElts; ++i)
if (Mask[i] == SM_SentinelZero)
Mask[i] = NumMaskElts + i;
SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
Res = DAG.getBitcast(MaskVT, V1);
SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
return DAG.getBitcast(RootVT, Res);
}
// If that failed and either input is extracted then try to combine as a
// shuffle with the larger type.
if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
DAG, Subtarget))
return WideShuffle;
// If we have a dual input lane-crossing shuffle then lower to VPERMV3.
if (AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
(Subtarget.hasVLX() &&
(MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
(Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
V1 = DAG.getBitcast(MaskVT, V1);
V2 = DAG.getBitcast(MaskVT, V2);
Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
return DAG.getBitcast(RootVT, Res);
}
return SDValue();
}
// See if we can combine a single input shuffle with zeros to a bit-mask,
// which is much simpler than any shuffle.
if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
APInt UndefElts(NumMaskElts, 0);
SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
for (unsigned i = 0; i != NumMaskElts; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef) {
UndefElts.setBit(i);
continue;
}
if (M == SM_SentinelZero)
continue;
EltBits[i] = AllOnes;
}
SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
Res = DAG.getBitcast(MaskVT, V1);
unsigned AndOpcode =
FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
return DAG.getBitcast(RootVT, Res);
}
// If we have a single input shuffle with different shuffle patterns in the
// the 128-bit lanes use the variable mask to VPERMILPS.
// TODO Combine other mask types at higher depths.
if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
SmallVector<SDValue, 16> VPermIdx;
for (int M : Mask) {
SDValue Idx =
M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
VPermIdx.push_back(Idx);
}
SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
Res = DAG.getBitcast(MaskVT, V1);
Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
return DAG.getBitcast(RootVT, Res);
}
// With XOP, binary shuffles of 128/256-bit floating point vectors can combine
// to VPERMIL2PD/VPERMIL2PS.
if (AllowVariableMask && Subtarget.hasXOP() &&
(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
MaskVT == MVT::v8f32)) {
// VPERMIL2 Operation.
// Bits[3] - Match Bit.
// Bits[2:1] - (Per Lane) PD Shuffle Mask.
// Bits[2:0] - (Per Lane) PS Shuffle Mask.
unsigned NumLanes = MaskVT.getSizeInBits() / 128;
unsigned NumEltsPerLane = NumMaskElts / NumLanes;
SmallVector<int, 8> VPerm2Idx;
unsigned M2ZImm = 0;
for (int M : Mask) {
if (M == SM_SentinelUndef) {
VPerm2Idx.push_back(-1);
continue;
}
if (M == SM_SentinelZero) {
M2ZImm = 2;
VPerm2Idx.push_back(8);
continue;
}
int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
VPerm2Idx.push_back(Index);
}
V1 = DAG.getBitcast(MaskVT, V1);
V2 = DAG.getBitcast(MaskVT, V2);
SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
DAG.getConstant(M2ZImm, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
}
// If we have 3 or more shuffle instructions or a chain involving a variable
// mask, we can replace them with a single PSHUFB instruction profitably.
// Intel's manuals suggest only using PSHUFB if doing so replacing 5
// instructions, but in practice PSHUFB tends to be *very* fast so we're
// more aggressive.
if (UnaryShuffle && AllowVariableMask &&
((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
(RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
(RootVT.is512BitVector() && Subtarget.hasBWI()))) {
SmallVector<SDValue, 16> PSHUFBMask;
int NumBytes = RootVT.getSizeInBits() / 8;
int Ratio = NumBytes / NumMaskElts;
for (int i = 0; i < NumBytes; ++i) {
int M = Mask[i / Ratio];
if (M == SM_SentinelUndef) {
PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
continue;
}
if (M == SM_SentinelZero) {
PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
continue;
}
M = Ratio * M + i % Ratio;
assert((M / 16) == (i / 16) && "Lane crossing detected");
PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
}
MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
Res = DAG.getBitcast(ByteVT, V1);
SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
return DAG.getBitcast(RootVT, Res);
}
// With XOP, if we have a 128-bit binary input shuffle we can always combine
// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
// slower than PSHUFB on targets that support both.
if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
// VPPERM Mask Operation
// Bits[4:0] - Byte Index (0 - 31)
// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
SmallVector<SDValue, 16> VPPERMMask;
int NumBytes = 16;
int Ratio = NumBytes / NumMaskElts;
for (int i = 0; i < NumBytes; ++i) {
int M = Mask[i / Ratio];
if (M == SM_SentinelUndef) {
VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
continue;
}
if (M == SM_SentinelZero) {
VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
continue;
}
M = Ratio * M + i % Ratio;
VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
}
MVT ByteVT = MVT::v16i8;
V1 = DAG.getBitcast(ByteVT, V1);
V2 = DAG.getBitcast(ByteVT, V2);
SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
return DAG.getBitcast(RootVT, Res);
}
// If that failed and either input is extracted then try to combine as a
// shuffle with the larger type.
if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
DAG, Subtarget))
return WideShuffle;
// If we have a dual input shuffle then lower to VPERMV3.
if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
(Subtarget.hasVLX() &&
(MaskVT == MVT::v2f64 || MaskVT == MVT::v2i64 || MaskVT == MVT::v4f64 ||
MaskVT == MVT::v4i64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 ||
MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
(Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
(Subtarget.hasBWI() && Subtarget.hasVLX() &&
(MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16)) ||
(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
(Subtarget.hasVBMI() && Subtarget.hasVLX() &&
(MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8)))) {
SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
V1 = DAG.getBitcast(MaskVT, V1);
V2 = DAG.getBitcast(MaskVT, V2);
Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
return DAG.getBitcast(RootVT, Res);
}
// Failed to find any combines.
return SDValue();
}
// Combine an arbitrary chain of shuffles + extract_subvectors into a single
// instruction if possible.
//
// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
// type size to attempt to combine:
// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
// -->
// extract_subvector(shuffle(x,y,m2),0)
static SDValue combineX86ShuffleChainWithExtract(
ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
unsigned NumMaskElts = BaseMask.size();
unsigned NumInputs = Inputs.size();
if (NumInputs == 0)
return SDValue();
SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
SmallVector<unsigned, 4> Offsets(NumInputs, 0);
// Peek through subvectors.
// TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
unsigned WideSizeInBits = WideInputs[0].getValueSizeInBits();
for (unsigned i = 0; i != NumInputs; ++i) {
SDValue &Src = WideInputs[i];
unsigned &Offset = Offsets[i];
Src = peekThroughBitcasts(Src);
EVT BaseVT = Src.getValueType();
while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
isa<ConstantSDNode>(Src.getOperand(1))) {
Offset += Src.getConstantOperandVal(1);
Src = Src.getOperand(0);
}
WideSizeInBits = std::max(WideSizeInBits, Src.getValueSizeInBits());
assert((Offset % BaseVT.getVectorNumElements()) == 0 &&
"Unexpected subvector extraction");
Offset /= BaseVT.getVectorNumElements();
Offset *= NumMaskElts;
}
// Bail if we're always extracting from the lowest subvectors,
// combineX86ShuffleChain should match this for the current width.
if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
return SDValue();
EVT RootVT = Root.getValueType();
unsigned RootSizeInBits = RootVT.getSizeInBits();
unsigned Scale = WideSizeInBits / RootSizeInBits;
assert((WideSizeInBits % RootSizeInBits) == 0 &&
"Unexpected subvector extraction");
// If the src vector types aren't the same, see if we can extend
// them to match each other.
// TODO: Support different scalar types?
EVT WideSVT = WideInputs[0].getValueType().getScalarType();
if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
Op.getValueType().getScalarType() != WideSVT;
}))
return SDValue();
for (SDValue &NewInput : WideInputs) {
assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
"Shuffle vector size mismatch");
if (WideSizeInBits > NewInput.getValueSizeInBits())
NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
SDLoc(NewInput), WideSizeInBits);
assert(WideSizeInBits == NewInput.getValueSizeInBits() &&
"Unexpected subvector extraction");
}
// Create new mask for larger type.
for (unsigned i = 1; i != NumInputs; ++i)
Offsets[i] += i * Scale * NumMaskElts;
SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
for (int &M : WideMask) {
if (M < 0)
continue;
M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
}
WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
// Remove unused/repeated shuffle source ops.
resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
assert(!WideInputs.empty() && "Shuffle with no inputs detected");
if (WideInputs.size() > 2)
return SDValue();
// Increase depth for every upper subvector we've peeked through.
Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
// Attempt to combine wider chain.
// TODO: Can we use a better Root?
SDValue WideRoot = WideInputs[0];
if (SDValue WideShuffle = combineX86ShuffleChain(
WideInputs, WideRoot, WideMask, Depth, HasVariableMask,
AllowVariableMask, DAG, Subtarget)) {
WideShuffle =
extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
return DAG.getBitcast(RootVT, WideShuffle);
}
return SDValue();
}
// Attempt to constant fold all of the constant source ops.
// Returns true if the entire shuffle is folded to a constant.
// TODO: Extend this to merge multiple constant Ops and update the mask.
static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
ArrayRef<int> Mask, SDValue Root,
bool HasVariableMask,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Root.getSimpleValueType();
unsigned SizeInBits = VT.getSizeInBits();
unsigned NumMaskElts = Mask.size();
unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
unsigned NumOps = Ops.size();
// Extract constant bits from each source op.
bool OneUseConstantOp = false;
SmallVector<APInt, 16> UndefEltsOps(NumOps);
SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
for (unsigned i = 0; i != NumOps; ++i) {
SDValue SrcOp = Ops[i];
OneUseConstantOp |= SrcOp.hasOneUse();
if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
RawBitsOps[i]))
return SDValue();
}
// Only fold if at least one of the constants is only used once or
// the combined shuffle has included a variable mask shuffle, this
// is to avoid constant pool bloat.
if (!OneUseConstantOp && !HasVariableMask)
return SDValue();
// Shuffle the constant bits according to the mask.
APInt UndefElts(NumMaskElts, 0);
APInt ZeroElts(NumMaskElts, 0);
APInt ConstantElts(NumMaskElts, 0);
SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
APInt::getNullValue(MaskSizeInBits));
for (unsigned i = 0; i != NumMaskElts; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef) {
UndefElts.setBit(i);
continue;
} else if (M == SM_SentinelZero) {
ZeroElts.setBit(i);
continue;
}
assert(0 <= M && M < (int)(NumMaskElts * NumOps));
unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
if (SrcUndefElts[SrcMaskIdx]) {
UndefElts.setBit(i);
continue;
}
auto &SrcEltBits = RawBitsOps[SrcOpIdx];
APInt &Bits = SrcEltBits[SrcMaskIdx];
if (!Bits) {
ZeroElts.setBit(i);
continue;
}
ConstantElts.setBit(i);
ConstantBitData[i] = Bits;
}
assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
// Create the constant data.
MVT MaskSVT;
if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
else
MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
SDLoc DL(Root);
SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
return DAG.getBitcast(VT, CstOp);
}
/// Fully generic combining of x86 shuffle instructions.
///
/// This should be the last combine run over the x86 shuffle instructions. Once
/// they have been fully optimized, this will recursively consider all chains
/// of single-use shuffle instructions, build a generic model of the cumulative
/// shuffle operation, and check for simpler instructions which implement this
/// operation. We use this primarily for two purposes:
///
/// 1) Collapse generic shuffles to specialized single instructions when
/// equivalent. In most cases, this is just an encoding size win, but
/// sometimes we will collapse multiple generic shuffles into a single
/// special-purpose shuffle.
/// 2) Look for sequences of shuffle instructions with 3 or more total
/// instructions, and replace them with the slightly more expensive SSSE3
/// PSHUFB instruction if available. We do this as the last combining step
/// to ensure we avoid using PSHUFB if we can implement the shuffle with
/// a suitable short sequence of other instructions. The PSHUFB will either
/// use a register or have to read from memory and so is slightly (but only
/// slightly) more expensive than the other shuffle instructions.
///
/// Because this is inherently a quadratic operation (for each shuffle in
/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
/// This should never be an issue in practice as the shuffle lowering doesn't
/// produce sequences of more than 8 instructions.
///
/// FIXME: We will currently miss some cases where the redundant shuffling
/// would simplify under the threshold for PSHUFB formation because of
/// combine-ordering. To fix this, we should do the redundant instruction
/// combining in this recursive walk.
static SDValue combineX86ShufflesRecursively(
ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// Bound the depth of our recursive combine because this is ultimately
// quadratic in nature.
const unsigned MaxRecursionDepth = 8;
if (Depth > MaxRecursionDepth)
return SDValue();
// Directly rip through bitcasts to find the underlying operand.
SDValue Op = SrcOps[SrcOpIndex];
Op = peekThroughOneUseBitcasts(Op);
MVT VT = Op.getSimpleValueType();
if (!VT.isVector())
return SDValue(); // Bail if we hit a non-vector.
assert(Root.getSimpleValueType().isVector() &&
"Shuffles operate on vector types!");
assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
"Can only combine shuffles of the same vector register size.");
// Extract target shuffle mask and resolve sentinels and inputs.
SmallVector<int, 64> OpMask;
SmallVector<SDValue, 2> OpInputs;
if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
return SDValue();
// Add the inputs to the Ops list, avoiding duplicates.
SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
// Attempt to find an existing match.
SDValue InputBC = peekThroughBitcasts(Input);
for (int i = 0, e = Ops.size(); i < e; ++i)
if (InputBC == peekThroughBitcasts(Ops[i]))
return i;
// Match failed - should we replace an existing Op?
if (InsertionPoint >= 0) {
Ops[InsertionPoint] = Input;
return InsertionPoint;
}
// Add to the end of the Ops list.
Ops.push_back(Input);
return Ops.size() - 1;
};
SmallVector<int, 2> OpInputIdx;
for (SDValue OpInput : OpInputs)
OpInputIdx.push_back(AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
assert(((RootMask.size() > OpMask.size() &&
RootMask.size() % OpMask.size() == 0) ||
(OpMask.size() > RootMask.size() &&
OpMask.size() % RootMask.size() == 0) ||
OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.");
// This function can be performance-critical, so we rely on the power-of-2
// knowledge that we have about the mask sizes to replace div/rem ops with
// bit-masks and shifts.
assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
assert((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!");
assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
// Merge this shuffle operation's mask into our accumulated mask. Note that
// this shuffle's mask will be the first applied to the input, followed by the
// root mask to get us all the way to the root value arrangement. The reason
// for this order is that we are recursing up the operation chain.
for (unsigned i = 0; i < MaskWidth; ++i) {
unsigned RootIdx = i >> RootRatioLog2;
if (RootMask[RootIdx] < 0) {
// This is a zero or undef lane, we're done.
Mask[i] = RootMask[RootIdx];
continue;
}
unsigned RootMaskedIdx =
RootRatio == 1
? RootMask[RootIdx]
: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
// Just insert the scaled root mask value if it references an input other
// than the SrcOp we're currently inserting.
if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
Mask[i] = RootMaskedIdx;
continue;
}
RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
if (OpMask[OpIdx] < 0) {
// The incoming lanes are zero or undef, it doesn't matter which ones we
// are using.
Mask[i] = OpMask[OpIdx];
continue;
}
// Ok, we have non-zero lanes, map them through to one of the Op's inputs.
unsigned OpMaskedIdx =
OpRatio == 1
? OpMask[OpIdx]
: (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
Mask[i] = OpMaskedIdx;
}
// Handle the all undef/zero cases early.
if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
return DAG.getUNDEF(Root.getValueType());
// TODO - should we handle the mixed zero/undef case as well? Just returning
// a zero mask will lose information on undef elements possibly reducing
// future combine possibilities.
if (all_of(Mask, [](int Idx) { return Idx < 0; }))
return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
SDLoc(Root));
// Remove unused/repeated shuffle source ops.
resolveTargetShuffleInputsAndMask(Ops, Mask);
assert(!Ops.empty() && "Shuffle with no inputs detected");
HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
// Update the list of shuffle nodes that have been combined so far.
SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
SrcNodes.end());
CombinedNodes.push_back(Op.getNode());
// See if we can recurse into each shuffle source op (if it's a target
// shuffle). The source op should only be generally combined if it either has
// a single use (i.e. current Op) or all its users have already been combined,
// if not then we can still combine but should prevent generation of variable
// shuffles to avoid constant pool bloat.
// Don't recurse if we already have more source ops than we can combine in
// the remaining recursion depth.
if (Ops.size() < (MaxRecursionDepth - Depth)) {
for (int i = 0, e = Ops.size(); i < e; ++i) {
bool AllowVar = false;
if (Ops[i].getNode()->hasOneUse() ||
SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
AllowVar = AllowVariableMask;
if (SDValue Res = combineX86ShufflesRecursively(
Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
AllowVar, DAG, Subtarget))
return Res;
}
}
// Attempt to constant fold all of the constant source ops.
if (SDValue Cst = combineX86ShufflesConstants(
Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
return Cst;
// We can only combine unary and binary shuffle mask cases.
if (Ops.size() <= 2) {
// Minor canonicalization of the accumulated shuffle mask to make it easier
// to match below. All this does is detect masks with sequential pairs of
// elements, and shrink them to the half-width mask. It does this in a loop
// so it will reduce the size of the mask to the minimal width mask which
// performs an equivalent shuffle.
SmallVector<int, 64> WidenedMask;
while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
Mask = std::move(WidenedMask);
}
// Canonicalization of binary shuffle masks to improve pattern matching by
// commuting the inputs.
if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
ShuffleVectorSDNode::commuteMask(Mask);
std::swap(Ops[0], Ops[1]);
}
// Finally, try to combine into a single shuffle instruction.
return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
AllowVariableMask, DAG, Subtarget);
}
// If that failed and any input is extracted then try to combine as a
// shuffle with the larger type.
return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth,
HasVariableMask, AllowVariableMask,
DAG, Subtarget);
}
/// Helper entry wrapper to combineX86ShufflesRecursively.
static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
/*HasVarMask*/ false,
/*AllowVarMask*/ true, DAG, Subtarget);
}
/// Get the PSHUF-style mask from PSHUF node.
///
/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
/// PSHUF-style masks that can be reused with such instructions.
static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
MVT VT = N.getSimpleValueType();
SmallVector<int, 4> Mask;
SmallVector<SDValue, 2> Ops;
bool IsUnary;
bool HaveMask =
getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
(void)HaveMask;
assert(HaveMask);
// If we have more than 128-bits, only the low 128-bits of shuffle mask
// matter. Check that the upper masks are repeats and remove them.
if (VT.getSizeInBits() > 128) {
int LaneElts = 128 / VT.getScalarSizeInBits();
#ifndef NDEBUG
for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
for (int j = 0; j < LaneElts; ++j)
assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
"Mask doesn't repeat in high 128-bit lanes!");
#endif
Mask.resize(LaneElts);
}
switch (N.getOpcode()) {
case X86ISD::PSHUFD:
return Mask;
case X86ISD::PSHUFLW:
Mask.resize(4);
return Mask;
case X86ISD::PSHUFHW:
Mask.erase(Mask.begin(), Mask.begin() + 4);
for (int &M : Mask)
M -= 4;
return Mask;
default:
llvm_unreachable("No valid shuffle instruction found!");
}
}
/// Search for a combinable shuffle across a chain ending in pshufd.
///
/// We walk up the chain and look for a combinable shuffle, skipping over
/// shuffles that we could hoist this shuffle's transformation past without
/// altering anything.
static SDValue
combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
SelectionDAG &DAG) {
assert(N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!");
SDLoc DL(N);
// Walk up a single-use chain looking for a combinable shuffle. Keep a stack
// of the shuffles in the chain so that we can form a fresh chain to replace
// this one.
SmallVector<SDValue, 8> Chain;
SDValue V = N.getOperand(0);
for (; V.hasOneUse(); V = V.getOperand(0)) {
switch (V.getOpcode()) {
default:
return SDValue(); // Nothing combined!
case ISD::BITCAST:
// Skip bitcasts as we always know the type for the target specific
// instructions.
continue;
case X86ISD::PSHUFD:
// Found another dword shuffle.
break;
case X86ISD::PSHUFLW:
// Check that the low words (being shuffled) are the identity in the
// dword shuffle, and the high words are self-contained.
if (Mask[0] != 0 || Mask[1] != 1 ||
!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
return SDValue();
Chain.push_back(V);
continue;
case X86ISD::PSHUFHW:
// Check that the high words (being shuffled) are the identity in the
// dword shuffle, and the low words are self-contained.
if (Mask[2] != 2 || Mask[3] != 3 ||
!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
return SDValue();
Chain.push_back(V);
continue;
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
// shuffle into a preceding word shuffle.
if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
V.getSimpleValueType().getVectorElementType() != MVT::i16)
return SDValue();
// Search for a half-shuffle which we can combine with.
unsigned CombineOp =
V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
if (V.getOperand(0) != V.getOperand(1) ||
!V->isOnlyUserOf(V.getOperand(0).getNode()))
return SDValue();
Chain.push_back(V);
V = V.getOperand(0);
do {
switch (V.getOpcode()) {
default:
return SDValue(); // Nothing to combine.
case X86ISD::PSHUFLW:
case X86ISD::PSHUFHW:
if (V.getOpcode() == CombineOp)
break;
Chain.push_back(V);
LLVM_FALLTHROUGH;
case ISD::BITCAST:
V = V.getOperand(0);
continue;
}
break;
} while (V.hasOneUse());
break;
}
// Break out of the loop if we break out of the switch.
break;
}
if (!V.hasOneUse())
// We fell out of the loop without finding a viable combining instruction.
return SDValue();
// Merge this node's mask and our incoming mask.
SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
for (int &M : Mask)
M = VMask[M];
V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
// Rebuild the chain around this new shuffle.
while (!Chain.empty()) {
SDValue W = Chain.pop_back_val();
if (V.getValueType() != W.getOperand(0).getValueType())
V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
switch (W.getOpcode()) {
default:
llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
break;
case X86ISD::PSHUFD:
case X86ISD::PSHUFLW:
case X86ISD::PSHUFHW:
V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
break;
}
}
if (V.getValueType() != N.getValueType())
V = DAG.getBitcast(N.getValueType(), V);
// Return the new chain to replace N.
return V;
}
/// Try to combine x86 target specific shuffles.
static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDLoc DL(N);
MVT VT = N.getSimpleValueType();
SmallVector<int, 4> Mask;
unsigned Opcode = N.getOpcode();
// Combine binary shuffle of 2 similar 'Horizontal' instructions into a
// single instruction.
if (VT.getScalarSizeInBits() == 64 &&
(Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
Opcode == X86ISD::UNPCKL)) {
auto BC0 = peekThroughBitcasts(N.getOperand(0));
auto BC1 = peekThroughBitcasts(N.getOperand(1));
EVT VT0 = BC0.getValueType();
EVT VT1 = BC1.getValueType();
unsigned Opcode0 = BC0.getOpcode();
unsigned Opcode1 = BC1.getOpcode();
if (Opcode0 == Opcode1 && VT0 == VT1 &&
(Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
SDValue Lo, Hi;
if (Opcode == X86ISD::MOVSD) {
Lo = BC1.getOperand(0);
Hi = BC0.getOperand(1);
} else {
Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
}
SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
return DAG.getBitcast(VT, Horiz);
}
}
switch (Opcode) {
case X86ISD::VBROADCAST: {
SDValue Src = N.getOperand(0);
SDValue BC = peekThroughBitcasts(Src);
EVT SrcVT = Src.getValueType();
EVT BCVT = BC.getValueType();
// If broadcasting from another shuffle, attempt to simplify it.
// TODO - we really need a general SimplifyDemandedVectorElts mechanism.
if (isTargetShuffle(BC.getOpcode()) &&
VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
SM_SentinelUndef);
for (unsigned i = 0; i != Scale; ++i)
DemandedMask[i] = i;
if (SDValue Res = combineX86ShufflesRecursively(
{BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1,
/*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
DAG.getBitcast(SrcVT, Res));
}
// broadcast(bitcast(src)) -> bitcast(broadcast(src))
// 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
if (Src.getOpcode() == ISD::BITCAST &&
SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits()) {
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
VT.getVectorNumElements());
return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
}
// Reduce broadcast source vector to lowest 128-bits.
if (SrcVT.getSizeInBits() > 128)
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
extract128BitVector(Src, 0, DAG, DL));
// broadcast(scalar_to_vector(x)) -> broadcast(x).
if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
// Share broadcast with the longest vector and extract low subvector (free).
for (SDNode *User : Src->uses())
if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
User->getValueSizeInBits(0) > VT.getSizeInBits()) {
return extractSubVector(SDValue(User, 0), 0, DAG, DL,
VT.getSizeInBits());
}
return SDValue();
}
case X86ISD::BLENDI: {
SDValue N0 = N.getOperand(0);
SDValue N1 = N.getOperand(1);
// blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
// TODO: Handle MVT::v16i16 repeated blend mask.
if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
MVT SrcVT = N0.getOperand(0).getSimpleValueType();
if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
SrcVT.getScalarSizeInBits() >= 32) {
unsigned Mask = N.getConstantOperandVal(2);
unsigned Size = VT.getVectorNumElements();
unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
unsigned ScaleMask = scaleVectorShuffleBlendMask(Mask, Size, Scale);
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
N1.getOperand(0),
DAG.getConstant(ScaleMask, DL, MVT::i8)));
}
}
return SDValue();
}
case X86ISD::VPERMI: {
// vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
// TODO: Remove when we have preferred domains in combineX86ShuffleChain.
SDValue N0 = N.getOperand(0);
SDValue N1 = N.getOperand(1);
unsigned EltSizeInBits = VT.getScalarSizeInBits();
if (N0.getOpcode() == ISD::BITCAST &&
N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
SDValue Src = N0.getOperand(0);
EVT SrcVT = Src.getValueType();
SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
return DAG.getBitcast(VT, Res);
}
return SDValue();
}
case X86ISD::PSHUFD:
case X86ISD::PSHUFLW:
case X86ISD::PSHUFHW:
Mask = getPSHUFShuffleMask(N);
assert(Mask.size() == 4);
break;
case X86ISD::MOVSD:
case X86ISD::MOVSS: {
SDValue N0 = N.getOperand(0);
SDValue N1 = N.getOperand(1);
// Canonicalize scalar FPOps:
// MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
// If commutable, allow OP(N1[0], N0[0]).
unsigned Opcode1 = N1.getOpcode();
if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
Opcode1 == ISD::FDIV) {
SDValue N10 = N1.getOperand(0);
SDValue N11 = N1.getOperand(1);
if (N10 == N0 ||
(N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
if (N10 != N0)
std::swap(N10, N11);
MVT SVT = VT.getVectorElementType();
SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
return DAG.getNode(Opcode, DL, VT, N0, SclVec);
}
}
return SDValue();
}
case X86ISD::INSERTPS: {
assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
SDValue Op0 = N.getOperand(0);
SDValue Op1 = N.getOperand(1);
SDValue Op2 = N.getOperand(2);
unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
unsigned ZeroMask = InsertPSMask & 0xF;
// If we zero out all elements from Op0 then we don't need to reference it.
if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
DAG.getConstant(InsertPSMask, DL, MVT::i8));
// If we zero out the element from Op1 then we don't need to reference it.
if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
DAG.getConstant(InsertPSMask, DL, MVT::i8));
// Attempt to merge insertps Op1 with an inner target shuffle node.
SmallVector<int, 8> TargetMask1;
SmallVector<SDValue, 2> Ops1;
if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
int M = TargetMask1[SrcIdx];
if (isUndefOrZero(M)) {
// Zero/UNDEF insertion - zero out element and remove dependency.
InsertPSMask |= (1u << DstIdx);
return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
DAG.getConstant(InsertPSMask, DL, MVT::i8));
}
// Update insertps mask srcidx and reference the source input directly.
assert(0 <= M && M < 8 && "Shuffle index out of range");
InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
Op1 = Ops1[M < 4 ? 0 : 1];
return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
DAG.getConstant(InsertPSMask, DL, MVT::i8));
}
// Attempt to merge insertps Op0 with an inner target shuffle node.
SmallVector<int, 8> TargetMask0;
SmallVector<SDValue, 2> Ops0;
if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
return SDValue();
bool Updated = false;
bool UseInput00 = false;
bool UseInput01 = false;
for (int i = 0; i != 4; ++i) {
int M = TargetMask0[i];
if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
// No change if element is already zero or the inserted element.
continue;
} else if (isUndefOrZero(M)) {
// If the target mask is undef/zero then we must zero the element.
InsertPSMask |= (1u << i);
Updated = true;
continue;
}
// The input vector element must be inline.
if (M != i && M != (i + 4))
return SDValue();
// Determine which inputs of the target shuffle we're using.
UseInput00 |= (0 <= M && M < 4);
UseInput01 |= (4 <= M);
}
// If we're not using both inputs of the target shuffle then use the
// referenced input directly.
if (UseInput00 && !UseInput01) {
Updated = true;
Op0 = Ops0[0];
} else if (!UseInput00 && UseInput01) {
Updated = true;
Op0 = Ops0[1];
}
if (Updated)
return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
DAG.getConstant(InsertPSMask, DL, MVT::i8));
return SDValue();
}
default:
return SDValue();
}
// Nuke no-op shuffles that show up after combining.
if (isNoopShuffleMask(Mask))
return N.getOperand(0);
// Look for simplifications involving one or two shuffle instructions.
SDValue V = N.getOperand(0);
switch (N.getOpcode()) {
default:
break;
case X86ISD::PSHUFLW:
case X86ISD::PSHUFHW:
assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
// See if this reduces to a PSHUFD which is no more expensive and can
// combine with more operations. Note that it has to at least flip the
// dwords as otherwise it would have been removed as a no-op.
if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
int DMask[] = {0, 1, 2, 3};
int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
DMask[DOffset + 0] = DOffset + 1;
DMask[DOffset + 1] = DOffset + 0;
MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
V = DAG.getBitcast(DVT, V);
V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
return DAG.getBitcast(VT, V);
}
// Look for shuffle patterns which can be implemented as a single unpack.
// FIXME: This doesn't handle the location of the PSHUFD generically, and
// only works when we have a PSHUFD followed by two half-shuffles.
if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
(V.getOpcode() == X86ISD::PSHUFLW ||
V.getOpcode() == X86ISD::PSHUFHW) &&
V.getOpcode() != N.getOpcode() &&
V.hasOneUse()) {
SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
int WordMask[8];
for (int i = 0; i < 4; ++i) {
WordMask[i + NOffset] = Mask[i] + NOffset;
WordMask[i + VOffset] = VMask[i] + VOffset;
}
// Map the word mask through the DWord mask.
int MappedMask[8];
for (int i = 0; i < 8; ++i)
MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
// We can replace all three shuffles with an unpack.
V = DAG.getBitcast(VT, D.getOperand(0));
return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
: X86ISD::UNPCKH,
DL, VT, V, V);
}
}
}
break;
case X86ISD::PSHUFD:
if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
return NewN;
break;
}
return SDValue();
}
/// Checks if the shuffle mask takes subsequent elements
/// alternately from two vectors.
/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
int ParitySrc[2] = {-1, -1};
unsigned Size = Mask.size();
for (unsigned i = 0; i != Size; ++i) {
int M = Mask[i];
if (M < 0)
continue;
// Make sure we are using the matching element from the input.
if ((M % Size) != i)
return false;
// Make sure we use the same input for all elements of the same parity.
int Src = M / Size;
if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
return false;
ParitySrc[i % 2] = Src;
}
// Make sure each input is used.
if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
return false;
Op0Even = ParitySrc[0] == 0;
return true;
}
/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
/// are written to the parameters \p Opnd0 and \p Opnd1.
///
/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
/// so it is easier to generically match. We also insert dummy vector shuffle
/// nodes for the operands which explicitly discard the lanes which are unused
/// by this operation to try to flow through the rest of the combiner
/// the fact that they're unused.
static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
bool &IsSubAdd) {
EVT VT = N->getValueType(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
!VT.getSimpleVT().isFloatingPoint())
return false;
// We only handle target-independent shuffles.
// FIXME: It would be easy and harmless to use the target shuffle mask
// extraction tool to support more.
if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
return false;
SDValue V1 = N->getOperand(0);
SDValue V2 = N->getOperand(1);
// Make sure we have an FADD and an FSUB.
if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
(V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
V1.getOpcode() == V2.getOpcode())
return false;
// If there are other uses of these operations we can't fold them.
if (!V1->hasOneUse() || !V2->hasOneUse())
return false;
// Ensure that both operations have the same operands. Note that we can
// commute the FADD operands.
SDValue LHS, RHS;
if (V1.getOpcode() == ISD::FSUB) {
LHS = V1->getOperand(0); RHS = V1->getOperand(1);
if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
(V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
return false;
} else {
assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
LHS = V2->getOperand(0); RHS = V2->getOperand(1);
if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
(V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
return false;
}
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
bool Op0Even;
if (!isAddSubOrSubAddMask(Mask, Op0Even))
return false;
// It's a subadd if the vector in the even parity is an FADD.
IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
: V2->getOpcode() == ISD::FADD;
Opnd0 = LHS;
Opnd1 = RHS;
return true;
}
/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue combineShuffleToFMAddSub(SDNode *N,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// We only handle target-independent shuffles.
// FIXME: It would be easy and harmless to use the target shuffle mask
// extraction tool to support more.
if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
return SDValue();
MVT VT = N->getSimpleValueType(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
return SDValue();
// We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
SDValue FMAdd = Op0, FMSub = Op1;
if (FMSub.getOpcode() != X86ISD::FMSUB)
std::swap(FMAdd, FMSub);
if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
FMAdd.getOperand(2) != FMSub.getOperand(2))
return SDValue();
// Check for correct shuffle mask.
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
bool Op0Even;
if (!isAddSubOrSubAddMask(Mask, Op0Even))
return SDValue();
// FMAddSub takes zeroth operand from FMSub node.
SDLoc DL(N);
bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
FMAdd.getOperand(2));
}
/// Try to combine a shuffle into a target-specific add-sub or
/// mul-add-sub node.
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
return V;
SDValue Opnd0, Opnd1;
bool IsSubAdd;
if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
return SDValue();
MVT VT = N->getSimpleValueType(0);
SDLoc DL(N);
// Try to generate X86ISD::FMADDSUB node here.
SDValue Opnd2;
if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
}
if (IsSubAdd)
return SDValue();
// Do not generate X86ISD::ADDSUB node for 512-bit types even though
// the ADDSUB idiom has been successfully recognized. There are no known
// X86 targets with 512-bit ADDSUB instructions!
if (VT.is512BitVector())
return SDValue();
return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
}
// We are looking for a shuffle where both sources are concatenated with undef
// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
// if we can express this as a single-source shuffle, that's preferable.
static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
return SDValue();
EVT VT = N->getValueType(0);
// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
if (!VT.is128BitVector() && !VT.is256BitVector())
return SDValue();
if (VT.getVectorElementType() != MVT::i32 &&
VT.getVectorElementType() != MVT::i64 &&
VT.getVectorElementType() != MVT::f32 &&
VT.getVectorElementType() != MVT::f64)
return SDValue();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// Check that both sources are concats with undef.
if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
!N1.getOperand(1).isUndef())
return SDValue();
// Construct the new shuffle mask. Elements from the first source retain their
// index, but elements from the second source no longer need to skip an undef.
SmallVector<int, 8> Mask;
int NumElts = VT.getVectorNumElements();
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
for (int Elt : SVOp->getMask())
Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
SDLoc DL(N);
SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
N1.getOperand(0));
return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
}
/// Eliminate a redundant shuffle of a horizontal math op.
static SDValue foldShuffleOfHorizOp(SDNode *N) {
unsigned Opcode = N->getOpcode();
if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
return SDValue();
// For a broadcast, peek through an extract element of index 0 to find the
// horizontal op: broadcast (ext_vec_elt HOp, 0)
EVT VT = N->getValueType(0);
if (Opcode == X86ISD::VBROADCAST) {
SDValue SrcOp = N->getOperand(0);
if (SrcOp.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
SrcOp.getValueType() == MVT::f64 &&
SrcOp.getOperand(0).getValueType() == VT &&
isNullConstant(SrcOp.getOperand(1)))
N = SrcOp.getNode();
}
SDValue HOp = N->getOperand(0);
if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
return SDValue();
// 128-bit horizontal math instructions are defined to operate on adjacent
// lanes of each operand as:
// v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
// ...similarly for v2f64 and v8i16.
if (!HOp.getOperand(0).isUndef() && !HOp.getOperand(1).isUndef() &&
HOp.getOperand(0) != HOp.getOperand(1))
return SDValue();
// When the operands of a horizontal math op are identical, the low half of
// the result is the same as the high half. If a target shuffle is also
// replicating low and high halves, we don't need the shuffle.
if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) {
if (HOp.getScalarValueSizeInBits() == 64) {
// movddup (hadd X, X) --> hadd X, X
// broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
assert((HOp.getValueType() == MVT::v2f64 ||
HOp.getValueType() == MVT::v4f64) && HOp.getValueType() == VT &&
"Unexpected type for h-op");
return HOp;
}
return SDValue();
}
// shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
// TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
// but this should be tied to whatever horizontal op matching and shuffle
// canonicalization are producing.
if (HOp.getValueSizeInBits() == 128 &&
(isTargetShuffleEquivalent(Mask, {0, 0}) ||
isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
return HOp;
if (HOp.getValueSizeInBits() == 256 &&
(isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
isTargetShuffleEquivalent(
Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
return HOp;
return SDValue();
}
/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
/// low half of each source vector and does not set any high half elements in
/// the destination vector, narrow the shuffle to half its original size.
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
if (!Shuf->getValueType(0).isSimple())
return SDValue();
MVT VT = Shuf->getSimpleValueType(0);
if (!VT.is256BitVector() && !VT.is512BitVector())
return SDValue();
// See if we can ignore all of the high elements of the shuffle.
ArrayRef<int> Mask = Shuf->getMask();
if (!isUndefUpperHalf(Mask))
return SDValue();
// Check if the shuffle mask accesses only the low half of each input vector
// (half-index output is 0 or 2).
int HalfIdx1, HalfIdx2;
SmallVector<int, 8> HalfMask(Mask.size() / 2);
if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
(HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
return SDValue();
// Create a half-width shuffle to replace the unnecessarily wide shuffle.
// The trick is knowing that all of the insert/extract are actually free
// subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
// of narrow inputs into a narrow output, and that is always cheaper than
// the wide shuffle that we started with.
return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
Shuf->getOperand(1), HalfMask, HalfIdx1,
HalfIdx2, false, DAG);
}
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
if (SDValue V = narrowShuffle(Shuf, DAG))
return V;
// If we have legalized the vector types, look for blends of FADD and FSUB
// nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
SDLoc dl(N);
EVT VT = N->getValueType(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.isTypeLegal(VT)) {
if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
return AddSub;
if (SDValue HAddSub = foldShuffleOfHorizOp(N))
return HAddSub;
}
// During Type Legalization, when promoting illegal vector types,
// the backend might introduce new shuffle dag nodes and bitcasts.
//
// This code performs the following transformation:
// fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
// (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
//
// We do this only if both the bitcast and the BINOP dag nodes have
// one use. Also, perform this transformation only if the new binary
// operation is legal. This is to avoid introducing dag nodes that
// potentially need to be further expanded (or custom lowered) into a
// less optimal sequence of dag nodes.
if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
N->getOpcode() == ISD::VECTOR_SHUFFLE &&
N->getOperand(0).getOpcode() == ISD::BITCAST &&
N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue BC0 = N0.getOperand(0);
EVT SVT = BC0.getValueType();
unsigned Opcode = BC0.getOpcode();
unsigned NumElts = VT.getVectorNumElements();
if (BC0.hasOneUse() && SVT.isVector() &&
SVT.getVectorNumElements() * 2 == NumElts &&
TLI.isOperationLegal(Opcode, VT)) {
bool CanFold = false;
switch (Opcode) {
default : break;
case ISD::ADD:
case ISD::SUB:
case ISD::MUL:
// isOperationLegal lies for integer ops on floating point types.
CanFold = VT.isInteger();
break;
case ISD::FADD:
case ISD::FSUB:
case ISD::FMUL:
// isOperationLegal lies for floating point ops on integer types.
CanFold = VT.isFloatingPoint();
break;
}
unsigned SVTNumElts = SVT.getVectorNumElements();
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
CanFold = SVOp->getMaskElt(i) < 0;
if (CanFold) {
SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
}
}
}
// Attempt to combine into a vector load/broadcast.
if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true))
return LD;
// For AVX2, we sometimes want to combine
// (vector_shuffle <mask> (concat_vectors t1, undef)
// (concat_vectors t2, undef))
// Into:
// (vector_shuffle <mask> (concat_vectors t1, t2), undef)
// Since the latter can be efficiently lowered with VPERMD/VPERMQ
if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
return ShufConcat;
if (isTargetShuffle(N->getOpcode())) {
SDValue Op(N, 0);
if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
return Shuffle;
// Try recursively combining arbitrary sequences of x86 shuffle
// instructions into higher-order shuffles. We do this after combining
// specific PSHUF instruction sequences into their minimal form so that we
// can evaluate how many specialized shuffle instructions are involved in
// a particular chain.
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
// Simplify source operands based on shuffle mask.
// TODO - merge this into combineX86ShufflesRecursively.
APInt KnownUndef, KnownZero;
APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, DCI))
return SDValue(N, 0);
}
// Look for a v2i64/v2f64 VZEXT_MOVL of a node that already produces zeros
// in the upper 64 bits.
// TODO: Can we generalize this using computeKnownBits.
if (N->getOpcode() == X86ISD::VZEXT_MOVL &&
(VT == MVT::v2f64 || VT == MVT::v2i64) &&
N->getOperand(0).getOpcode() == ISD::BITCAST &&
(N->getOperand(0).getOperand(0).getValueType() == MVT::v4f32 ||
N->getOperand(0).getOperand(0).getValueType() == MVT::v4i32)) {
SDValue In = N->getOperand(0).getOperand(0);
switch (In.getOpcode()) {
default:
break;
case X86ISD::CVTP2SI: case X86ISD::CVTP2UI:
case X86ISD::MCVTP2SI: case X86ISD::MCVTP2UI:
case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI:
case X86ISD::MCVTTP2SI: case X86ISD::MCVTTP2UI:
case X86ISD::CVTSI2P: case X86ISD::CVTUI2P:
case X86ISD::MCVTSI2P: case X86ISD::MCVTUI2P:
case X86ISD::VFPROUND: case X86ISD::VMFPROUND:
if (In.getOperand(0).getValueType() == MVT::v2f64 ||
In.getOperand(0).getValueType() == MVT::v2i64)
return N->getOperand(0); // return the bitcast
break;
}
}
// Pull subvector inserts into undef through VZEXT_MOVL by making it an
// insert into a zero vector. This helps get VZEXT_MOVL closer to
// scalar_to_vectors where 256/512 are canonicalized to an insert and a
// 128-bit scalar_to_vector. This reduces the number of isel patterns.
if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&
N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR &&
N->getOperand(0).hasOneUse() &&
N->getOperand(0).getOperand(0).isUndef() &&
isNullConstant(N->getOperand(0).getOperand(2))) {
SDValue In = N->getOperand(0).getOperand(1);
SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In);
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
Movl, N->getOperand(0).getOperand(2));
}
// If this a vzmovl of a full vector load, replace it with a vzload, unless
// the load is volatile.
if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() &&
ISD::isNormalLoad(N->getOperand(0).getNode())) {
LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
if (!LN->isVolatile()) {
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
SDValue VZLoad =
DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
VT.getVectorElementType(),
LN->getPointerInfo(),
LN->getAlignment(),
MachineMemOperand::MOLoad);
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
return VZLoad;
}
}
// Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the
// operands is an extend from v2i32 to v2i64. Turn it into a pmulld.
// FIXME: This can probably go away once we default to widening legalization.
if (Subtarget.hasSSE41() && VT == MVT::v4i32 &&
N->getOpcode() == ISD::VECTOR_SHUFFLE &&
N->getOperand(0).getOpcode() == ISD::BITCAST &&
N->getOperand(0).getOperand(0).getOpcode() == X86ISD::PMULUDQ) {
SDValue BC = N->getOperand(0);
SDValue MULUDQ = BC.getOperand(0);
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
ArrayRef<int> Mask = SVOp->getMask();
if (BC.hasOneUse() && MULUDQ.hasOneUse() &&
Mask[0] == 0 && Mask[1] == 2 && Mask[2] == -1 && Mask[3] == -1) {
SDValue Op0 = MULUDQ.getOperand(0);
SDValue Op1 = MULUDQ.getOperand(1);
if (Op0.getOpcode() == ISD::BITCAST &&
Op0.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
Op0.getOperand(0).getValueType() == MVT::v4i32) {
ShuffleVectorSDNode *SVOp0 =
cast<ShuffleVectorSDNode>(Op0.getOperand(0));
ArrayRef<int> Mask2 = SVOp0->getMask();
if (Mask2[0] == 0 && Mask2[1] == -1 &&
Mask2[2] == 1 && Mask2[3] == -1) {
Op0 = SVOp0->getOperand(0);
Op1 = DAG.getBitcast(MVT::v4i32, Op1);
Op1 = DAG.getVectorShuffle(MVT::v4i32, dl, Op1, Op1, Mask);
return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1);
}
}
if (Op1.getOpcode() == ISD::BITCAST &&
Op1.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
Op1.getOperand(0).getValueType() == MVT::v4i32) {
ShuffleVectorSDNode *SVOp1 =
cast<ShuffleVectorSDNode>(Op1.getOperand(0));
ArrayRef<int> Mask2 = SVOp1->getMask();
if (Mask2[0] == 0 && Mask2[1] == -1 &&
Mask2[2] == 1 && Mask2[3] == -1) {
Op0 = DAG.getBitcast(MVT::v4i32, Op0);
Op0 = DAG.getVectorShuffle(MVT::v4i32, dl, Op0, Op0, Mask);
Op1 = SVOp1->getOperand(0);
return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1);
}
}
}
}
return SDValue();
}
bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
TargetLoweringOpt &TLO, unsigned Depth) const {
int NumElts = DemandedElts.getBitWidth();
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
// Handle special case opcodes.
switch (Opc) {
case X86ISD::PMULDQ:
case X86ISD::PMULUDQ: {
APInt LHSUndef, LHSZero;
APInt RHSUndef, RHSZero;
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
Depth + 1))
return true;
if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
Depth + 1))
return true;
// Multiply by zero.
KnownZero = LHSZero | RHSZero;
break;
}
case X86ISD::VSHL:
case X86ISD::VSRL:
case X86ISD::VSRA: {
// We only need the bottom 64-bits of the (128-bit) shift amount.
SDValue Amt = Op.getOperand(1);
MVT AmtVT = Amt.getSimpleValueType();
assert(AmtVT.is128BitVector() && "Unexpected value type");
// If we reuse the shift amount just for sse shift amounts then we know that
// only the bottom 64-bits are only ever used.
bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
unsigned UseOpc = Use->getOpcode();
return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
UseOpc == X86ISD::VSRA) &&
Use->getOperand(0) != Amt;
});
APInt AmtUndef, AmtZero;
unsigned NumAmtElts = AmtVT.getVectorNumElements();
APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
Depth + 1, AssumeSingleUse))
return true;
LLVM_FALLTHROUGH;
}
case X86ISD::VSHLI:
case X86ISD::VSRLI:
case X86ISD::VSRAI: {
SDValue Src = Op.getOperand(0);
APInt SrcUndef;
if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
Depth + 1))
return true;
// TODO convert SrcUndef to KnownUndef.
break;
}
case X86ISD::CVTSI2P:
case X86ISD::CVTUI2P: {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
APInt SrcUndef, SrcZero;
APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
Depth + 1))
return true;
break;
}
case X86ISD::PACKSS:
case X86ISD::PACKUS: {
APInt DemandedLHS, DemandedRHS;
getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
APInt SrcUndef, SrcZero;
if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, SrcUndef,
SrcZero, TLO, Depth + 1))
return true;
if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, SrcUndef,
SrcZero, TLO, Depth + 1))
return true;
break;
}
case X86ISD::HADD:
case X86ISD::HSUB:
case X86ISD::FHADD:
case X86ISD::FHSUB: {
APInt DemandedLHS, DemandedRHS;
getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
APInt LHSUndef, LHSZero;
if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, LHSUndef,
LHSZero, TLO, Depth + 1))
return true;
APInt RHSUndef, RHSZero;
if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, RHSUndef,
RHSZero, TLO, Depth + 1))
return true;
break;
}
case X86ISD::VTRUNC:
case X86ISD::VTRUNCS:
case X86ISD::VTRUNCUS: {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
APInt SrcUndef, SrcZero;
if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
Depth + 1))
return true;
KnownZero = SrcZero.zextOrTrunc(NumElts);
KnownUndef = SrcUndef.zextOrTrunc(NumElts);
break;
}
case X86ISD::BLENDV: {
APInt SelUndef, SelZero;
if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
SelZero, TLO, Depth + 1))
return true;
// TODO: Use SelZero to adjust LHS/RHS DemandedElts.
APInt LHSUndef, LHSZero;
if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
LHSZero, TLO, Depth + 1))
return true;
APInt RHSUndef, RHSZero;
if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
RHSZero, TLO, Depth + 1))
return true;
KnownZero = LHSZero & RHSZero;
KnownUndef = LHSUndef & RHSUndef;
break;
}
case X86ISD::VBROADCAST: {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
if (!SrcVT.isVector())
return false;
// Don't bother broadcasting if we just need the 0'th element.
if (DemandedElts == 1) {
if (Src.getValueType() != VT)
Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
SDLoc(Op));
return TLO.CombineTo(Op, Src);
}
APInt SrcUndef, SrcZero;
APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
Depth + 1))
return true;
break;
}
- case X86ISD::SUBV_BROADCAST: {
- // Reduce size of broadcast if we don't need the upper half.
- unsigned HalfElts = NumElts / 2;
- if (DemandedElts.extractBits(HalfElts, HalfElts).isNullValue()) {
- SDValue Src = Op.getOperand(0);
- MVT SrcVT = Src.getSimpleValueType();
-
- SDValue Half = Src;
- if (SrcVT.getVectorNumElements() != HalfElts) {
- MVT HalfVT = MVT::getVectorVT(SrcVT.getScalarType(), HalfElts);
- Half = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, SDLoc(Op), HalfVT, Src);
- }
-
- return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Half, 0,
- TLO.DAG, SDLoc(Op),
- Half.getValueSizeInBits()));
- }
- break;
- }
case X86ISD::VPERMV: {
SDValue Mask = Op.getOperand(0);
APInt MaskUndef, MaskZero;
if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
Depth + 1))
return true;
break;
}
case X86ISD::PSHUFB:
case X86ISD::VPERMV3:
case X86ISD::VPERMILPV: {
SDValue Mask = Op.getOperand(1);
APInt MaskUndef, MaskZero;
if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
Depth + 1))
return true;
break;
}
case X86ISD::VPPERM:
case X86ISD::VPERMIL2: {
SDValue Mask = Op.getOperand(2);
APInt MaskUndef, MaskZero;
if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
Depth + 1))
return true;
break;
}
}
// For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
// demand any of the high elements, then narrow the op to 128/256-bits: e.g.
// (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
if ((VT.is256BitVector() || VT.is512BitVector()) &&
DemandedElts.lshr(NumElts / 2) == 0) {
unsigned SizeInBits = VT.getSizeInBits();
unsigned ExtSizeInBits = SizeInBits / 2;
// See if 512-bit ops only use the bottom 128-bits.
if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
ExtSizeInBits = SizeInBits / 4;
switch (Opc) {
// Zero upper elements.
case X86ISD::VZEXT_MOVL: {
SDLoc DL(Op);
SDValue Ext0 =
extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
SDValue ExtOp =
TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0);
SDValue UndefVec = TLO.DAG.getUNDEF(VT);
SDValue Insert =
insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
return TLO.CombineTo(Op, Insert);
}
+ // Subvector broadcast.
+ case X86ISD::SUBV_BROADCAST: {
+ SDLoc DL(Op);
+ SDValue Src = Op.getOperand(0);
+ if (Src.getValueSizeInBits() > ExtSizeInBits)
+ Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
+ else if (Src.getValueSizeInBits() < ExtSizeInBits) {
+ MVT SrcSVT = Src.getSimpleValueType().getScalarType();
+ MVT SrcVT =
+ MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits());
+ Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src);
+ }
+ return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0,
+ TLO.DAG, DL, ExtSizeInBits));
+ }
// Byte shifts by immediate.
case X86ISD::VSHLDQ:
case X86ISD::VSRLDQ:
// Shift by uniform.
case X86ISD::VSHL:
case X86ISD::VSRL:
case X86ISD::VSRA:
// Shift by immediate.
case X86ISD::VSHLI:
case X86ISD::VSRLI:
case X86ISD::VSRAI: {
SDLoc DL(Op);
SDValue Ext0 =
extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
SDValue ExtOp =
TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
SDValue UndefVec = TLO.DAG.getUNDEF(VT);
SDValue Insert =
insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
return TLO.CombineTo(Op, Insert);
}
case X86ISD::VPERMI: {
// Simplify PERMPD/PERMQ to extract_subvector.
// TODO: This should be done in shuffle combining.
if (VT == MVT::v4f64 || VT == MVT::v4i64) {
SmallVector<int, 4> Mask;
DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
SDLoc DL(Op);
SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
SDValue UndefVec = TLO.DAG.getUNDEF(VT);
SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
return TLO.CombineTo(Op, Insert);
}
}
break;
}
// Target Shuffles.
case X86ISD::PSHUFB:
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
// Saturated Packs.
case X86ISD::PACKSS:
case X86ISD::PACKUS:
// Horizontal Ops.
case X86ISD::HADD:
case X86ISD::HSUB:
case X86ISD::FHADD:
case X86ISD::FHSUB: {
SDLoc DL(Op);
MVT ExtVT = VT.getSimpleVT();
ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
ExtSizeInBits / ExtVT.getScalarSizeInBits());
SDValue Ext0 =
extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
SDValue Ext1 =
extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, ExtSizeInBits);
SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ext0, Ext1);
SDValue UndefVec = TLO.DAG.getUNDEF(VT);
SDValue Insert =
insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
return TLO.CombineTo(Op, Insert);
}
}
}
// Simplify target shuffles.
if (!isTargetShuffle(Opc) || !VT.isSimple())
return false;
// Get target shuffle mask.
bool IsUnary;
SmallVector<int, 64> OpMask;
SmallVector<SDValue, 2> OpInputs;
if (!getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, OpInputs,
OpMask, IsUnary))
return false;
// Shuffle inputs must be the same type as the result.
if (llvm::any_of(OpInputs,
[VT](SDValue V) { return VT != V.getValueType(); }))
return false;
// Clear known elts that might have been set above.
KnownZero.clearAllBits();
KnownUndef.clearAllBits();
// Check if shuffle mask can be simplified to undef/zero/identity.
int NumSrcs = OpInputs.size();
for (int i = 0; i != NumElts; ++i) {
int &M = OpMask[i];
if (!DemandedElts[i])
M = SM_SentinelUndef;
else if (0 <= M && OpInputs[M / NumElts].isUndef())
M = SM_SentinelUndef;
}
if (isUndefInRange(OpMask, 0, NumElts)) {
KnownUndef.setAllBits();
return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
}
if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
KnownZero.setAllBits();
return TLO.CombineTo(
Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
}
for (int Src = 0; Src != NumSrcs; ++Src)
if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
return TLO.CombineTo(Op, OpInputs[Src]);
// Attempt to simplify inputs.
for (int Src = 0; Src != NumSrcs; ++Src) {
int Lo = Src * NumElts;
APInt SrcElts = APInt::getNullValue(NumElts);
for (int i = 0; i != NumElts; ++i)
if (DemandedElts[i]) {
int M = OpMask[i] - Lo;
if (0 <= M && M < NumElts)
SrcElts.setBit(M);
}
APInt SrcUndef, SrcZero;
if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
TLO, Depth + 1))
return true;
}
// Extract known zero/undef elements.
// TODO - Propagate input undef/zero elts.
for (int i = 0; i != NumElts; ++i) {
if (OpMask[i] == SM_SentinelUndef)
KnownUndef.setBit(i);
if (OpMask[i] == SM_SentinelZero)
KnownZero.setBit(i);
}
return false;
}
bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
SDValue Op, const APInt &OriginalDemandedBits,
const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
unsigned Depth) const {
EVT VT = Op.getValueType();
unsigned BitWidth = OriginalDemandedBits.getBitWidth();
unsigned Opc = Op.getOpcode();
switch(Opc) {
case X86ISD::PMULDQ:
case X86ISD::PMULUDQ: {
// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
KnownBits KnownOp;
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
// FIXME: Can we bound this better?
APInt DemandedMask = APInt::getLowBitsSet(64, 32);
if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
TLO, Depth + 1))
return true;
if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
TLO, Depth + 1))
return true;
break;
}
case X86ISD::VSHLI: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) {
if (ShiftImm->getAPIntValue().uge(BitWidth))
break;
unsigned ShAmt = ShiftImm->getZExtValue();
APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
// single shift. We can do this if the bottom bits (which are shifted
// out) are never demanded.
if (Op0.getOpcode() == X86ISD::VSRLI &&
OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
if (auto *Shift2Imm = dyn_cast<ConstantSDNode>(Op0.getOperand(1))) {
if (Shift2Imm->getAPIntValue().ult(BitWidth)) {
int Diff = ShAmt - Shift2Imm->getZExtValue();
if (Diff == 0)
return TLO.CombineTo(Op, Op0.getOperand(0));
unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
SDValue NewShift = TLO.DAG.getNode(
NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
TLO.DAG.getConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
return TLO.CombineTo(Op, NewShift);
}
}
}
if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
TLO, Depth + 1))
return true;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
Known.Zero <<= ShAmt;
Known.One <<= ShAmt;
// Low bits known zero.
Known.Zero.setLowBits(ShAmt);
}
break;
}
case X86ISD::VSRLI: {
if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
if (ShiftImm->getAPIntValue().uge(BitWidth))
break;
unsigned ShAmt = ShiftImm->getZExtValue();
APInt DemandedMask = OriginalDemandedBits << ShAmt;
if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
OriginalDemandedElts, Known, TLO, Depth + 1))
return true;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
Known.Zero.lshrInPlace(ShAmt);
Known.One.lshrInPlace(ShAmt);
// High bits known zero.
Known.Zero.setHighBits(ShAmt);
}
break;
}
case X86ISD::VSRAI: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) {
if (ShiftImm->getAPIntValue().uge(BitWidth))
break;
unsigned ShAmt = ShiftImm->getZExtValue();
APInt DemandedMask = OriginalDemandedBits << ShAmt;
// If we just want the sign bit then we don't need to shift it.
if (OriginalDemandedBits.isSignMask())
return TLO.CombineTo(Op, Op0);
// fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
SDValue Op00 = Op0.getOperand(0);
unsigned NumSignBits =
TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
if (ShAmt < NumSignBits)
return TLO.CombineTo(Op, Op00);
}
// If any of the demanded bits are produced by the sign extension, we also
// demand the input sign bit.
if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
DemandedMask.setSignBit();
if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
TLO, Depth + 1))
return true;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
Known.Zero.lshrInPlace(ShAmt);
Known.One.lshrInPlace(ShAmt);
// If the input sign bit is known to be zero, or if none of the top bits
// are demanded, turn this into an unsigned shift right.
if (Known.Zero[BitWidth - ShAmt - 1] ||
OriginalDemandedBits.countLeadingZeros() >= ShAmt)
return TLO.CombineTo(
Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
// High bits are known one.
if (Known.One[BitWidth - ShAmt - 1])
Known.One.setHighBits(ShAmt);
}
break;
}
case X86ISD::PEXTRB:
case X86ISD::PEXTRW: {
SDValue Vec = Op.getOperand(0);
auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
MVT VecVT = Vec.getSimpleValueType();
unsigned NumVecElts = VecVT.getVectorNumElements();
if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
unsigned Idx = CIdx->getZExtValue();
unsigned VecBitWidth = VecVT.getScalarSizeInBits();
// If we demand no bits from the vector then we must have demanded
// bits from the implict zext - simplify to zero.
APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
if (DemandedVecBits == 0)
return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
APInt KnownUndef, KnownZero;
APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
KnownZero, TLO, Depth + 1))
return true;
KnownBits KnownVec;
if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
KnownVec, TLO, Depth + 1))
return true;
Known = KnownVec.zext(BitWidth, true);
return false;
}
break;
}
case X86ISD::PINSRB:
case X86ISD::PINSRW: {
SDValue Vec = Op.getOperand(0);
SDValue Scl = Op.getOperand(1);
auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
MVT VecVT = Vec.getSimpleValueType();
if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
unsigned Idx = CIdx->getZExtValue();
if (!OriginalDemandedElts[Idx])
return TLO.CombineTo(Op, Vec);
KnownBits KnownVec;
APInt DemandedVecElts(OriginalDemandedElts);
DemandedVecElts.clearBit(Idx);
if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
KnownVec, TLO, Depth + 1))
return true;
KnownBits KnownScl;
unsigned NumSclBits = Scl.getScalarValueSizeInBits();
APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
return true;
KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
Known.One = KnownVec.One & KnownScl.One;
Known.Zero = KnownVec.Zero & KnownScl.Zero;
return false;
}
break;
}
case X86ISD::PACKSS:
// PACKSS saturates to MIN/MAX integer values. So if we just want the
// sign bit then we can just ask for the source operands sign bit.
// TODO - add known bits handling.
if (OriginalDemandedBits.isSignMask()) {
APInt DemandedLHS, DemandedRHS;
getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
KnownBits KnownLHS, KnownRHS;
APInt SignMask = APInt::getSignMask(BitWidth * 2);
if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
KnownLHS, TLO, Depth + 1))
return true;
if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
KnownRHS, TLO, Depth + 1))
return true;
}
// TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
break;
case X86ISD::PCMPGT:
// icmp sgt(0, R) == ashr(R, BitWidth-1).
// iff we only need the sign bit then we can use R directly.
if (OriginalDemandedBits.isSignMask() &&
ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
return TLO.CombineTo(Op, Op.getOperand(1));
break;
case X86ISD::MOVMSK: {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
unsigned SrcBits = SrcVT.getScalarSizeInBits();
unsigned NumElts = SrcVT.getVectorNumElements();
// If we don't need the sign bits at all just return zero.
if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
// Only demand the vector elements of the sign bits we need.
APInt KnownUndef, KnownZero;
APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
TLO, Depth + 1))
return true;
Known.Zero = KnownZero.zextOrSelf(BitWidth);
Known.Zero.setHighBits(BitWidth - NumElts);
// MOVMSK only uses the MSB from each vector element.
KnownBits KnownSrc;
if (SimplifyDemandedBits(Src, APInt::getSignMask(SrcBits), DemandedElts,
KnownSrc, TLO, Depth + 1))
return true;
if (KnownSrc.One[SrcBits - 1])
Known.One.setLowBits(NumElts);
else if (KnownSrc.Zero[SrcBits - 1])
Known.Zero.setLowBits(NumElts);
return false;
}
}
return TargetLowering::SimplifyDemandedBitsForTargetNode(
Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
}
/// Check if a vector extract from a target-specific shuffle of a load can be
/// folded into a single element load.
/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
/// shuffles have been custom lowered so we need to handle those here.
static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
SDValue InVec = N->getOperand(0);
SDValue EltNo = N->getOperand(1);
EVT EltVT = N->getValueType(0);
if (!isa<ConstantSDNode>(EltNo))
return SDValue();
EVT OriginalVT = InVec.getValueType();
// Peek through bitcasts, don't duplicate a load with other uses.
InVec = peekThroughOneUseBitcasts(InVec);
EVT CurrentVT = InVec.getValueType();
if (!CurrentVT.isVector() ||
CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
return SDValue();
if (!isTargetShuffle(InVec.getOpcode()))
return SDValue();
// Don't duplicate a load with other uses.
if (!InVec.hasOneUse())
return SDValue();
SmallVector<int, 16> ShuffleMask;
SmallVector<SDValue, 2> ShuffleOps;
bool UnaryShuffle;
if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
ShuffleOps, ShuffleMask, UnaryShuffle))
return SDValue();
// Select the input vector, guarding against out of range extract vector.
unsigned NumElems = CurrentVT.getVectorNumElements();
int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
if (Idx == SM_SentinelZero)
return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
: DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
if (Idx == SM_SentinelUndef)
return DAG.getUNDEF(EltVT);
// Bail if any mask element is SM_SentinelZero - getVectorShuffle below
// won't handle it.
if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; }))
return SDValue();
assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0] : ShuffleOps[1];
// If inputs to shuffle are the same for both ops, then allow 2 uses
unsigned AllowedUses =
(ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
if (LdNode.getOpcode() == ISD::BITCAST) {
// Don't duplicate a load with other uses.
if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
return SDValue();
AllowedUses = 1; // only allow 1 load use if we have a bitcast
LdNode = LdNode.getOperand(0);
}
if (!ISD::isNormalLoad(LdNode.getNode()))
return SDValue();
LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
return SDValue();
// If there's a bitcast before the shuffle, check if the load type and
// alignment is valid.
unsigned Align = LN0->getAlignment();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
EltVT.getTypeForEVT(*DAG.getContext()));
if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
return SDValue();
// All checks match so transform back to vector_shuffle so that DAG combiner
// can finish the job
SDLoc dl(N);
// Create shuffle node taking into account the case that its a unary shuffle
SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
ShuffleMask);
Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
EltNo);
}
// Helper to peek through bitops/setcc to determine size of source vector.
// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
switch (Src.getOpcode()) {
case ISD::SETCC:
return Src.getOperand(0).getValueSizeInBits() == Size;
case ISD::AND:
case ISD::XOR:
case ISD::OR:
return checkBitcastSrcVectorSize(Src.getOperand(0), Size) &&
checkBitcastSrcVectorSize(Src.getOperand(1), Size);
}
return false;
}
// Try to match patterns such as
// (i16 bitcast (v16i1 x))
// ->
// (i16 movmsk (16i8 sext (v16i1 x)))
// before the illegal vector is scalarized on subtargets that don't have legal
// vxi1 types.
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
const SDLoc &DL,
const X86Subtarget &Subtarget) {
EVT SrcVT = Src.getValueType();
if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
return SDValue();
// If the input is a truncate from v16i8 or v32i8 go ahead and use a
// movmskb even with avx512. This will be better than truncating to vXi1 and
// using a kmov. This can especially help KNL if the input is a v16i8/v32i8
// vpcmpeqb/vpcmpgtb.
bool IsTruncated = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
(Src.getOperand(0).getValueType() == MVT::v16i8 ||
Src.getOperand(0).getValueType() == MVT::v32i8 ||
Src.getOperand(0).getValueType() == MVT::v64i8);
// With AVX512 vxi1 types are legal and we prefer using k-regs.
// MOVMSK is supported in SSE2 or later.
if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !IsTruncated))
return SDValue();
// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
// v8f64. So all legal 128-bit and 256-bit vectors are covered except for
// v8i16 and v16i16.
// For these two cases, we can shuffle the upper element bytes to a
// consecutive sequence at the start of the vector and treat the results as
// v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
// for v16i16 this is not the case, because the shuffle is expensive, so we
// avoid sign-extending to this type entirely.
// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
MVT SExtVT;
switch (SrcVT.getSimpleVT().SimpleTy) {
default:
return SDValue();
case MVT::v2i1:
SExtVT = MVT::v2i64;
break;
case MVT::v4i1:
SExtVT = MVT::v4i32;
// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
// sign-extend to a 256-bit operation to avoid truncation.
if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256))
SExtVT = MVT::v4i64;
break;
case MVT::v8i1:
SExtVT = MVT::v8i16;
// For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
// sign-extend to a 256-bit operation to match the compare.
// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
// 256-bit because the shuffle is cheaper than sign extending the result of
// the compare.
// TODO : use checkBitcastSrcVectorSize
if (Src.getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
(Src.getOperand(0).getValueType().is256BitVector() ||
Src.getOperand(0).getValueType().is512BitVector())) {
SExtVT = MVT::v8i32;
}
break;
case MVT::v16i1:
SExtVT = MVT::v16i8;
// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
// it is not profitable to sign-extend to 256-bit because this will
// require an extra cross-lane shuffle which is more expensive than
// truncating the result of the compare to 128-bits.
break;
case MVT::v32i1:
SExtVT = MVT::v32i8;
break;
case MVT::v64i1:
// If we have AVX512F, but not AVX512BW and the input is truncated from
// v64i8 checked earlier. Then split the input and make two pmovmskbs.
if (Subtarget.hasAVX512() && !Subtarget.hasBWI()) {
SExtVT = MVT::v64i8;
break;
}
return SDValue();
};
SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
if (SExtVT == MVT::v64i8) {
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
DAG.getConstant(32, DL, MVT::i8));
V = DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
} else if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8) {
V = getPMOVMSKB(DL, V, DAG, Subtarget);
} else {
if (SExtVT == MVT::v8i16)
V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
DAG.getUNDEF(MVT::v8i16));
V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
}
EVT IntVT =
EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
V = DAG.getZExtOrTrunc(V, DL, IntVT);
return DAG.getBitcast(VT, V);
}
// Convert a vXi1 constant build vector to the same width scalar integer.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
EVT SrcVT = Op.getValueType();
assert(SrcVT.getVectorElementType() == MVT::i1 &&
"Expected a vXi1 vector");
assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
"Expected a constant build vector");
APInt Imm(SrcVT.getVectorNumElements(), 0);
for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
SDValue In = Op.getOperand(Idx);
if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
Imm.setBit(Idx);
}
EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
return DAG.getConstant(Imm, SDLoc(Op), IntVT);
}
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
if (!DCI.isBeforeLegalizeOps())
return SDValue();
// Only do this if we have k-registers.
if (!Subtarget.hasAVX512())
return SDValue();
EVT DstVT = N->getValueType(0);
SDValue Op = N->getOperand(0);
EVT SrcVT = Op.getValueType();
if (!Op.hasOneUse())
return SDValue();
// Look for logic ops.
if (Op.getOpcode() != ISD::AND &&
Op.getOpcode() != ISD::OR &&
Op.getOpcode() != ISD::XOR)
return SDValue();
// Make sure we have a bitcast between mask registers and a scalar type.
if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
DstVT.isScalarInteger()) &&
!(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
SrcVT.isScalarInteger()))
return SDValue();
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
LHS.getOperand(0).getValueType() == DstVT)
return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
DAG.getBitcast(DstVT, RHS));
if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
RHS.getOperand(0).getValueType() == DstVT)
return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
// If the RHS is a vXi1 build vector, this is a good reason to flip too.
// Most of these have to move a constant from the scalar domain anyway.
if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
RHS = combinevXi1ConstantToInteger(RHS, DAG);
return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
DAG.getBitcast(DstVT, LHS), RHS);
}
return SDValue();
}
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc DL(BV);
unsigned NumElts = BV->getNumOperands();
SDValue Splat = BV->getSplatValue();
// Build MMX element from integer GPR or SSE float values.
auto CreateMMXElement = [&](SDValue V) {
if (V.isUndef())
return DAG.getUNDEF(MVT::x86mmx);
if (V.getValueType().isFloatingPoint()) {
if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
V = DAG.getBitcast(MVT::v2i64, V);
return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
}
V = DAG.getBitcast(MVT::i32, V);
} else {
V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
}
return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
};
// Convert build vector ops to MMX data in the bottom elements.
SmallVector<SDValue, 8> Ops;
// Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
if (Splat) {
if (Splat.isUndef())
return DAG.getUNDEF(MVT::x86mmx);
Splat = CreateMMXElement(Splat);
if (Subtarget.hasSSE1()) {
// Unpack v8i8 to splat i8 elements to lowest 16-bits.
if (NumElts == 8)
Splat = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
Splat);
// Use PSHUFW to repeat 16-bit elements.
unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
return DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat,
DAG.getConstant(ShufMask, DL, MVT::i8));
}
Ops.append(NumElts, Splat);
} else {
for (unsigned i = 0; i != NumElts; ++i)
Ops.push_back(CreateMMXElement(BV->getOperand(i)));
}
// Use tree of PUNPCKLs to build up general MMX vector.
while (Ops.size() > 1) {
unsigned NumOps = Ops.size();
unsigned IntrinOp =
(NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
: (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
: Intrinsic::x86_mmx_punpcklbw));
SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
for (unsigned i = 0; i != NumOps; i += 2)
Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
Ops[i], Ops[i + 1]);
Ops.resize(NumOps / 2);
}
return Ops[0];
}
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
EVT SrcVT = N0.getValueType();
// Try to match patterns such as
// (i16 bitcast (v16i1 x))
// ->
// (i16 movmsk (16i8 sext (v16i1 x)))
// before the setcc result is scalarized on subtargets that don't have legal
// vxi1 types.
if (DCI.isBeforeLegalize()) {
SDLoc dl(N);
if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
return V;
// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
// type, widen both sides to avoid a trip through memory.
if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
Subtarget.hasAVX512()) {
N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
N0 = DAG.getBitcast(MVT::v8i1, N0);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
DAG.getIntPtrConstant(0, dl));
}
// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
// type, widen both sides to avoid a trip through memory.
if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
Subtarget.hasAVX512()) {
unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
Ops[0] = N0;
N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
N0 = DAG.getBitcast(MVT::i8, N0);
return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
}
}
// Since MMX types are special and don't usually play with other vector types,
// it's better to handle them early to be sure we emit efficient code by
// avoiding store-load conversions.
if (VT == MVT::x86mmx) {
// Detect MMX constant vectors.
APInt UndefElts;
SmallVector<APInt, 1> EltBits;
if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
SDLoc DL(N0);
// Handle zero-extension of i32 with MOVD.
if (EltBits[0].countLeadingZeros() >= 32)
return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
// Else, bitcast to a double.
// TODO - investigate supporting sext 32-bit immediates on x86_64.
APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
}
// Detect bitcasts to x86mmx low word.
if (N0.getOpcode() == ISD::BUILD_VECTOR &&
(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
bool LowUndef = true, AllUndefOrZero = true;
for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
SDValue Op = N0.getOperand(i);
LowUndef &= Op.isUndef() || (i >= e/2);
AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
}
if (AllUndefOrZero) {
SDValue N00 = N0.getOperand(0);
SDLoc dl(N00);
N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
: DAG.getZExtOrTrunc(N00, dl, MVT::i32);
return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
}
}
// Detect bitcasts of 64-bit build vectors and convert to a
// MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
// lowest element.
if (N0.getOpcode() == ISD::BUILD_VECTOR &&
(SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
SrcVT == MVT::v8i8))
return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
// Detect bitcasts between element or subvector extraction to x86mmx.
if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
isNullConstant(N0.getOperand(1))) {
SDValue N00 = N0.getOperand(0);
if (N00.getValueType().is128BitVector())
return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
DAG.getBitcast(MVT::v2i64, N00));
}
// Detect bitcasts from FP_TO_SINT to x86mmx.
if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
SDLoc DL(N0);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
DAG.getUNDEF(MVT::v2i32));
return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
DAG.getBitcast(MVT::v2i64, Res));
}
}
// Try to remove a bitcast of constant vXi1 vector. We have to legalize
// most of these to scalar anyway.
if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
return combinevXi1ConstantToInteger(N0, DAG);
}
if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
isa<ConstantSDNode>(N0)) {
auto *C = cast<ConstantSDNode>(N0);
if (C->isAllOnesValue())
return DAG.getConstant(1, SDLoc(N0), VT);
if (C->isNullValue())
return DAG.getConstant(0, SDLoc(N0), VT);
}
// Try to remove bitcasts from input and output of mask arithmetic to
// remove GPR<->K-register crossings.
if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
return V;
// Convert a bitcasted integer logic operation that has one bitcasted
// floating-point operand into a floating-point logic operation. This may
// create a load of a constant, but that is cheaper than materializing the
// constant in an integer register and transferring it to an SSE register or
// transferring the SSE operand to integer register and back.
unsigned FPOpcode;
switch (N0.getOpcode()) {
case ISD::AND: FPOpcode = X86ISD::FAND; break;
case ISD::OR: FPOpcode = X86ISD::FOR; break;
case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
default: return SDValue();
}
if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
(Subtarget.hasSSE2() && VT == MVT::f64)))
return SDValue();
SDValue LogicOp0 = N0.getOperand(0);
SDValue LogicOp1 = N0.getOperand(1);
SDLoc DL0(N0);
// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
}
// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
}
return SDValue();
}
// Given a ABS node, detect the following pattern:
// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
// This is useful as it is the input into a SAD pattern.
static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
SDValue AbsOp1 = Abs->getOperand(0);
if (AbsOp1.getOpcode() != ISD::SUB)
return false;
Op0 = AbsOp1.getOperand(0);
Op1 = AbsOp1.getOperand(1);
// Check if the operands of the sub are zero-extended from vectors of i8.
if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
Op1.getOpcode() != ISD::ZERO_EXTEND ||
Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
return false;
return true;
}
// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
// to these zexts.
static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
const SDValue &Zext1, const SDLoc &DL,
const X86Subtarget &Subtarget) {
// Find the appropriate width for the PSADBW.
EVT InVT = Zext0.getOperand(0).getValueType();
unsigned RegSize = std::max(128u, InVT.getSizeInBits());
// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
// fill in the missing vector elements with 0.
unsigned NumConcat = RegSize / InVT.getSizeInBits();
SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
Ops[0] = Zext0.getOperand(0);
MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
Ops[0] = Zext1.getOperand(0);
SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
// Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
};
MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
PSADBWBuilder);
}
// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
// PHMINPOSUW.
static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// Bail without SSE41.
if (!Subtarget.hasSSE41())
return SDValue();
EVT ExtractVT = Extract->getValueType(0);
if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
return SDValue();
// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
ISD::NodeType BinOp;
SDValue Src = DAG.matchBinOpReduction(
Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
if (!Src)
return SDValue();
EVT SrcVT = Src.getValueType();
EVT SrcSVT = SrcVT.getScalarType();
if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
return SDValue();
SDLoc DL(Extract);
SDValue MinPos = Src;
// First, reduce the source down to 128-bit, applying BinOp to lo/hi.
while (SrcVT.getSizeInBits() > 128) {
unsigned NumElts = SrcVT.getVectorNumElements();
unsigned NumSubElts = NumElts / 2;
SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
unsigned SubSizeInBits = SrcVT.getSizeInBits();
SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
}
assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
(SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
"Unexpected value type");
// PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
// to flip the value accordingly.
SDValue Mask;
unsigned MaskEltsBits = ExtractVT.getSizeInBits();
if (BinOp == ISD::SMAX)
Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
else if (BinOp == ISD::SMIN)
Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
else if (BinOp == ISD::UMAX)
Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
if (Mask)
MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
// For v16i8 cases we need to perform UMIN on pairs of byte elements,
// shuffling each upper element down and insert zeros. This means that the
// v16i8 UMIN will leave the upper element as zero, performing zero-extension
// ready for the PHMINPOS.
if (ExtractVT == MVT::i8) {
SDValue Upper = DAG.getVectorShuffle(
SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
{1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
}
// Perform the PHMINPOS on a v8i16 vector,
MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
MinPos = DAG.getBitcast(SrcVT, MinPos);
if (Mask)
MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
DAG.getIntPtrConstant(0, DL));
}
// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
static SDValue combineHorizontalPredicateResult(SDNode *Extract,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// Bail without SSE2.
if (!Subtarget.hasSSE2())
return SDValue();
EVT ExtractVT = Extract->getValueType(0);
unsigned BitWidth = ExtractVT.getSizeInBits();
if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
return SDValue();
// Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
ISD::NodeType BinOp;
SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
if (!Match && ExtractVT == MVT::i1)
Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
if (!Match)
return SDValue();
// EXTRACT_VECTOR_ELT can require implicit extension of the vector element
// which we can't support here for now.
if (Match.getScalarValueSizeInBits() != BitWidth)
return SDValue();
SDValue Movmsk;
SDLoc DL(Extract);
EVT MatchVT = Match.getValueType();
unsigned NumElts = MatchVT.getVectorNumElements();
if (ExtractVT == MVT::i1) {
// Special case for (pre-legalization) vXi1 reductions.
if (NumElts > 32)
return SDValue();
if (DAG.getTargetLoweringInfo().isTypeLegal(MatchVT)) {
// If this is a legal AVX512 predicate type then we can just bitcast.
EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
Movmsk = DAG.getBitcast(MovmskVT, Match);
} else {
// Use combineBitcastvxi1 to create the MOVMSK.
if (NumElts == 32 && !Subtarget.hasInt256()) {
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
NumElts = 16;
}
EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
}
if (!Movmsk)
return SDValue();
Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, MVT::i32);
} else {
// Bail with AVX512VL (which uses predicate registers).
if (Subtarget.hasVLX())
return SDValue();
unsigned MatchSizeInBits = Match.getValueSizeInBits();
if (!(MatchSizeInBits == 128 ||
(MatchSizeInBits == 256 && Subtarget.hasAVX())))
return SDValue();
// Make sure this isn't a vector of 1 element. The perf win from using
// MOVMSK diminishes with less elements in the reduction, but it is
// generally better to get the comparison over to the GPRs as soon as
// possible to reduce the number of vector ops.
if (Match.getValueType().getVectorNumElements() < 2)
return SDValue();
// Check that we are extracting a reduction of all sign bits.
if (DAG.ComputeNumSignBits(Match) != BitWidth)
return SDValue();
if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
MatchSizeInBits = Match.getValueSizeInBits();
}
// For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
MVT MaskSrcVT;
if (64 == BitWidth || 32 == BitWidth)
MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
MatchSizeInBits / BitWidth);
else
MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
NumElts = MaskSrcVT.getVectorNumElements();
}
assert(NumElts <= 32 && "Not expecting more than 32 elements");
if (BinOp == ISD::XOR) {
// parity -> (AND (CTPOP(MOVMSK X)), 1)
SDValue Mask = DAG.getConstant(1, DL, MVT::i32);
SDValue Result = DAG.getNode(ISD::CTPOP, DL, MVT::i32, Movmsk);
Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result, Mask);
return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
}
SDValue CmpC;
ISD::CondCode CondCode;
if (BinOp == ISD::OR) {
// any_of -> MOVMSK != 0
CmpC = DAG.getConstant(0, DL, MVT::i32);
CondCode = ISD::CondCode::SETNE;
} else {
// all_of -> MOVMSK == ((1 << NumElts) - 1)
CmpC = DAG.getConstant((1ULL << NumElts) - 1, DL, MVT::i32);
CondCode = ISD::CondCode::SETEQ;
}
// The setcc produces an i8 of 0/1, so extend that to the result width and
// negate to get the final 0/-1 mask value.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT SetccVT =
TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
}
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// PSADBW is only supported on SSE2 and up.
if (!Subtarget.hasSSE2())
return SDValue();
// Verify the type we're extracting from is any integer type above i16.
EVT VT = Extract->getOperand(0).getValueType();
if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
return SDValue();
unsigned RegSize = 128;
if (Subtarget.useBWIRegs())
RegSize = 512;
else if (Subtarget.hasAVX())
RegSize = 256;
// We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
// TODO: We should be able to handle larger vectors by splitting them before
// feeding them into several SADs, and then reducing over those.
if (RegSize / VT.getVectorNumElements() < 8)
return SDValue();
// Match shuffle + add pyramid.
ISD::NodeType BinOp;
SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
// The operand is expected to be zero extended from i8
// (verified in detectZextAbsDiff).
// In order to convert to i64 and above, additional any/zero/sign
// extend is expected.
// The zero extend from 32 bit has no mathematical effect on the result.
// Also the sign extend is basically zero extend
// (extends the sign bit which is zero).
// So it is correct to skip the sign/zero extend instruction.
if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
Root.getOpcode() == ISD::ZERO_EXTEND ||
Root.getOpcode() == ISD::ANY_EXTEND))
Root = Root.getOperand(0);
// If there was a match, we want Root to be a select that is the root of an
// abs-diff pattern.
if (!Root || Root.getOpcode() != ISD::ABS)
return SDValue();
// Check whether we have an abs-diff pattern feeding into the select.
SDValue Zext0, Zext1;
if (!detectZextAbsDiff(Root, Zext0, Zext1))
return SDValue();
// Create the SAD instruction.
SDLoc DL(Extract);
SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
// If the original vector was wider than 8 elements, sum over the results
// in the SAD vector.
unsigned Stages = Log2_32(VT.getVectorNumElements());
MVT SadVT = SAD.getSimpleValueType();
if (Stages > 3) {
unsigned SadElems = SadVT.getVectorNumElements();
for(unsigned i = Stages - 3; i > 0; --i) {
SmallVector<int, 16> Mask(SadElems, -1);
for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
Mask[j] = MaskEnd + j;
SDValue Shuffle =
DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
}
}
MVT Type = Extract->getSimpleValueType(0);
unsigned TypeSizeInBits = Type.getSizeInBits();
// Return the lowest TypeSizeInBits bits.
MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
SAD = DAG.getBitcast(ResVT, SAD);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
Extract->getOperand(1));
}
// Attempt to peek through a target shuffle and extract the scalar from the
// source.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
SDValue Src = N->getOperand(0);
SDValue Idx = N->getOperand(1);
EVT VT = N->getValueType(0);
EVT SrcVT = Src.getValueType();
EVT SrcSVT = SrcVT.getVectorElementType();
unsigned NumSrcElts = SrcVT.getVectorNumElements();
// Don't attempt this for boolean mask vectors or unknown extraction indices.
if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
return SDValue();
SDValue SrcBC = peekThroughBitcasts(Src);
// Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
SDValue SrcOp = SrcBC.getOperand(0);
if (SrcOp.getValueSizeInBits() == VT.getSizeInBits())
return DAG.getBitcast(VT, SrcOp);
}
// Resolve the target shuffle inputs and mask.
SmallVector<int, 16> Mask;
SmallVector<SDValue, 2> Ops;
if (!resolveTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
return SDValue();
// Attempt to narrow/widen the shuffle mask to the correct size.
if (Mask.size() != NumSrcElts) {
if ((NumSrcElts % Mask.size()) == 0) {
SmallVector<int, 16> ScaledMask;
int Scale = NumSrcElts / Mask.size();
scaleShuffleMask<int>(Scale, Mask, ScaledMask);
Mask = std::move(ScaledMask);
} else if ((Mask.size() % NumSrcElts) == 0) {
// Simplify Mask based on demanded element.
int ExtractIdx = (int)N->getConstantOperandVal(1);
int Scale = Mask.size() / NumSrcElts;
int Lo = Scale * ExtractIdx;
int Hi = Scale * (ExtractIdx + 1);
for (int i = 0, e = (int)Mask.size(); i != e; ++i)
if (i < Lo || Hi <= i)
Mask[i] = SM_SentinelUndef;
SmallVector<int, 16> WidenedMask;
while (Mask.size() > NumSrcElts &&
canWidenShuffleElements(Mask, WidenedMask))
Mask = std::move(WidenedMask);
// TODO - investigate support for wider shuffle masks with known upper
// undef/zero elements for implicit zero-extension.
}
}
// Check if narrowing/widening failed.
if (Mask.size() != NumSrcElts)
return SDValue();
int SrcIdx = Mask[N->getConstantOperandVal(1)];
SDLoc dl(N);
// If the shuffle source element is undef/zero then we can just accept it.
if (SrcIdx == SM_SentinelUndef)
return DAG.getUNDEF(VT);
if (SrcIdx == SM_SentinelZero)
return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
: DAG.getConstant(0, dl, VT);
SDValue SrcOp = Ops[SrcIdx / Mask.size()];
SrcIdx = SrcIdx % Mask.size();
// We can only extract other elements from 128-bit vectors and in certain
// circumstances, depending on SSE-level.
// TODO: Investigate using extract_subvector for larger vectors.
// TODO: Investigate float/double extraction if it will be just stored.
if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
assert(SrcSVT == VT && "Unexpected extraction type");
SrcOp = DAG.getBitcast(SrcVT, SrcOp);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
DAG.getIntPtrConstant(SrcIdx, dl));
}
if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
(SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
"Unexpected extraction type");
unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
SrcOp = DAG.getBitcast(SrcVT, SrcOp);
SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
DAG.getIntPtrConstant(SrcIdx, dl));
return DAG.getZExtOrTrunc(ExtOp, dl, VT);
}
return SDValue();
}
/// Extracting a scalar FP value from vector element 0 is free, so extract each
/// operand first, then perform the math as a scalar op.
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
SDValue Vec = ExtElt->getOperand(0);
SDValue Index = ExtElt->getOperand(1);
EVT VT = ExtElt->getValueType(0);
EVT VecVT = Vec.getValueType();
// TODO: If this is a unary/expensive/expand op, allow extraction from a
// non-zero element because the shuffle+scalar op will be cheaper?
if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
return SDValue();
// Vector FP compares don't fit the pattern of FP math ops (propagate, not
// extract, the condition code), so deal with those as a special-case.
if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
if (OpVT != MVT::f32 && OpVT != MVT::f64)
return SDValue();
// extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
SDLoc DL(ExtElt);
SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
Vec.getOperand(0), Index);
SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
Vec.getOperand(1), Index);
return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
}
if (VT != MVT::f32 && VT != MVT::f64)
return SDValue();
// Vector FP selects don't fit the pattern of FP math ops (because the
// condition has a different type and we have to change the opcode), so deal
// with those here.
// FIXME: This is restricted to pre type legalization by ensuring the setcc
// has i1 elements. If we loosen this we need to convert vector bool to a
// scalar bool.
if (Vec.getOpcode() == ISD::VSELECT &&
Vec.getOperand(0).getOpcode() == ISD::SETCC &&
Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
// ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
SDLoc DL(ExtElt);
SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
Vec.getOperand(0).getValueType().getScalarType(),
Vec.getOperand(0), Index);
SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
Vec.getOperand(1), Index);
SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
Vec.getOperand(2), Index);
return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
}
// TODO: This switch could include FNEG and the x86-specific FP logic ops
// (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
// missed load folding and fma+fneg combining.
switch (Vec.getOpcode()) {
case ISD::FMA: // Begin 3 operands
case ISD::FMAD:
case ISD::FADD: // Begin 2 operands
case ISD::FSUB:
case ISD::FMUL:
case ISD::FDIV:
case ISD::FREM:
case ISD::FCOPYSIGN:
case ISD::FMINNUM:
case ISD::FMAXNUM:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
case ISD::FMAXIMUM:
case ISD::FMINIMUM:
case X86ISD::FMAX:
case X86ISD::FMIN:
case ISD::FABS: // Begin 1 operand
case ISD::FSQRT:
case ISD::FRINT:
case ISD::FCEIL:
case ISD::FTRUNC:
case ISD::FNEARBYINT:
case ISD::FROUND:
case ISD::FFLOOR:
case X86ISD::FRCP:
case X86ISD::FRSQRT: {
// extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
SDLoc DL(ExtElt);
SmallVector<SDValue, 4> ExtOps;
for (SDValue Op : Vec->ops())
ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
}
default:
return SDValue();
}
llvm_unreachable("All opcodes should return within switch");
}
/// Try to convert a vector reduction sequence composed of binops and shuffles
/// into horizontal ops.
static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
return SDValue();
SDValue Index = ExtElt->getOperand(1);
if (!isNullConstant(Index))
return SDValue();
// TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros.
ISD::NodeType Opc;
SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD});
if (!Rdx)
return SDValue();
EVT VT = ExtElt->getValueType(0);
EVT VecVT = ExtElt->getOperand(0).getValueType();
if (VecVT.getScalarType() != VT)
return SDValue();
unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
SDLoc DL(ExtElt);
// 256-bit horizontal instructions operate on 128-bit chunks rather than
// across the whole vector, so we need an extract + hop preliminary stage.
// This is the only step where the operands of the hop are not the same value.
// TODO: We could extend this to handle 512-bit or even longer vectors.
if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
unsigned NumElts = VecVT.getVectorNumElements();
SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
VecVT = EVT::getVectorVT(*DAG.getContext(), VT, NumElts / 2);
Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Hi, Lo);
}
if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
!((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
return SDValue();
// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
assert(Rdx.getValueType() == VecVT && "Unexpected reduction match");
unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
for (unsigned i = 0; i != ReductionSteps; ++i)
Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
}
/// Detect vector gather/scatter index generation and convert it from being a
/// bunch of shuffles and extracts into a somewhat faster sequence.
/// For i686, the best sequence is apparently storing the value and loading
/// scalars back, while for x64 we should use 64-bit extracts and shifts.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
return NewOp;
SDValue InputVector = N->getOperand(0);
SDValue EltIdx = N->getOperand(1);
auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
EVT SrcVT = InputVector.getValueType();
EVT VT = N->getValueType(0);
SDLoc dl(InputVector);
bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
if (CIdx && CIdx->getAPIntValue().uge(SrcVT.getVectorNumElements()))
return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
// Integer Constant Folding.
if (CIdx && VT.isInteger()) {
APInt UndefVecElts;
SmallVector<APInt, 16> EltBits;
unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
EltBits, true, false)) {
uint64_t Idx = CIdx->getZExtValue();
if (UndefVecElts[Idx])
return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
dl, VT);
}
}
// TODO - Remove this once we can handle the implicit zero-extension of
// X86ISD::PEXTRW/X86ISD::PEXTRB in:
// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
// combineBasicSADPattern.
if (IsPextr) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.SimplifyDemandedBits(
SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
return SDValue(N, 0);
return SDValue();
}
if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
return NewOp;
// Detect mmx extraction of all bits as a i64. It works better as a bitcast.
if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
SDValue MMXSrc = InputVector.getOperand(0);
// The bitcast source is a direct mmx result.
if (MMXSrc.getValueType() == MVT::x86mmx)
return DAG.getBitcast(VT, InputVector);
}
// Detect mmx to i32 conversion through a v2i32 elt extract.
if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
SDValue MMXSrc = InputVector.getOperand(0);
// The bitcast source is a direct mmx result.
if (MMXSrc.getValueType() == MVT::x86mmx)
return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
}
// Check whether this extract is the root of a sum of absolute differences
// pattern. This has to be done here because we really want it to happen
// pre-legalization,
if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
return SAD;
// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
return Cmp;
// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
return MinMax;
if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget))
return V;
if (SDValue V = scalarizeExtEltFP(N, DAG))
return V;
// Attempt to extract a i1 element by using MOVMSK to extract the signbits
// and then testing the relevant element.
if (CIdx && SrcVT.getScalarType() == MVT::i1) {
SmallVector<SDNode *, 16> BoolExtracts;
auto IsBoolExtract = [&BoolExtracts](SDNode *Use) {
if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
isa<ConstantSDNode>(Use->getOperand(1)) &&
Use->getValueType(0) == MVT::i1) {
BoolExtracts.push_back(Use);
return true;
}
return false;
};
if (all_of(InputVector->uses(), IsBoolExtract) &&
BoolExtracts.size() > 1) {
unsigned NumSrcElts = SrcVT.getVectorNumElements();
EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
if (SDValue BC =
combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
for (SDNode *Use : BoolExtracts) {
// extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
unsigned MaskIdx = Use->getConstantOperandVal(1);
APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
DCI.CombineTo(Use, Res);
}
return SDValue(N, 0);
}
}
}
return SDValue();
}
/// If a vector select has an operand that is -1 or 0, try to simplify the
/// select to a bitwise logic operation.
/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
static SDValue
combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue Cond = N->getOperand(0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
EVT VT = LHS.getValueType();
EVT CondVT = Cond.getValueType();
SDLoc DL(N);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (N->getOpcode() != ISD::VSELECT)
return SDValue();
assert(CondVT.isVector() && "Vector select expects a vector selector!");
// Check if the first operand is all zeros and Cond type is vXi1.
// This situation only applies to avx512.
// TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
// TODO: Can we assert that both operands are not zeros (because that should
// get simplified at node creation time)?
bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() &&
Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) {
// Invert the cond to not(cond) : xor(op,allones)=not(op)
SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
}
// To use the condition operand as a bitwise mask, it must have elements that
// are the same size as the select elements. Ie, the condition operand must
// have already been promoted from the IR select condition type <N x i1>.
// Don't check if the types themselves are equal because that excludes
// vector floating-point selects.
if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
return SDValue();
// Try to invert the condition if true value is not all 1s and false value is
// not all 0s. Only do this if the condition has one use.
bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
// Check if the selector will be produced by CMPP*/PCMP*.
Cond.getOpcode() == ISD::SETCC &&
// Check if SETCC has already been promoted.
TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
CondVT) {
bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
if (TValIsAllZeros || FValIsAllOnes) {
SDValue CC = Cond.getOperand(2);
ISD::CondCode NewCC =
ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
Cond.getOperand(0).getValueType().isInteger());
Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
NewCC);
std::swap(LHS, RHS);
TValIsAllOnes = FValIsAllOnes;
FValIsAllZeros = TValIsAllZeros;
}
}
// Cond value must be 'sign splat' to be converted to a logical op.
if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
return SDValue();
// vselect Cond, 111..., 000... -> Cond
if (TValIsAllOnes && FValIsAllZeros)
return DAG.getBitcast(VT, Cond);
if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
return SDValue();
// vselect Cond, 111..., X -> or Cond, X
if (TValIsAllOnes) {
SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
return DAG.getBitcast(VT, Or);
}
// vselect Cond, X, 000... -> and Cond, X
if (FValIsAllZeros) {
SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
return DAG.getBitcast(VT, And);
}
// vselect Cond, 000..., X -> andn Cond, X
if (TValIsAllZeros) {
MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
return DAG.getBitcast(VT, AndN);
}
return SDValue();
}
/// If both arms of a vector select are concatenated vectors, split the select,
/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
unsigned Opcode = N->getOpcode();
if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
return SDValue();
// TODO: Split 512-bit vectors too?
EVT VT = N->getValueType(0);
if (!VT.is256BitVector())
return SDValue();
// TODO: Split as long as any 2 of the 3 operands are concatenated?
SDValue Cond = N->getOperand(0);
SDValue TVal = N->getOperand(1);
SDValue FVal = N->getOperand(2);
SmallVector<SDValue, 4> CatOpsT, CatOpsF;
if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
!collectConcatOps(TVal.getNode(), CatOpsT) ||
!collectConcatOps(FVal.getNode(), CatOpsF))
return SDValue();
auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
};
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
makeBlend, /*CheckBWI*/ false);
}
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
SDValue Cond = N->getOperand(0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
SDLoc DL(N);
auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
if (!TrueC || !FalseC)
return SDValue();
// Don't do this for crazy integer types.
EVT VT = N->getValueType(0);
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
// We're going to use the condition bit in math or logic ops. We could allow
// this with a wider condition value (post-legalization it becomes an i8),
// but if nothing is creating selects that late, it doesn't matter.
if (Cond.getValueType() != MVT::i1)
return SDValue();
// A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
// 3, 5, or 9 with i32/i64, so those get transformed too.
// TODO: For constants that overflow or do not differ by power-of-2 or small
// multiplier, convert to 'and' + 'add'.
const APInt &TrueVal = TrueC->getAPIntValue();
const APInt &FalseVal = FalseC->getAPIntValue();
bool OV;
APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
if (OV)
return SDValue();
APInt AbsDiff = Diff.abs();
if (AbsDiff.isPowerOf2() ||
((VT == MVT::i32 || VT == MVT::i64) &&
(AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
// We need a positive multiplier constant for shift/LEA codegen. The 'not'
// of the condition can usually be folded into a compare predicate, but even
// without that, the sequence should be cheaper than a CMOV alternative.
if (TrueVal.slt(FalseVal)) {
Cond = DAG.getNOT(DL, Cond, MVT::i1);
std::swap(TrueC, FalseC);
}
// select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
// Multiply condition by the difference if non-one.
if (!AbsDiff.isOneValue())
R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
// Add the base if non-zero.
if (!FalseC->isNullValue())
R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
return R;
}
return SDValue();
}
/// If this is a *dynamic* select (non-constant condition) and we can match
/// this node with one of the variable blend instructions, restructure the
/// condition so that blends can use the high (sign) bit of each element.
/// This function will also call SimplifyDemandedBits on already created
/// BLENDV to perform additional simplifications.
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue Cond = N->getOperand(0);
if ((N->getOpcode() != ISD::VSELECT &&
N->getOpcode() != X86ISD::BLENDV) ||
ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
return SDValue();
// Don't optimize before the condition has been transformed to a legal type
// and don't ever optimize vector selects that map to AVX512 mask-registers.
unsigned BitWidth = Cond.getScalarValueSizeInBits();
if (BitWidth < 8 || BitWidth > 64)
return SDValue();
// We can only handle the cases where VSELECT is directly legal on the
// subtarget. We custom lower VSELECT nodes with constant conditions and
// this makes it hard to see whether a dynamic VSELECT will correctly
// lower, so we both check the operation's status and explicitly handle the
// cases where a *dynamic* blend will fail even though a constant-condition
// blend could be custom lowered.
// FIXME: We should find a better way to handle this class of problems.
// Potentially, we should combine constant-condition vselect nodes
// pre-legalization into shuffles and not mark as many types as custom
// lowered.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = N->getValueType(0);
if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
return SDValue();
// FIXME: We don't support i16-element blends currently. We could and
// should support them by making *all* the bits in the condition be set
// rather than just the high bit and using an i8-element blend.
if (VT.getVectorElementType() == MVT::i16)
return SDValue();
// Dynamic blending was only available from SSE4.1 onward.
if (VT.is128BitVector() && !Subtarget.hasSSE41())
return SDValue();
// Byte blends are only available in AVX2
if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
return SDValue();
// There are no 512-bit blend instructions that use sign bits.
if (VT.is512BitVector())
return SDValue();
// TODO: Add other opcodes eventually lowered into BLEND.
for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
UI != UE; ++UI)
if ((UI->getOpcode() != ISD::VSELECT &&
UI->getOpcode() != X86ISD::BLENDV) ||
UI.getOperandNo() != 0)
return SDValue();
APInt DemandedMask(APInt::getSignMask(BitWidth));
KnownBits Known;
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
!DCI.isBeforeLegalizeOps());
if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
return SDValue();
// If we changed the computation somewhere in the DAG, this change will
// affect all users of Cond. Update all the nodes so that we do not use
// the generic VSELECT anymore. Otherwise, we may perform wrong
// optimizations as we messed with the actual expectation for the vector
// boolean values.
for (SDNode *U : Cond->uses()) {
if (U->getOpcode() == X86ISD::BLENDV)
continue;
SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
Cond, U->getOperand(1), U->getOperand(2));
DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
DCI.AddToWorklist(U);
}
DCI.CommitTargetLoweringOpt(TLO);
return SDValue(N, 0);
}
/// Do target-specific dag combines on SELECT and VSELECT nodes.
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDLoc DL(N);
SDValue Cond = N->getOperand(0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
// Try simplification again because we use this function to optimize
// BLENDV nodes that are not handled by the generic combiner.
if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
return V;
EVT VT = LHS.getValueType();
EVT CondVT = Cond.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Convert vselects with constant condition into shuffles.
if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
DCI.isBeforeLegalizeOps()) {
SmallVector<int, 64> Mask;
if (createShuffleMaskFromVSELECT(Mask, Cond))
return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
}
// If we have SSE[12] support, try to form min/max nodes. SSE min/max
// instructions match the semantics of the common C idiom x<y?x:y but not
// x<=y?x:y, because of how they handle negative zero (which can be
// ignored in unsafe-math mode).
// We also try to create v2f32 min/max nodes, which we later widen to v4f32.
if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
VT != MVT::f80 && VT != MVT::f128 &&
(TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
(Subtarget.hasSSE2() ||
(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
unsigned Opcode = 0;
// Check for x CC y ? x : y.
if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
DAG.isEqualTo(RHS, Cond.getOperand(1))) {
switch (CC) {
default: break;
case ISD::SETULT:
// Converting this to a min would handle NaNs incorrectly, and swapping
// the operands would cause it to handle comparisons between positive
// and negative zero incorrectly.
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
if (!DAG.getTarget().Options.UnsafeFPMath &&
!(DAG.isKnownNeverZeroFloat(LHS) ||
DAG.isKnownNeverZeroFloat(RHS)))
break;
std::swap(LHS, RHS);
}
Opcode = X86ISD::FMIN;
break;
case ISD::SETOLE:
// Converting this to a min would handle comparisons between positive
// and negative zero incorrectly.
if (!DAG.getTarget().Options.UnsafeFPMath &&
!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
break;
Opcode = X86ISD::FMIN;
break;
case ISD::SETULE:
// Converting this to a min would handle both negative zeros and NaNs
// incorrectly, but we can swap the operands to fix both.
std::swap(LHS, RHS);
LLVM_FALLTHROUGH;
case ISD::SETOLT:
case ISD::SETLT:
case ISD::SETLE:
Opcode = X86ISD::FMIN;
break;
case ISD::SETOGE:
// Converting this to a max would handle comparisons between positive
// and negative zero incorrectly.
if (!DAG.getTarget().Options.UnsafeFPMath &&
!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
break;
Opcode = X86ISD::FMAX;
break;
case ISD::SETUGT:
// Converting this to a max would handle NaNs incorrectly, and swapping
// the operands would cause it to handle comparisons between positive
// and negative zero incorrectly.
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
if (!DAG.getTarget().Options.UnsafeFPMath &&
!(DAG.isKnownNeverZeroFloat(LHS) ||
DAG.isKnownNeverZeroFloat(RHS)))
break;
std::swap(LHS, RHS);
}
Opcode = X86ISD::FMAX;
break;
case ISD::SETUGE:
// Converting this to a max would handle both negative zeros and NaNs
// incorrectly, but we can swap the operands to fix both.
std::swap(LHS, RHS);
LLVM_FALLTHROUGH;
case ISD::SETOGT:
case ISD::SETGT:
case ISD::SETGE:
Opcode = X86ISD::FMAX;
break;
}
// Check for x CC y ? y : x -- a min/max with reversed arms.
} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
DAG.isEqualTo(RHS, Cond.getOperand(0))) {
switch (CC) {
default: break;
case ISD::SETOGE:
// Converting this to a min would handle comparisons between positive
// and negative zero incorrectly, and swapping the operands would
// cause it to handle NaNs incorrectly.
if (!DAG.getTarget().Options.UnsafeFPMath &&
!(DAG.isKnownNeverZeroFloat(LHS) ||
DAG.isKnownNeverZeroFloat(RHS))) {
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
break;
std::swap(LHS, RHS);
}
Opcode = X86ISD::FMIN;
break;
case ISD::SETUGT:
// Converting this to a min would handle NaNs incorrectly.
if (!DAG.getTarget().Options.UnsafeFPMath &&
(!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
break;
Opcode = X86ISD::FMIN;
break;
case ISD::SETUGE:
// Converting this to a min would handle both negative zeros and NaNs
// incorrectly, but we can swap the operands to fix both.
std::swap(LHS, RHS);
LLVM_FALLTHROUGH;
case ISD::SETOGT:
case ISD::SETGT:
case ISD::SETGE:
Opcode = X86ISD::FMIN;
break;
case ISD::SETULT:
// Converting this to a max would handle NaNs incorrectly.
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
break;
Opcode = X86ISD::FMAX;
break;
case ISD::SETOLE:
// Converting this to a max would handle comparisons between positive
// and negative zero incorrectly, and swapping the operands would
// cause it to handle NaNs incorrectly.
if (!DAG.getTarget().Options.UnsafeFPMath &&
!DAG.isKnownNeverZeroFloat(LHS) &&
!DAG.isKnownNeverZeroFloat(RHS)) {
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
break;
std::swap(LHS, RHS);
}
Opcode = X86ISD::FMAX;
break;
case ISD::SETULE:
// Converting this to a max would handle both negative zeros and NaNs
// incorrectly, but we can swap the operands to fix both.
std::swap(LHS, RHS);
LLVM_FALLTHROUGH;
case ISD::SETOLT:
case ISD::SETLT:
case ISD::SETLE:
Opcode = X86ISD::FMAX;
break;
}
}
if (Opcode)
return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
}
// Some mask scalar intrinsics rely on checking if only one bit is set
// and implement it in C code like this:
// A[0] = (U & 1) ? A[0] : W[0];
// This creates some redundant instructions that break pattern matching.
// fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
SDValue AndNode = Cond.getOperand(0);
if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
isNullConstant(Cond.getOperand(1)) &&
isOneConstant(AndNode.getOperand(1))) {
// LHS and RHS swapped due to
// setcc outputting 1 when AND resulted in 0 and vice versa.
AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
}
}
// v16i8 (select v16i1, v16i8, v16i8) does not have a proper
// lowering on KNL. In this case we convert it to
// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
// The same situation all vectors of i8 and i16 without BWI.
// Make sure we extend these even before type legalization gets a chance to
// split wide vectors.
// Since SKX these selects have a proper lowering.
if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
CondVT.getVectorElementType() == MVT::i1 &&
(ExperimentalVectorWideningLegalization ||
VT.getVectorNumElements() > 4) &&
(VT.getVectorElementType() == MVT::i8 ||
VT.getVectorElementType() == MVT::i16)) {
Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
}
// AVX512 - Extend select with zero to merge with target shuffle.
// select(mask, extract_subvector(shuffle(x)), zero) -->
// extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
// TODO - support non target shuffles as well.
if (Subtarget.hasAVX512() && CondVT.isVector() &&
CondVT.getVectorElementType() == MVT::i1) {
auto SelectableOp = [&TLI](SDValue Op) {
return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
isTargetShuffle(Op.getOperand(0).getOpcode()) &&
isNullConstant(Op.getOperand(1)) &&
TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
Op.hasOneUse() && Op.getOperand(0).hasOneUse();
};
bool SelectableLHS = SelectableOp(LHS);
bool SelectableRHS = SelectableOp(RHS);
bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
: RHS.getOperand(0).getValueType();
unsigned NumSrcElts = SrcVT.getVectorNumElements();
EVT SrcCondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumSrcElts);
LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
VT.getSizeInBits());
RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
VT.getSizeInBits());
Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
DAG.getUNDEF(SrcCondVT), Cond,
DAG.getIntPtrConstant(0, DL));
SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
}
}
if (SDValue V = combineSelectOfTwoConstants(N, DAG))
return V;
// Canonicalize max and min:
// (x > y) ? x : y -> (x >= y) ? x : y
// (x < y) ? x : y -> (x <= y) ? x : y
// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
// the need for an extra compare
// against zero. e.g.
// (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
// subl %esi, %edi
// testl %edi, %edi
// movl $0, %eax
// cmovgl %edi, %eax
// =>
// xorl %eax, %eax
// subl %esi, $edi
// cmovsl %eax, %edi
if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
DAG.isEqualTo(RHS, Cond.getOperand(1))) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
switch (CC) {
default: break;
case ISD::SETLT:
case ISD::SETGT: {
ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
Cond.getOperand(0), Cond.getOperand(1), NewCC);
return DAG.getSelect(DL, VT, Cond, LHS, RHS);
}
}
}
// Match VSELECTs into subs with unsigned saturation.
if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
// psubus is available in SSE2 for i8 and i16 vectors.
Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
isPowerOf2_32(VT.getVectorNumElements()) &&
(VT.getVectorElementType() == MVT::i8 ||
VT.getVectorElementType() == MVT::i16)) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
// Check if one of the arms of the VSELECT is a zero vector. If it's on the
// left side invert the predicate to simplify logic below.
SDValue Other;
if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
Other = RHS;
CC = ISD::getSetCCInverse(CC, true);
} else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
Other = LHS;
}
if (Other.getNode() && Other->getNumOperands() == 2 &&
Other->getOperand(0) == Cond.getOperand(0)) {
SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
SDValue CondRHS = Cond->getOperand(1);
// Look for a general sub with unsigned saturation first.
// x >= y ? x-y : 0 --> subus x, y
// x > y ? x-y : 0 --> subus x, y
if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
Other->getOpcode() == ISD::SUB && OpRHS == CondRHS)
return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) {
if (isa<BuildVectorSDNode>(CondRHS)) {
// If the RHS is a constant we have to reverse the const
// canonicalization.
// x > C-1 ? x+-C : 0 --> subus x, C
auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
return (!Op && !Cond) ||
(Op && Cond &&
Cond->getAPIntValue() == (-Op->getAPIntValue() - 1));
};
if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT,
/*AllowUndefs*/ true)) {
OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
OpRHS);
return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
}
// Another special case: If C was a sign bit, the sub has been
// canonicalized into a xor.
// FIXME: Would it be better to use computeKnownBits to determine
// whether it's safe to decanonicalize the xor?
// x s< 0 ? x^C : 0 --> subus x, C
if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
OpRHSConst->getAPIntValue().isSignMask()) {
// Note that we have to rebuild the RHS constant here to ensure we
// don't rely on particular values of undef lanes.
OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
}
}
}
}
}
}
// Match VSELECTs into add with unsigned saturation.
if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
// paddus is available in SSE2 for i8 and i16 vectors.
Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
isPowerOf2_32(VT.getVectorNumElements()) &&
(VT.getVectorElementType() == MVT::i8 ||
VT.getVectorElementType() == MVT::i16)) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
SDValue CondLHS = Cond->getOperand(0);
SDValue CondRHS = Cond->getOperand(1);
// Check if one of the arms of the VSELECT is vector with all bits set.
// If it's on the left side invert the predicate to simplify logic below.
SDValue Other;
if (ISD::isBuildVectorAllOnes(LHS.getNode())) {
Other = RHS;
CC = ISD::getSetCCInverse(CC, true);
} else if (ISD::isBuildVectorAllOnes(RHS.getNode())) {
Other = LHS;
}
if (Other.getNode() && Other.getOpcode() == ISD::ADD) {
SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);
// Canonicalize condition operands.
if (CC == ISD::SETUGE) {
std::swap(CondLHS, CondRHS);
CC = ISD::SETULE;
}
// We can test against either of the addition operands.
// x <= x+y ? x+y : ~0 --> addus x, y
// x+y >= x ? x+y : ~0 --> addus x, y
if (CC == ISD::SETULE && Other == CondRHS &&
(OpLHS == CondLHS || OpRHS == CondLHS))
return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
if (isa<BuildVectorSDNode>(OpRHS) && isa<BuildVectorSDNode>(CondRHS) &&
CondLHS == OpLHS) {
// If the RHS is a constant we have to reverse the const
// canonicalization.
// x > ~C ? x+C : ~0 --> addus x, C
auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
return Cond->getAPIntValue() == ~Op->getAPIntValue();
};
if (CC == ISD::SETULE &&
ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))
return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
}
}
}
// Early exit check
if (!TLI.isTypeLegal(VT))
return SDValue();
if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
return V;
if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
return V;
if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
return V;
// Custom action for SELECT MMX
if (VT == MVT::x86mmx) {
LHS = DAG.getBitcast(MVT::i64, LHS);
RHS = DAG.getBitcast(MVT::i64, RHS);
SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
return DAG.getBitcast(VT, newSelect);
}
return SDValue();
}
/// Combine:
/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
/// to:
/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
/// Note that this is only legal for some op/cc combinations.
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// This combine only operates on CMP-like nodes.
if (!(Cmp.getOpcode() == X86ISD::CMP ||
(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
return SDValue();
// Can't replace the cmp if it has more uses than the one we're looking at.
// FIXME: We would like to be able to handle this, but would need to make sure
// all uses were updated.
if (!Cmp.hasOneUse())
return SDValue();
// This only applies to variations of the common case:
// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
// (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
// Using the proper condcodes (see below), overflow is checked for.
// FIXME: We can generalize both constraints:
// - XOR/OR/AND (if they were made to survive AtomicExpand)
// - LHS != 1
// if the result is compared.
SDValue CmpLHS = Cmp.getOperand(0);
SDValue CmpRHS = Cmp.getOperand(1);
if (!CmpLHS.hasOneUse())
return SDValue();
unsigned Opc = CmpLHS.getOpcode();
if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
return SDValue();
SDValue OpRHS = CmpLHS.getOperand(2);
auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
if (!OpRHSC)
return SDValue();
APInt Addend = OpRHSC->getAPIntValue();
if (Opc == ISD::ATOMIC_LOAD_SUB)
Addend = -Addend;
auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
if (!CmpRHSC)
return SDValue();
APInt Comparison = CmpRHSC->getAPIntValue();
// If the addend is the negation of the comparison value, then we can do
// a full comparison by emitting the atomic arithmetic as a locked sub.
if (Comparison == -Addend) {
// The CC is fine, but we need to rewrite the LHS of the comparison as an
// atomic sub.
auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
auto AtomicSub = DAG.getAtomic(
ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
/*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
/*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
AN->getMemOperand());
auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
DAG.getUNDEF(CmpLHS.getValueType()));
DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
return LockOp;
}
// We can handle comparisons with zero in a number of cases by manipulating
// the CC used.
if (!Comparison.isNullValue())
return SDValue();
if (CC == X86::COND_S && Addend == 1)
CC = X86::COND_LE;
else if (CC == X86::COND_NS && Addend == 1)
CC = X86::COND_G;
else if (CC == X86::COND_G && Addend == -1)
CC = X86::COND_GE;
else if (CC == X86::COND_LE && Addend == -1)
CC = X86::COND_L;
else
return SDValue();
SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
DAG.getUNDEF(CmpLHS.getValueType()));
DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
return LockOp;
}
// Check whether a boolean test is testing a boolean value generated by
// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
// code.
//
// Simplify the following patterns:
// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
// to (Op EFLAGS Cond)
//
// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
// to (Op EFLAGS !Cond)
//
// where Op could be BRCOND or CMOV.
//
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
// This combine only operates on CMP-like nodes.
if (!(Cmp.getOpcode() == X86ISD::CMP ||
(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
return SDValue();
// Quit if not used as a boolean value.
if (CC != X86::COND_E && CC != X86::COND_NE)
return SDValue();
// Check CMP operands. One of them should be 0 or 1 and the other should be
// an SetCC or extended from it.
SDValue Op1 = Cmp.getOperand(0);
SDValue Op2 = Cmp.getOperand(1);
SDValue SetCC;
const ConstantSDNode* C = nullptr;
bool needOppositeCond = (CC == X86::COND_E);
bool checkAgainstTrue = false; // Is it a comparison against 1?
if ((C = dyn_cast<ConstantSDNode>(Op1)))
SetCC = Op2;
else if ((C = dyn_cast<ConstantSDNode>(Op2)))
SetCC = Op1;
else // Quit if all operands are not constants.
return SDValue();
if (C->getZExtValue() == 1) {
needOppositeCond = !needOppositeCond;
checkAgainstTrue = true;
} else if (C->getZExtValue() != 0)
// Quit if the constant is neither 0 or 1.
return SDValue();
bool truncatedToBoolWithAnd = false;
// Skip (zext $x), (trunc $x), or (and $x, 1) node.
while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
SetCC.getOpcode() == ISD::TRUNCATE ||
SetCC.getOpcode() == ISD::AND) {
if (SetCC.getOpcode() == ISD::AND) {
int OpIdx = -1;
if (isOneConstant(SetCC.getOperand(0)))
OpIdx = 1;
if (isOneConstant(SetCC.getOperand(1)))
OpIdx = 0;
if (OpIdx < 0)
break;
SetCC = SetCC.getOperand(OpIdx);
truncatedToBoolWithAnd = true;
} else
SetCC = SetCC.getOperand(0);
}
switch (SetCC.getOpcode()) {
case X86ISD::SETCC_CARRY:
// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
// i.e. it's a comparison against true but the result of SETCC_CARRY is not
// truncated to i1 using 'and'.
if (checkAgainstTrue && !truncatedToBoolWithAnd)
break;
assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
"Invalid use of SETCC_CARRY!");
LLVM_FALLTHROUGH;
case X86ISD::SETCC:
// Set the condition code or opposite one if necessary.
CC = X86::CondCode(SetCC.getConstantOperandVal(0));
if (needOppositeCond)
CC = X86::GetOppositeBranchCondition(CC);
return SetCC.getOperand(1);
case X86ISD::CMOV: {
// Check whether false/true value has canonical one, i.e. 0 or 1.
ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
// Quit if true value is not a constant.
if (!TVal)
return SDValue();
// Quit if false value is not a constant.
if (!FVal) {
SDValue Op = SetCC.getOperand(0);
// Skip 'zext' or 'trunc' node.
if (Op.getOpcode() == ISD::ZERO_EXTEND ||
Op.getOpcode() == ISD::TRUNCATE)
Op = Op.getOperand(0);
// A special case for rdrand/rdseed, where 0 is set if false cond is
// found.
if ((Op.getOpcode() != X86ISD::RDRAND &&
Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
return SDValue();
}
// Quit if false value is not the constant 0 or 1.
bool FValIsFalse = true;
if (FVal && FVal->getZExtValue() != 0) {
if (FVal->getZExtValue() != 1)
return SDValue();
// If FVal is 1, opposite cond is needed.
needOppositeCond = !needOppositeCond;
FValIsFalse = false;
}
// Quit if TVal is not the constant opposite of FVal.
if (FValIsFalse && TVal->getZExtValue() != 1)
return SDValue();
if (!FValIsFalse && TVal->getZExtValue() != 0)
return SDValue();
CC = X86::CondCode(SetCC.getConstantOperandVal(2));
if (needOppositeCond)
CC = X86::GetOppositeBranchCondition(CC);
return SetCC.getOperand(3);
}
}
return SDValue();
}
/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
/// Match:
/// (X86or (X86setcc) (X86setcc))
/// (X86cmp (and (X86setcc) (X86setcc)), 0)
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
X86::CondCode &CC1, SDValue &Flags,
bool &isAnd) {
if (Cond->getOpcode() == X86ISD::CMP) {
if (!isNullConstant(Cond->getOperand(1)))
return false;
Cond = Cond->getOperand(0);
}
isAnd = false;
SDValue SetCC0, SetCC1;
switch (Cond->getOpcode()) {
default: return false;
case ISD::AND:
case X86ISD::AND:
isAnd = true;
LLVM_FALLTHROUGH;
case ISD::OR:
case X86ISD::OR:
SetCC0 = Cond->getOperand(0);
SetCC1 = Cond->getOperand(1);
break;
};
// Make sure we have SETCC nodes, using the same flags value.
if (SetCC0.getOpcode() != X86ISD::SETCC ||
SetCC1.getOpcode() != X86ISD::SETCC ||
SetCC0->getOperand(1) != SetCC1->getOperand(1))
return false;
CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
Flags = SetCC0->getOperand(1);
return true;
}
// When legalizing carry, we create carries via add X, -1
// If that comes from an actual carry, via setcc, we use the
// carry directly.
static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
if (EFLAGS.getOpcode() == X86ISD::ADD) {
if (isAllOnesConstant(EFLAGS.getOperand(1))) {
SDValue Carry = EFLAGS.getOperand(0);
while (Carry.getOpcode() == ISD::TRUNCATE ||
Carry.getOpcode() == ISD::ZERO_EXTEND ||
Carry.getOpcode() == ISD::SIGN_EXTEND ||
Carry.getOpcode() == ISD::ANY_EXTEND ||
(Carry.getOpcode() == ISD::AND &&
isOneConstant(Carry.getOperand(1))))
Carry = Carry.getOperand(0);
if (Carry.getOpcode() == X86ISD::SETCC ||
Carry.getOpcode() == X86ISD::SETCC_CARRY) {
// TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
uint64_t CarryCC = Carry.getConstantOperandVal(0);
SDValue CarryOp1 = Carry.getOperand(1);
if (CarryCC == X86::COND_B)
return CarryOp1;
if (CarryCC == X86::COND_A) {
// Try to convert COND_A into COND_B in an attempt to facilitate
// materializing "setb reg".
//
// Do not flip "e > c", where "c" is a constant, because Cmp
// instruction cannot take an immediate as its first operand.
//
if (CarryOp1.getOpcode() == X86ISD::SUB &&
CarryOp1.getNode()->hasOneUse() &&
CarryOp1.getValueType().isInteger() &&
!isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
SDValue SubCommute =
DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
CarryOp1.getOperand(1), CarryOp1.getOperand(0));
return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
}
}
// If this is a check of the z flag of an add with 1, switch to the
// C flag.
if (CarryCC == X86::COND_E &&
CarryOp1.getOpcode() == X86ISD::ADD &&
isOneConstant(CarryOp1.getOperand(1)))
return CarryOp1;
}
}
}
return SDValue();
}
/// Optimize an EFLAGS definition used according to the condition code \p CC
/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
/// uses of chain values.
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (CC == X86::COND_B)
if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
return Flags;
if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
return R;
return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
}
/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDLoc DL(N);
SDValue FalseOp = N->getOperand(0);
SDValue TrueOp = N->getOperand(1);
X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
SDValue Cond = N->getOperand(3);
// cmov X, X, ?, ? --> X
if (TrueOp == FalseOp)
return TrueOp;
// Try to simplify the EFLAGS and condition code operands.
// We can't always do this as FCMOV only supports a subset of X86 cond.
if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
Flags};
return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
}
}
// If this is a select between two integer constants, try to do some
// optimizations. Note that the operands are ordered the opposite of SELECT
// operands.
if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
// larger than FalseC (the false value).
if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
CC = X86::GetOppositeBranchCondition(CC);
std::swap(TrueC, FalseC);
std::swap(TrueOp, FalseOp);
}
// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
// This is efficient for any integer data type (including i8/i16) and
// shift amount.
if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
Cond = getSETCC(CC, Cond, DL, DAG);
// Zero extend the condition if needed.
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
unsigned ShAmt = TrueC->getAPIntValue().logBase2();
Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
DAG.getConstant(ShAmt, DL, MVT::i8));
return Cond;
}
// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
// for any integer data type, including i8/i16.
if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
Cond = getSETCC(CC, Cond, DL, DAG);
// Zero extend the condition if needed.
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
FalseC->getValueType(0), Cond);
Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
SDValue(FalseC, 0));
return Cond;
}
// Optimize cases that will turn into an LEA instruction. This requires
// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
bool isFastMultiplier = false;
if (Diff < 10) {
switch ((unsigned char)Diff) {
default: break;
case 1: // result = add base, cond
case 2: // result = lea base( , cond*2)
case 3: // result = lea base(cond, cond*2)
case 4: // result = lea base( , cond*4)
case 5: // result = lea base(cond, cond*4)
case 8: // result = lea base( , cond*8)
case 9: // result = lea base(cond, cond*8)
isFastMultiplier = true;
break;
}
}
if (isFastMultiplier) {
APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
Cond = getSETCC(CC, Cond, DL ,DAG);
// Zero extend the condition if needed.
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
Cond);
// Scale the condition by the difference.
if (Diff != 1)
Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
DAG.getConstant(Diff, DL, Cond.getValueType()));
// Add the base if non-zero.
if (FalseC->getAPIntValue() != 0)
Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
SDValue(FalseC, 0));
return Cond;
}
}
}
}
// Handle these cases:
// (select (x != c), e, c) -> select (x != c), e, x),
// (select (x == c), c, e) -> select (x == c), x, e)
// where the c is an integer constant, and the "select" is the combination
// of CMOV and CMP.
//
// The rationale for this change is that the conditional-move from a constant
// needs two instructions, however, conditional-move from a register needs
// only one instruction.
//
// CAVEAT: By replacing a constant with a symbolic value, it may obscure
// some instruction-combining opportunities. This opt needs to be
// postponed as late as possible.
//
if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
// the DCI.xxxx conditions are provided to postpone the optimization as
// late as possible.
ConstantSDNode *CmpAgainst = nullptr;
if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
!isa<ConstantSDNode>(Cond.getOperand(0))) {
if (CC == X86::COND_NE &&
CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
CC = X86::GetOppositeBranchCondition(CC);
std::swap(TrueOp, FalseOp);
}
if (CC == X86::COND_E &&
CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
SDValue Ops[] = { FalseOp, Cond.getOperand(0),
DAG.getConstant(CC, DL, MVT::i8), Cond };
return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
}
}
}
// Fold and/or of setcc's to double CMOV:
// (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
// (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
//
// This combine lets us generate:
// cmovcc1 (jcc1 if we don't have CMOV)
// cmovcc2 (same)
// instead of:
// setcc1
// setcc2
// and/or
// cmovne (jne if we don't have CMOV)
// When we can't use the CMOV instruction, it might increase branch
// mispredicts.
// When we can use CMOV, or when there is no mispredict, this improves
// throughput and reduces register pressure.
//
if (CC == X86::COND_NE) {
SDValue Flags;
X86::CondCode CC0, CC1;
bool isAndSetCC;
if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
if (isAndSetCC) {
std::swap(FalseOp, TrueOp);
CC0 = X86::GetOppositeBranchCondition(CC0);
CC1 = X86::GetOppositeBranchCondition(CC1);
}
SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
Flags};
SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
return CMOV;
}
}
// Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
// (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
// Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
// (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
if ((CC == X86::COND_NE || CC == X86::COND_E) &&
Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
SDValue Add = TrueOp;
SDValue Const = FalseOp;
// Canonicalize the condition code for easier matching and output.
if (CC == X86::COND_E)
std::swap(Add, Const);
// We might have replaced the constant in the cmov with the LHS of the
// compare. If so change it to the RHS of the compare.
if (Const == Cond.getOperand(0))
Const = Cond.getOperand(1);
// Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
(Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
EVT VT = N->getValueType(0);
// This should constant fold.
SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
DAG.getConstant(X86::COND_NE, DL, MVT::i8),
Cond);
return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
}
}
return SDValue();
}
/// Different mul shrinking modes.
enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
EVT VT = N->getOperand(0).getValueType();
if (VT.getScalarSizeInBits() != 32)
return false;
assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
unsigned SignBits[2] = {1, 1};
bool IsPositive[2] = {false, false};
for (unsigned i = 0; i < 2; i++) {
SDValue Opd = N->getOperand(i);
SignBits[i] = DAG.ComputeNumSignBits(Opd);
IsPositive[i] = DAG.SignBitIsZero(Opd);
}
bool AllPositive = IsPositive[0] && IsPositive[1];
unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
// When ranges are from -128 ~ 127, use MULS8 mode.
if (MinSignBits >= 25)
Mode = MULS8;
// When ranges are from 0 ~ 255, use MULU8 mode.
else if (AllPositive && MinSignBits >= 24)
Mode = MULU8;
// When ranges are from -32768 ~ 32767, use MULS16 mode.
else if (MinSignBits >= 17)
Mode = MULS16;
// When ranges are from 0 ~ 65535, use MULU16 mode.
else if (AllPositive && MinSignBits >= 16)
Mode = MULU16;
else
return false;
return true;
}
/// When the operands of vector mul are extended from smaller size values,
/// like i8 and i16, the type of mul may be shrinked to generate more
/// efficient code. Two typical patterns are handled:
/// Pattern1:
/// %2 = sext/zext <N x i8> %1 to <N x i32>
/// %4 = sext/zext <N x i8> %3 to <N x i32>
// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
/// %5 = mul <N x i32> %2, %4
///
/// Pattern2:
/// %2 = zext/sext <N x i16> %1 to <N x i32>
/// %4 = zext/sext <N x i16> %3 to <N x i32>
/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
/// %5 = mul <N x i32> %2, %4
///
/// There are four mul shrinking modes:
/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
/// generate pmullw+sext32 for it (MULS8 mode).
/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
/// generate pmullw+zext32 for it (MULU8 mode).
/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
/// generate pmullw+pmulhw for it (MULS16 mode).
/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
/// generate pmullw+pmulhuw for it (MULU16 mode).
static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// Check for legality
// pmullw/pmulhw are not supported by SSE.
if (!Subtarget.hasSSE2())
return SDValue();
// Check for profitability
// pmulld is supported since SSE41. It is better to use pmulld
// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
// the expansion.
bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
return SDValue();
ShrinkMode Mode;
if (!canReduceVMulWidth(N, DAG, Mode))
return SDValue();
SDLoc DL(N);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getOperand(0).getValueType();
unsigned NumElts = VT.getVectorNumElements();
if ((NumElts % 2) != 0)
return SDValue();
unsigned RegSize = 128;
MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
// Shrink the operands of mul.
SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
if (ExperimentalVectorWideningLegalization ||
NumElts >= OpsVT.getVectorNumElements()) {
// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
// lower part is needed.
SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
if (Mode == MULU8 || Mode == MULS8)
return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
DL, VT, MulLo);
MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
// the higher part is also needed.
SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
ReducedVT, NewN0, NewN1);
// Repack the lower part and higher part result of mul into a wider
// result.
// Generate shuffle functioning as punpcklwd.
SmallVector<int, 16> ShuffleMask(NumElts);
for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
ShuffleMask[2 * i] = i;
ShuffleMask[2 * i + 1] = i + NumElts;
}
SDValue ResLo =
DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
ResLo = DAG.getBitcast(ResVT, ResLo);
// Generate shuffle functioning as punpckhwd.
for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
ShuffleMask[2 * i] = i + NumElts / 2;
ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
}
SDValue ResHi =
DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
ResHi = DAG.getBitcast(ResVT, ResHi);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
}
// When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
// to legalize the mul explicitly because implicit legalization for type
// <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
// instructions which will not exist when we explicitly legalize it by
// extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
// <4 x i16> undef).
//
// Legalize the operands of mul.
// FIXME: We may be able to handle non-concatenated vectors by insertion.
unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
if ((RegSize % ReducedSizeInBits) != 0)
return SDValue();
SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
DAG.getUNDEF(ReducedVT));
Ops[0] = NewN0;
NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
Ops[0] = NewN1;
NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
if (Mode == MULU8 || Mode == MULS8) {
// Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
// part is needed.
SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
// convert the type of mul result to VT.
MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
: ISD::SIGN_EXTEND_VECTOR_INREG,
DL, ResVT, Mul);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
DAG.getIntPtrConstant(0, DL));
}
// Generate the lower and higher part of mul: pmulhw/pmulhuw. For
// MULU16/MULS16, both parts are needed.
SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
OpsVT, NewN0, NewN1);
// Repack the lower part and higher part result of mul into a wider
// result. Make sure the type of mul result is VT.
MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
Res = DAG.getBitcast(ResVT, Res);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
DAG.getIntPtrConstant(0, DL));
}
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
EVT VT, const SDLoc &DL) {
auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
DAG.getConstant(Mult, DL, VT));
Result = DAG.getNode(ISD::SHL, DL, VT, Result,
DAG.getConstant(Shift, DL, MVT::i8));
Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
N->getOperand(0));
return Result;
};
auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
DAG.getConstant(Mul1, DL, VT));
Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
DAG.getConstant(Mul2, DL, VT));
Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
N->getOperand(0));
return Result;
};
switch (MulAmt) {
default:
break;
case 11:
// mul x, 11 => add ((shl (mul x, 5), 1), x)
return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
case 21:
// mul x, 21 => add ((shl (mul x, 5), 2), x)
return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
case 41:
// mul x, 41 => add ((shl (mul x, 5), 3), x)
return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
case 22:
// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
case 19:
// mul x, 19 => add ((shl (mul x, 9), 1), x)
return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
case 37:
// mul x, 37 => add ((shl (mul x, 9), 2), x)
return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
case 73:
// mul x, 73 => add ((shl (mul x, 9), 3), x)
return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
case 13:
// mul x, 13 => add ((shl (mul x, 3), 2), x)
return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
case 23:
// mul x, 23 => sub ((shl (mul x, 3), 3), x)
return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
case 26:
// mul x, 26 => add ((mul (mul x, 5), 5), x)
return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
case 28:
// mul x, 28 => add ((mul (mul x, 9), 3), x)
return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
case 29:
// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
}
// Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
// by a single LEA.
// First check if this a sum of two power of 2s because that's easy. Then
// count how many zeros are up to the first bit.
// TODO: We can do this even without LEA at a cost of two shifts and an add.
if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
unsigned ScaleShift = countTrailingZeros(MulAmt);
if (ScaleShift >= 1 && ScaleShift < 4) {
unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant(ShiftAmt, DL, MVT::i8));
SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant(ScaleShift, DL, MVT::i8));
return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
}
}
return SDValue();
}
// If the upper 17 bits of each element are zero then we can use PMADDWD,
// which is always at least as quick as PMULLD, except on KNL.
static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (!Subtarget.hasSSE2())
return SDValue();
if (Subtarget.isPMADDWDSlow())
return SDValue();
EVT VT = N->getValueType(0);
// Only support vXi32 vectors.
if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
return SDValue();
// Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
// Also allow v2i32 if it will be widened.
MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
if (!((ExperimentalVectorWideningLegalization && VT == MVT::v2i32) ||
DAG.getTargetLoweringInfo().isTypeLegal(WVT)))
return SDValue();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// If we are zero extending two steps without SSE4.1, its better to reduce
// the vmul width instead.
if (!Subtarget.hasSSE41() &&
(N0.getOpcode() == ISD::ZERO_EXTEND &&
N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
(N1.getOpcode() == ISD::ZERO_EXTEND &&
N1.getOperand(0).getScalarValueSizeInBits() <= 8))
return SDValue();
APInt Mask17 = APInt::getHighBitsSet(32, 17);
if (!DAG.MaskedValueIsZero(N1, Mask17) ||
!DAG.MaskedValueIsZero(N0, Mask17))
return SDValue();
// Use SplitOpsAndApply to handle AVX splitting.
auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
};
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
{ DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
PMADDWDBuilder);
}
static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (!Subtarget.hasSSE2())
return SDValue();
EVT VT = N->getValueType(0);
// Only support vXi64 vectors.
if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
VT.getVectorNumElements() < 2 ||
!isPowerOf2_32(VT.getVectorNumElements()))
return SDValue();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// MULDQ returns the 64-bit result of the signed multiplication of the lower
// 32-bits. We can lower with this if the sign bits stretch that far.
if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
DAG.ComputeNumSignBits(N1) > 32) {
auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
};
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
PMULDQBuilder, /*CheckBWI*/false);
}
// If the upper bits are zero we can use a single pmuludq.
APInt Mask = APInt::getHighBitsSet(64, 32);
if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
};
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
PMULUDQBuilder, /*CheckBWI*/false);
}
return SDValue();
}
/// Optimize a single multiply with constant into two operations in order to
/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
return V;
if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
return V;
if (DCI.isBeforeLegalize() && VT.isVector())
return reduceVMULWidth(N, DAG, Subtarget);
if (!MulConstantOptimization)
return SDValue();
// An imul is usually smaller than the alternative sequence.
if (DAG.getMachineFunction().getFunction().hasMinSize())
return SDValue();
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
return SDValue();
if (VT != MVT::i64 && VT != MVT::i32)
return SDValue();
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!C)
return SDValue();
if (isPowerOf2_64(C->getZExtValue()))
return SDValue();
int64_t SignMulAmt = C->getSExtValue();
assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
SDLoc DL(N);
if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
DAG.getConstant(AbsMulAmt, DL, VT));
if (SignMulAmt < 0)
NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
NewMul);
return NewMul;
}
uint64_t MulAmt1 = 0;
uint64_t MulAmt2 = 0;
if ((AbsMulAmt % 9) == 0) {
MulAmt1 = 9;
MulAmt2 = AbsMulAmt / 9;
} else if ((AbsMulAmt % 5) == 0) {
MulAmt1 = 5;
MulAmt2 = AbsMulAmt / 5;
} else if ((AbsMulAmt % 3) == 0) {
MulAmt1 = 3;
MulAmt2 = AbsMulAmt / 3;
}
SDValue NewMul;
// For negative multiply amounts, only allow MulAmt2 to be a power of 2.
if (MulAmt2 &&
(isPowerOf2_64(MulAmt2) ||
(SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
if (isPowerOf2_64(MulAmt2) &&
!(SignMulAmt >= 0 && N->hasOneUse() &&
N->use_begin()->getOpcode() == ISD::ADD))
// If second multiplifer is pow2, issue it first. We want the multiply by
// 3, 5, or 9 to be folded into the addressing mode unless the lone use
// is an add. Only do this for positive multiply amounts since the
// negate would prevent it from being used as an address mode anyway.
std::swap(MulAmt1, MulAmt2);
if (isPowerOf2_64(MulAmt1))
NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
else
NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
DAG.getConstant(MulAmt1, DL, VT));
if (isPowerOf2_64(MulAmt2))
NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
else
NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
DAG.getConstant(MulAmt2, DL, VT));
// Negate the result.
if (SignMulAmt < 0)
NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
NewMul);
} else if (!Subtarget.slowLEA())
NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
if (!NewMul) {
assert(C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
"Both cases that could cause potential overflows should have "
"already been handled.");
if (isPowerOf2_64(AbsMulAmt - 1)) {
// (mul x, 2^N + 1) => (add (shl x, N), x)
NewMul = DAG.getNode(
ISD::ADD, DL, VT, N->getOperand(0),
DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
MVT::i8)));
// To negate, subtract the number from zero
if (SignMulAmt < 0)
NewMul = DAG.getNode(ISD::SUB, DL, VT,
DAG.getConstant(0, DL, VT), NewMul);
} else if (isPowerOf2_64(AbsMulAmt + 1)) {
// (mul x, 2^N - 1) => (sub (shl x, N), x)
NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant(Log2_64(AbsMulAmt + 1),
DL, MVT::i8));
// To negate, reverse the operands of the subtract.
if (SignMulAmt < 0)
NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
else
NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
// (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant(Log2_64(AbsMulAmt - 2),
DL, MVT::i8));
NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
// (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant(Log2_64(AbsMulAmt + 2),
DL, MVT::i8));
NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
}
}
return NewMul;
}
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
EVT VT = N0.getValueType();
// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
// since the result of setcc_c is all zero's or all ones.
if (VT.isInteger() && !VT.isVector() &&
N1C && N0.getOpcode() == ISD::AND &&
N0.getOperand(1).getOpcode() == ISD::Constant) {
SDValue N00 = N0.getOperand(0);
APInt Mask = N0.getConstantOperandAPInt(1);
Mask <<= N1C->getAPIntValue();
bool MaskOK = false;
// We can handle cases concerning bit-widening nodes containing setcc_c if
// we carefully interrogate the mask to make sure we are semantics
// preserving.
// The transform is not safe if the result of C1 << C2 exceeds the bitwidth
// of the underlying setcc_c operation if the setcc_c was zero extended.
// Consider the following example:
// zext(setcc_c) -> i32 0x0000FFFF
// c1 -> i32 0x0000FFFF
// c2 -> i32 0x00000001
// (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
// (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
MaskOK = true;
} else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
MaskOK = true;
} else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
N00.getOpcode() == ISD::ANY_EXTEND) &&
N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
}
if (MaskOK && Mask != 0) {
SDLoc DL(N);
return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
}
}
// Hardware support for vector shifts is sparse which makes us scalarize the
// vector operations in many cases. Also, on sandybridge ADD is faster than
// shl.
// (shl V, 1) -> add V,V
if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
assert(N0.getValueType().isVector() && "Invalid vector shift type");
// We shift all of the values by one. In many cases we do not have
// hardware support for this operation. This is better expressed as an ADD
// of two values.
if (N1SplatC->getAPIntValue() == 1)
return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
}
return SDValue();
}
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
unsigned Size = VT.getSizeInBits();
// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
// depending on sign of (SarConst - [56,48,32,24,16])
// sexts in X86 are MOVs. The MOVs have the same code size
// as above SHIFTs (only SHIFT on 1 has lower code size).
// However the MOVs have 2 advantages to a SHIFT:
// 1. MOVs can write to a register that differs from source
// 2. MOVs accept memory operands
if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
N0.getOperand(1).getOpcode() != ISD::Constant)
return SDValue();
SDValue N00 = N0.getOperand(0);
SDValue N01 = N0.getOperand(1);
APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
EVT CVT = N1.getValueType();
if (SarConst.isNegative())
return SDValue();
for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
unsigned ShiftSize = SVT.getSizeInBits();
// skipping types without corresponding sext/zext and
// ShlConst that is not one of [56,48,32,24,16]
if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
continue;
SDLoc DL(N);
SDValue NN =
DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
SarConst = SarConst - (Size - ShiftSize);
if (SarConst == 0)
return NN;
else if (SarConst.isNegative())
return DAG.getNode(ISD::SHL, DL, VT, NN,
DAG.getConstant(-SarConst, DL, CVT));
else
return DAG.getNode(ISD::SRA, DL, VT, NN,
DAG.getConstant(SarConst, DL, CVT));
}
return SDValue();
}
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
// Only do this on the last DAG combine as it can interfere with other
// combines.
if (!DCI.isAfterLegalizeDAG())
return SDValue();
// Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
// TODO: This is a generic DAG combine that became an x86-only combine to
// avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
// and-not ('andn').
if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
return SDValue();
auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
if (!ShiftC || !AndC)
return SDValue();
// If we can shrink the constant mask below 8-bits or 32-bits, then this
// transform should reduce code size. It may also enable secondary transforms
// from improved known-bits analysis or instruction selection.
APInt MaskVal = AndC->getAPIntValue();
// If this can be matched by a zero extend, don't optimize.
if (MaskVal.isMask()) {
unsigned TO = MaskVal.countTrailingOnes();
if (TO >= 8 && isPowerOf2_32(TO))
return SDValue();
}
APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
unsigned OldMaskSize = MaskVal.getMinSignedBits();
unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
(OldMaskSize > 32 && NewMaskSize <= 32)) {
// srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
SDLoc DL(N);
SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
}
return SDValue();
}
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
unsigned Opcode = N->getOpcode();
assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
"Unexpected shift opcode");
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
unsigned DstBitsPerElt = VT.getScalarSizeInBits();
unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type");
bool IsSigned = (X86ISD::PACKSS == Opcode);
// Constant Folding.
APInt UndefElts0, UndefElts1;
SmallVector<APInt, 32> EltBits0, EltBits1;
if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
(N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
unsigned NumLanes = VT.getSizeInBits() / 128;
unsigned NumDstElts = VT.getVectorNumElements();
unsigned NumSrcElts = NumDstElts / 2;
unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
APInt Undefs(NumDstElts, 0);
SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
if (UndefElts[SrcIdx]) {
Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
continue;
}
APInt &Val = EltBits[SrcIdx];
if (IsSigned) {
// PACKSS: Truncate signed value with signed saturation.
// Source values less than dst minint are saturated to minint.
// Source values greater than dst maxint are saturated to maxint.
if (Val.isSignedIntN(DstBitsPerElt))
Val = Val.trunc(DstBitsPerElt);
else if (Val.isNegative())
Val = APInt::getSignedMinValue(DstBitsPerElt);
else
Val = APInt::getSignedMaxValue(DstBitsPerElt);
} else {
// PACKUS: Truncate signed value with unsigned saturation.
// Source values less than zero are saturated to zero.
// Source values greater than dst maxuint are saturated to maxuint.
if (Val.isIntN(DstBitsPerElt))
Val = Val.trunc(DstBitsPerElt);
else if (Val.isNegative())
Val = APInt::getNullValue(DstBitsPerElt);
else
Val = APInt::getAllOnesValue(DstBitsPerElt);
}
Bits[Lane * NumDstEltsPerLane + Elt] = Val;
}
}
return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
}
// Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
// truncate to create a larger truncate.
if (Subtarget.hasAVX512() &&
N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
N0.getOperand(0).getValueType() == MVT::v8i32) {
if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
(!IsSigned &&
DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
if (Subtarget.hasVLX())
return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
// Widen input to v16i32 so we can truncate that.
SDLoc dl(N);
SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
}
}
// Attempt to combine as shuffle.
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
return SDValue();
}
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
X86ISD::VSRL == N->getOpcode()) &&
"Unexpected shift opcode");
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// Shift zero -> zero.
if (ISD::isBuildVectorAllZeros(N0.getNode()))
return DAG.getConstant(0, SDLoc(N), VT);
// Detect constant shift amounts.
APInt UndefElts;
SmallVector<APInt, 32> EltBits;
if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
EltBits[0].getZExtValue(), DAG);
}
APInt KnownUndef, KnownZero;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
KnownZero, DCI))
return SDValue(N, 0);
return SDValue();
}
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
unsigned Opcode = N->getOpcode();
assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
X86ISD::VSRLI == Opcode) &&
"Unexpected shift opcode");
bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
unsigned NumBitsPerElt = VT.getScalarSizeInBits();
assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
"Unexpected value type");
assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
// Out of range logical bit shifts are guaranteed to be zero.
// Out of range arithmetic bit shifts splat the sign bit.
unsigned ShiftVal = cast<ConstantSDNode>(N1)->getZExtValue();
if (ShiftVal >= NumBitsPerElt) {
if (LogicalShift)
return DAG.getConstant(0, SDLoc(N), VT);
else
ShiftVal = NumBitsPerElt - 1;
}
// Shift N0 by zero -> N0.
if (!ShiftVal)
return N0;
// Shift zero -> zero.
if (ISD::isBuildVectorAllZeros(N0.getNode()))
return DAG.getConstant(0, SDLoc(N), VT);
// Fold (VSRAI (VSRAI X, C1), C2) --> (VSRAI X, (C1 + C2)) with (C1 + C2)
// clamped to (NumBitsPerElt - 1).
if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSRAI) {
unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
unsigned NewShiftVal = ShiftVal + ShiftVal2;
if (NewShiftVal >= NumBitsPerElt)
NewShiftVal = NumBitsPerElt - 1;
return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0),
DAG.getConstant(NewShiftVal, SDLoc(N), MVT::i8));
}
// We can decode 'whole byte' logical bit shifts as shuffles.
if (LogicalShift && (ShiftVal % 8) == 0) {
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
}
// Constant Folding.
APInt UndefElts;
SmallVector<APInt, 32> EltBits;
if (N->isOnlyUserOf(N0.getNode()) &&
getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
assert(EltBits.size() == VT.getVectorNumElements() &&
"Unexpected shift value type");
for (APInt &Elt : EltBits) {
if (X86ISD::VSHLI == Opcode)
Elt <<= ShiftVal;
else if (X86ISD::VSRAI == Opcode)
Elt.ashrInPlace(ShiftVal);
else
Elt.lshrInPlace(ShiftVal);
}
return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
}
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.SimplifyDemandedBits(SDValue(N, 0),
APInt::getAllOnesValue(NumBitsPerElt), DCI))
return SDValue(N, 0);
return SDValue();
}
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||
(N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) &&
"Unexpected vector insertion");
unsigned NumBitsPerElt = VT.getScalarSizeInBits();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.SimplifyDemandedBits(SDValue(N, 0),
APInt::getAllOnesValue(NumBitsPerElt), DCI))
return SDValue(N, 0);
// Attempt to combine PINSRB/PINSRW patterns to a shuffle.
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
return SDValue();
}
/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
/// OR -> CMPNEQSS.
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
unsigned opcode;
// SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
// we're requiring SSE2 for both.
if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue CMP0 = N0.getOperand(1);
SDValue CMP1 = N1.getOperand(1);
SDLoc DL(N);
// The SETCCs should both refer to the same CMP.
if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
return SDValue();
SDValue CMP00 = CMP0->getOperand(0);
SDValue CMP01 = CMP0->getOperand(1);
EVT VT = CMP00.getValueType();
if (VT == MVT::f32 || VT == MVT::f64) {
bool ExpectingFlags = false;
// Check for any users that want flags:
for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
!ExpectingFlags && UI != UE; ++UI)
switch (UI->getOpcode()) {
default:
case ISD::BR_CC:
case ISD::BRCOND:
case ISD::SELECT:
ExpectingFlags = true;
break;
case ISD::CopyToReg:
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND:
break;
}
if (!ExpectingFlags) {
enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
X86::CondCode tmp = cc0;
cc0 = cc1;
cc1 = tmp;
}
if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
// FIXME: need symbolic constants for these magic numbers.
// See X86ATTInstPrinter.cpp:printSSECC().
unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
if (Subtarget.hasAVX512()) {
SDValue FSetCC =
DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
DAG.getConstant(x86cc, DL, MVT::i8));
// Need to fill with zeros to ensure the bitcast will produce zeroes
// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
DAG.getConstant(0, DL, MVT::v16i1),
FSetCC, DAG.getIntPtrConstant(0, DL));
return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
N->getSimpleValueType(0));
}
SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
CMP00.getValueType(), CMP00, CMP01,
DAG.getConstant(x86cc, DL,
MVT::i8));
bool is64BitFP = (CMP00.getValueType() == MVT::f64);
MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
if (is64BitFP && !Subtarget.is64Bit()) {
// On a 32-bit target, we cannot bitcast the 64-bit float to a
// 64-bit integer, since that's not a legal type. Since
// OnesOrZeroesF is all ones of all zeroes, we don't need all the
// bits, but can do this little dance to extract the lowest 32 bits
// and work with those going forward.
SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
OnesOrZeroesF);
SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
Vector32, DAG.getIntPtrConstant(0, DL));
IntVT = MVT::i32;
}
SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
DAG.getConstant(1, DL, IntVT));
SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
ANDed);
return OneBitOfTruth;
}
}
}
}
return SDValue();
}
// Match (xor X, -1) -> X.
// Match extract_subvector(xor X, -1) -> extract_subvector(X).
// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
V = peekThroughBitcasts(V);
if (V.getOpcode() == ISD::XOR &&
ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
return V.getOperand(0);
if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
(isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
Not, V.getOperand(1));
}
}
SmallVector<SDValue, 2> CatOps;
if (collectConcatOps(V.getNode(), CatOps)) {
for (SDValue &CatOp : CatOps) {
SDValue NotCat = IsNOT(CatOp, DAG);
if (!NotCat) return SDValue();
CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
}
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
}
return SDValue();
}
/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::AND);
MVT VT = N->getSimpleValueType(0);
if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
return SDValue();
SDValue X, Y;
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
if (SDValue Not = IsNOT(N0, DAG)) {
X = Not;
Y = N1;
} else if (SDValue Not = IsNOT(N1, DAG)) {
X = Not;
Y = N0;
} else
return SDValue();
X = DAG.getBitcast(VT, X);
Y = DAG.getBitcast(VT, Y);
return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
}
// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
// register. In most cases we actually compare or select YMM-sized registers
// and mixing the two types creates horrible code. This method optimizes
// some of the transition sequences.
// Even with AVX-512 this is still useful for removing casts around logical
// operations on vXi1 mask types.
static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
assert(VT.isVector() && "Expected vector type");
assert((N->getOpcode() == ISD::ANY_EXTEND ||
N->getOpcode() == ISD::ZERO_EXTEND ||
N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
SDValue Narrow = N->getOperand(0);
EVT NarrowVT = Narrow.getValueType();
if (Narrow->getOpcode() != ISD::XOR &&
Narrow->getOpcode() != ISD::AND &&
Narrow->getOpcode() != ISD::OR)
return SDValue();
SDValue N0 = Narrow->getOperand(0);
SDValue N1 = Narrow->getOperand(1);
SDLoc DL(Narrow);
// The Left side has to be a trunc.
if (N0.getOpcode() != ISD::TRUNCATE)
return SDValue();
// The type of the truncated inputs.
if (N0.getOperand(0).getValueType() != VT)
return SDValue();
// The right side has to be a 'trunc' or a constant vector.
bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
N1.getOperand(0).getValueType() == VT;
if (!RHSTrunc &&
!ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
return SDValue();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
return SDValue();
// Set N0 and N1 to hold the inputs to the new wide operation.
N0 = N0.getOperand(0);
if (RHSTrunc)
N1 = N1.getOperand(0);
else
N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
// Generate the wide operation.
SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
unsigned Opcode = N->getOpcode();
switch (Opcode) {
default: llvm_unreachable("Unexpected opcode");
case ISD::ANY_EXTEND:
return Op;
case ISD::ZERO_EXTEND:
return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
case ISD::SIGN_EXTEND:
return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
Op, DAG.getValueType(NarrowVT));
}
}
/// If both input operands of a logic op are being cast from floating point
/// types, try to convert this into a floating point logic node to avoid
/// unnecessary moves from SSE to integer registers.
static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDLoc DL(N);
if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST)
return SDValue();
SDValue N00 = N0.getOperand(0);
SDValue N10 = N1.getOperand(0);
EVT N00Type = N00.getValueType();
EVT N10Type = N10.getValueType();
// Ensure that both types are the same and are legal scalar fp types.
if (N00Type != N10Type ||
!((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
(Subtarget.hasSSE2() && N00Type == MVT::f64)))
return SDValue();
unsigned FPOpcode;
switch (N->getOpcode()) {
default: llvm_unreachable("Unexpected input node for FP logic conversion");
case ISD::AND: FPOpcode = X86ISD::FAND; break;
case ISD::OR: FPOpcode = X86ISD::FOR; break;
case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
}
SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
return DAG.getBitcast(VT, FPLogic);
}
/// If this is a zero/all-bits result that is bitwise-anded with a low bits
/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
/// with a shift-right to eliminate loading the vector constant mask value.
static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
EVT VT0 = Op0.getValueType();
EVT VT1 = Op1.getValueType();
if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
return SDValue();
APInt SplatVal;
if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
!SplatVal.isMask())
return SDValue();
// Don't prevent creation of ANDN.
if (isBitwiseNot(Op0))
return SDValue();
if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
return SDValue();
unsigned EltBitWidth = VT0.getScalarSizeInBits();
if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
return SDValue();
SDLoc DL(N);
unsigned ShiftVal = SplatVal.countTrailingOnes();
SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
return DAG.getBitcast(N->getValueType(0), Shift);
}
// Get the index node from the lowered DAG of a GEP IR instruction with one
// indexing dimension.
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
if (Ld->isIndexed())
return SDValue();
SDValue Base = Ld->getBasePtr();
if (Base.getOpcode() != ISD::ADD)
return SDValue();
SDValue ShiftedIndex = Base.getOperand(0);
if (ShiftedIndex.getOpcode() != ISD::SHL)
return SDValue();
return ShiftedIndex.getOperand(0);
}
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
switch (VT.getSizeInBits()) {
default: return false;
case 64: return Subtarget.is64Bit() ? true : false;
case 32: return true;
}
}
return false;
}
// This function recognizes cases where X86 bzhi instruction can replace and
// 'and-load' sequence.
// In case of loading integer value from an array of constants which is defined
// as follows:
//
// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
//
// then applying a bitwise and on the result with another input.
// It's equivalent to performing bzhi (zero high bits) on the input, with the
// same index of the load.
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Node->getSimpleValueType(0);
SDLoc dl(Node);
// Check if subtarget has BZHI instruction for the node's type
if (!hasBZHI(Subtarget, VT))
return SDValue();
// Try matching the pattern for both operands.
for (unsigned i = 0; i < 2; i++) {
SDValue N = Node->getOperand(i);
LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
// continue if the operand is not a load instruction
if (!Ld)
return SDValue();
const Value *MemOp = Ld->getMemOperand()->getValue();
if (!MemOp)
return SDValue();
if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
Constant *Init = GV->getInitializer();
Type *Ty = Init->getType();
if (!isa<ConstantDataArray>(Init) ||
!Ty->getArrayElementType()->isIntegerTy() ||
Ty->getArrayElementType()->getScalarSizeInBits() !=
VT.getSizeInBits() ||
Ty->getArrayNumElements() >
Ty->getArrayElementType()->getScalarSizeInBits())
continue;
// Check if the array's constant elements are suitable to our case.
uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
bool ConstantsMatch = true;
for (uint64_t j = 0; j < ArrayElementCount; j++) {
ConstantInt *Elem =
dyn_cast<ConstantInt>(Init->getAggregateElement(j));
if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
ConstantsMatch = false;
break;
}
}
if (!ConstantsMatch)
continue;
// Do the transformation (For 32-bit type):
// -> (and (load arr[idx]), inp)
// <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
// that will be replaced with one bzhi instruction.
SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
// Get the Node which indexes into the array.
SDValue Index = getIndexFromUnindexedLoad(Ld);
if (!Index)
return SDValue();
Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
}
}
}
}
return SDValue();
}
// Look for (and (ctpop X), 1) which is the IR form of __builtin_parity.
// Turn it into series of XORs and a setnp.
static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
// We only support 64-bit and 32-bit. 64-bit requires special handling
// unless the 64-bit popcnt instruction is legal.
if (VT != MVT::i32 && VT != MVT::i64)
return SDValue();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT))
return SDValue();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// LHS needs to be a single use CTPOP.
if (N0.getOpcode() != ISD::CTPOP || !N0.hasOneUse())
return SDValue();
// RHS needs to be 1.
if (!isOneConstant(N1))
return SDValue();
SDLoc DL(N);
SDValue X = N0.getOperand(0);
// If this is 64-bit, its always best to xor the two 32-bit pieces together
// even if we have popcnt.
if (VT == MVT::i64) {
SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
DAG.getNode(ISD::SRL, DL, VT, X,
DAG.getConstant(32, DL, MVT::i8)));
SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
// Generate a 32-bit parity idiom. This will bring us back here if we need
// to expand it too.
SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32,
DAG.getNode(ISD::CTPOP, DL, MVT::i32, X),
DAG.getConstant(1, DL, MVT::i32));
return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Parity);
}
assert(VT == MVT::i32 && "Unexpected VT!");
// Xor the high and low 16-bits together using a 32-bit operation.
SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X,
DAG.getConstant(16, DL, MVT::i8));
X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16);
// Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
// This should allow an h-reg to be used to save a shift.
// FIXME: We only get an h-reg in 32-bit mode.
SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
DAG.getNode(ISD::SRL, DL, VT, X,
DAG.getConstant(8, DL, MVT::i8)));
SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
// Copy the inverse of the parity flag into a register with setcc.
SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
// Zero extend to original type.
return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp);
}
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
// If this is SSE1 only convert to FAND to avoid scalarization.
if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
return DAG.getBitcast(
MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
}
// Use a 32-bit and+zext if upper bits known zero.
if (VT == MVT::i64 && Subtarget.is64Bit() &&
!isa<ConstantSDNode>(N->getOperand(1))) {
APInt HiMask = APInt::getHighBitsSet(64, 32);
if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
SDLoc dl(N);
SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
}
}
// This must be done before legalization has expanded the ctpop.
if (SDValue V = combineParity(N, DAG, Subtarget))
return V;
// Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
// TODO: Support multiple SrcOps.
if (VT == MVT::i1) {
SmallVector<SDValue, 2> SrcOps;
if (matchBitOpReduction(SDValue(N, 0), ISD::AND, SrcOps) &&
SrcOps.size() == 1) {
SDLoc dl(N);
unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
if (Mask) {
APInt AllBits = APInt::getAllOnesValue(NumElts);
return DAG.getSetCC(dl, MVT::i1, Mask,
DAG.getConstant(AllBits, dl, MaskVT), ISD::SETEQ);
}
}
}
if (DCI.isBeforeLegalizeOps())
return SDValue();
if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
return R;
if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
return FPLogic;
if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
return R;
if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
return ShiftRight;
if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
return R;
// Attempt to recursively combine a bitmask AND with shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
}
// Attempt to combine a scalar bitmask AND with an extracted shuffle.
if ((VT.getScalarSizeInBits() % 8) == 0 &&
N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
SDValue BitMask = N->getOperand(1);
SDValue SrcVec = N->getOperand(0).getOperand(0);
EVT SrcVecVT = SrcVec.getValueType();
// Check that the constant bitmask masks whole bytes.
APInt UndefElts;
SmallVector<APInt, 64> EltBits;
if (VT == SrcVecVT.getScalarType() &&
N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
llvm::all_of(EltBits, [](APInt M) {
return M.isNullValue() || M.isAllOnesValue();
})) {
unsigned NumElts = SrcVecVT.getVectorNumElements();
unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
// Create a root shuffle mask from the byte mask and the extracted index.
SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
for (unsigned i = 0; i != Scale; ++i) {
if (UndefElts[i])
continue;
int VecIdx = Scale * Idx + i;
ShuffleMask[VecIdx] =
EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
}
if (SDValue Shuffle = combineX86ShufflesRecursively(
{SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
/*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
N->getOperand(0).getOperand(1));
}
}
return SDValue();
}
// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
EVT VT = N->getValueType(0);
if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
return SDValue();
SDValue N0 = peekThroughBitcasts(N->getOperand(0));
SDValue N1 = peekThroughBitcasts(N->getOperand(1));
if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
return SDValue();
// On XOP we'll lower to PCMOV so accept one use, otherwise only
// do this if either mask has multiple uses already.
if (!(Subtarget.hasXOP() || !N0.getOperand(1).hasOneUse() ||
!N1.getOperand(1).hasOneUse()))
return SDValue();
// Attempt to extract constant byte masks.
APInt UndefElts0, UndefElts1;
SmallVector<APInt, 32> EltBits0, EltBits1;
if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
false, false))
return SDValue();
if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
false, false))
return SDValue();
for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
// TODO - add UNDEF elts support.
if (UndefElts0[i] || UndefElts1[i])
return SDValue();
if (EltBits0[i] != ~EltBits1[i])
return SDValue();
}
SDLoc DL(N);
SDValue X = N->getOperand(0);
SDValue Y =
DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
DAG.getBitcast(VT, N1.getOperand(0)));
return DAG.getNode(ISD::OR, DL, VT, X, Y);
}
// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
if (N->getOpcode() != ISD::OR)
return false;
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// Canonicalize AND to LHS.
if (N1.getOpcode() == ISD::AND)
std::swap(N0, N1);
// Attempt to match OR(AND(M,Y),ANDNP(M,X)).
if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
return false;
Mask = N1.getOperand(0);
X = N1.getOperand(1);
// Check to see if the mask appeared in both the AND and ANDNP.
if (N0.getOperand(0) == Mask)
Y = N0.getOperand(1);
else if (N0.getOperand(1) == Mask)
Y = N0.getOperand(0);
else
return false;
// TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
// ANDNP combine allows other combines to happen that prevent matching.
return true;
}
// Try to match:
// (or (and (M, (sub 0, X)), (pandn M, X)))
// which is a special case of vselect:
// (vselect M, (sub 0, X), X)
// Per:
// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
// We know that, if fNegate is 0 or 1:
// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
//
// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
// ( M ? -X : X) == ((X ^ M ) + (M & 1))
// This lets us transform our vselect to:
// (add (xor X, M), (and M, 1))
// And further to:
// (sub (xor X, M), M)
static SDValue combineLogicBlendIntoConditionalNegate(
EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
SelectionDAG &DAG, const X86Subtarget &Subtarget) {
EVT MaskVT = Mask.getValueType();
assert(MaskVT.isInteger() &&
DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits");
if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
return SDValue();
if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
return SDValue();
auto IsNegV = [](SDNode *N, SDValue V) {
return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
};
SDValue V;
if (IsNegV(Y.getNode(), X))
V = X;
else if (IsNegV(X.getNode(), Y))
V = Y;
else
return SDValue();
SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
SDValue SubOp2 = Mask;
// If the negate was on the false side of the select, then
// the operands of the SUB need to be swapped. PR 27251.
// This is because the pattern being matched above is
// (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
// but if the pattern matched was
// (vselect M, X, (sub (0, X))), that is really negation of the pattern
// above, -(vselect M, (sub 0, X), X), and therefore the replacement
// pattern also needs to be a negation of the replacement pattern above.
// And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
// sub accomplishes the negation of the replacement pattern.
if (V == Y)
std::swap(SubOp1, SubOp2);
SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
return DAG.getBitcast(VT, Res);
}
// Try to fold:
// (or (and (m, y), (pandn m, x)))
// into:
// (vselect m, x, y)
// As a special case, try to fold:
// (or (and (m, (sub 0, x)), (pandn m, x)))
// into:
// (sub (xor X, M), M)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
EVT VT = N->getValueType(0);
if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
(VT.is256BitVector() && Subtarget.hasInt256())))
return SDValue();
SDValue X, Y, Mask;
if (!matchLogicBlend(N, X, Y, Mask))
return SDValue();
// Validate that X, Y, and Mask are bitcasts, and see through them.
Mask = peekThroughBitcasts(Mask);
X = peekThroughBitcasts(X);
Y = peekThroughBitcasts(Y);
EVT MaskVT = Mask.getValueType();
unsigned EltBits = MaskVT.getScalarSizeInBits();
// TODO: Attempt to handle floating point cases as well?
if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
return SDValue();
SDLoc DL(N);
// Attempt to combine to conditional negate: (sub (xor X, M), M)
if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
DAG, Subtarget))
return Res;
// PBLENDVB is only available on SSE 4.1.
if (!Subtarget.hasSSE41())
return SDValue();
MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
X = DAG.getBitcast(BlendVT, X);
Y = DAG.getBitcast(BlendVT, Y);
Mask = DAG.getBitcast(BlendVT, Mask);
Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
return DAG.getBitcast(VT, Mask);
}
// Helper function for combineOrCmpEqZeroToCtlzSrl
// Transforms:
// seteq(cmp x, 0)
// into:
// srl(ctlz x), log2(bitsize(x))
// Input pattern is checked by caller.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
SelectionDAG &DAG) {
SDValue Cmp = Op.getOperand(1);
EVT VT = Cmp.getOperand(0).getValueType();
unsigned Log2b = Log2_32(VT.getSizeInBits());
SDLoc dl(Op);
SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
// The result of the shift is true or false, and on X86, the 32-bit
// encoding of shr and lzcnt is more desirable.
SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
DAG.getConstant(Log2b, dl, MVT::i8));
return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
}
// Try to transform:
// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
// into:
// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
// Will also attempt to match more generic cases, eg:
// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
// Only applies if the target supports the FastLZCNT feature.
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
return SDValue();
auto isORCandidate = [](SDValue N) {
return (N->getOpcode() == ISD::OR && N->hasOneUse());
};
// Check the zero extend is extending to 32-bit or more. The code generated by
// srl(ctlz) for 16-bit or less variants of the pattern would require extra
// instructions to clear the upper bits.
if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
!isORCandidate(N->getOperand(0)))
return SDValue();
// Check the node matches: setcc(eq, cmp 0)
auto isSetCCCandidate = [](SDValue N) {
return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
N->getOperand(1).getOpcode() == X86ISD::CMP &&
isNullConstant(N->getOperand(1).getOperand(1)) &&
N->getOperand(1).getValueType().bitsGE(MVT::i32);
};
SDNode *OR = N->getOperand(0).getNode();
SDValue LHS = OR->getOperand(0);
SDValue RHS = OR->getOperand(1);
// Save nodes matching or(or, setcc(eq, cmp 0)).
SmallVector<SDNode *, 2> ORNodes;
while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
ORNodes.push_back(OR);
OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
LHS = OR->getOperand(0);
RHS = OR->getOperand(1);
}
// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
!isORCandidate(SDValue(OR, 0)))
return SDValue();
// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
// to
// or(srl(ctlz),srl(ctlz)).
// The dag combiner can then fold it into:
// srl(or(ctlz, ctlz)).
EVT VT = OR->getValueType(0);
SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
SDValue Ret, NewRHS;
if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
if (!Ret)
return SDValue();
// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
while (ORNodes.size() > 0) {
OR = ORNodes.pop_back_val();
LHS = OR->getOperand(0);
RHS = OR->getOperand(1);
// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
if (RHS->getOpcode() == ISD::OR)
std::swap(LHS, RHS);
NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
if (!NewRHS)
return SDValue();
Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
}
if (Ret)
Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
return Ret;
}
static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
// If this is SSE1 only convert to FOR to avoid scalarization.
if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
return DAG.getBitcast(MVT::v4i32,
DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
DAG.getBitcast(MVT::v4f32, N0),
DAG.getBitcast(MVT::v4f32, N1)));
}
if (DCI.isBeforeLegalizeOps())
return SDValue();
if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
return R;
if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
return FPLogic;
if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
return R;
if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
return R;
// Attempt to recursively combine an OR of shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
}
if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
return SDValue();
// fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
unsigned Bits = VT.getScalarSizeInBits();
// SHLD/SHRD instructions have lower register pressure, but on some
// platforms they have higher latency than the equivalent
// series of shifts/or that would otherwise be generated.
// Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
// have higher latencies and we are not optimizing for size.
if (!OptForSize && Subtarget.isSHLDSlow())
return SDValue();
if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
std::swap(N0, N1);
if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
return SDValue();
if (!N0.hasOneUse() || !N1.hasOneUse())
return SDValue();
SDValue ShAmt0 = N0.getOperand(1);
if (ShAmt0.getValueType() != MVT::i8)
return SDValue();
SDValue ShAmt1 = N1.getOperand(1);
if (ShAmt1.getValueType() != MVT::i8)
return SDValue();
// Peek through any modulo shift masks.
SDValue ShMsk0;
if (ShAmt0.getOpcode() == ISD::AND &&
isa<ConstantSDNode>(ShAmt0.getOperand(1)) &&
ShAmt0.getConstantOperandAPInt(1) == (Bits - 1)) {
ShMsk0 = ShAmt0;
ShAmt0 = ShAmt0.getOperand(0);
}
SDValue ShMsk1;
if (ShAmt1.getOpcode() == ISD::AND &&
isa<ConstantSDNode>(ShAmt1.getOperand(1)) &&
ShAmt1.getConstantOperandAPInt(1) == (Bits - 1)) {
ShMsk1 = ShAmt1;
ShAmt1 = ShAmt1.getOperand(0);
}
if (ShAmt0.getOpcode() == ISD::TRUNCATE)
ShAmt0 = ShAmt0.getOperand(0);
if (ShAmt1.getOpcode() == ISD::TRUNCATE)
ShAmt1 = ShAmt1.getOperand(0);
SDLoc DL(N);
unsigned Opc = ISD::FSHL;
SDValue Op0 = N0.getOperand(0);
SDValue Op1 = N1.getOperand(0);
if (ShAmt0.getOpcode() == ISD::SUB || ShAmt0.getOpcode() == ISD::XOR) {
Opc = ISD::FSHR;
std::swap(Op0, Op1);
std::swap(ShAmt0, ShAmt1);
std::swap(ShMsk0, ShMsk1);
}
auto GetFunnelShift = [&DAG, &DL, VT, Opc](SDValue Op0, SDValue Op1,
SDValue Amt) {
if (Opc == ISD::FSHR)
std::swap(Op0, Op1);
return DAG.getNode(Opc, DL, VT, Op0, Op1,
DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Amt));
};
// OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C )
// OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> FSHR( Y, X, C )
// OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHL( X, Y, C )
// OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHR( Y, X, C )
// OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> FSHL( X, Y, C )
// OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> FSHR( Y, X, C )
if (ShAmt1.getOpcode() == ISD::SUB) {
SDValue Sum = ShAmt1.getOperand(0);
if (auto *SumC = dyn_cast<ConstantSDNode>(Sum)) {
SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
if (ShAmt1Op1.getOpcode() == ISD::AND &&
isa<ConstantSDNode>(ShAmt1Op1.getOperand(1)) &&
ShAmt1Op1.getConstantOperandAPInt(1) == (Bits - 1)) {
ShMsk1 = ShAmt1Op1;
ShAmt1Op1 = ShAmt1Op1.getOperand(0);
}
if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
ShAmt1Op1 = ShAmt1Op1.getOperand(0);
if ((SumC->getAPIntValue() == Bits ||
(SumC->getAPIntValue() == 0 && ShMsk1)) &&
ShAmt1Op1 == ShAmt0)
return GetFunnelShift(Op0, Op1, ShAmt0);
}
} else if (auto *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
auto *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
return GetFunnelShift(Op0, Op1, ShAmt0);
} else if (ShAmt1.getOpcode() == ISD::XOR) {
SDValue Mask = ShAmt1.getOperand(1);
if (auto *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
unsigned InnerShift = (ISD::FSHL == Opc ? ISD::SRL : ISD::SHL);
SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
ShAmt1Op0 = ShAmt1Op0.getOperand(0);
if (MaskC->getSExtValue() == (Bits - 1) &&
(ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) {
if (Op1.getOpcode() == InnerShift &&
isa<ConstantSDNode>(Op1.getOperand(1)) &&
Op1.getConstantOperandAPInt(1) == 1) {
return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
}
// Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
Op1.getOperand(0) == Op1.getOperand(1)) {
return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
}
}
}
}
return SDValue();
}
/// Try to turn tests against the signbit in the form of:
/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
/// into:
/// SETGT(X, -1)
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
// This is only worth doing if the output type is i8 or i1.
EVT ResultType = N->getValueType(0);
if (ResultType != MVT::i8 && ResultType != MVT::i1)
return SDValue();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// We should be performing an xor against a truncated shift.
if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
return SDValue();
// Make sure we are performing an xor against one.
if (!isOneConstant(N1))
return SDValue();
// SetCC on x86 zero extends so only act on this if it's a logical shift.
SDValue Shift = N0.getOperand(0);
if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
return SDValue();
// Make sure we are truncating from one of i16, i32 or i64.
EVT ShiftTy = Shift.getValueType();
if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
return SDValue();
// Make sure the shift amount extracts the sign bit.
if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
return SDValue();
// Create a greater-than comparison against -1.
// N.B. Using SETGE against 0 works but we want a canonical looking
// comparison, using SETGT matches up with what TranslateX86CC.
SDLoc DL(N);
SDValue ShiftOp = Shift.getOperand(0);
EVT ShiftOpTy = ShiftOp.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
*DAG.getContext(), ResultType);
SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
if (SetCCResultType != ResultType)
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
return Cond;
}
/// Turn vector tests of the signbit in the form of:
/// xor (sra X, elt_size(X)-1), -1
/// into:
/// pcmpgt X, -1
///
/// This should be called before type legalization because the pattern may not
/// persist after that.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
if (!VT.isSimple())
return SDValue();
switch (VT.getSimpleVT().SimpleTy) {
default: return SDValue();
case MVT::v16i8:
case MVT::v8i16:
case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
case MVT::v32i8:
case MVT::v16i16:
case MVT::v8i32:
case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
}
// There must be a shift right algebraic before the xor, and the xor must be a
// 'not' operation.
SDValue Shift = N->getOperand(0);
SDValue Ones = N->getOperand(1);
if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
!ISD::isBuildVectorAllOnes(Ones.getNode()))
return SDValue();
// The shift should be smearing the sign bit across each vector element.
auto *ShiftAmt =
isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
if (!ShiftAmt ||
ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
return SDValue();
// Create a greater-than comparison against -1. We don't use the more obvious
// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
}
/// Check if truncation with saturation form type \p SrcVT to \p DstVT
/// is valid for the given \p Subtarget.
static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
const X86Subtarget &Subtarget) {
if (!Subtarget.hasAVX512())
return false;
// FIXME: Scalar type may be supported if we move it to vector register.
if (!SrcVT.isVector())
return false;
EVT SrcElVT = SrcVT.getScalarType();
EVT DstElVT = DstVT.getScalarType();
if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32)
return false;
if (SrcVT.is512BitVector() || Subtarget.hasVLX())
return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
return false;
}
/// Detect patterns of truncation with unsigned saturation:
///
/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
/// Return the source value x to be truncated or SDValue() if the pattern was
/// not matched.
///
/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
/// where C1 >= 0 and C2 is unsigned max of destination type.
///
/// (truncate (smax (smin (x, C2), C1)) to dest_type)
/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
///
/// These two patterns are equivalent to:
/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
/// So return the smax(x, C1) value to be truncated or SDValue() if the
/// pattern was not matched.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
const SDLoc &DL) {
EVT InVT = In.getValueType();
// Saturation with truncation. We truncate from InVT to VT.
assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
"Unexpected types for truncate operation");
// Match min/max and return limit value as a parameter.
auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
if (V.getOpcode() == Opcode &&
ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
return V.getOperand(0);
return SDValue();
};
APInt C1, C2;
if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
// C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
// the element size of the destination type.
if (C2.isMask(VT.getScalarSizeInBits()))
return UMin;
if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
if (MatchMinMax(SMin, ISD::SMAX, C1))
if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
return SMin;
if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
C2.uge(C1)) {
return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
}
return SDValue();
}
/// Detect patterns of truncation with signed saturation:
/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
/// signed_max_of_dest_type)) to dest_type)
/// or:
/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
/// signed_min_of_dest_type)) to dest_type).
/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
/// Return the source value to be truncated or SDValue() if the pattern was not
/// matched.
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
unsigned NumDstBits = VT.getScalarSizeInBits();
unsigned NumSrcBits = In.getScalarValueSizeInBits();
assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
auto MatchMinMax = [](SDValue V, unsigned Opcode,
const APInt &Limit) -> SDValue {
APInt C;
if (V.getOpcode() == Opcode &&
ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
return V.getOperand(0);
return SDValue();
};
APInt SignedMax, SignedMin;
if (MatchPackUS) {
SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
SignedMin = APInt(NumSrcBits, 0);
} else {
SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
}
if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
return SMax;
if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
return SMin;
return SDValue();
}
/// Detect a pattern of truncation with signed saturation.
/// The types should allow to use VPMOVSS* instruction on AVX512.
/// Return the source value to be truncated or SDValue() if the pattern was not
/// matched.
static SDValue detectAVX512SSatPattern(SDValue In, EVT VT,
const X86Subtarget &Subtarget,
const TargetLowering &TLI) {
if (!TLI.isTypeLegal(In.getValueType()))
return SDValue();
if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
return SDValue();
return detectSSatPattern(In, VT);
}
/// Detect a pattern of truncation with saturation:
/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
/// The types should allow to use VPMOVUS* instruction on AVX512.
/// Return the source value to be truncated or SDValue() if the pattern was not
/// matched.
static SDValue detectAVX512USatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
const SDLoc &DL,
const X86Subtarget &Subtarget,
const TargetLowering &TLI) {
if (!TLI.isTypeLegal(In.getValueType()))
return SDValue();
if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
return SDValue();
return detectUSatPattern(In, VT, DAG, DL);
}
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT SVT = VT.getScalarType();
EVT InVT = In.getValueType();
EVT InSVT = InVT.getScalarType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) &&
isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) {
if (auto SSatVal = detectSSatPattern(In, VT))
return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
}
if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
!Subtarget.hasAVX512() &&
(SVT == MVT::i8 || SVT == MVT::i16) &&
(InSVT == MVT::i16 || InSVT == MVT::i32)) {
if (auto USatVal = detectSSatPattern(In, VT, true)) {
// vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
if (SVT == MVT::i8 && InSVT == MVT::i32) {
EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
VT.getVectorNumElements());
SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
DAG, Subtarget);
if (Mid)
return truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
Subtarget);
} else if (SVT == MVT::i8 || Subtarget.hasSSE41())
return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
Subtarget);
}
if (auto SSatVal = detectSSatPattern(In, VT))
return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
Subtarget);
}
return SDValue();
}
/// This function detects the AVG pattern between vectors of unsigned i8/i16,
/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
/// X86ISD::AVG instruction.
static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
const SDLoc &DL) {
if (!VT.isVector())
return SDValue();
EVT InVT = In.getValueType();
unsigned NumElems = VT.getVectorNumElements();
EVT ScalarVT = VT.getVectorElementType();
if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
NumElems >= 2 && isPowerOf2_32(NumElems)))
return SDValue();
// InScalarVT is the intermediate type in AVG pattern and it should be greater
// than the original input type (i8/i16).
EVT InScalarVT = InVT.getVectorElementType();
if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
return SDValue();
if (!Subtarget.hasSSE2())
return SDValue();
// Detect the following pattern:
//
// %1 = zext <N x i8> %a to <N x i32>
// %2 = zext <N x i8> %b to <N x i32>
// %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
// %4 = add nuw nsw <N x i32> %3, %2
// %5 = lshr <N x i32> %N, <i32 1 x N>
// %6 = trunc <N x i32> %5 to <N x i8>
//
// In AVX512, the last instruction can also be a trunc store.
if (In.getOpcode() != ISD::SRL)
return SDValue();
// A lambda checking the given SDValue is a constant vector and each element
// is in the range [Min, Max].
auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
if (!BV || !BV->isConstant())
return false;
for (SDValue Op : V->ops()) {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
if (!C)
return false;
const APInt &Val = C->getAPIntValue();
if (Val.ult(Min) || Val.ugt(Max))
return false;
}
return true;
};
// Check if each element of the vector is left-shifted by one.
auto LHS = In.getOperand(0);
auto RHS = In.getOperand(1);
if (!IsConstVectorInRange(RHS, 1, 1))
return SDValue();
if (LHS.getOpcode() != ISD::ADD)
return SDValue();
// Detect a pattern of a + b + 1 where the order doesn't matter.
SDValue Operands[3];
Operands[0] = LHS.getOperand(0);
Operands[1] = LHS.getOperand(1);
auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
};
// Take care of the case when one of the operands is a constant vector whose
// element is in the range [1, 256].
if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
Operands[0].getOperand(0).getValueType() == VT) {
// The pattern is detected. Subtract one from the constant vector, then
// demote it and emit X86ISD::AVG instruction.
SDValue VecOnes = DAG.getConstant(1, DL, InVT);
Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
return SplitOpsAndApply(DAG, Subtarget, DL, VT,
{ Operands[0].getOperand(0), Operands[1] },
AVGBuilder);
}
// Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
// Match the or case only if its 'add-like' - can be replaced by an add.
auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
if (ISD::ADD == V.getOpcode()) {
Op0 = V.getOperand(0);
Op1 = V.getOperand(1);
return true;
}
if (ISD::ZERO_EXTEND != V.getOpcode())
return false;
V = V.getOperand(0);
if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
!DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
return false;
Op0 = V.getOperand(0);
Op1 = V.getOperand(1);
return true;
};
SDValue Op0, Op1;
if (FindAddLike(Operands[0], Op0, Op1))
std::swap(Operands[0], Operands[1]);
else if (!FindAddLike(Operands[1], Op0, Op1))
return SDValue();
Operands[2] = Op0;
Operands[1] = Op1;
// Now we have three operands of two additions. Check that one of them is a
// constant vector with ones, and the other two can be promoted from i8/i16.
for (int i = 0; i < 3; ++i) {
if (!IsConstVectorInRange(Operands[i], 1, 1))
continue;
std::swap(Operands[i], Operands[2]);
// Check if Operands[0] and Operands[1] are results of type promotion.
for (int j = 0; j < 2; ++j)
if (Operands[j].getValueType() != VT) {
if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
Operands[j].getOperand(0).getValueType() != VT)
return SDValue();
Operands[j] = Operands[j].getOperand(0);
}
// The pattern is detected, emit X86ISD::AVG instruction(s).
return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Operands[0], Operands[1]},
AVGBuilder);
}
return SDValue();
}
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
LoadSDNode *Ld = cast<LoadSDNode>(N);
EVT RegVT = Ld->getValueType(0);
EVT MemVT = Ld->getMemoryVT();
SDLoc dl(Ld);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// For chips with slow 32-byte unaligned loads, break the 32-byte operation
// into two 16-byte operations. Also split non-temporal aligned loads on
// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
ISD::LoadExtType Ext = Ld->getExtensionType();
bool Fast;
unsigned Alignment = Ld->getAlignment();
if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
Ext == ISD::NON_EXTLOAD &&
((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
*Ld->getMemOperand(), &Fast) &&
!Fast))) {
unsigned NumElems = RegVT.getVectorNumElements();
if (NumElems < 2)
return SDValue();
unsigned HalfAlign = 16;
SDValue Ptr1 = Ld->getBasePtr();
SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfAlign, dl);
EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
NumElems / 2);
SDValue Load1 =
DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
Alignment, Ld->getMemOperand()->getFlags());
SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
Ld->getPointerInfo().getWithOffset(HalfAlign),
MinAlign(Alignment, HalfAlign),
Ld->getMemOperand()->getFlags());
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
Load1.getValue(1), Load2.getValue(1));
SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
return DCI.CombineTo(N, NewVec, TF, true);
}
// Bool vector load - attempt to cast to an integer, as we have good
// (vXiY *ext(vXi1 bitcast(iX))) handling.
if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
unsigned NumElts = RegVT.getVectorNumElements();
EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
if (TLI.isTypeLegal(IntVT)) {
SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
Ld->getPointerInfo(), Alignment,
Ld->getMemOperand()->getFlags());
SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
}
}
return SDValue();
}
/// If V is a build vector of boolean constants and exactly one of those
/// constants is true, return the operand index of that true element.
/// Otherwise, return -1.
static int getOneTrueElt(SDValue V) {
// This needs to be a build vector of booleans.
// TODO: Checking for the i1 type matches the IR definition for the mask,
// but the mask check could be loosened to i8 or other types. That might
// also require checking more than 'allOnesValue'; eg, the x86 HW
// instructions only require that the MSB is set for each mask element.
// The ISD::MSTORE comments/definition do not specify how the mask operand
// is formatted.
auto *BV = dyn_cast<BuildVectorSDNode>(V);
if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
return -1;
int TrueIndex = -1;
unsigned NumElts = BV->getValueType(0).getVectorNumElements();
for (unsigned i = 0; i < NumElts; ++i) {
const SDValue &Op = BV->getOperand(i);
if (Op.isUndef())
continue;
auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
if (!ConstNode)
return -1;
if (ConstNode->getAPIntValue().isAllOnesValue()) {
// If we already found a one, this is too many.
if (TrueIndex >= 0)
return -1;
TrueIndex = i;
}
}
return TrueIndex;
}
/// Given a masked memory load/store operation, return true if it has one mask
/// bit set. If it has one mask bit set, then also return the memory address of
/// the scalar element to load/store, the vector index to insert/extract that
/// scalar element, and the alignment for the scalar memory access.
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
SelectionDAG &DAG, SDValue &Addr,
SDValue &Index, unsigned &Alignment) {
int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
if (TrueMaskElt < 0)
return false;
// Get the address of the one scalar element that is specified by the mask
// using the appropriate offset from the base pointer.
EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
Addr = MaskedOp->getBasePtr();
if (TrueMaskElt != 0) {
unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
}
Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
return true;
}
/// If exactly one element of the mask is set for a non-extending masked load,
/// it is a scalar load and vector insert.
/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
/// mask have already been optimized in IR, so we don't bother with those here.
static SDValue
reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
// However, some target hooks may need to be added to know when the transform
// is profitable. Endianness would also have to be considered.
SDValue Addr, VecIndex;
unsigned Alignment;
if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
return SDValue();
// Load the one scalar element that is specified by the mask using the
// appropriate offset from the base pointer.
SDLoc DL(ML);
EVT VT = ML->getValueType(0);
EVT EltVT = VT.getVectorElementType();
SDValue Load =
DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
Alignment, ML->getMemOperand()->getFlags());
// Insert the loaded element into the appropriate place in the vector.
SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
ML->getPassThru(), Load, VecIndex);
return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
}
static SDValue
combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
return SDValue();
SDLoc DL(ML);
EVT VT = ML->getValueType(0);
// If we are loading the first and last elements of a vector, it is safe and
// always faster to load the whole vector. Replace the masked load with a
// vector load and select.
unsigned NumElts = VT.getVectorNumElements();
BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
if (LoadFirstElt && LoadLastElt) {
SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
ML->getMemOperand());
SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
ML->getPassThru());
return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
}
// Convert a masked load with a constant mask into a masked load and a select.
// This allows the select operation to use a faster kind of select instruction
// (for example, vblendvps -> vblendps).
// Don't try this if the pass-through operand is already undefined. That would
// cause an infinite loop because that's what we're about to create.
if (ML->getPassThru().isUndef())
return SDValue();
if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
return SDValue();
// The new masked load has an undef pass-through operand. The select uses the
// original pass-through operand.
SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
ML->getMask(), DAG.getUNDEF(VT),
ML->getMemoryVT(), ML->getMemOperand(),
ML->getExtensionType());
SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
ML->getPassThru());
return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
}
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
// TODO: Expanding load with constant mask may be optimized as well.
if (Mld->isExpandingLoad())
return SDValue();
if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
return ScalarLoad;
// TODO: Do some AVX512 subsets benefit from this transform?
if (!Subtarget.hasAVX512())
if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
return Blend;
}
if (Mld->getExtensionType() != ISD::EXTLOAD)
return SDValue();
// Resolve extending loads.
EVT VT = Mld->getValueType(0);
unsigned NumElems = VT.getVectorNumElements();
EVT LdVT = Mld->getMemoryVT();
SDLoc dl(Mld);
assert(LdVT != VT && "Cannot extend to the same type");
unsigned ToSz = VT.getScalarSizeInBits();
unsigned FromSz = LdVT.getScalarSizeInBits();
// From/To sizes and ElemCount must be pow of two.
assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
"Unexpected size for extending masked load");
unsigned SizeRatio = ToSz / FromSz;
assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
// Create a type on which we perform the shuffle.
EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
LdVT.getScalarType(), NumElems*SizeRatio);
assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
// Convert PassThru value.
SDValue WidePassThru = DAG.getBitcast(WideVecVT, Mld->getPassThru());
if (!Mld->getPassThru().isUndef()) {
SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
for (unsigned i = 0; i != NumElems; ++i)
ShuffleVec[i] = i * SizeRatio;
// Can't shuffle using an illegal type.
assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
"WideVecVT should be legal");
WidePassThru = DAG.getVectorShuffle(WideVecVT, dl, WidePassThru,
DAG.getUNDEF(WideVecVT), ShuffleVec);
}
// Prepare the new mask.
SDValue NewMask;
SDValue Mask = Mld->getMask();
if (Mask.getValueType() == VT) {
// Mask and original value have the same type.
NewMask = DAG.getBitcast(WideVecVT, Mask);
SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
for (unsigned i = 0; i != NumElems; ++i)
ShuffleVec[i] = i * SizeRatio;
for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
ShuffleVec[i] = NumElems * SizeRatio;
NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
DAG.getConstant(0, dl, WideVecVT),
ShuffleVec);
} else {
assert(Mask.getValueType().getVectorElementType() == MVT::i1);
unsigned WidenNumElts = NumElems*SizeRatio;
unsigned MaskNumElts = VT.getVectorNumElements();
EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
WidenNumElts);
unsigned NumConcat = WidenNumElts / MaskNumElts;
SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
Ops[0] = Mask;
NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
}
SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
Mld->getBasePtr(), NewMask, WidePassThru,
Mld->getMemoryVT(), Mld->getMemOperand(),
ISD::NON_EXTLOAD);
SDValue SlicedVec = DAG.getBitcast(WideVecVT, WideLd);
SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
for (unsigned i = 0; i != NumElems; ++i)
ShuffleVec[i * SizeRatio] = i;
// Can't shuffle using an illegal type.
assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
"WideVecVT should be legal");
SlicedVec = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
DAG.getUNDEF(WideVecVT), ShuffleVec);
SlicedVec = DAG.getBitcast(VT, SlicedVec);
return DCI.CombineTo(N, SlicedVec, WideLd.getValue(1), true);
}
/// If exactly one element of the mask is set for a non-truncating masked store,
/// it is a vector extract and scalar store.
/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
/// mask have already been optimized in IR, so we don't bother with those here.
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
SelectionDAG &DAG) {
// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
// However, some target hooks may need to be added to know when the transform
// is profitable. Endianness would also have to be considered.
SDValue Addr, VecIndex;
unsigned Alignment;
if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
return SDValue();
// Extract the one scalar element that is actually being stored.
SDLoc DL(MS);
EVT VT = MS->getValue().getValueType();
EVT EltVT = VT.getVectorElementType();
SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
MS->getValue(), VecIndex);
// Store that element at the appropriate offset from the base pointer.
return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
Alignment, MS->getMemOperand()->getFlags());
}
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
if (Mst->isCompressingStore())
return SDValue();
EVT VT = Mst->getValue().getValueType();
EVT StVT = Mst->getMemoryVT();
SDLoc dl(Mst);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!Mst->isTruncatingStore()) {
if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
return ScalarStore;
// If the mask value has been legalized to a non-boolean vector, try to
// simplify ops leading up to it. We only demand the MSB of each lane.
SDValue Mask = Mst->getMask();
if (Mask.getScalarValueSizeInBits() != 1) {
APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));
if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
return SDValue(N, 0);
}
// TODO: AVX512 targets should also be able to simplify something like the
// pattern above, but that pattern will be different. It will either need to
// match setcc more generally or match PCMPGTM later (in tablegen?).
SDValue Value = Mst->getValue();
if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
Mst->getMemoryVT())) {
return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
Mst->getBasePtr(), Mask,
Mst->getMemoryVT(), Mst->getMemOperand(), true);
}
return SDValue();
}
// Resolve truncating stores.
unsigned NumElems = VT.getVectorNumElements();
assert(StVT != VT && "Cannot truncate to the same type");
unsigned FromSz = VT.getScalarSizeInBits();
unsigned ToSz = StVT.getScalarSizeInBits();
// The truncating store is legal in some cases. For example
// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
// are designated for truncate store.
// In this case we don't need any further transformations.
if (TLI.isTruncStoreLegal(VT, StVT))
return SDValue();
// From/To sizes and ElemCount must be pow of two.
assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
"Unexpected size for truncating masked store");
// We are going to use the original vector elt for storing.
// Accumulated smaller vector elements must be a multiple of the store size.
assert (((NumElems * FromSz) % ToSz) == 0 &&
"Unexpected ratio for truncating masked store");
unsigned SizeRatio = FromSz / ToSz;
assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
// Create a type on which we perform the shuffle.
EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
StVT.getScalarType(), NumElems*SizeRatio);
assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
for (unsigned i = 0; i != NumElems; ++i)
ShuffleVec[i] = i * SizeRatio;
// Can't shuffle using an illegal type.
assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
"WideVecVT should be legal");
SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
DAG.getUNDEF(WideVecVT),
ShuffleVec);
SDValue NewMask;
SDValue Mask = Mst->getMask();
if (Mask.getValueType() == VT) {
// Mask and original value have the same type.
NewMask = DAG.getBitcast(WideVecVT, Mask);
for (unsigned i = 0; i != NumElems; ++i)
ShuffleVec[i] = i * SizeRatio;
for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
ShuffleVec[i] = NumElems*SizeRatio;
NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
DAG.getConstant(0, dl, WideVecVT),
ShuffleVec);
} else {
assert(Mask.getValueType().getVectorElementType() == MVT::i1);
unsigned WidenNumElts = NumElems*SizeRatio;
unsigned MaskNumElts = VT.getVectorNumElements();
EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
WidenNumElts);
unsigned NumConcat = WidenNumElts / MaskNumElts;
SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
Ops[0] = Mask;
NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
}
return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
Mst->getBasePtr(), NewMask, StVT,
Mst->getMemOperand(), false);
}
static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
StoreSDNode *St = cast<StoreSDNode>(N);
EVT VT = St->getValue().getValueType();
EVT StVT = St->getMemoryVT();
SDLoc dl(St);
unsigned Alignment = St->getAlignment();
SDValue StoredVal = St->getOperand(1);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Convert a store of vXi1 into a store of iX and a bitcast.
if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
VT.getVectorElementType() == MVT::i1) {
EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
StoredVal = DAG.getBitcast(NewVT, StoredVal);
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
St->getPointerInfo(), St->getAlignment(),
St->getMemOperand()->getFlags());
}
// If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
// This will avoid a copy to k-register.
if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
StoredVal.getOperand(0).getValueType() == MVT::i8) {
return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
St->getBasePtr(), St->getPointerInfo(),
St->getAlignment(), St->getMemOperand()->getFlags());
}
// Widen v2i1/v4i1 stores to v8i1.
if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
Subtarget.hasAVX512()) {
unsigned NumConcats = 8 / VT.getVectorNumElements();
SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
Ops[0] = StoredVal;
StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
St->getPointerInfo(), St->getAlignment(),
St->getMemOperand()->getFlags());
}
// Turn vXi1 stores of constants into a scalar store.
if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
// If its a v64i1 store without 64-bit support, we need two stores.
if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
StoredVal->ops().slice(0, 32));
Lo = combinevXi1ConstantToInteger(Lo, DAG);
SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
StoredVal->ops().slice(32, 32));
Hi = combinevXi1ConstantToInteger(Hi, DAG);
SDValue Ptr0 = St->getBasePtr();
SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
SDValue Ch0 =
DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
Alignment, St->getMemOperand()->getFlags());
SDValue Ch1 =
DAG.getStore(St->getChain(), dl, Hi, Ptr1,
St->getPointerInfo().getWithOffset(4),
MinAlign(Alignment, 4U),
St->getMemOperand()->getFlags());
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
}
StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
St->getPointerInfo(), St->getAlignment(),
St->getMemOperand()->getFlags());
}
// If we are saving a concatenation of two XMM registers and 32-byte stores
// are slow, such as on Sandy Bridge, perform two 16-byte stores.
bool Fast;
if (VT.is256BitVector() && StVT == VT &&
TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
*St->getMemOperand(), &Fast) &&
!Fast) {
unsigned NumElems = VT.getVectorNumElements();
if (NumElems < 2)
return SDValue();
return splitVectorStore(St, DAG);
}
// Split under-aligned vector non-temporal stores.
if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) {
// ZMM/YMM nt-stores - either it can be stored as a series of shorter
// vectors or the legalizer can scalarize it to use MOVNTI.
if (VT.is256BitVector() || VT.is512BitVector()) {
unsigned NumElems = VT.getVectorNumElements();
if (NumElems < 2)
return SDValue();
return splitVectorStore(St, DAG);
}
// XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
// to use MOVNTI.
if (VT.is128BitVector() && Subtarget.hasSSE2()) {
MVT NTVT = Subtarget.hasSSE4A()
? MVT::v2f64
: (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
return scalarizeVectorStore(St, NTVT, DAG);
}
}
// Try to optimize v16i16->v16i8 truncating stores when BWI is not
// supported, but avx512f is by extending to v16i32 and truncating.
if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
St->getValue().getOpcode() == ISD::TRUNCATE &&
St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
TLI.isTruncStoreLegalOrCustom(MVT::v16i32, MVT::v16i8) &&
!DCI.isBeforeLegalizeOps()) {
SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
MVT::v16i8, St->getMemOperand());
}
// Optimize trunc store (of multiple scalars) to shuffle and store.
// First, pack all of the elements in one place. Next, store to memory
// in fewer chunks.
if (St->isTruncatingStore() && VT.isVector()) {
// Check if we can detect an AVG pattern from the truncation. If yes,
// replace the trunc store by a normal store with the result of X86ISD::AVG
// instruction.
if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
Subtarget, dl))
return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
St->getPointerInfo(), St->getAlignment(),
St->getMemOperand()->getFlags());
if (SDValue Val =
detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
TLI))
return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
dl, Val, St->getBasePtr(),
St->getMemoryVT(), St->getMemOperand(), DAG);
if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(),
DAG, dl, Subtarget, TLI))
return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
dl, Val, St->getBasePtr(),
St->getMemoryVT(), St->getMemOperand(), DAG);
unsigned NumElems = VT.getVectorNumElements();
assert(StVT != VT && "Cannot truncate to the same type");
unsigned FromSz = VT.getScalarSizeInBits();
unsigned ToSz = StVT.getScalarSizeInBits();
// The truncating store is legal in some cases. For example
// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
// are designated for truncate store.
// In this case we don't need any further transformations.
if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
return SDValue();
// From, To sizes and ElemCount must be pow of two
if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
// We are going to use the original vector elt for storing.
// Accumulated smaller vector elements must be a multiple of the store size.
if (0 != (NumElems * FromSz) % ToSz) return SDValue();
unsigned SizeRatio = FromSz / ToSz;
assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
// Create a type on which we perform the shuffle
EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
StVT.getScalarType(), NumElems*SizeRatio);
assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
for (unsigned i = 0; i != NumElems; ++i)
ShuffleVec[i] = i * SizeRatio;
// Can't shuffle using an illegal type.
if (!TLI.isTypeLegal(WideVecVT))
return SDValue();
SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
DAG.getUNDEF(WideVecVT),
ShuffleVec);
// At this point all of the data is stored at the bottom of the
// register. We now need to save it to mem.
// Find the largest store unit
MVT StoreType = MVT::i8;
for (MVT Tp : MVT::integer_valuetypes()) {
if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
StoreType = Tp;
}
// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
(64 <= NumElems * ToSz))
StoreType = MVT::f64;
// Bitcast the original vector into a vector of store-size units
EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
SmallVector<SDValue, 8> Chains;
SDValue Ptr = St->getBasePtr();
// Perform one or more big stores into memory.
for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
StoreType, ShuffWide,
DAG.getIntPtrConstant(i, dl));
SDValue Ch =
DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
St->getAlignment(), St->getMemOperand()->getFlags());
Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
Chains.push_back(Ch);
}
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
}
// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
// the FP state in cases where an emms may be missing.
// A preferable solution to the general problem is to figure out the right
// places to insert EMMS. This qualifies as a quick hack.
// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
if (VT.getSizeInBits() != 64)
return SDValue();
const Function &F = DAG.getMachineFunction().getFunction();
bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
bool F64IsLegal =
!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
if (((VT.isVector() && !VT.isFloatingPoint()) ||
(VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
isa<LoadSDNode>(St->getValue()) &&
!cast<LoadSDNode>(St->getValue())->isVolatile() &&
St->getChain().hasOneUse() && !St->isVolatile()) {
LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
SmallVector<SDValue, 8> Ops;
if (!ISD::isNormalLoad(Ld))
return SDValue();
// If this is not the MMX case, i.e. we are just turning i64 load/store
// into f64 load/store, avoid the transformation if there are multiple
// uses of the loaded value.
if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
return SDValue();
SDLoc LdDL(Ld);
SDLoc StDL(N);
// If we are a 64-bit capable x86, lower to a single movq load/store pair.
// Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
// pair instead.
if (Subtarget.is64Bit() || F64IsLegal) {
MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
Ld->getMemOperand());
// Make sure new load is placed in same chain order.
DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
St->getMemOperand());
}
// Otherwise, lower to two pairs of 32-bit loads / stores.
SDValue LoAddr = Ld->getBasePtr();
SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
Ld->getPointerInfo(), Ld->getAlignment(),
Ld->getMemOperand()->getFlags());
SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
Ld->getPointerInfo().getWithOffset(4),
MinAlign(Ld->getAlignment(), 4),
Ld->getMemOperand()->getFlags());
// Make sure new loads are placed in same chain order.
DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
LoAddr = St->getBasePtr();
HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
SDValue LoSt =
DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
St->getAlignment(), St->getMemOperand()->getFlags());
SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
St->getPointerInfo().getWithOffset(4),
MinAlign(St->getAlignment(), 4),
St->getMemOperand()->getFlags());
return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
}
// This is similar to the above case, but here we handle a scalar 64-bit
// integer store that is extracted from a vector on a 32-bit target.
// If we have SSE2, then we can treat it like a floating-point double
// to get past legalization. The execution dependencies fixup pass will
// choose the optimal machine instruction for the store if this really is
// an integer or v2f32 rather than an f64.
if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
SDValue OldExtract = St->getOperand(1);
SDValue ExtOp0 = OldExtract.getOperand(0);
unsigned VecSize = ExtOp0.getValueSizeInBits();
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
BitCast, OldExtract.getOperand(1));
return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
St->getPointerInfo(), St->getAlignment(),
St->getMemOperand()->getFlags());
}
return SDValue();
}
/// Return 'true' if this vector operation is "horizontal"
/// and return the operands for the horizontal operation in LHS and RHS. A
/// horizontal operation performs the binary operation on successive elements
/// of its first operand, then on successive elements of its second operand,
/// returning the resulting values in a vector. For example, if
/// A = < float a0, float a1, float a2, float a3 >
/// and
/// B = < float b0, float b1, float b2, float b3 >
/// then the result of doing a horizontal operation on A and B is
/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
/// A horizontal-op B, for some already available A and B, and if so then LHS is
/// set to A, RHS to B, and the routine returns 'true'.
static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
bool IsCommutative) {
// If either operand is undef, bail out. The binop should be simplified.
if (LHS.isUndef() || RHS.isUndef())
return false;
// Look for the following pattern:
// A = < float a0, float a1, float a2, float a3 >
// B = < float b0, float b1, float b2, float b3 >
// and
// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
// which is A horizontal-op B.
MVT VT = LHS.getSimpleValueType();
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for horizontal add/sub");
unsigned NumElts = VT.getVectorNumElements();
// TODO - can we make a general helper method that does all of this for us?
auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
SmallVectorImpl<int> &ShuffleMask) {
if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) {
if (!Op.getOperand(0).isUndef())
N0 = Op.getOperand(0);
if (!Op.getOperand(1).isUndef())
N1 = Op.getOperand(1);
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
ShuffleMask.append(Mask.begin(), Mask.end());
return;
}
bool UseSubVector = false;
if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
Op.getOperand(0).getValueType().is256BitVector() &&
llvm::isNullConstant(Op.getOperand(1))) {
Op = Op.getOperand(0);
UseSubVector = true;
}
bool IsUnary;
SmallVector<SDValue, 2> SrcOps;
SmallVector<int, 16> SrcShuffleMask;
SDValue BC = peekThroughBitcasts(Op);
if (isTargetShuffle(BC.getOpcode()) &&
getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false,
SrcOps, SrcShuffleMask, IsUnary)) {
if (!UseSubVector && SrcShuffleMask.size() == NumElts &&
SrcOps.size() <= 2) {
N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end());
}
if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) &&
SrcOps.size() == 1) {
N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op));
N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op));
ArrayRef<int> Mask = ArrayRef<int>(SrcShuffleMask).slice(0, NumElts);
ShuffleMask.append(Mask.begin(), Mask.end());
}
}
};
// View LHS in the form
// LHS = VECTOR_SHUFFLE A, B, LMask
// If LHS is not a shuffle, then pretend it is the identity shuffle:
// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
// NOTE: A default initialized SDValue represents an UNDEF of type VT.
SDValue A, B;
SmallVector<int, 16> LMask;
GetShuffle(LHS, A, B, LMask);
// Likewise, view RHS in the form
// RHS = VECTOR_SHUFFLE C, D, RMask
SDValue C, D;
SmallVector<int, 16> RMask;
GetShuffle(RHS, C, D, RMask);
// At least one of the operands should be a vector shuffle.
unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
if (NumShuffles == 0)
return false;
if (LMask.empty()) {
A = LHS;
for (unsigned i = 0; i != NumElts; ++i)
LMask.push_back(i);
}
if (RMask.empty()) {
C = RHS;
for (unsigned i = 0; i != NumElts; ++i)
RMask.push_back(i);
}
// If A and B occur in reverse order in RHS, then canonicalize by commuting
// RHS operands and shuffle mask.
if (A != C) {
std::swap(C, D);
ShuffleVectorSDNode::commuteMask(RMask);
}
// Check that the shuffles are both shuffling the same vectors.
if (!(A == C && B == D))
return false;
// LHS and RHS are now:
// LHS = shuffle A, B, LMask
// RHS = shuffle A, B, RMask
// Check that the masks correspond to performing a horizontal operation.
// AVX defines horizontal add/sub to operate independently on 128-bit lanes,
// so we just repeat the inner loop if this is a 256-bit op.
unsigned Num128BitChunks = VT.getSizeInBits() / 128;
unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
assert((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane");
for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
// Ignore undefined components.
int LIdx = LMask[i + j], RIdx = RMask[i + j];
if (LIdx < 0 || RIdx < 0 ||
(!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
(!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
continue;
// The low half of the 128-bit result must choose from A.
// The high half of the 128-bit result must choose from B,
// unless B is undef. In that case, we are always choosing from A.
unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;
// Check that successive elements are being operated on. If not, this is
// not a horizontal operation.
int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j;
if (!(LIdx == Index && RIdx == Index + 1) &&
!(IsCommutative && LIdx == Index + 1 && RIdx == Index))
return false;
}
}
LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget))
return false;
LHS = DAG.getBitcast(VT, LHS);
RHS = DAG.getBitcast(VT, RHS);
return true;
}
/// Do target-specific dag combines on floating-point adds/subs.
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
bool IsFadd = N->getOpcode() == ISD::FADD;
auto HorizOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
// Try to synthesize horizontal add/sub from adds/subs of shuffles.
if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd))
return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
return SDValue();
}
/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
/// the codegen.
/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
/// anything that is guaranteed to be transformed by DAGCombiner.
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
const SDLoc &DL) {
assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
SDValue Src = N->getOperand(0);
unsigned SrcOpcode = Src.getOpcode();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = N->getValueType(0);
EVT SrcVT = Src.getValueType();
auto IsFreeTruncation = [VT](SDValue Op) {
unsigned TruncSizeInBits = VT.getScalarSizeInBits();
// See if this has been extended from a smaller/equal size to
// the truncation size, allowing a truncation to combine with the extend.
unsigned Opcode = Op.getOpcode();
if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
Opcode == ISD::ZERO_EXTEND) &&
Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
return true;
// See if this is a single use constant which can be constant folded.
// NOTE: We don't peek throught bitcasts here because there is currently
// no support for constant folding truncate+bitcast+vector_of_constants. So
// we'll just send up with a truncate on both operands which will
// get turned back into (truncate (binop)) causing an infinite loop.
return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
};
auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
};
// Don't combine if the operation has other uses.
if (!Src.hasOneUse())
return SDValue();
// Only support vector truncation for now.
// TODO: i64 scalar math would benefit as well.
if (!VT.isVector())
return SDValue();
// In most cases its only worth pre-truncating if we're only facing the cost
// of one truncation.
// i.e. if one of the inputs will constant fold or the input is repeated.
switch (SrcOpcode) {
case ISD::AND:
case ISD::XOR:
case ISD::OR: {
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
if (TLI.isOperationLegalOrPromote(SrcOpcode, VT) &&
(Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
return TruncateArithmetic(Op0, Op1);
break;
}
case ISD::MUL:
// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
// better to truncate if we have the chance.
if (SrcVT.getScalarType() == MVT::i64 &&
TLI.isOperationLegal(SrcOpcode, VT) &&
!TLI.isOperationLegal(SrcOpcode, SrcVT))
return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
LLVM_FALLTHROUGH;
case ISD::ADD: {
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
if (TLI.isOperationLegal(SrcOpcode, VT) &&
(Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
return TruncateArithmetic(Op0, Op1);
break;
}
case ISD::SUB: {
// TODO: ISD::SUB We are conservative and require both sides to be freely
// truncatable to avoid interfering with combineSubToSubus.
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
if (TLI.isOperationLegal(SrcOpcode, VT) &&
(Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))
return TruncateArithmetic(Op0, Op1);
break;
}
}
return SDValue();
}
/// Truncate using ISD::AND mask and X86ISD::PACKUS.
/// e.g. trunc <8 x i32> X to <8 x i16> -->
/// MaskX = X & 0xffff (clear high bits to prevent saturation)
/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue In = N->getOperand(0);
EVT InVT = In.getValueType();
EVT OutVT = N->getValueType(0);
APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
OutVT.getScalarSizeInBits());
In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
}
/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue In = N->getOperand(0);
EVT InVT = In.getValueType();
EVT OutVT = N->getValueType(0);
In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
DAG.getValueType(OutVT));
return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
}
/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
/// legalization the truncation will be translated into a BUILD_VECTOR with each
/// element that is extracted from a vector and then truncated, and it is
/// difficult to do this optimization based on them.
static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT OutVT = N->getValueType(0);
if (!OutVT.isVector())
return SDValue();
SDValue In = N->getOperand(0);
if (!In.getValueType().isSimple())
return SDValue();
EVT InVT = In.getValueType();
unsigned NumElems = OutVT.getVectorNumElements();
// TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
// SSE2, and we need to take care of it specially.
// AVX512 provides vpmovdb.
if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
return SDValue();
EVT OutSVT = OutVT.getVectorElementType();
EVT InSVT = InVT.getVectorElementType();
if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
(OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
NumElems >= 8))
return SDValue();
// SSSE3's pshufb results in less instructions in the cases below.
if (Subtarget.hasSSSE3() && NumElems == 8 &&
((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
(InSVT == MVT::i32 && OutSVT == MVT::i16)))
return SDValue();
SDLoc DL(N);
// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
// truncate 2 x v4i32 to v8i16.
if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
if (InSVT == MVT::i32)
return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
return SDValue();
}
/// This function transforms vector truncation of 'extended sign-bits' or
/// 'extended zero-bits' values.
/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// Requires SSE2 but AVX512 has fast truncate.
if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
return SDValue();
if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
return SDValue();
SDValue In = N->getOperand(0);
if (!In.getValueType().isSimple())
return SDValue();
MVT VT = N->getValueType(0).getSimpleVT();
MVT SVT = VT.getScalarType();
MVT InVT = In.getValueType().getSimpleVT();
MVT InSVT = InVT.getScalarType();
// Check we have a truncation suited for PACKSS/PACKUS.
if (!VT.is128BitVector() && !VT.is256BitVector())
return SDValue();
if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
return SDValue();
if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
return SDValue();
unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
// Use PACKUS if the input has zero-bits that extend all the way to the
// packed/truncated value. e.g. masks, zext_in_reg, etc.
KnownBits Known = DAG.computeKnownBits(In);
unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
// Use PACKSS if the input has sign-bits that extend all the way to the
// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
unsigned NumSignBits = DAG.ComputeNumSignBits(In);
if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
return SDValue();
}
// Try to form a MULHU or MULHS node by looking for
// (trunc (srl (mul ext, ext), 16))
// TODO: This is X86 specific because we want to be able to handle wide types
// before type legalization. But we can only do it if the vector will be
// legalized via widening/splitting. Type legalization can't handle promotion
// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
// combiner.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
SelectionDAG &DAG, const X86Subtarget &Subtarget) {
// First instruction should be a right shift of a multiply.
if (Src.getOpcode() != ISD::SRL ||
Src.getOperand(0).getOpcode() != ISD::MUL)
return SDValue();
if (!Subtarget.hasSSE2())
return SDValue();
// Only handle vXi16 types that are at least 128-bits unless they will be
// widened.
if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 ||
(!ExperimentalVectorWideningLegalization &&
VT.getVectorNumElements() < 8))
return SDValue();
// Input type should be vXi32.
EVT InVT = Src.getValueType();
if (InVT.getVectorElementType() != MVT::i32)
return SDValue();
// Need a shift by 16.
APInt ShiftAmt;
if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
ShiftAmt != 16)
return SDValue();
SDValue LHS = Src.getOperand(0).getOperand(0);
SDValue RHS = Src.getOperand(0).getOperand(1);
unsigned ExtOpc = LHS.getOpcode();
if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
RHS.getOpcode() != ExtOpc)
return SDValue();
// Peek through the extends.
LHS = LHS.getOperand(0);
RHS = RHS.getOperand(0);
// Ensure the input types match.
if (LHS.getValueType() != VT || RHS.getValueType() != VT)
return SDValue();
unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
return DAG.getNode(Opc, DL, VT, LHS, RHS);
}
// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
// from one vector with signed bytes from another vector, adds together
// adjacent pairs of 16-bit products, and saturates the result before
// truncating to 16-bits.
//
// Which looks something like this:
// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
const SDLoc &DL) {
if (!VT.isVector() || !Subtarget.hasSSSE3())
return SDValue();
unsigned NumElems = VT.getVectorNumElements();
EVT ScalarVT = VT.getVectorElementType();
if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
return SDValue();
SDValue SSatVal = detectSSatPattern(In, VT);
if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
return SDValue();
// Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
// of multiplies from even/odd elements.
SDValue N0 = SSatVal.getOperand(0);
SDValue N1 = SSatVal.getOperand(1);
if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
return SDValue();
SDValue N00 = N0.getOperand(0);
SDValue N01 = N0.getOperand(1);
SDValue N10 = N1.getOperand(0);
SDValue N11 = N1.getOperand(1);
// TODO: Handle constant vectors and use knownbits/computenumsignbits?
// Canonicalize zero_extend to LHS.
if (N01.getOpcode() == ISD::ZERO_EXTEND)
std::swap(N00, N01);
if (N11.getOpcode() == ISD::ZERO_EXTEND)
std::swap(N10, N11);
// Ensure we have a zero_extend and a sign_extend.
if (N00.getOpcode() != ISD::ZERO_EXTEND ||
N01.getOpcode() != ISD::SIGN_EXTEND ||
N10.getOpcode() != ISD::ZERO_EXTEND ||
N11.getOpcode() != ISD::SIGN_EXTEND)
return SDValue();
// Peek through the extends.
N00 = N00.getOperand(0);
N01 = N01.getOperand(0);
N10 = N10.getOperand(0);
N11 = N11.getOperand(0);
// Ensure the extend is from vXi8.
if (N00.getValueType().getVectorElementType() != MVT::i8 ||
N01.getValueType().getVectorElementType() != MVT::i8 ||
N10.getValueType().getVectorElementType() != MVT::i8 ||
N11.getValueType().getVectorElementType() != MVT::i8)
return SDValue();
// All inputs should be build_vectors.
if (N00.getOpcode() != ISD::BUILD_VECTOR ||
N01.getOpcode() != ISD::BUILD_VECTOR ||
N10.getOpcode() != ISD::BUILD_VECTOR ||
N11.getOpcode() != ISD::BUILD_VECTOR)
return SDValue();
// N00/N10 are zero extended. N01/N11 are sign extended.
// For each element, we need to ensure we have an odd element from one vector
// multiplied by the odd element of another vector and the even element from
// one of the same vectors being multiplied by the even element from the
// other vector. So we need to make sure for each element i, this operator
// is being performed:
// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
SDValue ZExtIn, SExtIn;
for (unsigned i = 0; i != NumElems; ++i) {
SDValue N00Elt = N00.getOperand(i);
SDValue N01Elt = N01.getOperand(i);
SDValue N10Elt = N10.getOperand(i);
SDValue N11Elt = N11.getOperand(i);
// TODO: Be more tolerant to undefs.
if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
return SDValue();
unsigned IdxN00 = ConstN00Elt->getZExtValue();
unsigned IdxN01 = ConstN01Elt->getZExtValue();
unsigned IdxN10 = ConstN10Elt->getZExtValue();
unsigned IdxN11 = ConstN11Elt->getZExtValue();
// Add is commutative so indices can be reordered.
if (IdxN00 > IdxN10) {
std::swap(IdxN00, IdxN10);
std::swap(IdxN01, IdxN11);
}
// N0 indices be the even element. N1 indices must be the next odd element.
if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
return SDValue();
SDValue N00In = N00Elt.getOperand(0);
SDValue N01In = N01Elt.getOperand(0);
SDValue N10In = N10Elt.getOperand(0);
SDValue N11In = N11Elt.getOperand(0);
// First time we find an input capture it.
if (!ZExtIn) {
ZExtIn = N00In;
SExtIn = N01In;
}
if (ZExtIn != N00In || SExtIn != N01In ||
ZExtIn != N10In || SExtIn != N11In)
return SDValue();
}
auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
// Shrink by adding truncate nodes and let DAGCombine fold with the
// sources.
EVT InVT = Ops[0].getValueType();
assert(InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type");
assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
InVT.getVectorNumElements() / 2);
return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
};
return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
PMADDBuilder);
}
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
SDValue Src = N->getOperand(0);
SDLoc DL(N);
// Attempt to pre-truncate inputs to arithmetic ops instead.
if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
return V;
// Try to detect AVG pattern first.
if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
return Avg;
// Try to detect PMADD
if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
return PMAdd;
// Try to combine truncation with signed/unsigned saturation.
if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
return Val;
// Try to combine PMULHUW/PMULHW for vXi16.
if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
return V;
// The bitcast source is a direct mmx result.
// Detect bitcasts between i32 to x86mmx
if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
SDValue BCSrc = Src.getOperand(0);
if (BCSrc.getValueType() == MVT::x86mmx)
return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
}
// Try to truncate extended sign/zero bits with PACKSS/PACKUS.
if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
return V;
return combineVectorTruncation(N, DAG, Subtarget);
}
/// Returns the negated value if the node \p N flips sign of FP value.
///
/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
/// or FSUB(0, x)
/// AVX512F does not have FXOR, so FNEG is lowered as
/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
/// In this case we go though all bitcasts.
/// This also recognizes splat of a negated value and returns the splat of that
/// value.
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
if (N->getOpcode() == ISD::FNEG)
return N->getOperand(0);
unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
SDValue Op = peekThroughBitcasts(SDValue(N, 0));
EVT VT = Op->getValueType(0);
// Make sure the element size does't change.
if (VT.getScalarSizeInBits() != ScalarSize)
return SDValue();
if (auto SVOp = dyn_cast<ShuffleVectorSDNode>(Op.getNode())) {
// For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
if (!SVOp->getOperand(1).isUndef())
return SDValue();
if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode()))
if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT),
SVOp->getMask());
return SDValue();
}
unsigned Opc = Op.getOpcode();
if (Opc == ISD::INSERT_VECTOR_ELT) {
// Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
// -V, INDEX).
SDValue InsVector = Op.getOperand(0);
SDValue InsVal = Op.getOperand(1);
if (!InsVector.isUndef())
return SDValue();
if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode()))
if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
NegInsVal, Op.getOperand(2));
return SDValue();
}
if (Opc != X86ISD::FXOR && Opc != ISD::XOR && Opc != ISD::FSUB)
return SDValue();
SDValue Op1 = Op.getOperand(1);
SDValue Op0 = Op.getOperand(0);
// For XOR and FXOR, we want to check if constant bits of Op1 are sign bit
// masks. For FSUB, we have to check if constant bits of Op0 are sign bit
// masks and hence we swap the operands.
if (Opc == ISD::FSUB)
std::swap(Op0, Op1);
APInt UndefElts;
SmallVector<APInt, 16> EltBits;
// Extract constant bits and see if they are all sign bit masks. Ignore the
// undef elements.
if (getTargetConstantBitsFromNode(Op1, ScalarSize,
UndefElts, EltBits,
/* AllowWholeUndefs */ true,
/* AllowPartialUndefs */ false)) {
for (unsigned I = 0, E = EltBits.size(); I < E; I++)
if (!UndefElts[I] && !EltBits[I].isSignMask())
return SDValue();
return peekThroughBitcasts(Op0);
}
return SDValue();
}
/// Do target-specific dag combines on floating point negations.
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT OrigVT = N->getValueType(0);
SDValue Arg = isFNEG(DAG, N);
if (!Arg)
return SDValue();
EVT VT = Arg.getValueType();
EVT SVT = VT.getScalarType();
SDLoc DL(N);
// Let legalize expand this if it isn't a legal type yet.
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
// If we're negating a FMUL node on a target with FMA, then we can avoid the
// use of a constant by performing (-0 - A*B) instead.
// FIXME: Check rounding control flags as well once it becomes available.
if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
Arg.getOperand(1), Zero);
return DAG.getBitcast(OrigVT, NewNode);
}
// If we're negating an FMA node, then we can adjust the
// instruction to include the extra negation.
unsigned NewOpcode = 0;
if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
switch (Arg.getOpcode()) {
case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
// We can't handle scalar intrinsic node here because it would only
// invert one element and not the whole vector. But we could try to handle
// a negation of the lower element only.
}
}
if (NewOpcode)
return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
Arg.getNode()->ops()));
return SDValue();
}
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = N->getSimpleValueType(0);
// If we have integer vector types available, use the integer opcodes.
if (!VT.isVector() || !Subtarget.hasSSE2())
return SDValue();
SDLoc dl(N);
unsigned IntBits = VT.getScalarSizeInBits();
MVT IntSVT = MVT::getIntegerVT(IntBits);
MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
unsigned IntOpcode;
switch (N->getOpcode()) {
default: llvm_unreachable("Unexpected FP logic op");
case X86ISD::FOR: IntOpcode = ISD::OR; break;
case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
case X86ISD::FAND: IntOpcode = ISD::AND; break;
case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
}
SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
return DAG.getBitcast(VT, IntOp);
}
/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
if (N->getOpcode() != ISD::XOR)
return SDValue();
SDValue LHS = N->getOperand(0);
auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)
return SDValue();
X86::CondCode NewCC = X86::GetOppositeBranchCondition(
X86::CondCode(LHS->getConstantOperandVal(0)));
SDLoc DL(N);
return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
}
static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
// If this is SSE1 only convert to FXOR to avoid scalarization.
if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
N->getValueType(0) == MVT::v4i32) {
return DAG.getBitcast(
MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
}
if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
return Cmp;
if (DCI.isBeforeLegalizeOps())
return SDValue();
if (SDValue SetCC = foldXor1SetCC(N, DAG))
return SetCC;
if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
return RV;
if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
return FPLogic;
return combineFneg(N, DAG, Subtarget);
}
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
EVT VT = N->getValueType(0);
unsigned NumBits = VT.getSizeInBits();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// TODO - Constant Folding.
if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
// Reduce Cst1 to the bottom 16-bits.
// NOTE: SimplifyDemandedBits won't do this for constants.
const APInt &Val1 = Cst1->getAPIntValue();
APInt MaskedVal1 = Val1 & 0xFFFF;
if (MaskedVal1 != Val1)
return DAG.getNode(X86ISD::BEXTR, SDLoc(N), VT, Op0,
DAG.getConstant(MaskedVal1, SDLoc(N), VT));
}
// Only bottom 16-bits of the control bits are required.
APInt DemandedMask(APInt::getLowBitsSet(NumBits, 16));
if (TLI.SimplifyDemandedBits(Op1, DemandedMask, DCI))
return SDValue(N, 0);
return SDValue();
}
static bool isNullFPScalarOrVectorConst(SDValue V) {
return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
}
/// If a value is a scalar FP zero or a vector FP zero (potentially including
/// undefined elements), return a zero constant that may be used to fold away
/// that value. In the case of a vector, the returned constant will not contain
/// undefined elements even if the input parameter does. This makes it suitable
/// to be used as a replacement operand with operations (eg, bitwise-and) where
/// an undef should not propagate.
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (!isNullFPScalarOrVectorConst(V))
return SDValue();
if (V.getValueType().isVector())
return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
return V;
}
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
SDLoc DL(N);
// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
(VT == MVT::f64 && Subtarget.hasSSE2()) ||
(VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
return SDValue();
auto isAllOnesConstantFP = [](SDValue V) {
if (V.getSimpleValueType().isVector())
return ISD::isBuildVectorAllOnes(V.getNode());
auto *C = dyn_cast<ConstantFPSDNode>(V);
return C && C->getConstantFPValue()->isAllOnesValue();
};
// fand (fxor X, -1), Y --> fandn X, Y
if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
// fand X, (fxor Y, -1) --> fandn Y, X
if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
return SDValue();
}
/// Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// FAND(0.0, x) -> 0.0
if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
return V;
// FAND(x, 0.0) -> 0.0
if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
return V;
if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
return V;
return lowerX86FPLogicOp(N, DAG, Subtarget);
}
/// Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// FANDN(0.0, x) -> x
if (isNullFPScalarOrVectorConst(N->getOperand(0)))
return N->getOperand(1);
// FANDN(x, 0.0) -> 0.0
if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
return V;
return lowerX86FPLogicOp(N, DAG, Subtarget);
}
/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
// F[X]OR(0.0, x) -> x
if (isNullFPScalarOrVectorConst(N->getOperand(0)))
return N->getOperand(1);
// F[X]OR(x, 0.0) -> x
if (isNullFPScalarOrVectorConst(N->getOperand(1)))
return N->getOperand(0);
if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
return NewVal;
return lowerX86FPLogicOp(N, DAG, Subtarget);
}
/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
// Only perform optimizations if UnsafeMath is used.
if (!DAG.getTarget().Options.UnsafeFPMath)
return SDValue();
// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
// into FMINC and FMAXC, which are Commutative operations.
unsigned NewOp = 0;
switch (N->getOpcode()) {
default: llvm_unreachable("unknown opcode");
case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
}
return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
N->getOperand(0), N->getOperand(1));
}
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (Subtarget.useSoftFloat())
return SDValue();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = N->getValueType(0);
if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
(Subtarget.hasSSE2() && VT == MVT::f64) ||
(VT.isVector() && TLI.isTypeLegal(VT))))
return SDValue();
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
SDLoc DL(N);
auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
// If we don't have to respect NaN inputs, this is a direct translation to x86
// min/max instructions.
if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
// If one of the operands is known non-NaN use the native min/max instructions
// with the non-NaN input as second operand.
if (DAG.isKnownNeverNaN(Op1))
return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
if (DAG.isKnownNeverNaN(Op0))
return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
// If we have to respect NaN inputs, this takes at least 3 instructions.
// Favor a library call when operating on a scalar and minimizing code size.
if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
return SDValue();
EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
VT);
// There are 4 possibilities involving NaN inputs, and these are the required
// outputs:
// Op1
// Num NaN
// ----------------
// Num | Max | Op0 |
// Op0 ----------------
// NaN | Op1 | NaN |
// ----------------
//
// The SSE FP max/min instructions were not designed for this case, but rather
// to implement:
// Min = Op1 < Op0 ? Op1 : Op0
// Max = Op1 > Op0 ? Op1 : Op0
//
// So they always return Op0 if either input is a NaN. However, we can still
// use those instructions for fmaxnum by selecting away a NaN input.
// If either operand is NaN, the 2nd source operand (Op0) is passed through.
SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
// are NaN, the NaN value of Op1 is the result.
return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
}
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
EVT VT = N->getValueType(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt KnownUndef, KnownZero;
APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
KnownZero, DCI))
return SDValue(N, 0);
// Convert a full vector load into vzload when not all bits are needed.
SDValue In = N->getOperand(0);
MVT InVT = In.getSimpleValueType();
if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
assert(InVT.is128BitVector() && "Expected 128-bit input vector");
LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
// Unless the load is volatile.
if (!LN->isVolatile()) {
SDLoc dl(N);
unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
MVT MemVT = MVT::getIntegerVT(NumBits);
MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
SDValue VZLoad =
DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
LN->getPointerInfo(),
LN->getAlignment(),
LN->getMemOperand()->getFlags());
SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
DAG.getBitcast(InVT, VZLoad));
DCI.CombineTo(N, Convert);
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
return SDValue(N, 0);
}
}
return SDValue();
}
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
EVT VT = N->getValueType(0);
// Convert a full vector load into vzload when not all bits are needed.
SDValue In = N->getOperand(0);
MVT InVT = In.getSimpleValueType();
if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
assert(InVT.is128BitVector() && "Expected 128-bit input vector");
LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
// Unless the load is volatile.
if (!LN->isVolatile()) {
SDLoc dl(N);
unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
MVT MemVT = MVT::getFloatingPointVT(NumBits);
MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
SDValue VZLoad =
DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
LN->getPointerInfo(),
LN->getAlignment(),
LN->getMemOperand()->getFlags());
SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
DAG.getBitcast(InVT, VZLoad));
DCI.CombineTo(N, Convert);
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
return SDValue(N, 0);
}
}
return SDValue();
}
/// Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
MVT VT = N->getSimpleValueType(0);
// ANDNP(0, x) -> x
if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
return N->getOperand(1);
// ANDNP(x, 0) -> 0
if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
return DAG.getConstant(0, SDLoc(N), VT);
// Turn ANDNP back to AND if input is inverted.
if (SDValue Not = IsNOT(N->getOperand(0), DAG))
return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
N->getOperand(1));
// Attempt to recursively combine a bitmask ANDNP with shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
}
return SDValue();
}
static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// BT ignores high bits in the bit index operand.
unsigned BitWidth = N1.getValueSizeInBits();
APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
return SDValue();
}
// Try to combine sext_in_reg of a cmov of constants by extending the constants.
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
EVT DstVT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
return SDValue();
// Look through single use any_extends / truncs.
SDValue IntermediateBitwidthOp;
if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
N0.hasOneUse()) {
IntermediateBitwidthOp = N0;
N0 = N0.getOperand(0);
}
// See if we have a single use cmov.
if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
return SDValue();
SDValue CMovOp0 = N0.getOperand(0);
SDValue CMovOp1 = N0.getOperand(1);
// Make sure both operands are constants.
if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
!isa<ConstantSDNode>(CMovOp1.getNode()))
return SDValue();
SDLoc DL(N);
// If we looked through an any_extend/trunc above, add one to the constants.
if (IntermediateBitwidthOp) {
unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
}
CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
EVT CMovVT = DstVT;
// We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
if (DstVT == MVT::i16) {
CMovVT = MVT::i32;
CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
}
SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
N0.getOperand(2), N0.getOperand(3));
if (CMovVT != DstVT)
CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
return CMov;
}
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
if (SDValue V = combineSextInRegCmov(N, DAG))
return V;
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
SDLoc dl(N);
// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
// both SSE and AVX2 since there is no sign-extended shift right
// operation on a vector with 64-bit elements.
//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
N0.getOpcode() == ISD::SIGN_EXTEND)) {
SDValue N00 = N0.getOperand(0);
// EXTLOAD has a better solution on AVX2,
// it may be replaced with X86ISD::VSEXT node.
if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
if (!ISD::isNormalLoad(N00.getNode()))
return SDValue();
if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
N00, N1);
return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
}
}
return SDValue();
}
/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
/// opportunities to combine math ops, use an LEA, or use a complex addressing
/// mode. This can eliminate extend, add, and shift instructions.
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
Ext->getOpcode() != ISD::ZERO_EXTEND)
return SDValue();
// TODO: This should be valid for other integer types.
EVT VT = Ext->getValueType(0);
if (VT != MVT::i64)
return SDValue();
SDValue Add = Ext->getOperand(0);
if (Add.getOpcode() != ISD::ADD)
return SDValue();
bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
bool NSW = Add->getFlags().hasNoSignedWrap();
bool NUW = Add->getFlags().hasNoUnsignedWrap();
// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
// into the 'zext'
if ((Sext && !NSW) || (!Sext && !NUW))
return SDValue();
// Having a constant operand to the 'add' ensures that we are not increasing
// the instruction count because the constant is extended for free below.
// A constant operand can also become the displacement field of an LEA.
auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
if (!AddOp1)
return SDValue();
// Don't make the 'add' bigger if there's no hope of combining it with some
// other 'add' or 'shl' instruction.
// TODO: It may be profitable to generate simpler LEA instructions in place
// of single 'add' instructions, but the cost model for selecting an LEA
// currently has a high threshold.
bool HasLEAPotential = false;
for (auto *User : Ext->uses()) {
if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
HasLEAPotential = true;
break;
}
}
if (!HasLEAPotential)
return SDValue();
// Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
SDValue AddOp0 = Add.getOperand(0);
SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
// The wider add is guaranteed to not wrap because both operands are
// sign-extended.
SDNodeFlags Flags;
Flags.setNoSignedWrap(NSW);
Flags.setNoUnsignedWrap(NUW);
return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
}
// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
// operands and the result of CMOV is not used anywhere else - promote CMOV
// itself instead of promoting its result. This could be beneficial, because:
// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
// (or more) pseudo-CMOVs only when they go one-after-another and
// getting rid of result extension code after CMOV will help that.
// 2) Promotion of constant CMOV arguments is free, hence the
// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
// promotion is also good in terms of code-size.
// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
// promotion).
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
SDValue CMovN = Extend->getOperand(0);
if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
return SDValue();
EVT TargetVT = Extend->getValueType(0);
unsigned ExtendOpcode = Extend->getOpcode();
SDLoc DL(Extend);
EVT VT = CMovN.getValueType();
SDValue CMovOp0 = CMovN.getOperand(0);
SDValue CMovOp1 = CMovN.getOperand(1);
if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
!isa<ConstantSDNode>(CMovOp1.getNode()))
return SDValue();
// Only extend to i32 or i64.
if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
return SDValue();
// Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
// are free.
if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
return SDValue();
// If this a zero extend to i64, we should only extend to i32 and use a free
// zero extend to finish.
EVT ExtendVT = TargetVT;
if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
ExtendVT = MVT::i32;
CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
CMovN.getOperand(2), CMovN.getOperand(3));
// Finish extending if needed.
if (ExtendVT != TargetVT)
Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
return Res;
}
// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
// This is more or less the reverse of combineBitcastvxi1.
static SDValue
combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
unsigned Opcode = N->getOpcode();
if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
Opcode != ISD::ANY_EXTEND)
return SDValue();
if (!DCI.isBeforeLegalizeOps())
return SDValue();
if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
return SDValue();
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
EVT SVT = VT.getScalarType();
EVT InSVT = N0.getValueType().getScalarType();
unsigned EltSizeInBits = SVT.getSizeInBits();
// Input type must be extending a bool vector (bit-casted from a scalar
// integer) to legal integer types.
if (!VT.isVector())
return SDValue();
if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
return SDValue();
if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
return SDValue();
SDValue N00 = N0.getOperand(0);
EVT SclVT = N0.getOperand(0).getValueType();
if (!SclVT.isScalarInteger())
return SDValue();
SDLoc DL(N);
SDValue Vec;
SmallVector<int, 32> ShuffleMask;
unsigned NumElts = VT.getVectorNumElements();
assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
// Broadcast the scalar integer to the vector elements.
if (NumElts > EltSizeInBits) {
// If the scalar integer is greater than the vector element size, then we
// must split it down into sub-sections for broadcasting. For example:
// i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
// i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
unsigned Scale = NumElts / EltSizeInBits;
EVT BroadcastVT =
EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
Vec = DAG.getBitcast(VT, Vec);
for (unsigned i = 0; i != Scale; ++i)
ShuffleMask.append(EltSizeInBits, i);
} else {
// For smaller scalar integers, we can simply any-extend it to the vector
// element size (we don't care about the upper bits) and broadcast it to all
// elements.
SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
ShuffleMask.append(NumElts, 0);
}
Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
// Now, mask the relevant bit in each element.
SmallVector<SDValue, 32> Bits;
for (unsigned i = 0; i != NumElts; ++i) {
int BitIdx = (i % EltSizeInBits);
APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
Bits.push_back(DAG.getConstant(Bit, DL, SVT));
}
SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
// Compare against the bitmask and extend the result.
EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
// For SEXT, this is now done, otherwise shift the result down for
// zero-extension.
if (Opcode == ISD::SIGN_EXTEND)
return Vec;
return DAG.getNode(ISD::SRL, DL, VT, Vec,
DAG.getConstant(EltSizeInBits - 1, DL, VT));
}
/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
/// with UNDEFs) of the input to vectors of the same size as the target type
/// which then extends the lowest elements.
static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
if (ExperimentalVectorWideningLegalization)
return SDValue();
unsigned Opcode = N->getOpcode();
// TODO - add ANY_EXTEND support.
if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
return SDValue();
if (!DCI.isBeforeLegalizeOps())
return SDValue();
if (!Subtarget.hasSSE2())
return SDValue();
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
EVT SVT = VT.getScalarType();
EVT InVT = N0.getValueType();
EVT InSVT = InVT.getScalarType();
// FIXME: Generic DAGCombiner previously had a bug that would cause a
// sign_extend of setcc to sometimes return the original node and tricked it
// into thinking CombineTo was used which prevented the target combines from
// running.
// Earlying out here to avoid regressions like this
// (v4i32 (sext (v4i1 (setcc (v4i16)))))
// Becomes
// (v4i32 (sext_invec (v8i16 (concat (v4i16 (setcc (v4i16))), undef))))
// Type legalized to
// (v4i32 (sext_invec (v8i16 (trunc_invec (v4i32 (setcc (v4i32)))))))
// Leading to a packssdw+pmovsxwd
// We could write a DAG combine to fix this, but really we shouldn't be
// creating sext_invec that's forcing v8i16 into the DAG.
if (N0.getOpcode() == ISD::SETCC)
return SDValue();
// Input type must be a vector and we must be extending legal integer types.
if (!VT.isVector() || VT.getVectorNumElements() < 2)
return SDValue();
if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
return SDValue();
if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
return SDValue();
// If the input/output types are both legal then we have at least AVX1 and
// we will be able to use SIGN_EXTEND/ZERO_EXTEND directly.
if (DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
DAG.getTargetLoweringInfo().isTypeLegal(InVT))
return SDValue();
SDLoc DL(N);
auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
EVT SrcVT = N.getValueType();
EVT DstVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),
Size / SrcVT.getScalarSizeInBits());
SmallVector<SDValue, 8> Opnds(Size / SrcVT.getSizeInBits(),
DAG.getUNDEF(SrcVT));
Opnds[0] = N;
return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Opnds);
};
// If target-size is less than 128-bits, extend to a type that would extend
// to 128 bits, extend that and extract the original target vector.
if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
unsigned Scale = 128 / VT.getSizeInBits();
EVT ExVT =
EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
DAG.getIntPtrConstant(0, DL));
}
// If target-size is 128-bits (or 256-bits on AVX target), then convert to
// ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
// Also use this if we don't have SSE41 to allow the legalizer do its job.
if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
(VT.is256BitVector() && Subtarget.hasAVX()) ||
(VT.is512BitVector() && Subtarget.useAVX512Regs())) {
SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
return DAG.getNode(Opcode, DL, VT, ExOp);
}
auto SplitAndExtendInReg = [&](unsigned SplitSize) {
unsigned NumVecs = VT.getSizeInBits() / SplitSize;
unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
unsigned IROpc = getOpcode_EXTEND_VECTOR_INREG(Opcode);
SmallVector<SDValue, 8> Opnds;
for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
DAG.getIntPtrConstant(Offset, DL));
SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
SrcVec = DAG.getNode(IROpc, DL, SubVT, SrcVec);
Opnds.push_back(SrcVec);
}
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
};
// On pre-AVX targets, split into 128-bit nodes of
// ISD::*_EXTEND_VECTOR_INREG.
if (!Subtarget.hasAVX() && !(VT.getSizeInBits() % 128))
return SplitAndExtendInReg(128);
// On pre-AVX512 targets, split into 256-bit nodes of
// ISD::*_EXTEND_VECTOR_INREG.
if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256))
return SplitAndExtendInReg(256);
return SDValue();
}
// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
// result type.
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
SDLoc dl(N);
// Only do this combine with AVX512 for vector extends.
if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
return SDValue();
// Only combine legal element types.
EVT SVT = VT.getVectorElementType();
if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
return SDValue();
// We can only do this if the vector size in 256 bits or less.
unsigned Size = VT.getSizeInBits();
if (Size > 256)
return SDValue();
// Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
// that's the only integer compares with we have.
ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
if (ISD::isUnsignedIntSetCC(CC))
return SDValue();
// Only do this combine if the extension will be fully consumed by the setcc.
EVT N00VT = N0.getOperand(0).getValueType();
EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
if (Size != MatchingVecType.getSizeInBits())
return SDValue();
SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
if (N->getOpcode() == ISD::ZERO_EXTEND)
Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());
return Res;
}
static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
EVT InVT = N0.getValueType();
SDLoc DL(N);
if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
return NewCMov;
if (!DCI.isBeforeLegalizeOps())
return SDValue();
if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
return V;
if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
// Invert and sign-extend a boolean is the same as zero-extend and subtract
// 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
// lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
// sext (xor Bool, -1) --> sub (zext Bool), 1
SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
}
if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
return V;
if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
return V;
if (VT.isVector())
if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
return R;
if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
return NewAdd;
return SDValue();
}
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) {
if (NegMul) {
switch (Opcode) {
default: llvm_unreachable("Unexpected opcode");
case ISD::FMA: Opcode = X86ISD::FNMADD; break;
case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
case X86ISD::FNMADD: Opcode = ISD::FMA; break;
case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
}
}
if (NegAcc) {
switch (Opcode) {
default: llvm_unreachable("Unexpected opcode");
case ISD::FMA: Opcode = X86ISD::FMSUB; break;
case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
case X86ISD::FMSUB: Opcode = ISD::FMA; break;
case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
}
}
return Opcode;
}
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
// Let legalize expand this if it isn't a legal type yet.
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
EVT ScalarVT = VT.getScalarType();
if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
return SDValue();
SDValue A = N->getOperand(0);
SDValue B = N->getOperand(1);
SDValue C = N->getOperand(2);
auto invertIfNegative = [&DAG](SDValue &V) {
if (SDValue NegVal = isFNEG(DAG, V.getNode())) {
V = DAG.getBitcast(V.getValueType(), NegVal);
return true;
}
// Look through extract_vector_elts. If it comes from an FNEG, create a
// new extract from the FNEG input.
if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
isNullConstant(V.getOperand(1))) {
if (SDValue NegVal = isFNEG(DAG, V.getOperand(0).getNode())) {
NegVal = DAG.getBitcast(V.getOperand(0).getValueType(), NegVal);
V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
NegVal, V.getOperand(1));
return true;
}
}
return false;
};
// Do not convert the passthru input of scalar intrinsics.
// FIXME: We could allow negations of the lower element only.
bool NegA = invertIfNegative(A);
bool NegB = invertIfNegative(B);
bool NegC = invertIfNegative(C);
if (!NegA && !NegB && !NegC)
return SDValue();
unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC);
if (N->getNumOperands() == 4)
return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
return DAG.getNode(NewOpcode, dl, VT, A, B, C);
}
// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
SDValue NegVal = isFNEG(DAG, N->getOperand(2).getNode());
if (!NegVal)
return SDValue();
// FIXME: Should we bitcast instead?
if (NegVal.getValueType() != VT)
return SDValue();
unsigned NewOpcode;
switch (N->getOpcode()) {
default: llvm_unreachable("Unexpected opcode!");
case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
}
if (N->getNumOperands() == 4)
return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
NegVal, N->getOperand(3));
return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
NegVal);
}
static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
// (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
// (and (i32 x86isd::setcc_carry), 1)
// This eliminates the zext. This transformation is necessary because
// ISD::SETCC is always legalized to i8.
SDLoc dl(N);
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
if (N0.getOpcode() == ISD::AND &&
N0.hasOneUse() &&
N0.getOperand(0).hasOneUse()) {
SDValue N00 = N0.getOperand(0);
if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
if (!isOneConstant(N0.getOperand(1)))
return SDValue();
return DAG.getNode(ISD::AND, dl, VT,
DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
N00.getOperand(0), N00.getOperand(1)),
DAG.getConstant(1, dl, VT));
}
}
if (N0.getOpcode() == ISD::TRUNCATE &&
N0.hasOneUse() &&
N0.getOperand(0).hasOneUse()) {
SDValue N00 = N0.getOperand(0);
if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
return DAG.getNode(ISD::AND, dl, VT,
DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
N00.getOperand(0), N00.getOperand(1)),
DAG.getConstant(1, dl, VT));
}
}
if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
return NewCMov;
if (DCI.isBeforeLegalizeOps())
if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
return V;
if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
return V;
if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
return V;
if (VT.isVector())
if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
return R;
if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
return NewAdd;
if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
return R;
// TODO: Combine with any target/faux shuffle.
if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
SDValue N00 = N0.getOperand(0);
SDValue N01 = N0.getOperand(1);
unsigned NumSrcElts = N00.getValueType().getVectorNumElements();
unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
(N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
return concatSubVectors(N00, N01, VT, NumSrcElts * 2, DAG, dl, 128);
}
}
return SDValue();
}
/// Try to map a 128-bit or larger integer comparison to vector instructions
/// before type legalization splits it up into chunks.
static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
// We're looking for an oversized integer equality comparison.
SDValue X = SetCC->getOperand(0);
SDValue Y = SetCC->getOperand(1);
EVT OpVT = X.getValueType();
unsigned OpSize = OpVT.getSizeInBits();
if (!OpVT.isScalarInteger() || OpSize < 128)
return SDValue();
// Ignore a comparison with zero because that gets special treatment in
// EmitTest(). But make an exception for the special case of a pair of
// logically-combined vector-sized operands compared to zero. This pattern may
// be generated by the memcmp expansion pass with oversized integer compares
// (see PR33325).
bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
X.getOperand(0).getOpcode() == ISD::XOR &&
X.getOperand(1).getOpcode() == ISD::XOR;
if (isNullConstant(Y) && !IsOrXorXorCCZero)
return SDValue();
// Don't perform this combine if constructing the vector will be expensive.
auto IsVectorBitCastCheap = [](SDValue X) {
X = peekThroughBitcasts(X);
return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
X.getOpcode() == ISD::LOAD;
};
if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
!IsOrXorXorCCZero)
return SDValue();
// TODO: Use PXOR + PTEST for SSE4.1 or later?
EVT VT = SetCC->getValueType(0);
SDLoc DL(SetCC);
if ((OpSize == 128 && Subtarget.hasSSE2()) ||
(OpSize == 256 && Subtarget.hasAVX2()) ||
(OpSize == 512 && Subtarget.useAVX512Regs())) {
EVT VecVT = OpSize == 512 ? MVT::v16i32 :
OpSize == 256 ? MVT::v32i8 :
MVT::v16i8;
EVT CmpVT = OpSize == 512 ? MVT::v16i1 : VecVT;
SDValue Cmp;
if (IsOrXorXorCCZero) {
// This is a bitwise-combined equality comparison of 2 pairs of vectors:
// setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
// Use 2 vector equality compares and 'and' the results before doing a
// MOVMSK.
SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ);
Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2);
} else {
SDValue VecX = DAG.getBitcast(VecVT, X);
SDValue VecY = DAG.getBitcast(VecVT, Y);
Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
}
// For 512-bits we want to emit a setcc that will lower to kortest.
if (OpSize == 512)
return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp),
DAG.getConstant(0xFFFF, DL, MVT::i16), CC);
// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
// setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
// setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
MVT::i32);
return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
}
return SDValue();
}
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
EVT VT = N->getValueType(0);
EVT OpVT = LHS.getValueType();
SDLoc DL(N);
if (CC == ISD::SETNE || CC == ISD::SETEQ) {
// 0-x == y --> x+y == 0
// 0-x != y --> x+y != 0
if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
LHS.hasOneUse()) {
SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
}
// x == 0-y --> x+y == 0
// x != 0-y --> x+y != 0
if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
RHS.hasOneUse()) {
SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
}
if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
return V;
}
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
(CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
// Put build_vectors on the right.
if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
std::swap(LHS, RHS);
CC = ISD::getSetCCSwappedOperands(CC);
}
bool IsSEXT0 =
(LHS.getOpcode() == ISD::SIGN_EXTEND) &&
(LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
if (IsSEXT0 && IsVZero1) {
assert(VT == LHS.getOperand(0).getValueType() &&
"Uexpected operand type");
if (CC == ISD::SETGT)
return DAG.getConstant(0, DL, VT);
if (CC == ISD::SETLE)
return DAG.getConstant(1, DL, VT);
if (CC == ISD::SETEQ || CC == ISD::SETGE)
return DAG.getNOT(DL, LHS.getOperand(0), VT);
assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
"Unexpected condition code!");
return LHS.getOperand(0);
}
}
// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
// pre-promote its result type since vXi1 vectors don't get promoted
// during type legalization.
// NOTE: The element count check is to ignore operand types that need to
// go through type promotion to a 128-bit vector.
if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
VT.getVectorElementType() == MVT::i1 &&
(ExperimentalVectorWideningLegalization ||
VT.getVectorNumElements() > 4) &&
(OpVT.getVectorElementType() == MVT::i8 ||
OpVT.getVectorElementType() == MVT::i16)) {
SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
N->getOperand(2));
return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
}
// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
// to avoid scalarization via legalization because v4i32 is not a legal type.
if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
LHS.getValueType() == MVT::v4f32)
return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
return SDValue();
}
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
SDValue Src = N->getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
MVT VT = N->getSimpleValueType(0);
unsigned NumBits = VT.getScalarSizeInBits();
unsigned NumElts = SrcVT.getVectorNumElements();
// Perform constant folding.
if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
assert(VT == MVT::i32 && "Unexpected result type");
APInt Imm(32, 0);
for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
if (!Src.getOperand(Idx).isUndef() &&
Src.getConstantOperandAPInt(Idx).isNegative())
Imm.setBit(Idx);
}
return DAG.getConstant(Imm, SDLoc(N), VT);
}
// Look through int->fp bitcasts that don't change the element width.
unsigned EltWidth = SrcVT.getScalarSizeInBits();
if (Src.getOpcode() == ISD::BITCAST &&
Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
// Fold movmsk(not(x)) -> not(movmsk) to improve folding of movmsk results
// with scalar comparisons.
if (SDValue NotSrc = IsNOT(Src, DAG)) {
SDLoc DL(N);
APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
NotSrc = DAG.getBitcast(SrcVT, NotSrc);
return DAG.getNode(ISD::XOR, DL, VT,
DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
DAG.getConstant(NotMask, DL, VT));
}
// Simplify the inputs.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedMask(APInt::getAllOnesValue(NumBits));
if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
return SDValue(N, 0);
return SDValue();
}
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDLoc DL(N);
if (DCI.isBeforeLegalizeOps()) {
SDValue Index = N->getOperand(4);
// Remove any sign extends from 32 or smaller to larger than 32.
// Only do this before LegalizeOps in case we need the sign extend for
// legalization.
if (Index.getOpcode() == ISD::SIGN_EXTEND) {
if (Index.getScalarValueSizeInBits() > 32 &&
Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
NewOps[4] = Index.getOperand(0);
SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
if (Res == N) {
// The original sign extend has less users, add back to worklist in
// case it needs to be removed
DCI.AddToWorklist(Index.getNode());
DCI.AddToWorklist(N);
}
return SDValue(Res, 0);
}
}
// Make sure the index is either i32 or i64
unsigned ScalarSize = Index.getScalarValueSizeInBits();
if (ScalarSize != 32 && ScalarSize != 64) {
MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
Index.getValueType().getVectorNumElements());
Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
NewOps[4] = Index;
SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
if (Res == N)
DCI.AddToWorklist(N);
return SDValue(Res, 0);
}
// Try to remove zero extends from 32->64 if we know the sign bit of
// the input is zero.
if (Index.getOpcode() == ISD::ZERO_EXTEND &&
Index.getScalarValueSizeInBits() == 64 &&
Index.getOperand(0).getScalarValueSizeInBits() == 32) {
if (DAG.SignBitIsZero(Index.getOperand(0))) {
SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
NewOps[4] = Index.getOperand(0);
SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
if (Res == N) {
// The original sign extend has less users, add back to worklist in
// case it needs to be removed
DCI.AddToWorklist(Index.getNode());
DCI.AddToWorklist(N);
}
return SDValue(Res, 0);
}
}
}
// With AVX2 we only demand the upper bit of the mask.
if (!Subtarget.hasAVX512()) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Mask = N->getOperand(2);
APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
return SDValue(N, 0);
}
return SDValue();
}
// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc DL(N);
X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
SDValue EFLAGS = N->getOperand(1);
// Try to simplify the EFLAGS and condition code operands.
if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
return getSETCC(CC, Flags, DL, DAG);
return SDValue();
}
/// Optimize branch condition evaluation.
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc DL(N);
SDValue EFLAGS = N->getOperand(3);
X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
// Try to simplify the EFLAGS and condition code operands.
// Make sure to not keep references to operands, as combineSetCCEFLAGS can
// RAUW them under us.
if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
N->getOperand(1), Cond, Flags);
}
return SDValue();
}
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
SelectionDAG &DAG) {
// Take advantage of vector comparisons producing 0 or -1 in each lane to
// optimize away operation when it's from a constant.
//
// The general transformation is:
// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
// AND(VECTOR_CMP(x,y), constant2)
// constant2 = UNARYOP(constant)
// Early exit if this isn't a vector operation, the operand of the
// unary operation isn't a bitwise AND, or if the sizes of the operations
// aren't the same.
EVT VT = N->getValueType(0);
if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
return SDValue();
// Now check that the other operand of the AND is a constant. We could
// make the transformation for non-constant splats as well, but it's unclear
// that would be a benefit as it would not eliminate any operations, just
// perform one more step in scalar code before moving to the vector unit.
if (auto *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(0).getOperand(1))) {
// Bail out if the vector isn't a constant.
if (!BV->isConstant())
return SDValue();
// Everything checks out. Build up the new and improved node.
SDLoc DL(N);
EVT IntVT = BV->getValueType(0);
// Create a new constant of the appropriate type for the transformed
// DAG.
SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
// The AND node needs bitcasts to/from an integer vector type around it.
SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
N->getOperand(0)->getOperand(0), MaskConst);
SDValue Res = DAG.getBitcast(VT, NewAnd);
return Res;
}
return SDValue();
}
/// If we are converting a value to floating-point, try to replace scalar
/// truncate of an extracted vector element with a bitcast. This tries to keep
/// the sequence on XMM registers rather than moving between vector and GPRs.
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
// TODO: This is currently only used by combineSIntToFP, but it is generalized
// to allow being called by any similar cast opcode.
// TODO: Consider merging this into lowering: vectorizeExtractedCast().
SDValue Trunc = N->getOperand(0);
if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
return SDValue();
SDValue ExtElt = Trunc.getOperand(0);
if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isNullConstant(ExtElt.getOperand(1)))
return SDValue();
EVT TruncVT = Trunc.getValueType();
EVT SrcVT = ExtElt.getValueType();
unsigned DestWidth = TruncVT.getSizeInBits();
unsigned SrcWidth = SrcVT.getSizeInBits();
if (SrcWidth % DestWidth != 0)
return SDValue();
// inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
unsigned VecWidth = SrcVecVT.getSizeInBits();
unsigned NumElts = VecWidth / DestWidth;
EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
SDLoc DL(N);
SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
BitcastVec, ExtElt.getOperand(1));
return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
}
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue Op0 = N->getOperand(0);
EVT VT = N->getValueType(0);
EVT InVT = Op0.getValueType();
// UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
SDLoc dl(N);
EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
InVT.getVectorNumElements());
SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
}
// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
// the optimization here.
if (DAG.SignBitIsZero(Op0))
return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
return SDValue();
}
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// First try to optimize away the conversion entirely when it's
// conditionally from a constant. Vectors only.
if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
return Res;
// Now move on to more general possibilities.
SDValue Op0 = N->getOperand(0);
EVT VT = N->getValueType(0);
EVT InVT = Op0.getValueType();
// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
SDLoc dl(N);
EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
InVT.getVectorNumElements());
SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
}
// Without AVX512DQ we only support i64 to float scalar conversion. For both
// vectors and scalars, see if we know that the upper bits are all the sign
// bit, in which case we can truncate the input to i32 and convert from that.
if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
unsigned BitWidth = InVT.getScalarSizeInBits();
unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
if (NumSignBits >= (BitWidth - 31)) {
EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
if (InVT.isVector())
TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
InVT.getVectorNumElements());
SDLoc dl(N);
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
}
}
// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
// a 32-bit target where SSE doesn't support i64->FP operations.
if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
Op0.getOpcode() == ISD::LOAD) {
LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
EVT LdVT = Ld->getValueType(0);
// This transformation is not supported if the result type is f16 or f128.
if (VT == MVT::f16 || VT == MVT::f128)
return SDValue();
// If we have AVX512DQ we can use packed conversion instructions unless
// the VT is f80.
if (Subtarget.hasDQI() && VT != MVT::f80)
return SDValue();
if (!Ld->isVolatile() && !VT.isVector() &&
ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
!Subtarget.is64Bit() && LdVT == MVT::i64) {
SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
return FILDChain;
}
}
if (SDValue V = combineToFPTruncExtElt(N, DAG))
return V;
return SDValue();
}
static bool needCarryOrOverflowFlag(SDValue Flags) {
assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
UI != UE; ++UI) {
SDNode *User = *UI;
X86::CondCode CC;
switch (User->getOpcode()) {
default:
// Be conservative.
return true;
case X86ISD::SETCC:
case X86ISD::SETCC_CARRY:
CC = (X86::CondCode)User->getConstantOperandVal(0);
break;
case X86ISD::BRCOND:
CC = (X86::CondCode)User->getConstantOperandVal(2);
break;
case X86ISD::CMOV:
CC = (X86::CondCode)User->getConstantOperandVal(2);
break;
}
switch (CC) {
default: break;
case X86::COND_A: case X86::COND_AE:
case X86::COND_B: case X86::COND_BE:
case X86::COND_O: case X86::COND_NO:
case X86::COND_G: case X86::COND_GE:
case X86::COND_L: case X86::COND_LE:
return true;
}
}
return false;
}
static bool onlyZeroFlagUsed(SDValue Flags) {
assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
UI != UE; ++UI) {
SDNode *User = *UI;
unsigned CCOpNo;
switch (User->getOpcode()) {
default:
// Be conservative.
return false;
case X86ISD::SETCC: CCOpNo = 0; break;
case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
case X86ISD::BRCOND: CCOpNo = 2; break;
case X86ISD::CMOV: CCOpNo = 2; break;
}
X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
if (CC != X86::COND_E && CC != X86::COND_NE)
return false;
}
return true;
}
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
// Only handle test patterns.
if (!isNullConstant(N->getOperand(1)))
return SDValue();
// If we have a CMP of a truncated binop, see if we can make a smaller binop
// and use its flags directly.
// TODO: Maybe we should try promoting compares that only use the zero flag
// first if we can prove the upper bits with computeKnownBits?
SDLoc dl(N);
SDValue Op = N->getOperand(0);
EVT VT = Op.getValueType();
// If we have a constant logical shift that's only used in a comparison
// against zero turn it into an equivalent AND. This allows turning it into
// a TEST instruction later.
if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
onlyZeroFlagUsed(SDValue(N, 0))) {
unsigned BitWidth = VT.getSizeInBits();
const APInt &ShAmt = Op.getConstantOperandAPInt(1);
if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
APInt Mask = Op.getOpcode() == ISD::SRL
? APInt::getHighBitsSet(BitWidth, MaskBits)
: APInt::getLowBitsSet(BitWidth, MaskBits);
if (Mask.isSignedIntN(32)) {
Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
DAG.getConstant(Mask, dl, VT));
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, dl, VT));
}
}
}
// Look for a truncate with a single use.
if (Op.getOpcode() != ISD::TRUNCATE || !Op.hasOneUse())
return SDValue();
Op = Op.getOperand(0);
// Arithmetic op can only have one use.
if (!Op.hasOneUse())
return SDValue();
unsigned NewOpc;
switch (Op.getOpcode()) {
default: return SDValue();
case ISD::AND:
// Skip and with constant. We have special handling for and with immediate
// during isel to generate test instructions.
if (isa<ConstantSDNode>(Op.getOperand(1)))
return SDValue();
NewOpc = X86ISD::AND;
break;
case ISD::OR: NewOpc = X86ISD::OR; break;
case ISD::XOR: NewOpc = X86ISD::XOR; break;
case ISD::ADD:
// If the carry or overflow flag is used, we can't truncate.
if (needCarryOrOverflowFlag(SDValue(N, 0)))
return SDValue();
NewOpc = X86ISD::ADD;
break;
case ISD::SUB:
// If the carry or overflow flag is used, we can't truncate.
if (needCarryOrOverflowFlag(SDValue(N, 0)))
return SDValue();
NewOpc = X86ISD::SUB;
break;
}
// We found an op we can narrow. Truncate its inputs.
SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
// Use a X86 specific opcode to avoid DAG combine messing with it.
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
// For AND, keep a CMP so that we can match the test pattern.
if (NewOpc == X86ISD::AND)
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, dl, VT));
// Return the flags.
return Op.getValue(1);
}
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
"Expected X86ISD::ADD or X86ISD::SUB");
SDLoc DL(N);
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
MVT VT = LHS.getSimpleValueType();
unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
// If we don't use the flag result, simplify back to a generic ADD/SUB.
if (!N->hasAnyUseOfValue(1)) {
SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
}
// Fold any similar generic ADD/SUB opcodes to reuse this node.
auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
SDValue Ops[] = {N0, N1};
SDVTList VTs = DAG.getVTList(N->getValueType(0));
if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
SDValue Op(N, 0);
if (Negate)
Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
DCI.CombineTo(GenericAddSub, Op);
}
};
MatchGeneric(LHS, RHS, false);
MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
return SDValue();
}
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
MVT VT = N->getSimpleValueType(0);
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
N->getOperand(0), N->getOperand(1),
Flags);
}
// Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
// iff the flag result is dead.
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
!N->hasAnyUseOfValue(1))
return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
Op0.getOperand(1), N->getOperand(2));
return SDValue();
}
// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
// If the LHS and RHS of the ADC node are zero, then it can't overflow and
// the result is either zero or one (depending on the input carry bit).
// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
if (X86::isZeroNode(N->getOperand(0)) &&
X86::isZeroNode(N->getOperand(1)) &&
// We don't have a good way to replace an EFLAGS use, so only do this when
// dead right now.
SDValue(N, 1).use_empty()) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
DAG.getConstant(X86::COND_B, DL,
MVT::i8),
N->getOperand(2)),
DAG.getConstant(1, DL, VT));
return DCI.CombineTo(N, Res1, CarryOut);
}
if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
MVT VT = N->getSimpleValueType(0);
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
N->getOperand(0), N->getOperand(1),
Flags);
}
return SDValue();
}
/// If this is an add or subtract where one operand is produced by a cmp+setcc,
/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
/// with CMP+{ADC, SBB}.
static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
bool IsSub = N->getOpcode() == ISD::SUB;
SDValue X = N->getOperand(0);
SDValue Y = N->getOperand(1);
// If this is an add, canonicalize a zext operand to the RHS.
// TODO: Incomplete? What if both sides are zexts?
if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
Y.getOpcode() != ISD::ZERO_EXTEND)
std::swap(X, Y);
// Look through a one-use zext.
bool PeekedThroughZext = false;
if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
Y = Y.getOperand(0);
PeekedThroughZext = true;
}
// If this is an add, canonicalize a setcc operand to the RHS.
// TODO: Incomplete? What if both sides are setcc?
// TODO: Should we allow peeking through a zext of the other operand?
if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
Y.getOpcode() != X86ISD::SETCC)
std::swap(X, Y);
if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
return SDValue();
SDLoc DL(N);
EVT VT = N->getValueType(0);
X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
// If X is -1 or 0, then we have an opportunity to avoid constants required in
// the general case below.
auto *ConstantX = dyn_cast<ConstantSDNode>(X);
if (ConstantX) {
if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
(IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
// This is a complicated way to get -1 or 0 from the carry flag:
// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
DAG.getConstant(X86::COND_B, DL, MVT::i8),
Y.getOperand(1));
}
if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
(IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
SDValue EFLAGS = Y->getOperand(1);
if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
EFLAGS.getValueType().isInteger() &&
!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
// Swap the operands of a SUB, and we have the same pattern as above.
// -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
// 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
SDValue NewSub = DAG.getNode(
X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
EFLAGS.getOperand(1), EFLAGS.getOperand(0));
SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
DAG.getConstant(X86::COND_B, DL, MVT::i8),
NewEFLAGS);
}
}
}
if (CC == X86::COND_B) {
// X + SETB Z --> adc X, 0
// X - SETB Z --> sbb X, 0
return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
DAG.getVTList(VT, MVT::i32), X,
DAG.getConstant(0, DL, VT), Y.getOperand(1));
}
if (CC == X86::COND_A) {
SDValue EFLAGS = Y->getOperand(1);
// Try to convert COND_A into COND_B in an attempt to facilitate
// materializing "setb reg".
//
// Do not flip "e > c", where "c" is a constant, because Cmp instruction
// cannot take an immediate as its first operand.
//
if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
EFLAGS.getValueType().isInteger() &&
!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
EFLAGS.getNode()->getVTList(),
EFLAGS.getOperand(1), EFLAGS.getOperand(0));
SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
DAG.getVTList(VT, MVT::i32), X,
DAG.getConstant(0, DL, VT), NewEFLAGS);
}
}
if (CC != X86::COND_E && CC != X86::COND_NE)
return SDValue();
SDValue Cmp = Y.getOperand(1);
if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
!X86::isZeroNode(Cmp.getOperand(1)) ||
!Cmp.getOperand(0).getValueType().isInteger())
return SDValue();
SDValue Z = Cmp.getOperand(0);
EVT ZVT = Z.getValueType();
// If X is -1 or 0, then we have an opportunity to avoid constants required in
// the general case below.
if (ConstantX) {
// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
// fake operands:
// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
// -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
(!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
SDValue Zero = DAG.getConstant(0, DL, ZVT);
SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
DAG.getConstant(X86::COND_B, DL, MVT::i8),
SDValue(Neg.getNode(), 1));
}
// cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
// with fake operands:
// 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
// -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
SDValue One = DAG.getConstant(1, DL, ZVT);
SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
}
}
// (cmp Z, 1) sets the carry flag if Z is 0.
SDValue One = DAG.getConstant(1, DL, ZVT);
SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
// Add the flags type for ADC/SBB nodes.
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
if (CC == X86::COND_NE)
return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
DAG.getConstant(-1ULL, DL, VT), Cmp1);
// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
DAG.getConstant(0, DL, VT), Cmp1);
}
static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (!Subtarget.hasSSE2())
return SDValue();
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
EVT VT = N->getValueType(0);
// If the vector size is less than 128, or greater than the supported RegSize,
// do not use PMADD.
if (!VT.isVector() || VT.getVectorNumElements() < 8)
return SDValue();
if (Op0.getOpcode() != ISD::MUL)
std::swap(Op0, Op1);
if (Op0.getOpcode() != ISD::MUL)
return SDValue();
ShrinkMode Mode;
if (!canReduceVMulWidth(Op0.getNode(), DAG, Mode) || Mode == MULU16)
return SDValue();
SDLoc DL(N);
EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
VT.getVectorNumElements());
EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
VT.getVectorNumElements() / 2);
// Madd vector size is half of the original vector size
auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
};
auto BuildPMADDWD = [&](SDValue Mul) {
// Shrink the operands of mul.
SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(0));
SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(1));
SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
PMADDWDBuilder);
// Fill the rest of the output with 0
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd,
DAG.getConstant(0, DL, MAddVT));
};
Op0 = BuildPMADDWD(Op0);
// It's possible that Op1 is also a mul we can reduce.
if (Op1.getOpcode() == ISD::MUL &&
canReduceVMulWidth(Op1.getNode(), DAG, Mode) && Mode != MULU16) {
Op1 = BuildPMADDWD(Op1);
}
return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
}
static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (!Subtarget.hasSSE2())
return SDValue();
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
// TODO: There's nothing special about i32, any integer type above i16 should
// work just as well.
if (!VT.isVector() || !VT.isSimple() ||
!(VT.getVectorElementType() == MVT::i32))
return SDValue();
unsigned RegSize = 128;
if (Subtarget.useBWIRegs())
RegSize = 512;
else if (Subtarget.hasAVX())
RegSize = 256;
// We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
// TODO: We should be able to handle larger vectors by splitting them before
// feeding them into several SADs, and then reducing over those.
if (VT.getSizeInBits() / 4 > RegSize)
return SDValue();
// We know N is a reduction add, which means one of its operands is a phi.
// To match SAD, we need the other operand to be a ABS.
if (Op0.getOpcode() != ISD::ABS)
std::swap(Op0, Op1);
if (Op0.getOpcode() != ISD::ABS)
return SDValue();
auto BuildPSADBW = [&](SDValue Op0, SDValue Op1) {
// SAD pattern detected. Now build a SAD instruction and an addition for
// reduction. Note that the number of elements of the result of SAD is less
// than the number of elements of its input. Therefore, we could only update
// part of elements in the reduction vector.
SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget);
// The output of PSADBW is a vector of i64.
// We need to turn the vector of i64 into a vector of i32.
// If the reduction vector is at least as wide as the psadbw result, just
// bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
// anyway.
MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
if (VT.getSizeInBits() >= ResVT.getSizeInBits())
Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
else
Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
// Fill the upper elements with zero to match the add width.
SDValue Zero = DAG.getConstant(0, DL, VT);
Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
DAG.getIntPtrConstant(0, DL));
}
return Sad;
};
// Check whether we have an abs-diff pattern feeding into the select.
SDValue SadOp0, SadOp1;
if (!detectZextAbsDiff(Op0, SadOp0, SadOp1))
return SDValue();
Op0 = BuildPSADBW(SadOp0, SadOp1);
// It's possible we have a sad on the other side too.
if (Op1.getOpcode() == ISD::ABS &&
detectZextAbsDiff(Op1, SadOp0, SadOp1)) {
Op1 = BuildPSADBW(SadOp0, SadOp1);
}
return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
}
/// Convert vector increment or decrement to sub/add with an all-ones constant:
/// add X, <1, 1...> --> sub X, <-1, -1...>
/// sub X, <1, 1...> --> add X, <-1, -1...>
/// The all-ones vector constant can be materialized using a pcmpeq instruction
/// that is commonly recognized as an idiom (has no register dependency), so
/// that's better/smaller than loading a splat 1 constant.
static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
"Unexpected opcode for increment/decrement transform");
// Pseudo-legality check: getOnesVector() expects one of these types, so bail
// out and wait for legalization if we have an unsupported vector length.
EVT VT = N->getValueType(0);
if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
return SDValue();
APInt SplatVal;
if (!isConstantSplat(N->getOperand(1), SplatVal) || !SplatVal.isOneValue())
return SDValue();
SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
}
static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
const SDLoc &DL, EVT VT,
const X86Subtarget &Subtarget) {
// Example of pattern we try to detect:
// t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
//(add (build_vector (extract_elt t, 0),
// (extract_elt t, 2),
// (extract_elt t, 4),
// (extract_elt t, 6)),
// (build_vector (extract_elt t, 1),
// (extract_elt t, 3),
// (extract_elt t, 5),
// (extract_elt t, 7)))
if (!Subtarget.hasSSE2())
return SDValue();
if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
Op1.getOpcode() != ISD::BUILD_VECTOR)
return SDValue();
if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
VT.getVectorNumElements() < 4 ||
!isPowerOf2_32(VT.getVectorNumElements()))
return SDValue();
// Check if one of Op0,Op1 is of the form:
// (build_vector (extract_elt Mul, 0),
// (extract_elt Mul, 2),
// (extract_elt Mul, 4),
// ...
// the other is of the form:
// (build_vector (extract_elt Mul, 1),
// (extract_elt Mul, 3),
// (extract_elt Mul, 5),
// ...
// and identify Mul.
SDValue Mul;
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
// TODO: Be more tolerant to undefs.
if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
if (!Const0L || !Const1L || !Const0H || !Const1H)
return SDValue();
unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
// Commutativity of mul allows factors of a product to reorder.
if (Idx0L > Idx1L)
std::swap(Idx0L, Idx1L);
if (Idx0H > Idx1H)
std::swap(Idx0H, Idx1H);
// Commutativity of add allows pairs of factors to reorder.
if (Idx0L > Idx0H) {
std::swap(Idx0L, Idx0H);
std::swap(Idx1L, Idx1H);
}
if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
Idx1H != 2 * i + 3)
return SDValue();
if (!Mul) {
// First time an extract_elt's source vector is visited. Must be a MUL
// with 2X number of vector elements than the BUILD_VECTOR.
// Both extracts must be from same MUL.
Mul = Op0L->getOperand(0);
if (Mul->getOpcode() != ISD::MUL ||
Mul.getValueType().getVectorNumElements() != 2 * e)
return SDValue();
}
// Check that the extract is from the same MUL previously seen.
if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
return SDValue();
}
// Check if the Mul source can be safely shrunk.
ShrinkMode Mode;
if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || Mode == MULU16)
return SDValue();
auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
// Shrink by adding truncate nodes and let DAGCombine fold with the
// sources.
EVT InVT = Ops[0].getValueType();
assert(InVT.getScalarType() == MVT::i32 &&
"Unexpected scalar element type");
assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
InVT.getVectorNumElements() / 2);
EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
InVT.getVectorNumElements());
return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
};
return SplitOpsAndApply(DAG, Subtarget, DL, VT,
{ Mul.getOperand(0), Mul.getOperand(1) },
PMADDBuilder);
}
// Attempt to turn this pattern into PMADDWD.
// (mul (add (zext (build_vector)), (zext (build_vector))),
// (add (zext (build_vector)), (zext (build_vector)))
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
const SDLoc &DL, EVT VT,
const X86Subtarget &Subtarget) {
if (!Subtarget.hasSSE2())
return SDValue();
if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
return SDValue();
if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
VT.getVectorNumElements() < 4 ||
!isPowerOf2_32(VT.getVectorNumElements()))
return SDValue();
SDValue N00 = N0.getOperand(0);
SDValue N01 = N0.getOperand(1);
SDValue N10 = N1.getOperand(0);
SDValue N11 = N1.getOperand(1);
// All inputs need to be sign extends.
// TODO: Support ZERO_EXTEND from known positive?
if (N00.getOpcode() != ISD::SIGN_EXTEND ||
N01.getOpcode() != ISD::SIGN_EXTEND ||
N10.getOpcode() != ISD::SIGN_EXTEND ||
N11.getOpcode() != ISD::SIGN_EXTEND)
return SDValue();
// Peek through the extends.
N00 = N00.getOperand(0);
N01 = N01.getOperand(0);
N10 = N10.getOperand(0);
N11 = N11.getOperand(0);
// Must be extending from vXi16.
EVT InVT = N00.getValueType();
if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
N10.getValueType() != InVT || N11.getValueType() != InVT)
return SDValue();
// All inputs should be build_vectors.
if (N00.getOpcode() != ISD::BUILD_VECTOR ||
N01.getOpcode() != ISD::BUILD_VECTOR ||
N10.getOpcode() != ISD::BUILD_VECTOR ||
N11.getOpcode() != ISD::BUILD_VECTOR)
return SDValue();
// For each element, we need to ensure we have an odd element from one vector
// multiplied by the odd element of another vector and the even element from
// one of the same vectors being multiplied by the even element from the
// other vector. So we need to make sure for each element i, this operator
// is being performed:
// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
SDValue In0, In1;
for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
SDValue N00Elt = N00.getOperand(i);
SDValue N01Elt = N01.getOperand(i);
SDValue N10Elt = N10.getOperand(i);
SDValue N11Elt = N11.getOperand(i);
// TODO: Be more tolerant to undefs.
if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
return SDValue();
unsigned IdxN00 = ConstN00Elt->getZExtValue();
unsigned IdxN01 = ConstN01Elt->getZExtValue();
unsigned IdxN10 = ConstN10Elt->getZExtValue();
unsigned IdxN11 = ConstN11Elt->getZExtValue();
// Add is commutative so indices can be reordered.
if (IdxN00 > IdxN10) {
std::swap(IdxN00, IdxN10);
std::swap(IdxN01, IdxN11);
}
// N0 indices be the even element. N1 indices must be the next odd element.
if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
return SDValue();
SDValue N00In = N00Elt.getOperand(0);
SDValue N01In = N01Elt.getOperand(0);
SDValue N10In = N10Elt.getOperand(0);
SDValue N11In = N11Elt.getOperand(0);
// First time we find an input capture it.
if (!In0) {
In0 = N00In;
In1 = N01In;
}
// Mul is commutative so the input vectors can be in any order.
// Canonicalize to make the compares easier.
if (In0 != N00In)
std::swap(N00In, N01In);
if (In0 != N10In)
std::swap(N10In, N11In);
if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
return SDValue();
}
auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
// Shrink by adding truncate nodes and let DAGCombine fold with the
// sources.
EVT OpVT = Ops[0].getValueType();
assert(OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type");
assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
OpVT.getVectorNumElements() / 2);
return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
};
return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
PMADDBuilder);
}
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
const SDNodeFlags Flags = N->getFlags();
if (Flags.hasVectorReduction()) {
if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
return Sad;
if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
return MAdd;
}
EVT VT = N->getValueType(0);
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
return MAdd;
if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
return MAdd;
// Try to synthesize horizontal adds from adds of shuffles.
if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
VT == MVT::v8i32) &&
Subtarget.hasSSSE3() &&
isHorizontalBinOp(Op0, Op1, DAG, Subtarget, true)) {
auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
};
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
HADDBuilder);
}
if (SDValue V = combineIncDecVector(N, DAG))
return V;
return combineAddOrSubToADCOrSBB(N, DAG);
}
static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
EVT VT = N->getValueType(0);
// PSUBUS is supported, starting from SSE2, but truncation for v8i32
// is only worth it with SSSE3 (PSHUFB).
if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
!(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
!(Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
!(Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 ||
VT == MVT::v16i32 || VT == MVT::v8i64)))
return SDValue();
SDValue SubusLHS, SubusRHS;
// Try to find umax(a,b) - b or a - umin(a,b) patterns
// they may be converted to subus(a,b).
// TODO: Need to add IR canonicalization for this code.
if (Op0.getOpcode() == ISD::UMAX) {
SubusRHS = Op1;
SDValue MaxLHS = Op0.getOperand(0);
SDValue MaxRHS = Op0.getOperand(1);
if (MaxLHS == Op1)
SubusLHS = MaxRHS;
else if (MaxRHS == Op1)
SubusLHS = MaxLHS;
else
return SDValue();
} else if (Op1.getOpcode() == ISD::UMIN) {
SubusLHS = Op0;
SDValue MinLHS = Op1.getOperand(0);
SDValue MinRHS = Op1.getOperand(1);
if (MinLHS == Op0)
SubusRHS = MinRHS;
else if (MinRHS == Op0)
SubusRHS = MinLHS;
else
return SDValue();
} else
return SDValue();
auto USUBSATBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(ISD::USUBSAT, DL, Ops[0].getValueType(), Ops);
};
// PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
// special preprocessing in some cases.
if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
{ SubusLHS, SubusRHS }, USUBSATBuilder);
// Special preprocessing case can be only applied
// if the value was zero extended from 16 bit,
// so we require first 16 bits to be zeros for 32 bit
// values, or first 48 bits for 64 bit values.
KnownBits Known = DAG.computeKnownBits(SubusLHS);
unsigned NumZeros = Known.countMinLeadingZeros();
if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
return SDValue();
EVT ExtType = SubusLHS.getValueType();
EVT ShrinkedType;
if (VT == MVT::v8i32 || VT == MVT::v8i64)
ShrinkedType = MVT::v8i16;
else
ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
// If SubusLHS is zeroextended - truncate SubusRHS to it's
// size SubusRHS = umin(0xFFF.., SubusRHS).
SDValue SaturationConst =
DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
ShrinkedType.getScalarSizeInBits()),
SDLoc(SubusLHS), ExtType);
SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
SaturationConst);
SDValue NewSubusLHS =
DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
SDValue Psubus =
SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType,
{ NewSubusLHS, NewSubusRHS }, USUBSATBuilder);
// Zero extend the result, it may be used somewhere as 32 bit,
// if not zext and following trunc will shrink.
return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
}
static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
// X86 can't encode an immediate LHS of a sub. See if we can push the
// negation into a preceding instruction.
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
// If the RHS of the sub is a XOR with one use and a constant, invert the
// immediate. Then add one to the LHS of the sub so we can turn
// X-Y -> X+~Y+1, saving one register.
if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
isa<ConstantSDNode>(Op1.getOperand(1))) {
const APInt &XorC = Op1.getConstantOperandAPInt(1);
EVT VT = Op0.getValueType();
SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
Op1.getOperand(0),
DAG.getConstant(~XorC, SDLoc(Op1), VT));
return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
}
}
// Try to synthesize horizontal subs from subs of shuffles.
EVT VT = N->getValueType(0);
if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
VT == MVT::v8i32) &&
Subtarget.hasSSSE3() &&
isHorizontalBinOp(Op0, Op1, DAG, Subtarget, false)) {
auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
};
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
HSUBBuilder);
}
if (SDValue V = combineIncDecVector(N, DAG))
return V;
// Try to create PSUBUS if SUB's argument is max/min
if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
return V;
return combineAddOrSubToADCOrSBB(N, DAG);
}
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = N->getSimpleValueType(0);
SDLoc DL(N);
if (N->getOperand(0) == N->getOperand(1)) {
if (N->getOpcode() == X86ISD::PCMPEQ)
return DAG.getConstant(-1, DL, VT);
if (N->getOpcode() == X86ISD::PCMPGT)
return DAG.getConstant(0, DL, VT);
}
return SDValue();
}
/// Helper that combines an array of subvector ops as if they were the operands
/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
ArrayRef<SDValue> Ops, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
return DAG.getUNDEF(VT);
if (llvm::all_of(Ops, [](SDValue Op) {
return ISD::isBuildVectorAllZeros(Op.getNode());
}))
return getZeroVector(VT, Subtarget, DAG, DL);
SDValue Op0 = Ops[0];
// Fold subvector loads into one.
// If needed, look through bitcasts to get to the load.
if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
bool Fast;
const X86TargetLowering *TLI = Subtarget.getTargetLowering();
if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
*FirstLd->getMemOperand(), &Fast) &&
Fast) {
if (SDValue Ld =
EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
return Ld;
}
}
// Repeated subvectors.
if (llvm::all_of(Ops, [Op0](SDValue Op) { return Op == Op0; })) {
// If this broadcast/subv_broadcast is inserted into both halves, use a
// larger broadcast/subv_broadcast.
if (Op0.getOpcode() == X86ISD::VBROADCAST ||
Op0.getOpcode() == X86ISD::SUBV_BROADCAST)
return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
// concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
(Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
Op0.getOperand(0),
DAG.getIntPtrConstant(0, DL)));
// concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
(Subtarget.hasAVX2() ||
(VT.getScalarSizeInBits() >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
Op0.getOperand(0).getValueType() == VT.getScalarType())
return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
}
bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
// Repeated opcode.
// TODO - combineX86ShufflesRecursively should handle shuffle concatenation
// but it currently struggles with different vector widths.
if (llvm::all_of(Ops, [Op0](SDValue Op) {
return Op.getOpcode() == Op0.getOpcode();
})) {
unsigned NumOps = Ops.size();
switch (Op0.getOpcode()) {
case X86ISD::PSHUFHW:
case X86ISD::PSHUFLW:
case X86ISD::PSHUFD:
if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
SmallVector<SDValue, 2> Src;
for (unsigned i = 0; i != NumOps; ++i)
Src.push_back(Ops[i].getOperand(0));
return DAG.getNode(Op0.getOpcode(), DL, VT,
DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
Op0.getOperand(1));
}
LLVM_FALLTHROUGH;
case X86ISD::VPERMILPI:
// TODO - add support for vXf64/vXi64 shuffles.
if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
SmallVector<SDValue, 2> Src;
for (unsigned i = 0; i != NumOps; ++i)
Src.push_back(DAG.getBitcast(MVT::v4f32, Ops[i].getOperand(0)));
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f32, Src);
Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
Op0.getOperand(1));
return DAG.getBitcast(VT, Res);
}
break;
case X86ISD::PACKUS:
if (NumOps == 2 && VT.is256BitVector() && Subtarget.hasInt256()) {
SmallVector<SDValue, 2> LHS, RHS;
for (unsigned i = 0; i != NumOps; ++i) {
LHS.push_back(Ops[i].getOperand(0));
RHS.push_back(Ops[i].getOperand(1));
}
MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
NumOps * SrcVT.getVectorNumElements());
return DAG.getNode(Op0.getOpcode(), DL, VT,
DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
}
break;
}
}
// If we're inserting all zeros into the upper half, change this to
// an insert into an all zeros vector. We will match this to a move
// with implicit upper bit zeroing during isel.
if (Ops.size() == 2 && ISD::isBuildVectorAllZeros(Ops[1].getNode()))
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
getZeroVector(VT, Subtarget, DAG, DL), Ops[0],
DAG.getIntPtrConstant(0, DL));
return SDValue();
}
static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
EVT SrcVT = N->getOperand(0).getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Don't do anything for i1 vectors.
if (VT.getVectorElementType() == MVT::i1)
return SDValue();
if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
DCI, Subtarget))
return R;
}
return SDValue();
}
static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
MVT OpVT = N->getSimpleValueType(0);
bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
SDLoc dl(N);
SDValue Vec = N->getOperand(0);
SDValue SubVec = N->getOperand(1);
uint64_t IdxVal = N->getConstantOperandVal(2);
MVT SubVecVT = SubVec.getSimpleValueType();
if (Vec.isUndef() && SubVec.isUndef())
return DAG.getUNDEF(OpVT);
// Inserting undefs/zeros into zeros/undefs is a zero vector.
if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
(SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
return getZeroVector(OpVT, Subtarget, DAG, dl);
if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
// If we're inserting into a zero vector and then into a larger zero vector,
// just insert into the larger zero vector directly.
if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
getZeroVector(OpVT, Subtarget, DAG, dl),
SubVec.getOperand(1),
DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
}
// If we're inserting into a zero vector and our input was extracted from an
// insert into a zero vector of the same type and the extraction was at
// least as large as the original insertion. Just insert the original
// subvector into a zero vector.
if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
SubVec.getConstantOperandAPInt(1) == 0 &&
SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
SDValue Ins = SubVec.getOperand(0);
if (Ins.getConstantOperandAPInt(2) == 0 &&
ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
getZeroVector(OpVT, Subtarget, DAG, dl),
Ins.getOperand(1), N->getOperand(2));
}
}
// Stop here if this is an i1 vector.
if (IsI1Vector)
return SDValue();
// If this is an insert of an extract, combine to a shuffle. Don't do this
// if the insert or extract can be represented with a subregister operation.
if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
SubVec.getOperand(0).getSimpleValueType() == OpVT &&
(IdxVal != 0 || !Vec.isUndef())) {
int ExtIdxVal = SubVec.getConstantOperandVal(1);
if (ExtIdxVal != 0) {
int VecNumElts = OpVT.getVectorNumElements();
int SubVecNumElts = SubVecVT.getVectorNumElements();
SmallVector<int, 64> Mask(VecNumElts);
// First create an identity shuffle mask.
for (int i = 0; i != VecNumElts; ++i)
Mask[i] = i;
// Now insert the extracted portion.
for (int i = 0; i != SubVecNumElts; ++i)
Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
}
}
// Match concat_vector style patterns.
SmallVector<SDValue, 2> SubVectorOps;
if (collectConcatOps(N, SubVectorOps))
if (SDValue Fold =
combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
return Fold;
// If we are inserting into both halves of the vector, the starting vector
// should be undef. If it isn't, make it so. Only do this if the early insert
// has no other uses.
// TODO: Should this be a generic DAG combine?
// TODO: Why doesn't SimplifyDemandedVectorElts catch this?
if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2 &&
isNullConstant(Vec.getOperand(2)) && !Vec.getOperand(0).isUndef() &&
+ Vec.getOperand(1).getValueSizeInBits() == SubVecVT.getSizeInBits() &&
Vec.hasOneUse()) {
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
Vec.getOperand(1), Vec.getOperand(2));
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
N->getOperand(2));
}
// If this is a broadcast insert into an upper undef, use a larger broadcast.
if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
return SDValue();
}
/// If we are extracting a subvector of a vector select and the select condition
/// is composed of concatenated vectors, try to narrow the select width. This
/// is a common pattern for AVX1 integer code because 256-bit selects may be
/// legal, but there is almost no integer math/logic available for 256-bit.
/// This function should only be called with legal types (otherwise, the calls
/// to get simple value types will assert).
static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
SmallVector<SDValue, 4> CatOps;
if (Sel.getOpcode() != ISD::VSELECT ||
!collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
return SDValue();
// Note: We assume simple value types because this should only be called with
// legal operations/types.
// TODO: This can be extended to handle extraction to 256-bits.
MVT VT = Ext->getSimpleValueType(0);
if (!VT.is128BitVector())
return SDValue();
MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
return SDValue();
MVT WideVT = Ext->getOperand(0).getSimpleValueType();
MVT SelVT = Sel.getSimpleValueType();
assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
"Unexpected vector type with legal operations");
unsigned SelElts = SelVT.getVectorNumElements();
unsigned CastedElts = WideVT.getVectorNumElements();
unsigned ExtIdx = cast<ConstantSDNode>(Ext->getOperand(1))->getZExtValue();
if (SelElts % CastedElts == 0) {
// The select has the same or more (narrower) elements than the extract
// operand. The extraction index gets scaled by that factor.
ExtIdx *= (SelElts / CastedElts);
} else if (CastedElts % SelElts == 0) {
// The select has less (wider) elements than the extract operand. Make sure
// that the extraction index can be divided evenly.
unsigned IndexDivisor = CastedElts / SelElts;
if (ExtIdx % IndexDivisor != 0)
return SDValue();
ExtIdx /= IndexDivisor;
} else {
llvm_unreachable("Element count of simple vector types are not divisible?");
}
unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
unsigned NarrowElts = SelElts / NarrowingFactor;
MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
SDLoc DL(Ext);
SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
return DAG.getBitcast(VT, NarrowSel);
}
static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
// For AVX1 only, if we are extracting from a 256-bit and+not (which will
// eventually get combined/lowered into ANDNP) with a concatenated operand,
// split the 'and' into 128-bit ops to avoid the concatenate and extract.
// We let generic combining take over from there to simplify the
// insert/extract and 'not'.
// This pattern emerges during AVX1 legalization. We handle it before lowering
// to avoid complications like splitting constant vector loads.
// Capture the original wide type in the likely case that we need to bitcast
// back to this type.
if (!N->getValueType(0).isSimple())
return SDValue();
MVT VT = N->getSimpleValueType(0);
EVT WideVecVT = N->getOperand(0).getValueType();
SDValue WideVec = peekThroughBitcasts(N->getOperand(0));
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
TLI.isTypeLegal(WideVecVT) &&
WideVecVT.getSizeInBits() == 256 && WideVec.getOpcode() == ISD::AND) {
auto isConcatenatedNot = [] (SDValue V) {
V = peekThroughBitcasts(V);
if (!isBitwiseNot(V))
return false;
SDValue NotOp = V->getOperand(0);
return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
};
if (isConcatenatedNot(WideVec.getOperand(0)) ||
isConcatenatedNot(WideVec.getOperand(1))) {
// extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
SDValue Concat = split256IntArith(WideVec, DAG);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
DAG.getBitcast(WideVecVT, Concat), N->getOperand(1));
}
}
if (DCI.isBeforeLegalizeOps())
return SDValue();
if (SDValue V = narrowExtractedVectorSelect(N, DAG))
return V;
SDValue InVec = N->getOperand(0);
unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
if (ISD::isBuildVectorAllZeros(InVec.getNode()))
return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
if (VT.getScalarType() == MVT::i1)
return DAG.getConstant(1, SDLoc(N), VT);
return getOnesVector(VT, DAG, SDLoc(N));
}
if (InVec.getOpcode() == ISD::BUILD_VECTOR)
return DAG.getBuildVector(
VT, SDLoc(N),
InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
// Try to move vector bitcast after extract_subv by scaling extraction index:
// extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
// TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR
if (InVec.getOpcode() == ISD::BITCAST &&
InVec.getOperand(0).getValueType().isVector()) {
SDValue SrcOp = InVec.getOperand(0);
EVT SrcVT = SrcOp.getValueType();
unsigned SrcNumElts = SrcVT.getVectorNumElements();
unsigned DestNumElts = InVec.getValueType().getVectorNumElements();
if ((DestNumElts % SrcNumElts) == 0) {
unsigned DestSrcRatio = DestNumElts / SrcNumElts;
if ((VT.getVectorNumElements() % DestSrcRatio) == 0) {
unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio;
EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),
SrcVT.getScalarType(), NewExtNumElts);
if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&
TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;
SDLoc DL(N);
SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
SrcOp, NewIndex);
return DAG.getBitcast(VT, NewExtract);
}
}
}
}
// If we're extracting from a broadcast then we're better off just
// broadcasting to the smaller type directly, assuming this is the only use.
// As its a broadcast we don't care about the extraction index.
if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() &&
InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits())
return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0));
// If we're extracting the lowest subvector and we're the only user,
// we may be able to perform this with a smaller vector width.
if (IdxVal == 0 && InVec.hasOneUse()) {
unsigned InOpcode = InVec.getOpcode();
if (VT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) {
// v2f64 CVTDQ2PD(v4i32).
if (InOpcode == ISD::SINT_TO_FP &&
InVec.getOperand(0).getValueType() == MVT::v4i32) {
return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
}
// v2f64 CVTUDQ2PD(v4i32).
if (InOpcode == ISD::UINT_TO_FP &&
InVec.getOperand(0).getValueType() == MVT::v4i32) {
return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
}
// v2f64 CVTPS2PD(v4f32).
if (InOpcode == ISD::FP_EXTEND &&
InVec.getOperand(0).getValueType() == MVT::v4f32) {
return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
}
}
if ((InOpcode == ISD::ANY_EXTEND ||
InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
InOpcode == ISD::ZERO_EXTEND ||
InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
InOpcode == ISD::SIGN_EXTEND ||
InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
VT.is128BitVector() &&
InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
return DAG.getNode(ExtOp, SDLoc(N), VT, InVec.getOperand(0));
}
if (InOpcode == ISD::VSELECT &&
InVec.getOperand(0).getValueType().is256BitVector() &&
InVec.getOperand(1).getValueType().is256BitVector() &&
InVec.getOperand(2).getValueType().is256BitVector()) {
SDLoc DL(N);
SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
}
}
return SDValue();
}
static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
SDValue Src = N->getOperand(0);
SDLoc DL(N);
// If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
// This occurs frequently in our masked scalar intrinsic code and our
// floating point select lowering with AVX512.
// TODO: SimplifyDemandedBits instead?
if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
if (C->getAPIntValue().isOneValue())
return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
Src.getOperand(0));
// Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
if (C->isNullValue())
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
Src.getOperand(1));
// Reduce v2i64 to v4i32 if we don't need the upper bits.
// TODO: Move to DAGCombine?
if (VT == MVT::v2i64 && Src.getOpcode() == ISD::ANY_EXTEND &&
Src.getValueType() == MVT::i64 && Src.hasOneUse() &&
Src.getOperand(0).getScalarValueSizeInBits() <= 32)
return DAG.getBitcast(
VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32)));
return SDValue();
}
// Simplify PMULDQ and PMULUDQ operations.
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
// Canonicalize constant to RHS.
if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
!DAG.isConstantIntBuildVectorOrConstantInt(RHS))
return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
// Multiply by zero.
if (ISD::isBuildVectorAllZeros(RHS.getNode()))
return RHS;
// Aggressively peek through ops to get at the demanded low bits.
APInt DemandedMask = APInt::getLowBitsSet(64, 32);
SDValue DemandedLHS = DAG.GetDemandedBits(LHS, DemandedMask);
SDValue DemandedRHS = DAG.GetDemandedBits(RHS, DemandedMask);
if (DemandedLHS || DemandedRHS)
return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
DemandedLHS ? DemandedLHS : LHS,
DemandedRHS ? DemandedRHS : RHS);
// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
return SDValue(N, 0);
return SDValue();
}
static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
SDValue In = N->getOperand(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Try to merge vector loads and extend_inreg to an extload.
if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
In.hasOneUse()) {
auto *Ld = cast<LoadSDNode>(In);
if (!Ld->isVolatile()) {
MVT SVT = In.getSimpleValueType().getVectorElementType();
ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT,
VT.getVectorNumElements());
if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
SDValue Load =
DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
Ld->getMemOperand()->getFlags());
DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
return Load;
}
}
}
// Disabling for widening legalization for now. We can enable if we find a
// case that needs it. Otherwise it can be deleted when we switch to
// widening legalization.
if (ExperimentalVectorWideningLegalization)
return SDValue();
// Combine (ext_invec (ext_invec X)) -> (ext_invec X)
if (In.getOpcode() == N->getOpcode() &&
TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType()))
return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0));
// Attempt to combine as a shuffle.
// TODO: SSE41 support
if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) {
SDValue Op(N, 0);
if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
}
return SDValue();
}
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
switch (N->getOpcode()) {
default: break;
case ISD::SCALAR_TO_VECTOR:
return combineScalarToVector(N, DAG);
case ISD::EXTRACT_VECTOR_ELT:
case X86ISD::PEXTRW:
case X86ISD::PEXTRB:
return combineExtractVectorElt(N, DAG, DCI, Subtarget);
case ISD::CONCAT_VECTORS:
return combineConcatVectors(N, DAG, DCI, Subtarget);
case ISD::INSERT_SUBVECTOR:
return combineInsertSubvector(N, DAG, DCI, Subtarget);
case ISD::EXTRACT_SUBVECTOR:
return combineExtractSubvector(N, DAG, DCI, Subtarget);
case ISD::VSELECT:
case ISD::SELECT:
case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
case X86ISD::CMP: return combineCMP(N, DAG);
case ISD::ADD: return combineAdd(N, DAG, Subtarget);
case ISD::SUB: return combineSub(N, DAG, Subtarget);
case X86ISD::ADD:
case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
case X86ISD::SBB: return combineSBB(N, DAG);
case X86ISD::ADC: return combineADC(N, DAG, DCI);
case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
case ISD::SHL: return combineShiftLeft(N, DAG);
case ISD::SRA: return combineShiftRightArithmetic(N, DAG);
case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI);
case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
case ISD::FADD:
case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
case X86ISD::FXOR:
case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
case X86ISD::FMIN:
case X86ISD::FMAX: return combineFMinFMax(N, DAG);
case ISD::FMINNUM:
case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
case X86ISD::CVTSI2P:
case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
case X86ISD::CVTP2SI:
case X86ISD::CVTP2UI:
case X86ISD::CVTTP2SI:
case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI);
case X86ISD::BT: return combineBT(N, DAG, DCI);
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
case ISD::ANY_EXTEND_VECTOR_INREG:
case ISD::SIGN_EXTEND_VECTOR_INREG:
case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, DCI,
Subtarget);
case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
case X86ISD::PACKSS:
case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
case X86ISD::VSHL:
case X86ISD::VSRA:
case X86ISD::VSRL:
return combineVectorShiftVar(N, DAG, DCI, Subtarget);
case X86ISD::VSHLI:
case X86ISD::VSRAI:
case X86ISD::VSRLI:
return combineVectorShiftImm(N, DAG, DCI, Subtarget);
case X86ISD::PINSRB:
case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
case X86ISD::SHUFP: // Handle all target specific shuffles
case X86ISD::INSERTPS:
case X86ISD::EXTRQI:
case X86ISD::INSERTQI:
case X86ISD::PALIGNR:
case X86ISD::VSHLDQ:
case X86ISD::VSRLDQ:
case X86ISD::BLENDI:
case X86ISD::UNPCKH:
case X86ISD::UNPCKL:
case X86ISD::MOVHLPS:
case X86ISD::MOVLHPS:
case X86ISD::PSHUFB:
case X86ISD::PSHUFD:
case X86ISD::PSHUFHW:
case X86ISD::PSHUFLW:
case X86ISD::MOVSHDUP:
case X86ISD::MOVSLDUP:
case X86ISD::MOVDDUP:
case X86ISD::MOVSS:
case X86ISD::MOVSD:
case X86ISD::VBROADCAST:
case X86ISD::VPPERM:
case X86ISD::VPERMI:
case X86ISD::VPERMV:
case X86ISD::VPERMV3:
case X86ISD::VPERMIL2:
case X86ISD::VPERMILPI:
case X86ISD::VPERMILPV:
case X86ISD::VPERM2X128:
case X86ISD::SHUF128:
case X86ISD::VZEXT_MOVL:
case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
case X86ISD::FMADD_RND:
case X86ISD::FMSUB:
case X86ISD::FMSUB_RND:
case X86ISD::FNMADD:
case X86ISD::FNMADD_RND:
case X86ISD::FNMSUB:
case X86ISD::FNMSUB_RND:
case ISD::FMA: return combineFMA(N, DAG, Subtarget);
case X86ISD::FMADDSUB_RND:
case X86ISD::FMSUBADD_RND:
case X86ISD::FMADDSUB:
case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);
case X86ISD::MGATHER:
case X86ISD::MSCATTER:
case ISD::MGATHER:
case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
case X86ISD::PCMPEQ:
case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
case X86ISD::PMULDQ:
case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI);
}
return SDValue();
}
bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
if (!isTypeLegal(VT))
return false;
// There are no vXi8 shifts.
if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
return false;
// TODO: Almost no 8-bit ops are desirable because they have no actual
// size/speed advantages vs. 32-bit ops, but they do have a major
// potential disadvantage by causing partial register stalls.
//
// 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
// we have specializations to turn 32-bit multiply/shl into LEA or other ops.
// Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
// check for a constant operand to the multiply.
if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
return false;
// i16 instruction encodings are longer and some i16 instructions are slow,
// so those are not desirable.
if (VT == MVT::i16) {
switch (Opc) {
default:
break;
case ISD::LOAD:
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND:
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
case ISD::SUB:
case ISD::ADD:
case ISD::MUL:
case ISD::AND:
case ISD::OR:
case ISD::XOR:
return false;
}
}
// Any legal type not explicitly accounted for above here is desirable.
return true;
}
SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
SDValue Value, SDValue Addr,
SelectionDAG &DAG) const {
const Module *M = DAG.getMachineFunction().getMMI().getModule();
Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
if (IsCFProtectionSupported) {
// In case control-flow branch protection is enabled, we need to add
// notrack prefix to the indirect branch.
// In order to do that we create NT_BRIND SDNode.
// Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
}
return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
}
bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
EVT VT = Op.getValueType();
bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
isa<ConstantSDNode>(Op.getOperand(1));
// i16 is legal, but undesirable since i16 instruction encodings are longer
// and some i16 instructions are slow.
// 8-bit multiply-by-constant can usually be expanded to something cheaper
// using LEA and/or other ALU ops.
if (VT != MVT::i16 && !Is8BitMulByConstant)
return false;
auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
if (!Op.hasOneUse())
return false;
SDNode *User = *Op->use_begin();
if (!ISD::isNormalStore(User))
return false;
auto *Ld = cast<LoadSDNode>(Load);
auto *St = cast<StoreSDNode>(User);
return Ld->getBasePtr() == St->getBasePtr();
};
auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
return false;
if (!Op.hasOneUse())
return false;
SDNode *User = *Op->use_begin();
if (User->getOpcode() != ISD::ATOMIC_STORE)
return false;
auto *Ld = cast<AtomicSDNode>(Load);
auto *St = cast<AtomicSDNode>(User);
return Ld->getBasePtr() == St->getBasePtr();
};
bool Commute = false;
switch (Op.getOpcode()) {
default: return false;
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND:
break;
case ISD::SHL:
case ISD::SRA:
case ISD::SRL: {
SDValue N0 = Op.getOperand(0);
// Look out for (store (shl (load), x)).
if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
return false;
break;
}
case ISD::ADD:
case ISD::MUL:
case ISD::AND:
case ISD::OR:
case ISD::XOR:
Commute = true;
LLVM_FALLTHROUGH;
case ISD::SUB: {
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
// Avoid disabling potential load folding opportunities.
if (MayFoldLoad(N1) &&
(!Commute || !isa<ConstantSDNode>(N0) ||
(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
return false;
if (MayFoldLoad(N0) &&
((Commute && !isa<ConstantSDNode>(N1)) ||
(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
return false;
if (IsFoldableAtomicRMW(N0, Op) ||
(Commute && IsFoldableAtomicRMW(N1, Op)))
return false;
}
}
PVT = MVT::i32;
return true;
}
bool X86TargetLowering::
isDesirableToCombineBuildVectorToShuffleTruncate(
ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
"Element count mismatch");
assert(
Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
"Shuffle Mask expected to be legal");
// For 32-bit elements VPERMD is better than shuffle+truncate.
// TODO: After we improve lowerBuildVector, add execption for VPERMW.
if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
return false;
if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
return false;
return true;
}
//===----------------------------------------------------------------------===//
// X86 Inline Assembly Support
//===----------------------------------------------------------------------===//
// Helper to match a string separated by whitespace.
static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
for (StringRef Piece : Pieces) {
if (!S.startswith(Piece)) // Check if the piece matches.
return false;
S = S.substr(Piece.size());
StringRef::size_type Pos = S.find_first_not_of(" \t");
if (Pos == 0) // We matched a prefix.
return false;
S = S.substr(Pos);
}
return S.empty();
}
static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
if (AsmPieces.size() == 3)
return true;
else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
return true;
}
}
return false;
}
bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
const std::string &AsmStr = IA->getAsmString();
IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
if (!Ty || Ty->getBitWidth() % 16 != 0)
return false;
// TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
SmallVector<StringRef, 4> AsmPieces;
SplitString(AsmStr, AsmPieces, ";\n");
switch (AsmPieces.size()) {
default: return false;
case 1:
// FIXME: this should verify that we are targeting a 486 or better. If not,
// we will turn this bswap into something that will be lowered to logical
// ops instead of emitting the bswap asm. For now, we don't support 486 or
// lower so don't worry about this.
// bswap $0
if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
// No need to check constraints, nothing other than the equivalent of
// "=r,0" would be valid here.
return IntrinsicLowering::LowerToByteSwap(CI);
}
// rorw $$8, ${0:w} --> llvm.bswap.i16
if (CI->getType()->isIntegerTy(16) &&
IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
(matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
AsmPieces.clear();
StringRef ConstraintsStr = IA->getConstraintString();
SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
array_pod_sort(AsmPieces.begin(), AsmPieces.end());
if (clobbersFlagRegisters(AsmPieces))
return IntrinsicLowering::LowerToByteSwap(CI);
}
break;
case 3:
if (CI->getType()->isIntegerTy(32) &&
IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
AsmPieces.clear();
StringRef ConstraintsStr = IA->getConstraintString();
SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
array_pod_sort(AsmPieces.begin(), AsmPieces.end());
if (clobbersFlagRegisters(AsmPieces))
return IntrinsicLowering::LowerToByteSwap(CI);
}
if (CI->getType()->isIntegerTy(64)) {
InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
if (Constraints.size() >= 2 &&
Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
return IntrinsicLowering::LowerToByteSwap(CI);
}
}
break;
}
return false;
}
static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
.Case("{@cca}", X86::COND_A)
.Case("{@ccae}", X86::COND_AE)
.Case("{@ccb}", X86::COND_B)
.Case("{@ccbe}", X86::COND_BE)
.Case("{@ccc}", X86::COND_B)
.Case("{@cce}", X86::COND_E)
.Case("{@ccz}", X86::COND_E)
.Case("{@ccg}", X86::COND_G)
.Case("{@ccge}", X86::COND_GE)
.Case("{@ccl}", X86::COND_L)
.Case("{@ccle}", X86::COND_LE)
.Case("{@ccna}", X86::COND_BE)
.Case("{@ccnae}", X86::COND_B)
.Case("{@ccnb}", X86::COND_AE)
.Case("{@ccnbe}", X86::COND_A)
.Case("{@ccnc}", X86::COND_AE)
.Case("{@ccne}", X86::COND_NE)
.Case("{@ccnz}", X86::COND_NE)
.Case("{@ccng}", X86::COND_LE)
.Case("{@ccnge}", X86::COND_L)
.Case("{@ccnl}", X86::COND_GE)
.Case("{@ccnle}", X86::COND_G)
.Case("{@ccno}", X86::COND_NO)
.Case("{@ccnp}", X86::COND_P)
.Case("{@ccns}", X86::COND_NS)
.Case("{@cco}", X86::COND_O)
.Case("{@ccp}", X86::COND_P)
.Case("{@ccs}", X86::COND_S)
.Default(X86::COND_INVALID);
return Cond;
}
/// Given a constraint letter, return the type of constraint for this target.
X86TargetLowering::ConstraintType
X86TargetLowering::getConstraintType(StringRef Constraint) const {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
case 'R':
case 'q':
case 'Q':
case 'f':
case 't':
case 'u':
case 'y':
case 'x':
case 'v':
case 'Y':
case 'l':
case 'k': // AVX512 masking registers.
return C_RegisterClass;
case 'a':
case 'b':
case 'c':
case 'd':
case 'S':
case 'D':
case 'A':
return C_Register;
case 'I':
case 'J':
case 'K':
- case 'L':
- case 'M':
case 'N':
case 'G':
+ case 'L':
+ case 'M':
+ return C_Immediate;
case 'C':
case 'e':
case 'Z':
return C_Other;
default:
break;
}
}
else if (Constraint.size() == 2) {
switch (Constraint[0]) {
default:
break;
case 'Y':
switch (Constraint[1]) {
default:
break;
case 'z':
case '0':
return C_Register;
case 'i':
case 'm':
case 'k':
case 't':
case '2':
return C_RegisterClass;
}
}
} else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
return C_Other;
return TargetLowering::getConstraintType(Constraint);
}
/// Examine constraint type and operand type and determine a weight value.
/// This object must already have been set up with the operand type
/// and the current alternative constraint selected.
TargetLowering::ConstraintWeight
X86TargetLowering::getSingleConstraintMatchWeight(
AsmOperandInfo &info, const char *constraint) const {
ConstraintWeight weight = CW_Invalid;
Value *CallOperandVal = info.CallOperandVal;
// If we don't have a value, we can't do a match,
// but allow it at the lowest weight.
if (!CallOperandVal)
return CW_Default;
Type *type = CallOperandVal->getType();
// Look at the constraint type.
switch (*constraint) {
default:
weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
LLVM_FALLTHROUGH;
case 'R':
case 'q':
case 'Q':
case 'a':
case 'b':
case 'c':
case 'd':
case 'S':
case 'D':
case 'A':
if (CallOperandVal->getType()->isIntegerTy())
weight = CW_SpecificReg;
break;
case 'f':
case 't':
case 'u':
if (type->isFloatingPointTy())
weight = CW_SpecificReg;
break;
case 'y':
if (type->isX86_MMXTy() && Subtarget.hasMMX())
weight = CW_SpecificReg;
break;
case 'Y': {
unsigned Size = StringRef(constraint).size();
// Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
char NextChar = Size == 2 ? constraint[1] : 'i';
if (Size > 2)
break;
switch (NextChar) {
default:
return CW_Invalid;
// XMM0
case 'z':
case '0':
if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
return CW_SpecificReg;
return CW_Invalid;
// Conditional OpMask regs (AVX512)
case 'k':
if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
return CW_Register;
return CW_Invalid;
// Any MMX reg
case 'm':
if (type->isX86_MMXTy() && Subtarget.hasMMX())
return weight;
return CW_Invalid;
// Any SSE reg when ISA >= SSE2, same as 'Y'
case 'i':
case 't':
case '2':
if (!Subtarget.hasSSE2())
return CW_Invalid;
break;
}
// Fall through (handle "Y" constraint).
LLVM_FALLTHROUGH;
}
case 'v':
if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
weight = CW_Register;
LLVM_FALLTHROUGH;
case 'x':
if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
weight = CW_Register;
break;
case 'k':
// Enable conditional vector operations using %k<#> registers.
if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
weight = CW_Register;
break;
case 'I':
if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
if (C->getZExtValue() <= 31)
weight = CW_Constant;
}
break;
case 'J':
if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if (C->getZExtValue() <= 63)
weight = CW_Constant;
}
break;
case 'K':
if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
weight = CW_Constant;
}
break;
case 'L':
if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
weight = CW_Constant;
}
break;
case 'M':
if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if (C->getZExtValue() <= 3)
weight = CW_Constant;
}
break;
case 'N':
if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if (C->getZExtValue() <= 0xff)
weight = CW_Constant;
}
break;
case 'G':
case 'C':
if (isa<ConstantFP>(CallOperandVal)) {
weight = CW_Constant;
}
break;
case 'e':
if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if ((C->getSExtValue() >= -0x80000000LL) &&
(C->getSExtValue() <= 0x7fffffffLL))
weight = CW_Constant;
}
break;
case 'Z':
if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if (C->getZExtValue() <= 0xffffffff)
weight = CW_Constant;
}
break;
}
return weight;
}
/// Try to replace an X constraint, which matches anything, with another that
/// has more specific requirements based on the type of the corresponding
/// operand.
const char *X86TargetLowering::
LowerXConstraint(EVT ConstraintVT) const {
// FP X constraints get lowered to SSE1/2 registers if available, otherwise
// 'f' like normal targets.
if (ConstraintVT.isFloatingPoint()) {
if (Subtarget.hasSSE2())
return "Y";
if (Subtarget.hasSSE1())
return "x";
}
return TargetLowering::LowerXConstraint(ConstraintVT);
}
// Lower @cc targets via setcc.
SDValue X86TargetLowering::LowerAsmOutputForConstraint(
SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo,
SelectionDAG &DAG) const {
X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
if (Cond == X86::COND_INVALID)
return SDValue();
// Check that return type is valid.
if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
OpInfo.ConstraintVT.getSizeInBits() < 8)
report_fatal_error("Flag output operand is of invalid type");
// Get EFLAGS register. Only update chain when copyfrom is glued.
if (Flag.getNode()) {
Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
Chain = Flag.getValue(1);
} else
Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
// Extract CC code.
SDValue CC = getSETCC(Cond, Flag, DL, DAG);
// Extend to 32-bits
SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
return Result;
}
/// Lower the specified operand into the Ops vector.
/// If it is invalid, don't add anything to Ops.
void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
std::string &Constraint,
std::vector<SDValue>&Ops,
SelectionDAG &DAG) const {
SDValue Result;
// Only support length 1 constraints for now.
if (Constraint.length() > 1) return;
char ConstraintLetter = Constraint[0];
switch (ConstraintLetter) {
default: break;
case 'I':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 31) {
Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
Op.getValueType());
break;
}
}
return;
case 'J':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 63) {
Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
Op.getValueType());
break;
}
}
return;
case 'K':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (isInt<8>(C->getSExtValue())) {
Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
Op.getValueType());
break;
}
}
return;
case 'L':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
Op.getValueType());
break;
}
}
return;
case 'M':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 3) {
Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
Op.getValueType());
break;
}
}
return;
case 'N':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 255) {
Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
Op.getValueType());
break;
}
}
return;
case 'O':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 127) {
Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
Op.getValueType());
break;
}
}
return;
case 'e': {
// 32-bit signed value
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
C->getSExtValue())) {
// Widen to 64 bits here to get it sign extended.
Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
break;
}
// FIXME gcc accepts some relocatable values here too, but only in certain
// memory models; it's complicated.
}
return;
}
case 'Z': {
// 32-bit unsigned value
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
C->getZExtValue())) {
Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
Op.getValueType());
break;
}
}
// FIXME gcc accepts some relocatable values here too, but only in certain
// memory models; it's complicated.
return;
}
case 'i': {
// Literal immediates are always ok.
if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
BooleanContent BCont = getBooleanContents(MVT::i64);
ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
: ISD::SIGN_EXTEND;
int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
: CST->getSExtValue();
Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
break;
}
// In any sort of PIC mode addresses need to be computed at runtime by
// adding in a register or some sort of table lookup. These can't
// be used as immediates.
if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
return;
// If we are in non-pic codegen mode, we allow the address of a global (with
// an optional displacement) to be used with 'i'.
if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
// If we require an extra load to get this address, as in PIC mode, we
// can't accept it.
if (isGlobalStubReference(
Subtarget.classifyGlobalReference(GA->getGlobal())))
return;
break;
}
}
if (Result.getNode()) {
Ops.push_back(Result);
return;
}
return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
}
/// Check if \p RC is a general purpose register class.
/// I.e., GR* or one of their variant.
static bool isGRClass(const TargetRegisterClass &RC) {
return RC.hasSuperClassEq(&X86::GR8RegClass) ||
RC.hasSuperClassEq(&X86::GR16RegClass) ||
RC.hasSuperClassEq(&X86::GR32RegClass) ||
RC.hasSuperClassEq(&X86::GR64RegClass) ||
RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
}
/// Check if \p RC is a vector register class.
/// I.e., FR* / VR* or one of their variant.
static bool isFRClass(const TargetRegisterClass &RC) {
return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
RC.hasSuperClassEq(&X86::FR64XRegClass) ||
RC.hasSuperClassEq(&X86::VR128XRegClass) ||
RC.hasSuperClassEq(&X86::VR256XRegClass) ||
RC.hasSuperClassEq(&X86::VR512RegClass);
}
/// Check if \p RC is a mask register class.
/// I.e., VK* or one of their variant.
static bool isVKClass(const TargetRegisterClass &RC) {
return RC.hasSuperClassEq(&X86::VK1RegClass) ||
RC.hasSuperClassEq(&X86::VK2RegClass) ||
RC.hasSuperClassEq(&X86::VK4RegClass) ||
RC.hasSuperClassEq(&X86::VK8RegClass) ||
RC.hasSuperClassEq(&X86::VK16RegClass) ||
RC.hasSuperClassEq(&X86::VK32RegClass) ||
RC.hasSuperClassEq(&X86::VK64RegClass);
}
std::pair<unsigned, const TargetRegisterClass *>
X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
MVT VT) const {
// First, see if this is a constraint that directly corresponds to an LLVM
// register class.
if (Constraint.size() == 1) {
// GCC Constraint Letters
switch (Constraint[0]) {
default: break;
// 'A' means [ER]AX + [ER]DX.
case 'A':
if (Subtarget.is64Bit())
return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
"Expecting 64, 32 or 16 bit subtarget");
return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
// TODO: Slight differences here in allocation order and leaving
// RIP in the class. Do they matter any more here than they do
// in the normal allocation?
case 'k':
if (Subtarget.hasAVX512()) {
if (VT == MVT::i1)
return std::make_pair(0U, &X86::VK1RegClass);
if (VT == MVT::i8)
return std::make_pair(0U, &X86::VK8RegClass);
if (VT == MVT::i16)
return std::make_pair(0U, &X86::VK16RegClass);
}
if (Subtarget.hasBWI()) {
if (VT == MVT::i32)
return std::make_pair(0U, &X86::VK32RegClass);
if (VT == MVT::i64)
return std::make_pair(0U, &X86::VK64RegClass);
}
break;
case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
if (Subtarget.is64Bit()) {
if (VT == MVT::i32 || VT == MVT::f32)
return std::make_pair(0U, &X86::GR32RegClass);
if (VT == MVT::i16)
return std::make_pair(0U, &X86::GR16RegClass);
if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, &X86::GR8RegClass);
if (VT == MVT::i64 || VT == MVT::f64)
return std::make_pair(0U, &X86::GR64RegClass);
break;
}
LLVM_FALLTHROUGH;
// 32-bit fallthrough
case 'Q': // Q_REGS
if (VT == MVT::i32 || VT == MVT::f32)
return std::make_pair(0U, &X86::GR32_ABCDRegClass);
if (VT == MVT::i16)
return std::make_pair(0U, &X86::GR16_ABCDRegClass);
if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
if (VT == MVT::i64)
return std::make_pair(0U, &X86::GR64_ABCDRegClass);
break;
case 'r': // GENERAL_REGS
case 'l': // INDEX_REGS
if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, &X86::GR8RegClass);
if (VT == MVT::i16)
return std::make_pair(0U, &X86::GR16RegClass);
if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
return std::make_pair(0U, &X86::GR32RegClass);
return std::make_pair(0U, &X86::GR64RegClass);
case 'R': // LEGACY_REGS
if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, &X86::GR8_NOREXRegClass);
if (VT == MVT::i16)
return std::make_pair(0U, &X86::GR16_NOREXRegClass);
if (VT == MVT::i32 || !Subtarget.is64Bit())
return std::make_pair(0U, &X86::GR32_NOREXRegClass);
return std::make_pair(0U, &X86::GR64_NOREXRegClass);
case 'f': // FP Stack registers.
// If SSE is enabled for this VT, use f80 to ensure the isel moves the
// value to the correct fpstack register class.
if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
return std::make_pair(0U, &X86::RFP32RegClass);
if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
return std::make_pair(0U, &X86::RFP64RegClass);
return std::make_pair(0U, &X86::RFP80RegClass);
case 'y': // MMX_REGS if MMX allowed.
if (!Subtarget.hasMMX()) break;
return std::make_pair(0U, &X86::VR64RegClass);
case 'Y': // SSE_REGS if SSE2 allowed
if (!Subtarget.hasSSE2()) break;
LLVM_FALLTHROUGH;
case 'v':
case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
if (!Subtarget.hasSSE1()) break;
bool VConstraint = (Constraint[0] == 'v');
switch (VT.SimpleTy) {
default: break;
// Scalar SSE types.
case MVT::f32:
case MVT::i32:
if (VConstraint && Subtarget.hasVLX())
return std::make_pair(0U, &X86::FR32XRegClass);
return std::make_pair(0U, &X86::FR32RegClass);
case MVT::f64:
case MVT::i64:
if (VConstraint && Subtarget.hasVLX())
return std::make_pair(0U, &X86::FR64XRegClass);
return std::make_pair(0U, &X86::FR64RegClass);
// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
// Vector types.
case MVT::v16i8:
case MVT::v8i16:
case MVT::v4i32:
case MVT::v2i64:
case MVT::v4f32:
case MVT::v2f64:
if (VConstraint && Subtarget.hasVLX())
return std::make_pair(0U, &X86::VR128XRegClass);
return std::make_pair(0U, &X86::VR128RegClass);
// AVX types.
case MVT::v32i8:
case MVT::v16i16:
case MVT::v8i32:
case MVT::v4i64:
case MVT::v8f32:
case MVT::v4f64:
if (VConstraint && Subtarget.hasVLX())
return std::make_pair(0U, &X86::VR256XRegClass);
if (Subtarget.hasAVX())
return std::make_pair(0U, &X86::VR256RegClass);
break;
case MVT::v8f64:
case MVT::v16f32:
case MVT::v16i32:
case MVT::v8i64:
if (!Subtarget.hasAVX512()) break;
if (VConstraint)
return std::make_pair(0U, &X86::VR512RegClass);
return std::make_pair(0U, &X86::VR512_0_15RegClass);
}
break;
}
} else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
switch (Constraint[1]) {
default:
break;
case 'i':
case 't':
case '2':
return getRegForInlineAsmConstraint(TRI, "Y", VT);
case 'm':
if (!Subtarget.hasMMX()) break;
return std::make_pair(0U, &X86::VR64RegClass);
case 'z':
case '0':
if (!Subtarget.hasSSE1()) break;
return std::make_pair(X86::XMM0, &X86::VR128RegClass);
case 'k':
// This register class doesn't allocate k0 for masked vector operation.
if (Subtarget.hasAVX512()) {
if (VT == MVT::i1)
return std::make_pair(0U, &X86::VK1WMRegClass);
if (VT == MVT::i8)
return std::make_pair(0U, &X86::VK8WMRegClass);
if (VT == MVT::i16)
return std::make_pair(0U, &X86::VK16WMRegClass);
}
if (Subtarget.hasBWI()) {
if (VT == MVT::i32)
return std::make_pair(0U, &X86::VK32WMRegClass);
if (VT == MVT::i64)
return std::make_pair(0U, &X86::VK64WMRegClass);
}
break;
}
}
if (parseConstraintCode(Constraint) != X86::COND_INVALID)
return std::make_pair(0U, &X86::GR32RegClass);
// Use the default implementation in TargetLowering to convert the register
// constraint into a member of a register class.
std::pair<unsigned, const TargetRegisterClass*> Res;
Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
// Not found as a standard register?
if (!Res.second) {
// Map st(0) -> st(7) -> ST0
if (Constraint.size() == 7 && Constraint[0] == '{' &&
tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
Constraint[3] == '(' &&
(Constraint[4] >= '0' && Constraint[4] <= '7') &&
Constraint[5] == ')' && Constraint[6] == '}') {
// st(7) is not allocatable and thus not a member of RFP80. Return
// singleton class in cases where we have a reference to it.
if (Constraint[4] == '7')
return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
return std::make_pair(X86::FP0 + Constraint[4] - '0',
&X86::RFP80RegClass);
}
// GCC allows "st(0)" to be called just plain "st".
if (StringRef("{st}").equals_lower(Constraint))
return std::make_pair(X86::FP0, &X86::RFP80RegClass);
// flags -> EFLAGS
if (StringRef("{flags}").equals_lower(Constraint))
return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
// dirflag -> DF
if (StringRef("{dirflag}").equals_lower(Constraint))
return std::make_pair(X86::DF, &X86::DFCCRRegClass);
// fpsr -> FPSW
if (StringRef("{fpsr}").equals_lower(Constraint))
return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
return Res;
}
// Make sure it isn't a register that requires 64-bit mode.
if (!Subtarget.is64Bit() &&
(isFRClass(*Res.second) || isGRClass(*Res.second)) &&
TRI->getEncodingValue(Res.first) >= 8) {
// Register requires REX prefix, but we're in 32-bit mode.
return std::make_pair(0, nullptr);
}
// Make sure it isn't a register that requires AVX512.
if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
TRI->getEncodingValue(Res.first) & 0x10) {
// Register requires EVEX prefix.
return std::make_pair(0, nullptr);
}
// Otherwise, check to see if this is a register class of the wrong value
// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
// turn into {ax},{dx}.
// MVT::Other is used to specify clobber names.
if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
return Res; // Correct type already, nothing to do.
// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
// return "eax". This should even work for things like getting 64bit integer
// registers when given an f64 type.
const TargetRegisterClass *Class = Res.second;
// The generic code will match the first register class that contains the
// given register. Thus, based on the ordering of the tablegened file,
// the "plain" GR classes might not come first.
// Therefore, use a helper method.
if (isGRClass(*Class)) {
unsigned Size = VT.getSizeInBits();
if (Size == 1) Size = 8;
unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
if (DestReg > 0) {
bool is64Bit = Subtarget.is64Bit();
const TargetRegisterClass *RC =
Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
: Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
: Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
: Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
: nullptr;
if (Size == 64 && !is64Bit) {
// Model GCC's behavior here and select a fixed pair of 32-bit
// registers.
switch (DestReg) {
case X86::RAX:
return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
case X86::RDX:
return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
case X86::RCX:
return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
case X86::RBX:
return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
case X86::RSI:
return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
case X86::RDI:
return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
case X86::RBP:
return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
default:
return std::make_pair(0, nullptr);
}
}
if (RC && RC->contains(DestReg))
return std::make_pair(DestReg, RC);
return Res;
}
// No register found/type mismatch.
return std::make_pair(0, nullptr);
} else if (isFRClass(*Class)) {
// Handle references to XMM physical registers that got mapped into the
// wrong class. This can happen with constraints like {xmm0} where the
// target independent register mapper will just pick the first match it can
// find, ignoring the required type.
// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
if (VT == MVT::f32 || VT == MVT::i32)
Res.second = &X86::FR32XRegClass;
else if (VT == MVT::f64 || VT == MVT::i64)
Res.second = &X86::FR64XRegClass;
else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
Res.second = &X86::VR128XRegClass;
else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
Res.second = &X86::VR256XRegClass;
else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
Res.second = &X86::VR512RegClass;
else {
// Type mismatch and not a clobber: Return an error;
Res.first = 0;
Res.second = nullptr;
}
} else if (isVKClass(*Class)) {
if (VT == MVT::i1)
Res.second = &X86::VK1RegClass;
else if (VT == MVT::i8)
Res.second = &X86::VK8RegClass;
else if (VT == MVT::i16)
Res.second = &X86::VK16RegClass;
else if (VT == MVT::i32)
Res.second = &X86::VK32RegClass;
else if (VT == MVT::i64)
Res.second = &X86::VK64RegClass;
else {
// Type mismatch and not a clobber: Return an error;
Res.first = 0;
Res.second = nullptr;
}
}
return Res;
}
int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS) const {
// Scaling factors are not free at all.
// An indexed folded instruction, i.e., inst (reg1, reg2, scale),
// will take 2 allocations in the out of order engine instead of 1
// for plain addressing mode, i.e. inst (reg1).
// E.g.,
// vaddps (%rsi,%rdx), %ymm0, %ymm1
// Requires two allocations (one for the load, one for the computation)
// whereas:
// vaddps (%rsi), %ymm0, %ymm1
// Requires just 1 allocation, i.e., freeing allocations for other operations
// and having less micro operations to execute.
//
// For some X86 architectures, this is even worse because for instance for
// stores, the complex addressing mode forces the instruction to use the
// "load" ports instead of the dedicated "store" port.
// E.g., on Haswell:
// vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
// vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
if (isLegalAddressingMode(DL, AM, Ty, AS))
// Scale represents reg2 * scale, thus account for 1
// as soon as we use a second register.
return AM.Scale != 0;
return -1;
}
bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
// Integer division on x86 is expensive. However, when aggressively optimizing
// for code size, we prefer to use a div instruction, as it is usually smaller
// than the alternative sequence.
// The exception to this is vector division. Since x86 doesn't have vector
// integer division, leaving the division as-is is a loss even in terms of
// size, because it will have to be scalarized, while the alternative code
// sequence can be performed in vector form.
bool OptSize =
Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
return OptSize && !VT.isVector();
}
void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
if (!Subtarget.is64Bit())
return;
// Update IsSplitCSR in X86MachineFunctionInfo.
X86MachineFunctionInfo *AFI =
Entry->getParent()->getInfo<X86MachineFunctionInfo>();
AFI->setIsSplitCSR(true);
}
void X86TargetLowering::insertCopiesSplitCSR(
MachineBasicBlock *Entry,
const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
if (!IStart)
return;
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
MachineBasicBlock::iterator MBBI = Entry->begin();
for (const MCPhysReg *I = IStart; *I; ++I) {
const TargetRegisterClass *RC = nullptr;
if (X86::GR64RegClass.contains(*I))
RC = &X86::GR64RegClass;
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
unsigned NewVR = MRI->createVirtualRegister(RC);
// Create copy from CSR to a virtual register.
// FIXME: this currently does not emit CFI pseudo-instructions, it works
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
// nounwind. If we want to generalize this later, we may need to emit
// CFI pseudo-instructions.
assert(
Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
"Function should be nounwind in insertCopiesSplitCSR!");
Entry->addLiveIn(*I);
BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
.addReg(*I);
// Insert the copy-back instructions right before the terminator.
for (auto *Exit : Exits)
BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
TII->get(TargetOpcode::COPY), *I)
.addReg(NewVR);
}
}
bool X86TargetLowering::supportSwiftError() const {
return Subtarget.is64Bit();
}
/// Returns the name of the symbol used to emit stack probes or the empty
/// string if not applicable.
StringRef
X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
// If the function specifically requests stack probes, emit them.
if (MF.getFunction().hasFnAttribute("probe-stack"))
return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
// Generally, if we aren't on Windows, the platform ABI does not include
// support for stack probes, so don't emit them.
if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
return "";
// We need a stack probe to conform to the Windows ABI. Choose the right
// symbol.
if (Subtarget.is64Bit())
return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
}
Index: vendor/llvm/dist-release_90/lib/Transforms/InstCombine/InstCombineCompares.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/Transforms/InstCombine/InstCombineCompares.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Transforms/InstCombine/InstCombineCompares.cpp (revision 351303)
@@ -1,5743 +1,5752 @@
//===- InstCombineCompares.cpp --------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the visitICmp and visitFCmp functions.
//
//===----------------------------------------------------------------------===//
#include "InstCombineInternal.h"
#include "llvm/ADT/APSInt.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/KnownBits.h"
using namespace llvm;
using namespace PatternMatch;
#define DEBUG_TYPE "instcombine"
// How many times is a select replaced by one of its operands?
STATISTIC(NumSel, "Number of select opts");
/// Compute Result = In1+In2, returning true if the result overflowed for this
/// type.
static bool addWithOverflow(APInt &Result, const APInt &In1,
const APInt &In2, bool IsSigned = false) {
bool Overflow;
if (IsSigned)
Result = In1.sadd_ov(In2, Overflow);
else
Result = In1.uadd_ov(In2, Overflow);
return Overflow;
}
/// Compute Result = In1-In2, returning true if the result overflowed for this
/// type.
static bool subWithOverflow(APInt &Result, const APInt &In1,
const APInt &In2, bool IsSigned = false) {
bool Overflow;
if (IsSigned)
Result = In1.ssub_ov(In2, Overflow);
else
Result = In1.usub_ov(In2, Overflow);
return Overflow;
}
/// Given an icmp instruction, return true if any use of this comparison is a
/// branch on sign bit comparison.
static bool hasBranchUse(ICmpInst &I) {
for (auto *U : I.users())
if (isa<BranchInst>(U))
return true;
return false;
}
/// Given an exploded icmp instruction, return true if the comparison only
/// checks the sign bit. If it only checks the sign bit, set TrueIfSigned if the
/// result of the comparison is true when the input value is signed.
static bool isSignBitCheck(ICmpInst::Predicate Pred, const APInt &RHS,
bool &TrueIfSigned) {
switch (Pred) {
case ICmpInst::ICMP_SLT: // True if LHS s< 0
TrueIfSigned = true;
return RHS.isNullValue();
case ICmpInst::ICMP_SLE: // True if LHS s<= RHS and RHS == -1
TrueIfSigned = true;
return RHS.isAllOnesValue();
case ICmpInst::ICMP_SGT: // True if LHS s> -1
TrueIfSigned = false;
return RHS.isAllOnesValue();
case ICmpInst::ICMP_UGT:
// True if LHS u> RHS and RHS == high-bit-mask - 1
TrueIfSigned = true;
return RHS.isMaxSignedValue();
case ICmpInst::ICMP_UGE:
// True if LHS u>= RHS and RHS == high-bit-mask (2^7, 2^15, 2^31, etc)
TrueIfSigned = true;
return RHS.isSignMask();
default:
return false;
}
}
/// Returns true if the exploded icmp can be expressed as a signed comparison
/// to zero and updates the predicate accordingly.
/// The signedness of the comparison is preserved.
/// TODO: Refactor with decomposeBitTestICmp()?
static bool isSignTest(ICmpInst::Predicate &Pred, const APInt &C) {
if (!ICmpInst::isSigned(Pred))
return false;
if (C.isNullValue())
return ICmpInst::isRelational(Pred);
if (C.isOneValue()) {
if (Pred == ICmpInst::ICMP_SLT) {
Pred = ICmpInst::ICMP_SLE;
return true;
}
} else if (C.isAllOnesValue()) {
if (Pred == ICmpInst::ICMP_SGT) {
Pred = ICmpInst::ICMP_SGE;
return true;
}
}
return false;
}
/// Given a signed integer type and a set of known zero and one bits, compute
/// the maximum and minimum values that could have the specified known zero and
/// known one bits, returning them in Min/Max.
/// TODO: Move to method on KnownBits struct?
static void computeSignedMinMaxValuesFromKnownBits(const KnownBits &Known,
APInt &Min, APInt &Max) {
assert(Known.getBitWidth() == Min.getBitWidth() &&
Known.getBitWidth() == Max.getBitWidth() &&
"KnownZero, KnownOne and Min, Max must have equal bitwidth.");
APInt UnknownBits = ~(Known.Zero|Known.One);
// The minimum value is when all unknown bits are zeros, EXCEPT for the sign
// bit if it is unknown.
Min = Known.One;
Max = Known.One|UnknownBits;
if (UnknownBits.isNegative()) { // Sign bit is unknown
Min.setSignBit();
Max.clearSignBit();
}
}
/// Given an unsigned integer type and a set of known zero and one bits, compute
/// the maximum and minimum values that could have the specified known zero and
/// known one bits, returning them in Min/Max.
/// TODO: Move to method on KnownBits struct?
static void computeUnsignedMinMaxValuesFromKnownBits(const KnownBits &Known,
APInt &Min, APInt &Max) {
assert(Known.getBitWidth() == Min.getBitWidth() &&
Known.getBitWidth() == Max.getBitWidth() &&
"Ty, KnownZero, KnownOne and Min, Max must have equal bitwidth.");
APInt UnknownBits = ~(Known.Zero|Known.One);
// The minimum value is when the unknown bits are all zeros.
Min = Known.One;
// The maximum value is when the unknown bits are all ones.
Max = Known.One|UnknownBits;
}
/// This is called when we see this pattern:
/// cmp pred (load (gep GV, ...)), cmpcst
/// where GV is a global variable with a constant initializer. Try to simplify
/// this into some simple computation that does not need the load. For example
/// we can optimize "icmp eq (load (gep "foo", 0, i)), 0" into "icmp eq i, 3".
///
/// If AndCst is non-null, then the loaded value is masked with that constant
/// before doing the comparison. This handles cases like "A[i]&4 == 0".
Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
GlobalVariable *GV,
CmpInst &ICI,
ConstantInt *AndCst) {
Constant *Init = GV->getInitializer();
if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
return nullptr;
uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
// Don't blow up on huge arrays.
if (ArrayElementCount > MaxArraySizeForCombine)
return nullptr;
// There are many forms of this optimization we can handle, for now, just do
// the simple index into a single-dimensional array.
//
// Require: GEP GV, 0, i {{, constant indices}}
if (GEP->getNumOperands() < 3 ||
!isa<ConstantInt>(GEP->getOperand(1)) ||
!cast<ConstantInt>(GEP->getOperand(1))->isZero() ||
isa<Constant>(GEP->getOperand(2)))
return nullptr;
// Check that indices after the variable are constants and in-range for the
// type they index. Collect the indices. This is typically for arrays of
// structs.
SmallVector<unsigned, 4> LaterIndices;
Type *EltTy = Init->getType()->getArrayElementType();
for (unsigned i = 3, e = GEP->getNumOperands(); i != e; ++i) {
ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i));
if (!Idx) return nullptr; // Variable index.
uint64_t IdxVal = Idx->getZExtValue();
if ((unsigned)IdxVal != IdxVal) return nullptr; // Too large array index.
if (StructType *STy = dyn_cast<StructType>(EltTy))
EltTy = STy->getElementType(IdxVal);
else if (ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) {
if (IdxVal >= ATy->getNumElements()) return nullptr;
EltTy = ATy->getElementType();
} else {
return nullptr; // Unknown type.
}
LaterIndices.push_back(IdxVal);
}
enum { Overdefined = -3, Undefined = -2 };
// Variables for our state machines.
// FirstTrueElement/SecondTrueElement - Used to emit a comparison of the form
// "i == 47 | i == 87", where 47 is the first index the condition is true for,
// and 87 is the second (and last) index. FirstTrueElement is -2 when
// undefined, otherwise set to the first true element. SecondTrueElement is
// -2 when undefined, -3 when overdefined and >= 0 when that index is true.
int FirstTrueElement = Undefined, SecondTrueElement = Undefined;
// FirstFalseElement/SecondFalseElement - Used to emit a comparison of the
// form "i != 47 & i != 87". Same state transitions as for true elements.
int FirstFalseElement = Undefined, SecondFalseElement = Undefined;
/// TrueRangeEnd/FalseRangeEnd - In conjunction with First*Element, these
/// define a state machine that triggers for ranges of values that the index
/// is true or false for. This triggers on things like "abbbbc"[i] == 'b'.
/// This is -2 when undefined, -3 when overdefined, and otherwise the last
/// index in the range (inclusive). We use -2 for undefined here because we
/// use relative comparisons and don't want 0-1 to match -1.
int TrueRangeEnd = Undefined, FalseRangeEnd = Undefined;
// MagicBitvector - This is a magic bitvector where we set a bit if the
// comparison is true for element 'i'. If there are 64 elements or less in
// the array, this will fully represent all the comparison results.
uint64_t MagicBitvector = 0;
// Scan the array and see if one of our patterns matches.
Constant *CompareRHS = cast<Constant>(ICI.getOperand(1));
for (unsigned i = 0, e = ArrayElementCount; i != e; ++i) {
Constant *Elt = Init->getAggregateElement(i);
if (!Elt) return nullptr;
// If this is indexing an array of structures, get the structure element.
if (!LaterIndices.empty())
Elt = ConstantExpr::getExtractValue(Elt, LaterIndices);
// If the element is masked, handle it.
if (AndCst) Elt = ConstantExpr::getAnd(Elt, AndCst);
// Find out if the comparison would be true or false for the i'th element.
Constant *C = ConstantFoldCompareInstOperands(ICI.getPredicate(), Elt,
CompareRHS, DL, &TLI);
// If the result is undef for this element, ignore it.
if (isa<UndefValue>(C)) {
// Extend range state machines to cover this element in case there is an
// undef in the middle of the range.
if (TrueRangeEnd == (int)i-1)
TrueRangeEnd = i;
if (FalseRangeEnd == (int)i-1)
FalseRangeEnd = i;
continue;
}
// If we can't compute the result for any of the elements, we have to give
// up evaluating the entire conditional.
if (!isa<ConstantInt>(C)) return nullptr;
// Otherwise, we know if the comparison is true or false for this element,
// update our state machines.
bool IsTrueForElt = !cast<ConstantInt>(C)->isZero();
// State machine for single/double/range index comparison.
if (IsTrueForElt) {
// Update the TrueElement state machine.
if (FirstTrueElement == Undefined)
FirstTrueElement = TrueRangeEnd = i; // First true element.
else {
// Update double-compare state machine.
if (SecondTrueElement == Undefined)
SecondTrueElement = i;
else
SecondTrueElement = Overdefined;
// Update range state machine.
if (TrueRangeEnd == (int)i-1)
TrueRangeEnd = i;
else
TrueRangeEnd = Overdefined;
}
} else {
// Update the FalseElement state machine.
if (FirstFalseElement == Undefined)
FirstFalseElement = FalseRangeEnd = i; // First false element.
else {
// Update double-compare state machine.
if (SecondFalseElement == Undefined)
SecondFalseElement = i;
else
SecondFalseElement = Overdefined;
// Update range state machine.
if (FalseRangeEnd == (int)i-1)
FalseRangeEnd = i;
else
FalseRangeEnd = Overdefined;
}
}
// If this element is in range, update our magic bitvector.
if (i < 64 && IsTrueForElt)
MagicBitvector |= 1ULL << i;
// If all of our states become overdefined, bail out early. Since the
// predicate is expensive, only check it every 8 elements. This is only
// really useful for really huge arrays.
if ((i & 8) == 0 && i >= 64 && SecondTrueElement == Overdefined &&
SecondFalseElement == Overdefined && TrueRangeEnd == Overdefined &&
FalseRangeEnd == Overdefined)
return nullptr;
}
// Now that we've scanned the entire array, emit our new comparison(s). We
// order the state machines in complexity of the generated code.
Value *Idx = GEP->getOperand(2);
// If the index is larger than the pointer size of the target, truncate the
// index down like the GEP would do implicitly. We don't have to do this for
// an inbounds GEP because the index can't be out of range.
if (!GEP->isInBounds()) {
Type *IntPtrTy = DL.getIntPtrType(GEP->getType());
unsigned PtrSize = IntPtrTy->getIntegerBitWidth();
if (Idx->getType()->getPrimitiveSizeInBits() > PtrSize)
Idx = Builder.CreateTrunc(Idx, IntPtrTy);
}
// If the comparison is only true for one or two elements, emit direct
// comparisons.
if (SecondTrueElement != Overdefined) {
// None true -> false.
if (FirstTrueElement == Undefined)
return replaceInstUsesWith(ICI, Builder.getFalse());
Value *FirstTrueIdx = ConstantInt::get(Idx->getType(), FirstTrueElement);
// True for one element -> 'i == 47'.
if (SecondTrueElement == Undefined)
return new ICmpInst(ICmpInst::ICMP_EQ, Idx, FirstTrueIdx);
// True for two elements -> 'i == 47 | i == 72'.
Value *C1 = Builder.CreateICmpEQ(Idx, FirstTrueIdx);
Value *SecondTrueIdx = ConstantInt::get(Idx->getType(), SecondTrueElement);
Value *C2 = Builder.CreateICmpEQ(Idx, SecondTrueIdx);
return BinaryOperator::CreateOr(C1, C2);
}
// If the comparison is only false for one or two elements, emit direct
// comparisons.
if (SecondFalseElement != Overdefined) {
// None false -> true.
if (FirstFalseElement == Undefined)
return replaceInstUsesWith(ICI, Builder.getTrue());
Value *FirstFalseIdx = ConstantInt::get(Idx->getType(), FirstFalseElement);
// False for one element -> 'i != 47'.
if (SecondFalseElement == Undefined)
return new ICmpInst(ICmpInst::ICMP_NE, Idx, FirstFalseIdx);
// False for two elements -> 'i != 47 & i != 72'.
Value *C1 = Builder.CreateICmpNE(Idx, FirstFalseIdx);
Value *SecondFalseIdx = ConstantInt::get(Idx->getType(),SecondFalseElement);
Value *C2 = Builder.CreateICmpNE(Idx, SecondFalseIdx);
return BinaryOperator::CreateAnd(C1, C2);
}
// If the comparison can be replaced with a range comparison for the elements
// where it is true, emit the range check.
if (TrueRangeEnd != Overdefined) {
assert(TrueRangeEnd != FirstTrueElement && "Should emit single compare");
// Generate (i-FirstTrue) <u (TrueRangeEnd-FirstTrue+1).
if (FirstTrueElement) {
Value *Offs = ConstantInt::get(Idx->getType(), -FirstTrueElement);
Idx = Builder.CreateAdd(Idx, Offs);
}
Value *End = ConstantInt::get(Idx->getType(),
TrueRangeEnd-FirstTrueElement+1);
return new ICmpInst(ICmpInst::ICMP_ULT, Idx, End);
}
// False range check.
if (FalseRangeEnd != Overdefined) {
assert(FalseRangeEnd != FirstFalseElement && "Should emit single compare");
// Generate (i-FirstFalse) >u (FalseRangeEnd-FirstFalse).
if (FirstFalseElement) {
Value *Offs = ConstantInt::get(Idx->getType(), -FirstFalseElement);
Idx = Builder.CreateAdd(Idx, Offs);
}
Value *End = ConstantInt::get(Idx->getType(),
FalseRangeEnd-FirstFalseElement);
return new ICmpInst(ICmpInst::ICMP_UGT, Idx, End);
}
// If a magic bitvector captures the entire comparison state
// of this load, replace it with computation that does:
// ((magic_cst >> i) & 1) != 0
{
Type *Ty = nullptr;
// Look for an appropriate type:
// - The type of Idx if the magic fits
// - The smallest fitting legal type
if (ArrayElementCount <= Idx->getType()->getIntegerBitWidth())
Ty = Idx->getType();
else
Ty = DL.getSmallestLegalIntType(Init->getContext(), ArrayElementCount);
if (Ty) {
Value *V = Builder.CreateIntCast(Idx, Ty, false);
V = Builder.CreateLShr(ConstantInt::get(Ty, MagicBitvector), V);
V = Builder.CreateAnd(ConstantInt::get(Ty, 1), V);
return new ICmpInst(ICmpInst::ICMP_NE, V, ConstantInt::get(Ty, 0));
}
}
return nullptr;
}
/// Return a value that can be used to compare the *offset* implied by a GEP to
/// zero. For example, if we have &A[i], we want to return 'i' for
/// "icmp ne i, 0". Note that, in general, indices can be complex, and scales
/// are involved. The above expression would also be legal to codegen as
/// "icmp ne (i*4), 0" (assuming A is a pointer to i32).
/// This latter form is less amenable to optimization though, and we are allowed
/// to generate the first by knowing that pointer arithmetic doesn't overflow.
///
/// If we can't emit an optimized form for this expression, this returns null.
///
static Value *evaluateGEPOffsetExpression(User *GEP, InstCombiner &IC,
const DataLayout &DL) {
gep_type_iterator GTI = gep_type_begin(GEP);
// Check to see if this gep only has a single variable index. If so, and if
// any constant indices are a multiple of its scale, then we can compute this
// in terms of the scale of the variable index. For example, if the GEP
// implies an offset of "12 + i*4", then we can codegen this as "3 + i",
// because the expression will cross zero at the same point.
unsigned i, e = GEP->getNumOperands();
int64_t Offset = 0;
for (i = 1; i != e; ++i, ++GTI) {
if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i))) {
// Compute the aggregate offset of constant indices.
if (CI->isZero()) continue;
// Handle a struct index, which adds its field offset to the pointer.
if (StructType *STy = GTI.getStructTypeOrNull()) {
Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
} else {
uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
Offset += Size*CI->getSExtValue();
}
} else {
// Found our variable index.
break;
}
}
// If there are no variable indices, we must have a constant offset, just
// evaluate it the general way.
if (i == e) return nullptr;
Value *VariableIdx = GEP->getOperand(i);
// Determine the scale factor of the variable element. For example, this is
// 4 if the variable index is into an array of i32.
uint64_t VariableScale = DL.getTypeAllocSize(GTI.getIndexedType());
// Verify that there are no other variable indices. If so, emit the hard way.
for (++i, ++GTI; i != e; ++i, ++GTI) {
ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i));
if (!CI) return nullptr;
// Compute the aggregate offset of constant indices.
if (CI->isZero()) continue;
// Handle a struct index, which adds its field offset to the pointer.
if (StructType *STy = GTI.getStructTypeOrNull()) {
Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
} else {
uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
Offset += Size*CI->getSExtValue();
}
}
// Okay, we know we have a single variable index, which must be a
// pointer/array/vector index. If there is no offset, life is simple, return
// the index.
Type *IntPtrTy = DL.getIntPtrType(GEP->getOperand(0)->getType());
unsigned IntPtrWidth = IntPtrTy->getIntegerBitWidth();
if (Offset == 0) {
// Cast to intptrty in case a truncation occurs. If an extension is needed,
// we don't need to bother extending: the extension won't affect where the
// computation crosses zero.
if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth) {
VariableIdx = IC.Builder.CreateTrunc(VariableIdx, IntPtrTy);
}
return VariableIdx;
}
// Otherwise, there is an index. The computation we will do will be modulo
// the pointer size.
Offset = SignExtend64(Offset, IntPtrWidth);
VariableScale = SignExtend64(VariableScale, IntPtrWidth);
// To do this transformation, any constant index must be a multiple of the
// variable scale factor. For example, we can evaluate "12 + 4*i" as "3 + i",
// but we can't evaluate "10 + 3*i" in terms of i. Check that the offset is a
// multiple of the variable scale.
int64_t NewOffs = Offset / (int64_t)VariableScale;
if (Offset != NewOffs*(int64_t)VariableScale)
return nullptr;
// Okay, we can do this evaluation. Start by converting the index to intptr.
if (VariableIdx->getType() != IntPtrTy)
VariableIdx = IC.Builder.CreateIntCast(VariableIdx, IntPtrTy,
true /*Signed*/);
Constant *OffsetVal = ConstantInt::get(IntPtrTy, NewOffs);
return IC.Builder.CreateAdd(VariableIdx, OffsetVal, "offset");
}
/// Returns true if we can rewrite Start as a GEP with pointer Base
/// and some integer offset. The nodes that need to be re-written
/// for this transformation will be added to Explored.
static bool canRewriteGEPAsOffset(Value *Start, Value *Base,
const DataLayout &DL,
SetVector<Value *> &Explored) {
SmallVector<Value *, 16> WorkList(1, Start);
Explored.insert(Base);
// The following traversal gives us an order which can be used
// when doing the final transformation. Since in the final
// transformation we create the PHI replacement instructions first,
// we don't have to get them in any particular order.
//
// However, for other instructions we will have to traverse the
// operands of an instruction first, which means that we have to
// do a post-order traversal.
while (!WorkList.empty()) {
SetVector<PHINode *> PHIs;
while (!WorkList.empty()) {
if (Explored.size() >= 100)
return false;
Value *V = WorkList.back();
if (Explored.count(V) != 0) {
WorkList.pop_back();
continue;
}
if (!isa<IntToPtrInst>(V) && !isa<PtrToIntInst>(V) &&
!isa<GetElementPtrInst>(V) && !isa<PHINode>(V))
// We've found some value that we can't explore which is different from
// the base. Therefore we can't do this transformation.
return false;
if (isa<IntToPtrInst>(V) || isa<PtrToIntInst>(V)) {
auto *CI = dyn_cast<CastInst>(V);
if (!CI->isNoopCast(DL))
return false;
if (Explored.count(CI->getOperand(0)) == 0)
WorkList.push_back(CI->getOperand(0));
}
if (auto *GEP = dyn_cast<GEPOperator>(V)) {
// We're limiting the GEP to having one index. This will preserve
// the original pointer type. We could handle more cases in the
// future.
if (GEP->getNumIndices() != 1 || !GEP->isInBounds() ||
GEP->getType() != Start->getType())
return false;
if (Explored.count(GEP->getOperand(0)) == 0)
WorkList.push_back(GEP->getOperand(0));
}
if (WorkList.back() == V) {
WorkList.pop_back();
// We've finished visiting this node, mark it as such.
Explored.insert(V);
}
if (auto *PN = dyn_cast<PHINode>(V)) {
// We cannot transform PHIs on unsplittable basic blocks.
if (isa<CatchSwitchInst>(PN->getParent()->getTerminator()))
return false;
Explored.insert(PN);
PHIs.insert(PN);
}
}
// Explore the PHI nodes further.
for (auto *PN : PHIs)
for (Value *Op : PN->incoming_values())
if (Explored.count(Op) == 0)
WorkList.push_back(Op);
}
// Make sure that we can do this. Since we can't insert GEPs in a basic
// block before a PHI node, we can't easily do this transformation if
// we have PHI node users of transformed instructions.
for (Value *Val : Explored) {
for (Value *Use : Val->uses()) {
auto *PHI = dyn_cast<PHINode>(Use);
auto *Inst = dyn_cast<Instruction>(Val);
if (Inst == Base || Inst == PHI || !Inst || !PHI ||
Explored.count(PHI) == 0)
continue;
if (PHI->getParent() == Inst->getParent())
return false;
}
}
return true;
}
// Sets the appropriate insert point on Builder where we can add
// a replacement Instruction for V (if that is possible).
static void setInsertionPoint(IRBuilder<> &Builder, Value *V,
bool Before = true) {
if (auto *PHI = dyn_cast<PHINode>(V)) {
Builder.SetInsertPoint(&*PHI->getParent()->getFirstInsertionPt());
return;
}
if (auto *I = dyn_cast<Instruction>(V)) {
if (!Before)
I = &*std::next(I->getIterator());
Builder.SetInsertPoint(I);
return;
}
if (auto *A = dyn_cast<Argument>(V)) {
// Set the insertion point in the entry block.
BasicBlock &Entry = A->getParent()->getEntryBlock();
Builder.SetInsertPoint(&*Entry.getFirstInsertionPt());
return;
}
// Otherwise, this is a constant and we don't need to set a new
// insertion point.
assert(isa<Constant>(V) && "Setting insertion point for unknown value!");
}
/// Returns a re-written value of Start as an indexed GEP using Base as a
/// pointer.
static Value *rewriteGEPAsOffset(Value *Start, Value *Base,
const DataLayout &DL,
SetVector<Value *> &Explored) {
// Perform all the substitutions. This is a bit tricky because we can
// have cycles in our use-def chains.
// 1. Create the PHI nodes without any incoming values.
// 2. Create all the other values.
// 3. Add the edges for the PHI nodes.
// 4. Emit GEPs to get the original pointers.
// 5. Remove the original instructions.
Type *IndexType = IntegerType::get(
Base->getContext(), DL.getIndexTypeSizeInBits(Start->getType()));
DenseMap<Value *, Value *> NewInsts;
NewInsts[Base] = ConstantInt::getNullValue(IndexType);
// Create the new PHI nodes, without adding any incoming values.
for (Value *Val : Explored) {
if (Val == Base)
continue;
// Create empty phi nodes. This avoids cyclic dependencies when creating
// the remaining instructions.
if (auto *PHI = dyn_cast<PHINode>(Val))
NewInsts[PHI] = PHINode::Create(IndexType, PHI->getNumIncomingValues(),
PHI->getName() + ".idx", PHI);
}
IRBuilder<> Builder(Base->getContext());
// Create all the other instructions.
for (Value *Val : Explored) {
if (NewInsts.find(Val) != NewInsts.end())
continue;
if (auto *CI = dyn_cast<CastInst>(Val)) {
// Don't get rid of the intermediate variable here; the store can grow
// the map which will invalidate the reference to the input value.
Value *V = NewInsts[CI->getOperand(0)];
NewInsts[CI] = V;
continue;
}
if (auto *GEP = dyn_cast<GEPOperator>(Val)) {
Value *Index = NewInsts[GEP->getOperand(1)] ? NewInsts[GEP->getOperand(1)]
: GEP->getOperand(1);
setInsertionPoint(Builder, GEP);
// Indices might need to be sign extended. GEPs will magically do
// this, but we need to do it ourselves here.
if (Index->getType()->getScalarSizeInBits() !=
NewInsts[GEP->getOperand(0)]->getType()->getScalarSizeInBits()) {
Index = Builder.CreateSExtOrTrunc(
Index, NewInsts[GEP->getOperand(0)]->getType(),
GEP->getOperand(0)->getName() + ".sext");
}
auto *Op = NewInsts[GEP->getOperand(0)];
if (isa<ConstantInt>(Op) && cast<ConstantInt>(Op)->isZero())
NewInsts[GEP] = Index;
else
NewInsts[GEP] = Builder.CreateNSWAdd(
Op, Index, GEP->getOperand(0)->getName() + ".add");
continue;
}
if (isa<PHINode>(Val))
continue;
llvm_unreachable("Unexpected instruction type");
}
// Add the incoming values to the PHI nodes.
for (Value *Val : Explored) {
if (Val == Base)
continue;
// All the instructions have been created, we can now add edges to the
// phi nodes.
if (auto *PHI = dyn_cast<PHINode>(Val)) {
PHINode *NewPhi = static_cast<PHINode *>(NewInsts[PHI]);
for (unsigned I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
Value *NewIncoming = PHI->getIncomingValue(I);
if (NewInsts.find(NewIncoming) != NewInsts.end())
NewIncoming = NewInsts[NewIncoming];
NewPhi->addIncoming(NewIncoming, PHI->getIncomingBlock(I));
}
}
}
for (Value *Val : Explored) {
if (Val == Base)
continue;
// Depending on the type, for external users we have to emit
// a GEP or a GEP + ptrtoint.
setInsertionPoint(Builder, Val, false);
// If required, create an inttoptr instruction for Base.
Value *NewBase = Base;
if (!Base->getType()->isPointerTy())
NewBase = Builder.CreateBitOrPointerCast(Base, Start->getType(),
Start->getName() + "to.ptr");
Value *GEP = Builder.CreateInBoundsGEP(
Start->getType()->getPointerElementType(), NewBase,
makeArrayRef(NewInsts[Val]), Val->getName() + ".ptr");
if (!Val->getType()->isPointerTy()) {
Value *Cast = Builder.CreatePointerCast(GEP, Val->getType(),
Val->getName() + ".conv");
GEP = Cast;
}
Val->replaceAllUsesWith(GEP);
}
return NewInsts[Start];
}
/// Looks through GEPs, IntToPtrInsts and PtrToIntInsts in order to express
/// the input Value as a constant indexed GEP. Returns a pair containing
/// the GEPs Pointer and Index.
static std::pair<Value *, Value *>
getAsConstantIndexedAddress(Value *V, const DataLayout &DL) {
Type *IndexType = IntegerType::get(V->getContext(),
DL.getIndexTypeSizeInBits(V->getType()));
Constant *Index = ConstantInt::getNullValue(IndexType);
while (true) {
if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
// We accept only inbouds GEPs here to exclude the possibility of
// overflow.
if (!GEP->isInBounds())
break;
if (GEP->hasAllConstantIndices() && GEP->getNumIndices() == 1 &&
GEP->getType() == V->getType()) {
V = GEP->getOperand(0);
Constant *GEPIndex = static_cast<Constant *>(GEP->getOperand(1));
Index = ConstantExpr::getAdd(
Index, ConstantExpr::getSExtOrBitCast(GEPIndex, IndexType));
continue;
}
break;
}
if (auto *CI = dyn_cast<IntToPtrInst>(V)) {
if (!CI->isNoopCast(DL))
break;
V = CI->getOperand(0);
continue;
}
if (auto *CI = dyn_cast<PtrToIntInst>(V)) {
if (!CI->isNoopCast(DL))
break;
V = CI->getOperand(0);
continue;
}
break;
}
return {V, Index};
}
/// Converts (CMP GEPLHS, RHS) if this change would make RHS a constant.
/// We can look through PHIs, GEPs and casts in order to determine a common base
/// between GEPLHS and RHS.
static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS,
ICmpInst::Predicate Cond,
const DataLayout &DL) {
if (!GEPLHS->hasAllConstantIndices())
return nullptr;
// Make sure the pointers have the same type.
if (GEPLHS->getType() != RHS->getType())
return nullptr;
Value *PtrBase, *Index;
std::tie(PtrBase, Index) = getAsConstantIndexedAddress(GEPLHS, DL);
// The set of nodes that will take part in this transformation.
SetVector<Value *> Nodes;
if (!canRewriteGEPAsOffset(RHS, PtrBase, DL, Nodes))
return nullptr;
// We know we can re-write this as
// ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2)
// Since we've only looked through inbouds GEPs we know that we
// can't have overflow on either side. We can therefore re-write
// this as:
// OFFSET1 cmp OFFSET2
Value *NewRHS = rewriteGEPAsOffset(RHS, PtrBase, DL, Nodes);
// RewriteGEPAsOffset has replaced RHS and all of its uses with a re-written
// GEP having PtrBase as the pointer base, and has returned in NewRHS the
// offset. Since Index is the offset of LHS to the base pointer, we will now
// compare the offsets instead of comparing the pointers.
return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Index, NewRHS);
}
/// Fold comparisons between a GEP instruction and something else. At this point
/// we know that the GEP is on the LHS of the comparison.
Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
ICmpInst::Predicate Cond,
Instruction &I) {
// Don't transform signed compares of GEPs into index compares. Even if the
// GEP is inbounds, the final add of the base pointer can have signed overflow
// and would change the result of the icmp.
// e.g. "&foo[0] <s &foo[1]" can't be folded to "true" because "foo" could be
// the maximum signed value for the pointer type.
if (ICmpInst::isSigned(Cond))
return nullptr;
// Look through bitcasts and addrspacecasts. We do not however want to remove
// 0 GEPs.
if (!isa<GetElementPtrInst>(RHS))
RHS = RHS->stripPointerCasts();
Value *PtrBase = GEPLHS->getOperand(0);
if (PtrBase == RHS && GEPLHS->isInBounds()) {
// ((gep Ptr, OFFSET) cmp Ptr) ---> (OFFSET cmp 0).
// This transformation (ignoring the base and scales) is valid because we
// know pointers can't overflow since the gep is inbounds. See if we can
// output an optimized form.
Value *Offset = evaluateGEPOffsetExpression(GEPLHS, *this, DL);
// If not, synthesize the offset the hard way.
if (!Offset)
Offset = EmitGEPOffset(GEPLHS);
return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset,
Constant::getNullValue(Offset->getType()));
} else if (GEPOperator *GEPRHS = dyn_cast<GEPOperator>(RHS)) {
// If the base pointers are different, but the indices are the same, just
// compare the base pointer.
if (PtrBase != GEPRHS->getOperand(0)) {
bool IndicesTheSame = GEPLHS->getNumOperands()==GEPRHS->getNumOperands();
IndicesTheSame &= GEPLHS->getOperand(0)->getType() ==
GEPRHS->getOperand(0)->getType();
if (IndicesTheSame)
for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i)
if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
IndicesTheSame = false;
break;
}
// If all indices are the same, just compare the base pointers.
Type *BaseType = GEPLHS->getOperand(0)->getType();
if (IndicesTheSame && CmpInst::makeCmpResultType(BaseType) == I.getType())
return new ICmpInst(Cond, GEPLHS->getOperand(0), GEPRHS->getOperand(0));
// If we're comparing GEPs with two base pointers that only differ in type
// and both GEPs have only constant indices or just one use, then fold
// the compare with the adjusted indices.
if (GEPLHS->isInBounds() && GEPRHS->isInBounds() &&
(GEPLHS->hasAllConstantIndices() || GEPLHS->hasOneUse()) &&
(GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse()) &&
PtrBase->stripPointerCasts() ==
GEPRHS->getOperand(0)->stripPointerCasts()) {
Value *LOffset = EmitGEPOffset(GEPLHS);
Value *ROffset = EmitGEPOffset(GEPRHS);
// If we looked through an addrspacecast between different sized address
// spaces, the LHS and RHS pointers are different sized
// integers. Truncate to the smaller one.
Type *LHSIndexTy = LOffset->getType();
Type *RHSIndexTy = ROffset->getType();
if (LHSIndexTy != RHSIndexTy) {
if (LHSIndexTy->getPrimitiveSizeInBits() <
RHSIndexTy->getPrimitiveSizeInBits()) {
ROffset = Builder.CreateTrunc(ROffset, LHSIndexTy);
} else
LOffset = Builder.CreateTrunc(LOffset, RHSIndexTy);
}
Value *Cmp = Builder.CreateICmp(ICmpInst::getSignedPredicate(Cond),
LOffset, ROffset);
return replaceInstUsesWith(I, Cmp);
}
// Otherwise, the base pointers are different and the indices are
// different. Try convert this to an indexed compare by looking through
// PHIs/casts.
return transformToIndexedCompare(GEPLHS, RHS, Cond, DL);
}
// If one of the GEPs has all zero indices, recurse.
if (GEPLHS->hasAllZeroIndices())
return foldGEPICmp(GEPRHS, GEPLHS->getOperand(0),
ICmpInst::getSwappedPredicate(Cond), I);
// If the other GEP has all zero indices, recurse.
if (GEPRHS->hasAllZeroIndices())
return foldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I);
bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds();
if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands()) {
// If the GEPs only differ by one index, compare it.
unsigned NumDifferences = 0; // Keep track of # differences.
unsigned DiffOperand = 0; // The operand that differs.
for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i)
if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
if (GEPLHS->getOperand(i)->getType()->getPrimitiveSizeInBits() !=
GEPRHS->getOperand(i)->getType()->getPrimitiveSizeInBits()) {
// Irreconcilable differences.
NumDifferences = 2;
break;
} else {
if (NumDifferences++) break;
DiffOperand = i;
}
}
if (NumDifferences == 0) // SAME GEP?
return replaceInstUsesWith(I, // No comparison is needed here.
ConstantInt::get(I.getType(), ICmpInst::isTrueWhenEqual(Cond)));
else if (NumDifferences == 1 && GEPsInBounds) {
Value *LHSV = GEPLHS->getOperand(DiffOperand);
Value *RHSV = GEPRHS->getOperand(DiffOperand);
// Make sure we do a signed comparison here.
return new ICmpInst(ICmpInst::getSignedPredicate(Cond), LHSV, RHSV);
}
}
// Only lower this if the icmp is the only user of the GEP or if we expect
// the result to fold to a constant!
if (GEPsInBounds && (isa<ConstantExpr>(GEPLHS) || GEPLHS->hasOneUse()) &&
(isa<ConstantExpr>(GEPRHS) || GEPRHS->hasOneUse())) {
// ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2) ---> (OFFSET1 cmp OFFSET2)
Value *L = EmitGEPOffset(GEPLHS);
Value *R = EmitGEPOffset(GEPRHS);
return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R);
}
}
// Try convert this to an indexed compare by looking through PHIs/casts as a
// last resort.
return transformToIndexedCompare(GEPLHS, RHS, Cond, DL);
}
Instruction *InstCombiner::foldAllocaCmp(ICmpInst &ICI,
const AllocaInst *Alloca,
const Value *Other) {
assert(ICI.isEquality() && "Cannot fold non-equality comparison.");
// It would be tempting to fold away comparisons between allocas and any
// pointer not based on that alloca (e.g. an argument). However, even
// though such pointers cannot alias, they can still compare equal.
//
// But LLVM doesn't specify where allocas get their memory, so if the alloca
// doesn't escape we can argue that it's impossible to guess its value, and we
// can therefore act as if any such guesses are wrong.
//
// The code below checks that the alloca doesn't escape, and that it's only
// used in a comparison once (the current instruction). The
// single-comparison-use condition ensures that we're trivially folding all
// comparisons against the alloca consistently, and avoids the risk of
// erroneously folding a comparison of the pointer with itself.
unsigned MaxIter = 32; // Break cycles and bound to constant-time.
SmallVector<const Use *, 32> Worklist;
for (const Use &U : Alloca->uses()) {
if (Worklist.size() >= MaxIter)
return nullptr;
Worklist.push_back(&U);
}
unsigned NumCmps = 0;
while (!Worklist.empty()) {
assert(Worklist.size() <= MaxIter);
const Use *U = Worklist.pop_back_val();
const Value *V = U->getUser();
--MaxIter;
if (isa<BitCastInst>(V) || isa<GetElementPtrInst>(V) || isa<PHINode>(V) ||
isa<SelectInst>(V)) {
// Track the uses.
} else if (isa<LoadInst>(V)) {
// Loading from the pointer doesn't escape it.
continue;
} else if (const auto *SI = dyn_cast<StoreInst>(V)) {
// Storing *to* the pointer is fine, but storing the pointer escapes it.
if (SI->getValueOperand() == U->get())
return nullptr;
continue;
} else if (isa<ICmpInst>(V)) {
if (NumCmps++)
return nullptr; // Found more than one cmp.
continue;
} else if (const auto *Intrin = dyn_cast<IntrinsicInst>(V)) {
switch (Intrin->getIntrinsicID()) {
// These intrinsics don't escape or compare the pointer. Memset is safe
// because we don't allow ptrtoint. Memcpy and memmove are safe because
// we don't allow stores, so src cannot point to V.
case Intrinsic::lifetime_start: case Intrinsic::lifetime_end:
case Intrinsic::memcpy: case Intrinsic::memmove: case Intrinsic::memset:
continue;
default:
return nullptr;
}
} else {
return nullptr;
}
for (const Use &U : V->uses()) {
if (Worklist.size() >= MaxIter)
return nullptr;
Worklist.push_back(&U);
}
}
Type *CmpTy = CmpInst::makeCmpResultType(Other->getType());
return replaceInstUsesWith(
ICI,
ConstantInt::get(CmpTy, !CmpInst::isTrueWhenEqual(ICI.getPredicate())));
}
/// Fold "icmp pred (X+C), X".
Instruction *InstCombiner::foldICmpAddOpConst(Value *X, const APInt &C,
ICmpInst::Predicate Pred) {
// From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0,
// so the values can never be equal. Similarly for all other "or equals"
// operators.
assert(!!C && "C should not be zero!");
// (X+1) <u X --> X >u (MAXUINT-1) --> X == 255
// (X+2) <u X --> X >u (MAXUINT-2) --> X > 253
// (X+MAXUINT) <u X --> X >u (MAXUINT-MAXUINT) --> X != 0
if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) {
Constant *R = ConstantInt::get(X->getType(),
APInt::getMaxValue(C.getBitWidth()) - C);
return new ICmpInst(ICmpInst::ICMP_UGT, X, R);
}
// (X+1) >u X --> X <u (0-1) --> X != 255
// (X+2) >u X --> X <u (0-2) --> X <u 254
// (X+MAXUINT) >u X --> X <u (0-MAXUINT) --> X <u 1 --> X == 0
if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE)
return new ICmpInst(ICmpInst::ICMP_ULT, X,
ConstantInt::get(X->getType(), -C));
APInt SMax = APInt::getSignedMaxValue(C.getBitWidth());
// (X+ 1) <s X --> X >s (MAXSINT-1) --> X == 127
// (X+ 2) <s X --> X >s (MAXSINT-2) --> X >s 125
// (X+MAXSINT) <s X --> X >s (MAXSINT-MAXSINT) --> X >s 0
// (X+MINSINT) <s X --> X >s (MAXSINT-MINSINT) --> X >s -1
// (X+ -2) <s X --> X >s (MAXSINT- -2) --> X >s 126
// (X+ -1) <s X --> X >s (MAXSINT- -1) --> X != 127
if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE)
return new ICmpInst(ICmpInst::ICMP_SGT, X,
ConstantInt::get(X->getType(), SMax - C));
// (X+ 1) >s X --> X <s (MAXSINT-(1-1)) --> X != 127
// (X+ 2) >s X --> X <s (MAXSINT-(2-1)) --> X <s 126
// (X+MAXSINT) >s X --> X <s (MAXSINT-(MAXSINT-1)) --> X <s 1
// (X+MINSINT) >s X --> X <s (MAXSINT-(MINSINT-1)) --> X <s -2
// (X+ -2) >s X --> X <s (MAXSINT-(-2-1)) --> X <s -126
// (X+ -1) >s X --> X <s (MAXSINT-(-1-1)) --> X == -128
assert(Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE);
return new ICmpInst(ICmpInst::ICMP_SLT, X,
ConstantInt::get(X->getType(), SMax - (C - 1)));
}
/// Handle "(icmp eq/ne (ashr/lshr AP2, A), AP1)" ->
/// (icmp eq/ne A, Log2(AP2/AP1)) ->
/// (icmp eq/ne A, Log2(AP2) - Log2(AP1)).
Instruction *InstCombiner::foldICmpShrConstConst(ICmpInst &I, Value *A,
const APInt &AP1,
const APInt &AP2) {
assert(I.isEquality() && "Cannot fold icmp gt/lt");
auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
if (I.getPredicate() == I.ICMP_NE)
Pred = CmpInst::getInversePredicate(Pred);
return new ICmpInst(Pred, LHS, RHS);
};
// Don't bother doing any work for cases which InstSimplify handles.
if (AP2.isNullValue())
return nullptr;
bool IsAShr = isa<AShrOperator>(I.getOperand(0));
if (IsAShr) {
if (AP2.isAllOnesValue())
return nullptr;
if (AP2.isNegative() != AP1.isNegative())
return nullptr;
if (AP2.sgt(AP1))
return nullptr;
}
if (!AP1)
// 'A' must be large enough to shift out the highest set bit.
return getICmp(I.ICMP_UGT, A,
ConstantInt::get(A->getType(), AP2.logBase2()));
if (AP1 == AP2)
return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType()));
int Shift;
if (IsAShr && AP1.isNegative())
Shift = AP1.countLeadingOnes() - AP2.countLeadingOnes();
else
Shift = AP1.countLeadingZeros() - AP2.countLeadingZeros();
if (Shift > 0) {
if (IsAShr && AP1 == AP2.ashr(Shift)) {
// There are multiple solutions if we are comparing against -1 and the LHS
// of the ashr is not a power of two.
if (AP1.isAllOnesValue() && !AP2.isPowerOf2())
return getICmp(I.ICMP_UGE, A, ConstantInt::get(A->getType(), Shift));
return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
} else if (AP1 == AP2.lshr(Shift)) {
return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
}
}
// Shifting const2 will never be equal to const1.
// FIXME: This should always be handled by InstSimplify?
auto *TorF = ConstantInt::get(I.getType(), I.getPredicate() == I.ICMP_NE);
return replaceInstUsesWith(I, TorF);
}
/// Handle "(icmp eq/ne (shl AP2, A), AP1)" ->
/// (icmp eq/ne A, TrailingZeros(AP1) - TrailingZeros(AP2)).
Instruction *InstCombiner::foldICmpShlConstConst(ICmpInst &I, Value *A,
const APInt &AP1,
const APInt &AP2) {
assert(I.isEquality() && "Cannot fold icmp gt/lt");
auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
if (I.getPredicate() == I.ICMP_NE)
Pred = CmpInst::getInversePredicate(Pred);
return new ICmpInst(Pred, LHS, RHS);
};
// Don't bother doing any work for cases which InstSimplify handles.
if (AP2.isNullValue())
return nullptr;
unsigned AP2TrailingZeros = AP2.countTrailingZeros();
if (!AP1 && AP2TrailingZeros != 0)
return getICmp(
I.ICMP_UGE, A,
ConstantInt::get(A->getType(), AP2.getBitWidth() - AP2TrailingZeros));
if (AP1 == AP2)
return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType()));
// Get the distance between the lowest bits that are set.
int Shift = AP1.countTrailingZeros() - AP2TrailingZeros;
if (Shift > 0 && AP2.shl(Shift) == AP1)
return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
// Shifting const2 will never be equal to const1.
// FIXME: This should always be handled by InstSimplify?
auto *TorF = ConstantInt::get(I.getType(), I.getPredicate() == I.ICMP_NE);
return replaceInstUsesWith(I, TorF);
}
/// The caller has matched a pattern of the form:
/// I = icmp ugt (add (add A, B), CI2), CI1
/// If this is of the form:
/// sum = a + b
/// if (sum+128 >u 255)
/// Then replace it with llvm.sadd.with.overflow.i8.
///
static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
ConstantInt *CI2, ConstantInt *CI1,
InstCombiner &IC) {
// The transformation we're trying to do here is to transform this into an
// llvm.sadd.with.overflow. To do this, we have to replace the original add
// with a narrower add, and discard the add-with-constant that is part of the
// range check (if we can't eliminate it, this isn't profitable).
// In order to eliminate the add-with-constant, the compare can be its only
// use.
Instruction *AddWithCst = cast<Instruction>(I.getOperand(0));
if (!AddWithCst->hasOneUse())
return nullptr;
// If CI2 is 2^7, 2^15, 2^31, then it might be an sadd.with.overflow.
if (!CI2->getValue().isPowerOf2())
return nullptr;
unsigned NewWidth = CI2->getValue().countTrailingZeros();
if (NewWidth != 7 && NewWidth != 15 && NewWidth != 31)
return nullptr;
// The width of the new add formed is 1 more than the bias.
++NewWidth;
// Check to see that CI1 is an all-ones value with NewWidth bits.
if (CI1->getBitWidth() == NewWidth ||
CI1->getValue() != APInt::getLowBitsSet(CI1->getBitWidth(), NewWidth))
return nullptr;
// This is only really a signed overflow check if the inputs have been
// sign-extended; check for that condition. For example, if CI2 is 2^31 and
// the operands of the add are 64 bits wide, we need at least 33 sign bits.
unsigned NeededSignBits = CI1->getBitWidth() - NewWidth + 1;
if (IC.ComputeNumSignBits(A, 0, &I) < NeededSignBits ||
IC.ComputeNumSignBits(B, 0, &I) < NeededSignBits)
return nullptr;
// In order to replace the original add with a narrower
// llvm.sadd.with.overflow, the only uses allowed are the add-with-constant
// and truncates that discard the high bits of the add. Verify that this is
// the case.
Instruction *OrigAdd = cast<Instruction>(AddWithCst->getOperand(0));
for (User *U : OrigAdd->users()) {
if (U == AddWithCst)
continue;
// Only accept truncates for now. We would really like a nice recursive
// predicate like SimplifyDemandedBits, but which goes downwards the use-def
// chain to see which bits of a value are actually demanded. If the
// original add had another add which was then immediately truncated, we
// could still do the transformation.
TruncInst *TI = dyn_cast<TruncInst>(U);
if (!TI || TI->getType()->getPrimitiveSizeInBits() > NewWidth)
return nullptr;
}
// If the pattern matches, truncate the inputs to the narrower type and
// use the sadd_with_overflow intrinsic to efficiently compute both the
// result and the overflow bit.
Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth);
Function *F = Intrinsic::getDeclaration(
I.getModule(), Intrinsic::sadd_with_overflow, NewType);
InstCombiner::BuilderTy &Builder = IC.Builder;
// Put the new code above the original add, in case there are any uses of the
// add between the add and the compare.
Builder.SetInsertPoint(OrigAdd);
Value *TruncA = Builder.CreateTrunc(A, NewType, A->getName() + ".trunc");
Value *TruncB = Builder.CreateTrunc(B, NewType, B->getName() + ".trunc");
CallInst *Call = Builder.CreateCall(F, {TruncA, TruncB}, "sadd");
Value *Add = Builder.CreateExtractValue(Call, 0, "sadd.result");
Value *ZExt = Builder.CreateZExt(Add, OrigAdd->getType());
// The inner add was the result of the narrow add, zero extended to the
// wider type. Replace it with the result computed by the intrinsic.
IC.replaceInstUsesWith(*OrigAdd, ZExt);
// The original icmp gets replaced with the overflow value.
return ExtractValueInst::Create(Call, 1, "sadd.overflow");
}
// Handle icmp pred X, 0
Instruction *InstCombiner::foldICmpWithZero(ICmpInst &Cmp) {
CmpInst::Predicate Pred = Cmp.getPredicate();
if (!match(Cmp.getOperand(1), m_Zero()))
return nullptr;
// (icmp sgt smin(PosA, B) 0) -> (icmp sgt B 0)
if (Pred == ICmpInst::ICMP_SGT) {
Value *A, *B;
SelectPatternResult SPR = matchSelectPattern(Cmp.getOperand(0), A, B);
if (SPR.Flavor == SPF_SMIN) {
if (isKnownPositive(A, DL, 0, &AC, &Cmp, &DT))
return new ICmpInst(Pred, B, Cmp.getOperand(1));
if (isKnownPositive(B, DL, 0, &AC, &Cmp, &DT))
return new ICmpInst(Pred, A, Cmp.getOperand(1));
}
}
// Given:
// icmp eq/ne (urem %x, %y), 0
// Iff %x has 0 or 1 bits set, and %y has at least 2 bits set, omit 'urem':
// icmp eq/ne %x, 0
Value *X, *Y;
if (match(Cmp.getOperand(0), m_URem(m_Value(X), m_Value(Y))) &&
ICmpInst::isEquality(Pred)) {
KnownBits XKnown = computeKnownBits(X, 0, &Cmp);
KnownBits YKnown = computeKnownBits(Y, 0, &Cmp);
if (XKnown.countMaxPopulation() == 1 && YKnown.countMinPopulation() >= 2)
return new ICmpInst(Pred, X, Cmp.getOperand(1));
}
return nullptr;
}
/// Fold icmp Pred X, C.
/// TODO: This code structure does not make sense. The saturating add fold
/// should be moved to some other helper and extended as noted below (it is also
/// possible that code has been made unnecessary - do we canonicalize IR to
/// overflow/saturating intrinsics or not?).
Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) {
// Match the following pattern, which is a common idiom when writing
// overflow-safe integer arithmetic functions. The source performs an addition
// in wider type and explicitly checks for overflow using comparisons against
// INT_MIN and INT_MAX. Simplify by using the sadd_with_overflow intrinsic.
//
// TODO: This could probably be generalized to handle other overflow-safe
// operations if we worked out the formulas to compute the appropriate magic
// constants.
//
// sum = a + b
// if (sum+128 >u 255) ... -> llvm.sadd.with.overflow.i8
CmpInst::Predicate Pred = Cmp.getPredicate();
Value *Op0 = Cmp.getOperand(0), *Op1 = Cmp.getOperand(1);
Value *A, *B;
ConstantInt *CI, *CI2; // I = icmp ugt (add (add A, B), CI2), CI
if (Pred == ICmpInst::ICMP_UGT && match(Op1, m_ConstantInt(CI)) &&
match(Op0, m_Add(m_Add(m_Value(A), m_Value(B)), m_ConstantInt(CI2))))
if (Instruction *Res = processUGT_ADDCST_ADD(Cmp, A, B, CI2, CI, *this))
return Res;
return nullptr;
}
/// Canonicalize icmp instructions based on dominating conditions.
Instruction *InstCombiner::foldICmpWithDominatingICmp(ICmpInst &Cmp) {
// This is a cheap/incomplete check for dominance - just match a single
// predecessor with a conditional branch.
BasicBlock *CmpBB = Cmp.getParent();
BasicBlock *DomBB = CmpBB->getSinglePredecessor();
if (!DomBB)
return nullptr;
Value *DomCond;
BasicBlock *TrueBB, *FalseBB;
if (!match(DomBB->getTerminator(), m_Br(m_Value(DomCond), TrueBB, FalseBB)))
return nullptr;
assert((TrueBB == CmpBB || FalseBB == CmpBB) &&
"Predecessor block does not point to successor?");
// The branch should get simplified. Don't bother simplifying this condition.
if (TrueBB == FalseBB)
return nullptr;
// Try to simplify this compare to T/F based on the dominating condition.
Optional<bool> Imp = isImpliedCondition(DomCond, &Cmp, DL, TrueBB == CmpBB);
if (Imp)
return replaceInstUsesWith(Cmp, ConstantInt::get(Cmp.getType(), *Imp));
CmpInst::Predicate Pred = Cmp.getPredicate();
Value *X = Cmp.getOperand(0), *Y = Cmp.getOperand(1);
ICmpInst::Predicate DomPred;
const APInt *C, *DomC;
if (match(DomCond, m_ICmp(DomPred, m_Specific(X), m_APInt(DomC))) &&
match(Y, m_APInt(C))) {
// We have 2 compares of a variable with constants. Calculate the constant
// ranges of those compares to see if we can transform the 2nd compare:
// DomBB:
// DomCond = icmp DomPred X, DomC
// br DomCond, CmpBB, FalseBB
// CmpBB:
// Cmp = icmp Pred X, C
ConstantRange CR = ConstantRange::makeAllowedICmpRegion(Pred, *C);
ConstantRange DominatingCR =
(CmpBB == TrueBB) ? ConstantRange::makeExactICmpRegion(DomPred, *DomC)
: ConstantRange::makeExactICmpRegion(
CmpInst::getInversePredicate(DomPred), *DomC);
ConstantRange Intersection = DominatingCR.intersectWith(CR);
ConstantRange Difference = DominatingCR.difference(CR);
if (Intersection.isEmptySet())
return replaceInstUsesWith(Cmp, Builder.getFalse());
if (Difference.isEmptySet())
return replaceInstUsesWith(Cmp, Builder.getTrue());
// Canonicalizing a sign bit comparison that gets used in a branch,
// pessimizes codegen by generating branch on zero instruction instead
// of a test and branch. So we avoid canonicalizing in such situations
// because test and branch instruction has better branch displacement
// than compare and branch instruction.
bool UnusedBit;
bool IsSignBit = isSignBitCheck(Pred, *C, UnusedBit);
if (Cmp.isEquality() || (IsSignBit && hasBranchUse(Cmp)))
return nullptr;
if (const APInt *EqC = Intersection.getSingleElement())
return new ICmpInst(ICmpInst::ICMP_EQ, X, Builder.getInt(*EqC));
if (const APInt *NeC = Difference.getSingleElement())
return new ICmpInst(ICmpInst::ICMP_NE, X, Builder.getInt(*NeC));
}
return nullptr;
}
/// Fold icmp (trunc X, Y), C.
Instruction *InstCombiner::foldICmpTruncConstant(ICmpInst &Cmp,
TruncInst *Trunc,
const APInt &C) {
ICmpInst::Predicate Pred = Cmp.getPredicate();
Value *X = Trunc->getOperand(0);
if (C.isOneValue() && C.getBitWidth() > 1) {
// icmp slt trunc(signum(V)) 1 --> icmp slt V, 1
Value *V = nullptr;
if (Pred == ICmpInst::ICMP_SLT && match(X, m_Signum(m_Value(V))))
return new ICmpInst(ICmpInst::ICMP_SLT, V,
ConstantInt::get(V->getType(), 1));
}
if (Cmp.isEquality() && Trunc->hasOneUse()) {
// Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42|highbits if all
// of the high bits truncated out of x are known.
unsigned DstBits = Trunc->getType()->getScalarSizeInBits(),
SrcBits = X->getType()->getScalarSizeInBits();
KnownBits Known = computeKnownBits(X, 0, &Cmp);
// If all the high bits are known, we can do this xform.
if ((Known.Zero | Known.One).countLeadingOnes() >= SrcBits - DstBits) {
// Pull in the high bits from known-ones set.
APInt NewRHS = C.zext(SrcBits);
NewRHS |= Known.One & APInt::getHighBitsSet(SrcBits, SrcBits - DstBits);
return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), NewRHS));
}
}
return nullptr;
}
/// Fold icmp (xor X, Y), C.
Instruction *InstCombiner::foldICmpXorConstant(ICmpInst &Cmp,
BinaryOperator *Xor,
const APInt &C) {
Value *X = Xor->getOperand(0);
Value *Y = Xor->getOperand(1);
const APInt *XorC;
if (!match(Y, m_APInt(XorC)))
return nullptr;
// If this is a comparison that tests the signbit (X < 0) or (x > -1),
// fold the xor.
ICmpInst::Predicate Pred = Cmp.getPredicate();
bool TrueIfSigned = false;
if (isSignBitCheck(Cmp.getPredicate(), C, TrueIfSigned)) {
// If the sign bit of the XorCst is not set, there is no change to
// the operation, just stop using the Xor.
if (!XorC->isNegative()) {
Cmp.setOperand(0, X);
Worklist.Add(Xor);
return &Cmp;
}
// Emit the opposite comparison.
if (TrueIfSigned)
return new ICmpInst(ICmpInst::ICMP_SGT, X,
ConstantInt::getAllOnesValue(X->getType()));
else
return new ICmpInst(ICmpInst::ICMP_SLT, X,
ConstantInt::getNullValue(X->getType()));
}
if (Xor->hasOneUse()) {
// (icmp u/s (xor X SignMask), C) -> (icmp s/u X, (xor C SignMask))
if (!Cmp.isEquality() && XorC->isSignMask()) {
Pred = Cmp.isSigned() ? Cmp.getUnsignedPredicate()
: Cmp.getSignedPredicate();
return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), C ^ *XorC));
}
// (icmp u/s (xor X ~SignMask), C) -> (icmp s/u X, (xor C ~SignMask))
if (!Cmp.isEquality() && XorC->isMaxSignedValue()) {
Pred = Cmp.isSigned() ? Cmp.getUnsignedPredicate()
: Cmp.getSignedPredicate();
Pred = Cmp.getSwappedPredicate(Pred);
return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), C ^ *XorC));
}
}
// Mask constant magic can eliminate an 'xor' with unsigned compares.
if (Pred == ICmpInst::ICMP_UGT) {
// (xor X, ~C) >u C --> X <u ~C (when C+1 is a power of 2)
if (*XorC == ~C && (C + 1).isPowerOf2())
return new ICmpInst(ICmpInst::ICMP_ULT, X, Y);
// (xor X, C) >u C --> X >u C (when C+1 is a power of 2)
if (*XorC == C && (C + 1).isPowerOf2())
return new ICmpInst(ICmpInst::ICMP_UGT, X, Y);
}
if (Pred == ICmpInst::ICMP_ULT) {
// (xor X, -C) <u C --> X >u ~C (when C is a power of 2)
if (*XorC == -C && C.isPowerOf2())
return new ICmpInst(ICmpInst::ICMP_UGT, X,
ConstantInt::get(X->getType(), ~C));
// (xor X, C) <u C --> X >u ~C (when -C is a power of 2)
if (*XorC == C && (-C).isPowerOf2())
return new ICmpInst(ICmpInst::ICMP_UGT, X,
ConstantInt::get(X->getType(), ~C));
}
return nullptr;
}
/// Fold icmp (and (sh X, Y), C2), C1.
Instruction *InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And,
const APInt &C1, const APInt &C2) {
BinaryOperator *Shift = dyn_cast<BinaryOperator>(And->getOperand(0));
if (!Shift || !Shift->isShift())
return nullptr;
// If this is: (X >> C3) & C2 != C1 (where any shift and any compare could
// exist), turn it into (X & (C2 << C3)) != (C1 << C3). This happens a LOT in
// code produced by the clang front-end, for bitfield access.
// This seemingly simple opportunity to fold away a shift turns out to be
// rather complicated. See PR17827 for details.
unsigned ShiftOpcode = Shift->getOpcode();
bool IsShl = ShiftOpcode == Instruction::Shl;
const APInt *C3;
if (match(Shift->getOperand(1), m_APInt(C3))) {
bool CanFold = false;
if (ShiftOpcode == Instruction::Shl) {
// For a left shift, we can fold if the comparison is not signed. We can
// also fold a signed comparison if the mask value and comparison value
// are not negative. These constraints may not be obvious, but we can
// prove that they are correct using an SMT solver.
if (!Cmp.isSigned() || (!C2.isNegative() && !C1.isNegative()))
CanFold = true;
} else {
bool IsAshr = ShiftOpcode == Instruction::AShr;
// For a logical right shift, we can fold if the comparison is not signed.
// We can also fold a signed comparison if the shifted mask value and the
// shifted comparison value are not negative. These constraints may not be
// obvious, but we can prove that they are correct using an SMT solver.
// For an arithmetic shift right we can do the same, if we ensure
// the And doesn't use any bits being shifted in. Normally these would
// be turned into lshr by SimplifyDemandedBits, but not if there is an
// additional user.
if (!IsAshr || (C2.shl(*C3).lshr(*C3) == C2)) {
if (!Cmp.isSigned() ||
(!C2.shl(*C3).isNegative() && !C1.shl(*C3).isNegative()))
CanFold = true;
}
}
if (CanFold) {
APInt NewCst = IsShl ? C1.lshr(*C3) : C1.shl(*C3);
APInt SameAsC1 = IsShl ? NewCst.shl(*C3) : NewCst.lshr(*C3);
// Check to see if we are shifting out any of the bits being compared.
if (SameAsC1 != C1) {
// If we shifted bits out, the fold is not going to work out. As a
// special case, check to see if this means that the result is always
// true or false now.
if (Cmp.getPredicate() == ICmpInst::ICMP_EQ)
return replaceInstUsesWith(Cmp, ConstantInt::getFalse(Cmp.getType()));
if (Cmp.getPredicate() == ICmpInst::ICMP_NE)
return replaceInstUsesWith(Cmp, ConstantInt::getTrue(Cmp.getType()));
} else {
Cmp.setOperand(1, ConstantInt::get(And->getType(), NewCst));
APInt NewAndCst = IsShl ? C2.lshr(*C3) : C2.shl(*C3);
And->setOperand(1, ConstantInt::get(And->getType(), NewAndCst));
And->setOperand(0, Shift->getOperand(0));
Worklist.Add(Shift); // Shift is dead.
return &Cmp;
}
}
}
// Turn ((X >> Y) & C2) == 0 into (X & (C2 << Y)) == 0. The latter is
// preferable because it allows the C2 << Y expression to be hoisted out of a
// loop if Y is invariant and X is not.
if (Shift->hasOneUse() && C1.isNullValue() && Cmp.isEquality() &&
!Shift->isArithmeticShift() && !isa<Constant>(Shift->getOperand(0))) {
// Compute C2 << Y.
Value *NewShift =
IsShl ? Builder.CreateLShr(And->getOperand(1), Shift->getOperand(1))
: Builder.CreateShl(And->getOperand(1), Shift->getOperand(1));
// Compute X & (C2 << Y).
Value *NewAnd = Builder.CreateAnd(Shift->getOperand(0), NewShift);
Cmp.setOperand(0, NewAnd);
return &Cmp;
}
return nullptr;
}
/// Fold icmp (and X, C2), C1.
Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp,
BinaryOperator *And,
const APInt &C1) {
bool isICMP_NE = Cmp.getPredicate() == ICmpInst::ICMP_NE;
// For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1
// TODO: We canonicalize to the longer form for scalars because we have
// better analysis/folds for icmp, and codegen may be better with icmp.
if (isICMP_NE && Cmp.getType()->isVectorTy() && C1.isNullValue() &&
match(And->getOperand(1), m_One()))
return new TruncInst(And->getOperand(0), Cmp.getType());
const APInt *C2;
Value *X;
if (!match(And, m_And(m_Value(X), m_APInt(C2))))
return nullptr;
// Don't perform the following transforms if the AND has multiple uses
if (!And->hasOneUse())
return nullptr;
if (Cmp.isEquality() && C1.isNullValue()) {
// Restrict this fold to single-use 'and' (PR10267).
// Replace (and X, (1 << size(X)-1) != 0) with X s< 0
if (C2->isSignMask()) {
Constant *Zero = Constant::getNullValue(X->getType());
auto NewPred = isICMP_NE ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_SGE;
return new ICmpInst(NewPred, X, Zero);
}
// Restrict this fold only for single-use 'and' (PR10267).
// ((%x & C) == 0) --> %x u< (-C) iff (-C) is power of two.
if ((~(*C2) + 1).isPowerOf2()) {
Constant *NegBOC =
ConstantExpr::getNeg(cast<Constant>(And->getOperand(1)));
auto NewPred = isICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
return new ICmpInst(NewPred, X, NegBOC);
}
}
// If the LHS is an 'and' of a truncate and we can widen the and/compare to
// the input width without changing the value produced, eliminate the cast:
//
// icmp (and (trunc W), C2), C1 -> icmp (and W, C2'), C1'
//
// We can do this transformation if the constants do not have their sign bits
// set or if it is an equality comparison. Extending a relational comparison
// when we're checking the sign bit would not work.
Value *W;
if (match(And->getOperand(0), m_OneUse(m_Trunc(m_Value(W)))) &&
(Cmp.isEquality() || (!C1.isNegative() && !C2->isNegative()))) {
// TODO: Is this a good transform for vectors? Wider types may reduce
// throughput. Should this transform be limited (even for scalars) by using
// shouldChangeType()?
if (!Cmp.getType()->isVectorTy()) {
Type *WideType = W->getType();
unsigned WideScalarBits = WideType->getScalarSizeInBits();
Constant *ZextC1 = ConstantInt::get(WideType, C1.zext(WideScalarBits));
Constant *ZextC2 = ConstantInt::get(WideType, C2->zext(WideScalarBits));
Value *NewAnd = Builder.CreateAnd(W, ZextC2, And->getName());
return new ICmpInst(Cmp.getPredicate(), NewAnd, ZextC1);
}
}
if (Instruction *I = foldICmpAndShift(Cmp, And, C1, *C2))
return I;
// (icmp pred (and (or (lshr A, B), A), 1), 0) -->
// (icmp pred (and A, (or (shl 1, B), 1), 0))
//
// iff pred isn't signed
if (!Cmp.isSigned() && C1.isNullValue() && And->getOperand(0)->hasOneUse() &&
match(And->getOperand(1), m_One())) {
Constant *One = cast<Constant>(And->getOperand(1));
Value *Or = And->getOperand(0);
Value *A, *B, *LShr;
if (match(Or, m_Or(m_Value(LShr), m_Value(A))) &&
match(LShr, m_LShr(m_Specific(A), m_Value(B)))) {
unsigned UsesRemoved = 0;
if (And->hasOneUse())
++UsesRemoved;
if (Or->hasOneUse())
++UsesRemoved;
if (LShr->hasOneUse())
++UsesRemoved;
// Compute A & ((1 << B) | 1)
Value *NewOr = nullptr;
if (auto *C = dyn_cast<Constant>(B)) {
if (UsesRemoved >= 1)
NewOr = ConstantExpr::getOr(ConstantExpr::getNUWShl(One, C), One);
} else {
if (UsesRemoved >= 3)
NewOr = Builder.CreateOr(Builder.CreateShl(One, B, LShr->getName(),
/*HasNUW=*/true),
One, Or->getName());
}
if (NewOr) {
Value *NewAnd = Builder.CreateAnd(A, NewOr, And->getName());
Cmp.setOperand(0, NewAnd);
return &Cmp;
}
}
}
return nullptr;
}
/// Fold icmp (and X, Y), C.
Instruction *InstCombiner::foldICmpAndConstant(ICmpInst &Cmp,
BinaryOperator *And,
const APInt &C) {
if (Instruction *I = foldICmpAndConstConst(Cmp, And, C))
return I;
// TODO: These all require that Y is constant too, so refactor with the above.
// Try to optimize things like "A[i] & 42 == 0" to index computations.
Value *X = And->getOperand(0);
Value *Y = And->getOperand(1);
if (auto *LI = dyn_cast<LoadInst>(X))
if (auto *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0)))
if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
!LI->isVolatile() && isa<ConstantInt>(Y)) {
ConstantInt *C2 = cast<ConstantInt>(Y);
if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, Cmp, C2))
return Res;
}
if (!Cmp.isEquality())
return nullptr;
// X & -C == -C -> X > u ~C
// X & -C != -C -> X <= u ~C
// iff C is a power of 2
if (Cmp.getOperand(1) == Y && (-C).isPowerOf2()) {
auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGT
: CmpInst::ICMP_ULE;
return new ICmpInst(NewPred, X, SubOne(cast<Constant>(Cmp.getOperand(1))));
}
// (X & C2) == 0 -> (trunc X) >= 0
// (X & C2) != 0 -> (trunc X) < 0
// iff C2 is a power of 2 and it masks the sign bit of a legal integer type.
const APInt *C2;
if (And->hasOneUse() && C.isNullValue() && match(Y, m_APInt(C2))) {
int32_t ExactLogBase2 = C2->exactLogBase2();
if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) {
Type *NTy = IntegerType::get(Cmp.getContext(), ExactLogBase2 + 1);
if (And->getType()->isVectorTy())
NTy = VectorType::get(NTy, And->getType()->getVectorNumElements());
Value *Trunc = Builder.CreateTrunc(X, NTy);
auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_SGE
: CmpInst::ICMP_SLT;
return new ICmpInst(NewPred, Trunc, Constant::getNullValue(NTy));
}
}
return nullptr;
}
/// Fold icmp (or X, Y), C.
Instruction *InstCombiner::foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or,
const APInt &C) {
ICmpInst::Predicate Pred = Cmp.getPredicate();
if (C.isOneValue()) {
// icmp slt signum(V) 1 --> icmp slt V, 1
Value *V = nullptr;
if (Pred == ICmpInst::ICMP_SLT && match(Or, m_Signum(m_Value(V))))
return new ICmpInst(ICmpInst::ICMP_SLT, V,
ConstantInt::get(V->getType(), 1));
}
Value *OrOp0 = Or->getOperand(0), *OrOp1 = Or->getOperand(1);
if (Cmp.isEquality() && Cmp.getOperand(1) == OrOp1) {
// X | C == C --> X <=u C
// X | C != C --> X >u C
// iff C+1 is a power of 2 (C is a bitmask of the low bits)
if ((C + 1).isPowerOf2()) {
Pred = (Pred == CmpInst::ICMP_EQ) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT;
return new ICmpInst(Pred, OrOp0, OrOp1);
}
// More general: are all bits outside of a mask constant set or not set?
// X | C == C --> (X & ~C) == 0
// X | C != C --> (X & ~C) != 0
if (Or->hasOneUse()) {
Value *A = Builder.CreateAnd(OrOp0, ~C);
return new ICmpInst(Pred, A, ConstantInt::getNullValue(OrOp0->getType()));
}
}
if (!Cmp.isEquality() || !C.isNullValue() || !Or->hasOneUse())
return nullptr;
Value *P, *Q;
if (match(Or, m_Or(m_PtrToInt(m_Value(P)), m_PtrToInt(m_Value(Q))))) {
// Simplify icmp eq (or (ptrtoint P), (ptrtoint Q)), 0
// -> and (icmp eq P, null), (icmp eq Q, null).
Value *CmpP =
Builder.CreateICmp(Pred, P, ConstantInt::getNullValue(P->getType()));
Value *CmpQ =
Builder.CreateICmp(Pred, Q, ConstantInt::getNullValue(Q->getType()));
auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
return BinaryOperator::Create(BOpc, CmpP, CmpQ);
}
// Are we using xors to bitwise check for a pair of (in)equalities? Convert to
// a shorter form that has more potential to be folded even further.
Value *X1, *X2, *X3, *X4;
if (match(OrOp0, m_OneUse(m_Xor(m_Value(X1), m_Value(X2)))) &&
match(OrOp1, m_OneUse(m_Xor(m_Value(X3), m_Value(X4))))) {
// ((X1 ^ X2) || (X3 ^ X4)) == 0 --> (X1 == X2) && (X3 == X4)
// ((X1 ^ X2) || (X3 ^ X4)) != 0 --> (X1 != X2) || (X3 != X4)
Value *Cmp12 = Builder.CreateICmp(Pred, X1, X2);
Value *Cmp34 = Builder.CreateICmp(Pred, X3, X4);
auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
return BinaryOperator::Create(BOpc, Cmp12, Cmp34);
}
return nullptr;
}
/// Fold icmp (mul X, Y), C.
Instruction *InstCombiner::foldICmpMulConstant(ICmpInst &Cmp,
BinaryOperator *Mul,
const APInt &C) {
const APInt *MulC;
if (!match(Mul->getOperand(1), m_APInt(MulC)))
return nullptr;
// If this is a test of the sign bit and the multiply is sign-preserving with
// a constant operand, use the multiply LHS operand instead.
ICmpInst::Predicate Pred = Cmp.getPredicate();
if (isSignTest(Pred, C) && Mul->hasNoSignedWrap()) {
if (MulC->isNegative())
Pred = ICmpInst::getSwappedPredicate(Pred);
return new ICmpInst(Pred, Mul->getOperand(0),
Constant::getNullValue(Mul->getType()));
}
return nullptr;
}
/// Fold icmp (shl 1, Y), C.
static Instruction *foldICmpShlOne(ICmpInst &Cmp, Instruction *Shl,
const APInt &C) {
Value *Y;
if (!match(Shl, m_Shl(m_One(), m_Value(Y))))
return nullptr;
Type *ShiftType = Shl->getType();
unsigned TypeBits = C.getBitWidth();
bool CIsPowerOf2 = C.isPowerOf2();
ICmpInst::Predicate Pred = Cmp.getPredicate();
if (Cmp.isUnsigned()) {
// (1 << Y) pred C -> Y pred Log2(C)
if (!CIsPowerOf2) {
// (1 << Y) < 30 -> Y <= 4
// (1 << Y) <= 30 -> Y <= 4
// (1 << Y) >= 30 -> Y > 4
// (1 << Y) > 30 -> Y > 4
if (Pred == ICmpInst::ICMP_ULT)
Pred = ICmpInst::ICMP_ULE;
else if (Pred == ICmpInst::ICMP_UGE)
Pred = ICmpInst::ICMP_UGT;
}
// (1 << Y) >= 2147483648 -> Y >= 31 -> Y == 31
// (1 << Y) < 2147483648 -> Y < 31 -> Y != 31
unsigned CLog2 = C.logBase2();
if (CLog2 == TypeBits - 1) {
if (Pred == ICmpInst::ICMP_UGE)
Pred = ICmpInst::ICMP_EQ;
else if (Pred == ICmpInst::ICMP_ULT)
Pred = ICmpInst::ICMP_NE;
}
return new ICmpInst(Pred, Y, ConstantInt::get(ShiftType, CLog2));
} else if (Cmp.isSigned()) {
Constant *BitWidthMinusOne = ConstantInt::get(ShiftType, TypeBits - 1);
if (C.isAllOnesValue()) {
// (1 << Y) <= -1 -> Y == 31
if (Pred == ICmpInst::ICMP_SLE)
return new ICmpInst(ICmpInst::ICMP_EQ, Y, BitWidthMinusOne);
// (1 << Y) > -1 -> Y != 31
if (Pred == ICmpInst::ICMP_SGT)
return new ICmpInst(ICmpInst::ICMP_NE, Y, BitWidthMinusOne);
} else if (!C) {
// (1 << Y) < 0 -> Y == 31
// (1 << Y) <= 0 -> Y == 31
if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE)
return new ICmpInst(ICmpInst::ICMP_EQ, Y, BitWidthMinusOne);
// (1 << Y) >= 0 -> Y != 31
// (1 << Y) > 0 -> Y != 31
if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE)
return new ICmpInst(ICmpInst::ICMP_NE, Y, BitWidthMinusOne);
}
} else if (Cmp.isEquality() && CIsPowerOf2) {
return new ICmpInst(Pred, Y, ConstantInt::get(ShiftType, C.logBase2()));
}
return nullptr;
}
/// Fold icmp (shl X, Y), C.
Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
BinaryOperator *Shl,
const APInt &C) {
const APInt *ShiftVal;
if (Cmp.isEquality() && match(Shl->getOperand(0), m_APInt(ShiftVal)))
return foldICmpShlConstConst(Cmp, Shl->getOperand(1), C, *ShiftVal);
const APInt *ShiftAmt;
if (!match(Shl->getOperand(1), m_APInt(ShiftAmt)))
return foldICmpShlOne(Cmp, Shl, C);
// Check that the shift amount is in range. If not, don't perform undefined
// shifts. When the shift is visited, it will be simplified.
unsigned TypeBits = C.getBitWidth();
if (ShiftAmt->uge(TypeBits))
return nullptr;
ICmpInst::Predicate Pred = Cmp.getPredicate();
Value *X = Shl->getOperand(0);
Type *ShType = Shl->getType();
// NSW guarantees that we are only shifting out sign bits from the high bits,
// so we can ASHR the compare constant without needing a mask and eliminate
// the shift.
if (Shl->hasNoSignedWrap()) {
if (Pred == ICmpInst::ICMP_SGT) {
// icmp Pred (shl nsw X, ShiftAmt), C --> icmp Pred X, (C >>s ShiftAmt)
APInt ShiftedC = C.ashr(*ShiftAmt);
return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
}
if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) &&
C.ashr(*ShiftAmt).shl(*ShiftAmt) == C) {
APInt ShiftedC = C.ashr(*ShiftAmt);
return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
}
if (Pred == ICmpInst::ICMP_SLT) {
// SLE is the same as above, but SLE is canonicalized to SLT, so convert:
// (X << S) <=s C is equiv to X <=s (C >> S) for all C
// (X << S) <s (C + 1) is equiv to X <s (C >> S) + 1 if C <s SMAX
// (X << S) <s C is equiv to X <s ((C - 1) >> S) + 1 if C >s SMIN
assert(!C.isMinSignedValue() && "Unexpected icmp slt");
APInt ShiftedC = (C - 1).ashr(*ShiftAmt) + 1;
return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
}
// If this is a signed comparison to 0 and the shift is sign preserving,
// use the shift LHS operand instead; isSignTest may change 'Pred', so only
// do that if we're sure to not continue on in this function.
if (isSignTest(Pred, C))
return new ICmpInst(Pred, X, Constant::getNullValue(ShType));
}
// NUW guarantees that we are only shifting out zero bits from the high bits,
// so we can LSHR the compare constant without needing a mask and eliminate
// the shift.
if (Shl->hasNoUnsignedWrap()) {
if (Pred == ICmpInst::ICMP_UGT) {
// icmp Pred (shl nuw X, ShiftAmt), C --> icmp Pred X, (C >>u ShiftAmt)
APInt ShiftedC = C.lshr(*ShiftAmt);
return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
}
if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) &&
C.lshr(*ShiftAmt).shl(*ShiftAmt) == C) {
APInt ShiftedC = C.lshr(*ShiftAmt);
return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
}
if (Pred == ICmpInst::ICMP_ULT) {
// ULE is the same as above, but ULE is canonicalized to ULT, so convert:
// (X << S) <=u C is equiv to X <=u (C >> S) for all C
// (X << S) <u (C + 1) is equiv to X <u (C >> S) + 1 if C <u ~0u
// (X << S) <u C is equiv to X <u ((C - 1) >> S) + 1 if C >u 0
assert(C.ugt(0) && "ult 0 should have been eliminated");
APInt ShiftedC = (C - 1).lshr(*ShiftAmt) + 1;
return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
}
}
if (Cmp.isEquality() && Shl->hasOneUse()) {
// Strength-reduce the shift into an 'and'.
Constant *Mask = ConstantInt::get(
ShType,
APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt->getZExtValue()));
Value *And = Builder.CreateAnd(X, Mask, Shl->getName() + ".mask");
Constant *LShrC = ConstantInt::get(ShType, C.lshr(*ShiftAmt));
return new ICmpInst(Pred, And, LShrC);
}
// Otherwise, if this is a comparison of the sign bit, simplify to and/test.
bool TrueIfSigned = false;
if (Shl->hasOneUse() && isSignBitCheck(Pred, C, TrueIfSigned)) {
// (X << 31) <s 0 --> (X & 1) != 0
Constant *Mask = ConstantInt::get(
ShType,
APInt::getOneBitSet(TypeBits, TypeBits - ShiftAmt->getZExtValue() - 1));
Value *And = Builder.CreateAnd(X, Mask, Shl->getName() + ".mask");
return new ICmpInst(TrueIfSigned ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ,
And, Constant::getNullValue(ShType));
}
// Simplify 'shl' inequality test into 'and' equality test.
if (Cmp.isUnsigned() && Shl->hasOneUse()) {
// (X l<< C2) u<=/u> C1 iff C1+1 is power of two -> X & (~C1 l>> C2) ==/!= 0
if ((C + 1).isPowerOf2() &&
(Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_UGT)) {
Value *And = Builder.CreateAnd(X, (~C).lshr(ShiftAmt->getZExtValue()));
return new ICmpInst(Pred == ICmpInst::ICMP_ULE ? ICmpInst::ICMP_EQ
: ICmpInst::ICMP_NE,
And, Constant::getNullValue(ShType));
}
// (X l<< C2) u</u>= C1 iff C1 is power of two -> X & (-C1 l>> C2) ==/!= 0
if (C.isPowerOf2() &&
(Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE)) {
Value *And =
Builder.CreateAnd(X, (~(C - 1)).lshr(ShiftAmt->getZExtValue()));
return new ICmpInst(Pred == ICmpInst::ICMP_ULT ? ICmpInst::ICMP_EQ
: ICmpInst::ICMP_NE,
And, Constant::getNullValue(ShType));
}
}
// Transform (icmp pred iM (shl iM %v, N), C)
// -> (icmp pred i(M-N) (trunc %v iM to i(M-N)), (trunc (C>>N))
// Transform the shl to a trunc if (trunc (C>>N)) has no loss and M-N.
// This enables us to get rid of the shift in favor of a trunc that may be
// free on the target. It has the additional benefit of comparing to a
// smaller constant that may be more target-friendly.
unsigned Amt = ShiftAmt->getLimitedValue(TypeBits - 1);
if (Shl->hasOneUse() && Amt != 0 && C.countTrailingZeros() >= Amt &&
DL.isLegalInteger(TypeBits - Amt)) {
Type *TruncTy = IntegerType::get(Cmp.getContext(), TypeBits - Amt);
if (ShType->isVectorTy())
TruncTy = VectorType::get(TruncTy, ShType->getVectorNumElements());
Constant *NewC =
ConstantInt::get(TruncTy, C.ashr(*ShiftAmt).trunc(TypeBits - Amt));
return new ICmpInst(Pred, Builder.CreateTrunc(X, TruncTy), NewC);
}
return nullptr;
}
/// Fold icmp ({al}shr X, Y), C.
Instruction *InstCombiner::foldICmpShrConstant(ICmpInst &Cmp,
BinaryOperator *Shr,
const APInt &C) {
// An exact shr only shifts out zero bits, so:
// icmp eq/ne (shr X, Y), 0 --> icmp eq/ne X, 0
Value *X = Shr->getOperand(0);
CmpInst::Predicate Pred = Cmp.getPredicate();
if (Cmp.isEquality() && Shr->isExact() && Shr->hasOneUse() &&
C.isNullValue())
return new ICmpInst(Pred, X, Cmp.getOperand(1));
const APInt *ShiftVal;
if (Cmp.isEquality() && match(Shr->getOperand(0), m_APInt(ShiftVal)))
return foldICmpShrConstConst(Cmp, Shr->getOperand(1), C, *ShiftVal);
const APInt *ShiftAmt;
if (!match(Shr->getOperand(1), m_APInt(ShiftAmt)))
return nullptr;
// Check that the shift amount is in range. If not, don't perform undefined
// shifts. When the shift is visited it will be simplified.
unsigned TypeBits = C.getBitWidth();
unsigned ShAmtVal = ShiftAmt->getLimitedValue(TypeBits);
if (ShAmtVal >= TypeBits || ShAmtVal == 0)
return nullptr;
bool IsAShr = Shr->getOpcode() == Instruction::AShr;
bool IsExact = Shr->isExact();
Type *ShrTy = Shr->getType();
// TODO: If we could guarantee that InstSimplify would handle all of the
// constant-value-based preconditions in the folds below, then we could assert
// those conditions rather than checking them. This is difficult because of
// undef/poison (PR34838).
if (IsAShr) {
if (Pred == CmpInst::ICMP_SLT || (Pred == CmpInst::ICMP_SGT && IsExact)) {
// icmp slt (ashr X, ShAmtC), C --> icmp slt X, (C << ShAmtC)
// icmp sgt (ashr exact X, ShAmtC), C --> icmp sgt X, (C << ShAmtC)
APInt ShiftedC = C.shl(ShAmtVal);
if (ShiftedC.ashr(ShAmtVal) == C)
return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
}
if (Pred == CmpInst::ICMP_SGT) {
// icmp sgt (ashr X, ShAmtC), C --> icmp sgt X, ((C + 1) << ShAmtC) - 1
APInt ShiftedC = (C + 1).shl(ShAmtVal) - 1;
if (!C.isMaxSignedValue() && !(C + 1).shl(ShAmtVal).isMinSignedValue() &&
(ShiftedC + 1).ashr(ShAmtVal) == (C + 1))
return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
}
} else {
if (Pred == CmpInst::ICMP_ULT || (Pred == CmpInst::ICMP_UGT && IsExact)) {
// icmp ult (lshr X, ShAmtC), C --> icmp ult X, (C << ShAmtC)
// icmp ugt (lshr exact X, ShAmtC), C --> icmp ugt X, (C << ShAmtC)
APInt ShiftedC = C.shl(ShAmtVal);
if (ShiftedC.lshr(ShAmtVal) == C)
return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
}
if (Pred == CmpInst::ICMP_UGT) {
// icmp ugt (lshr X, ShAmtC), C --> icmp ugt X, ((C + 1) << ShAmtC) - 1
APInt ShiftedC = (C + 1).shl(ShAmtVal) - 1;
if ((ShiftedC + 1).lshr(ShAmtVal) == (C + 1))
return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
}
}
if (!Cmp.isEquality())
return nullptr;
// Handle equality comparisons of shift-by-constant.
// If the comparison constant changes with the shift, the comparison cannot
// succeed (bits of the comparison constant cannot match the shifted value).
// This should be known by InstSimplify and already be folded to true/false.
assert(((IsAShr && C.shl(ShAmtVal).ashr(ShAmtVal) == C) ||
(!IsAShr && C.shl(ShAmtVal).lshr(ShAmtVal) == C)) &&
"Expected icmp+shr simplify did not occur.");
// If the bits shifted out are known zero, compare the unshifted value:
// (X & 4) >> 1 == 2 --> (X & 4) == 4.
if (Shr->isExact())
return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, C << ShAmtVal));
if (Shr->hasOneUse()) {
// Canonicalize the shift into an 'and':
// icmp eq/ne (shr X, ShAmt), C --> icmp eq/ne (and X, HiMask), (C << ShAmt)
APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal));
Constant *Mask = ConstantInt::get(ShrTy, Val);
Value *And = Builder.CreateAnd(X, Mask, Shr->getName() + ".mask");
return new ICmpInst(Pred, And, ConstantInt::get(ShrTy, C << ShAmtVal));
}
return nullptr;
}
/// Fold icmp (udiv X, Y), C.
Instruction *InstCombiner::foldICmpUDivConstant(ICmpInst &Cmp,
BinaryOperator *UDiv,
const APInt &C) {
const APInt *C2;
if (!match(UDiv->getOperand(0), m_APInt(C2)))
return nullptr;
assert(*C2 != 0 && "udiv 0, X should have been simplified already.");
// (icmp ugt (udiv C2, Y), C) -> (icmp ule Y, C2/(C+1))
Value *Y = UDiv->getOperand(1);
if (Cmp.getPredicate() == ICmpInst::ICMP_UGT) {
assert(!C.isMaxValue() &&
"icmp ugt X, UINT_MAX should have been simplified already.");
return new ICmpInst(ICmpInst::ICMP_ULE, Y,
ConstantInt::get(Y->getType(), C2->udiv(C + 1)));
}
// (icmp ult (udiv C2, Y), C) -> (icmp ugt Y, C2/C)
if (Cmp.getPredicate() == ICmpInst::ICMP_ULT) {
assert(C != 0 && "icmp ult X, 0 should have been simplified already.");
return new ICmpInst(ICmpInst::ICMP_UGT, Y,
ConstantInt::get(Y->getType(), C2->udiv(C)));
}
return nullptr;
}
/// Fold icmp ({su}div X, Y), C.
Instruction *InstCombiner::foldICmpDivConstant(ICmpInst &Cmp,
BinaryOperator *Div,
const APInt &C) {
// Fold: icmp pred ([us]div X, C2), C -> range test
// Fold this div into the comparison, producing a range check.
// Determine, based on the divide type, what the range is being
// checked. If there is an overflow on the low or high side, remember
// it, otherwise compute the range [low, hi) bounding the new value.
// See: InsertRangeTest above for the kinds of replacements possible.
const APInt *C2;
if (!match(Div->getOperand(1), m_APInt(C2)))
return nullptr;
// FIXME: If the operand types don't match the type of the divide
// then don't attempt this transform. The code below doesn't have the
// logic to deal with a signed divide and an unsigned compare (and
// vice versa). This is because (x /s C2) <s C produces different
// results than (x /s C2) <u C or (x /u C2) <s C or even
// (x /u C2) <u C. Simply casting the operands and result won't
// work. :( The if statement below tests that condition and bails
// if it finds it.
bool DivIsSigned = Div->getOpcode() == Instruction::SDiv;
if (!Cmp.isEquality() && DivIsSigned != Cmp.isSigned())
return nullptr;
// The ProdOV computation fails on divide by 0 and divide by -1. Cases with
// INT_MIN will also fail if the divisor is 1. Although folds of all these
// division-by-constant cases should be present, we can not assert that they
// have happened before we reach this icmp instruction.
if (C2->isNullValue() || C2->isOneValue() ||
(DivIsSigned && C2->isAllOnesValue()))
return nullptr;
// Compute Prod = C * C2. We are essentially solving an equation of
// form X / C2 = C. We solve for X by multiplying C2 and C.
// By solving for X, we can turn this into a range check instead of computing
// a divide.
APInt Prod = C * *C2;
// Determine if the product overflows by seeing if the product is not equal to
// the divide. Make sure we do the same kind of divide as in the LHS
// instruction that we're folding.
bool ProdOV = (DivIsSigned ? Prod.sdiv(*C2) : Prod.udiv(*C2)) != C;
ICmpInst::Predicate Pred = Cmp.getPredicate();
// If the division is known to be exact, then there is no remainder from the
// divide, so the covered range size is unit, otherwise it is the divisor.
APInt RangeSize = Div->isExact() ? APInt(C2->getBitWidth(), 1) : *C2;
// Figure out the interval that is being checked. For example, a comparison
// like "X /u 5 == 0" is really checking that X is in the interval [0, 5).
// Compute this interval based on the constants involved and the signedness of
// the compare/divide. This computes a half-open interval, keeping track of
// whether either value in the interval overflows. After analysis each
// overflow variable is set to 0 if it's corresponding bound variable is valid
// -1 if overflowed off the bottom end, or +1 if overflowed off the top end.
int LoOverflow = 0, HiOverflow = 0;
APInt LoBound, HiBound;
if (!DivIsSigned) { // udiv
// e.g. X/5 op 3 --> [15, 20)
LoBound = Prod;
HiOverflow = LoOverflow = ProdOV;
if (!HiOverflow) {
// If this is not an exact divide, then many values in the range collapse
// to the same result value.
HiOverflow = addWithOverflow(HiBound, LoBound, RangeSize, false);
}
} else if (C2->isStrictlyPositive()) { // Divisor is > 0.
if (C.isNullValue()) { // (X / pos) op 0
// Can't overflow. e.g. X/2 op 0 --> [-1, 2)
LoBound = -(RangeSize - 1);
HiBound = RangeSize;
} else if (C.isStrictlyPositive()) { // (X / pos) op pos
LoBound = Prod; // e.g. X/5 op 3 --> [15, 20)
HiOverflow = LoOverflow = ProdOV;
if (!HiOverflow)
HiOverflow = addWithOverflow(HiBound, Prod, RangeSize, true);
} else { // (X / pos) op neg
// e.g. X/5 op -3 --> [-15-4, -15+1) --> [-19, -14)
HiBound = Prod + 1;
LoOverflow = HiOverflow = ProdOV ? -1 : 0;
if (!LoOverflow) {
APInt DivNeg = -RangeSize;
LoOverflow = addWithOverflow(LoBound, HiBound, DivNeg, true) ? -1 : 0;
}
}
} else if (C2->isNegative()) { // Divisor is < 0.
if (Div->isExact())
RangeSize.negate();
if (C.isNullValue()) { // (X / neg) op 0
// e.g. X/-5 op 0 --> [-4, 5)
LoBound = RangeSize + 1;
HiBound = -RangeSize;
if (HiBound == *C2) { // -INTMIN = INTMIN
HiOverflow = 1; // [INTMIN+1, overflow)
HiBound = APInt(); // e.g. X/INTMIN = 0 --> X > INTMIN
}
} else if (C.isStrictlyPositive()) { // (X / neg) op pos
// e.g. X/-5 op 3 --> [-19, -14)
HiBound = Prod + 1;
HiOverflow = LoOverflow = ProdOV ? -1 : 0;
if (!LoOverflow)
LoOverflow = addWithOverflow(LoBound, HiBound, RangeSize, true) ? -1:0;
} else { // (X / neg) op neg
LoBound = Prod; // e.g. X/-5 op -3 --> [15, 20)
LoOverflow = HiOverflow = ProdOV;
if (!HiOverflow)
HiOverflow = subWithOverflow(HiBound, Prod, RangeSize, true);
}
// Dividing by a negative swaps the condition. LT <-> GT
Pred = ICmpInst::getSwappedPredicate(Pred);
}
Value *X = Div->getOperand(0);
switch (Pred) {
default: llvm_unreachable("Unhandled icmp opcode!");
case ICmpInst::ICMP_EQ:
if (LoOverflow && HiOverflow)
return replaceInstUsesWith(Cmp, Builder.getFalse());
if (HiOverflow)
return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
ICmpInst::ICMP_UGE, X,
ConstantInt::get(Div->getType(), LoBound));
if (LoOverflow)
return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
ICmpInst::ICMP_ULT, X,
ConstantInt::get(Div->getType(), HiBound));
return replaceInstUsesWith(
Cmp, insertRangeTest(X, LoBound, HiBound, DivIsSigned, true));
case ICmpInst::ICMP_NE:
if (LoOverflow && HiOverflow)
return replaceInstUsesWith(Cmp, Builder.getTrue());
if (HiOverflow)
return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
ICmpInst::ICMP_ULT, X,
ConstantInt::get(Div->getType(), LoBound));
if (LoOverflow)
return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
ICmpInst::ICMP_UGE, X,
ConstantInt::get(Div->getType(), HiBound));
return replaceInstUsesWith(Cmp,
insertRangeTest(X, LoBound, HiBound,
DivIsSigned, false));
case ICmpInst::ICMP_ULT:
case ICmpInst::ICMP_SLT:
if (LoOverflow == +1) // Low bound is greater than input range.
return replaceInstUsesWith(Cmp, Builder.getTrue());
if (LoOverflow == -1) // Low bound is less than input range.
return replaceInstUsesWith(Cmp, Builder.getFalse());
return new ICmpInst(Pred, X, ConstantInt::get(Div->getType(), LoBound));
case ICmpInst::ICMP_UGT:
case ICmpInst::ICMP_SGT:
if (HiOverflow == +1) // High bound greater than input range.
return replaceInstUsesWith(Cmp, Builder.getFalse());
if (HiOverflow == -1) // High bound less than input range.
return replaceInstUsesWith(Cmp, Builder.getTrue());
if (Pred == ICmpInst::ICMP_UGT)
return new ICmpInst(ICmpInst::ICMP_UGE, X,
ConstantInt::get(Div->getType(), HiBound));
return new ICmpInst(ICmpInst::ICMP_SGE, X,
ConstantInt::get(Div->getType(), HiBound));
}
return nullptr;
}
/// Fold icmp (sub X, Y), C.
Instruction *InstCombiner::foldICmpSubConstant(ICmpInst &Cmp,
BinaryOperator *Sub,
const APInt &C) {
Value *X = Sub->getOperand(0), *Y = Sub->getOperand(1);
ICmpInst::Predicate Pred = Cmp.getPredicate();
const APInt *C2;
APInt SubResult;
// (icmp P (sub nuw|nsw C2, Y), C) -> (icmp swap(P) Y, C2-C)
if (match(X, m_APInt(C2)) &&
((Cmp.isUnsigned() && Sub->hasNoUnsignedWrap()) ||
(Cmp.isSigned() && Sub->hasNoSignedWrap())) &&
!subWithOverflow(SubResult, *C2, C, Cmp.isSigned()))
return new ICmpInst(Cmp.getSwappedPredicate(), Y,
ConstantInt::get(Y->getType(), SubResult));
// The following transforms are only worth it if the only user of the subtract
// is the icmp.
if (!Sub->hasOneUse())
return nullptr;
if (Sub->hasNoSignedWrap()) {
// (icmp sgt (sub nsw X, Y), -1) -> (icmp sge X, Y)
if (Pred == ICmpInst::ICMP_SGT && C.isAllOnesValue())
return new ICmpInst(ICmpInst::ICMP_SGE, X, Y);
// (icmp sgt (sub nsw X, Y), 0) -> (icmp sgt X, Y)
if (Pred == ICmpInst::ICMP_SGT && C.isNullValue())
return new ICmpInst(ICmpInst::ICMP_SGT, X, Y);
// (icmp slt (sub nsw X, Y), 0) -> (icmp slt X, Y)
if (Pred == ICmpInst::ICMP_SLT && C.isNullValue())
return new ICmpInst(ICmpInst::ICMP_SLT, X, Y);
// (icmp slt (sub nsw X, Y), 1) -> (icmp sle X, Y)
if (Pred == ICmpInst::ICMP_SLT && C.isOneValue())
return new ICmpInst(ICmpInst::ICMP_SLE, X, Y);
}
if (!match(X, m_APInt(C2)))
return nullptr;
// C2 - Y <u C -> (Y | (C - 1)) == C2
// iff (C2 & (C - 1)) == C - 1 and C is a power of 2
if (Pred == ICmpInst::ICMP_ULT && C.isPowerOf2() &&
(*C2 & (C - 1)) == (C - 1))
return new ICmpInst(ICmpInst::ICMP_EQ, Builder.CreateOr(Y, C - 1), X);
// C2 - Y >u C -> (Y | C) != C2
// iff C2 & C == C and C + 1 is a power of 2
if (Pred == ICmpInst::ICMP_UGT && (C + 1).isPowerOf2() && (*C2 & C) == C)
return new ICmpInst(ICmpInst::ICMP_NE, Builder.CreateOr(Y, C), X);
return nullptr;
}
/// Fold icmp (add X, Y), C.
Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp,
BinaryOperator *Add,
const APInt &C) {
Value *Y = Add->getOperand(1);
const APInt *C2;
if (Cmp.isEquality() || !match(Y, m_APInt(C2)))
return nullptr;
// Fold icmp pred (add X, C2), C.
Value *X = Add->getOperand(0);
Type *Ty = Add->getType();
CmpInst::Predicate Pred = Cmp.getPredicate();
if (!Add->hasOneUse())
return nullptr;
// If the add does not wrap, we can always adjust the compare by subtracting
// the constants. Equality comparisons are handled elsewhere. SGE/SLE/UGE/ULE
// are canonicalized to SGT/SLT/UGT/ULT.
if ((Add->hasNoSignedWrap() &&
(Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLT)) ||
(Add->hasNoUnsignedWrap() &&
(Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULT))) {
bool Overflow;
APInt NewC =
Cmp.isSigned() ? C.ssub_ov(*C2, Overflow) : C.usub_ov(*C2, Overflow);
// If there is overflow, the result must be true or false.
// TODO: Can we assert there is no overflow because InstSimplify always
// handles those cases?
if (!Overflow)
// icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2)
return new ICmpInst(Pred, X, ConstantInt::get(Ty, NewC));
}
auto CR = ConstantRange::makeExactICmpRegion(Pred, C).subtract(*C2);
const APInt &Upper = CR.getUpper();
const APInt &Lower = CR.getLower();
if (Cmp.isSigned()) {
if (Lower.isSignMask())
return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantInt::get(Ty, Upper));
if (Upper.isSignMask())
return new ICmpInst(ICmpInst::ICMP_SGE, X, ConstantInt::get(Ty, Lower));
} else {
if (Lower.isMinValue())
return new ICmpInst(ICmpInst::ICMP_ULT, X, ConstantInt::get(Ty, Upper));
if (Upper.isMinValue())
return new ICmpInst(ICmpInst::ICMP_UGE, X, ConstantInt::get(Ty, Lower));
}
// X+C <u C2 -> (X & -C2) == C
// iff C & (C2-1) == 0
// C2 is a power of 2
if (Pred == ICmpInst::ICMP_ULT && C.isPowerOf2() && (*C2 & (C - 1)) == 0)
return new ICmpInst(ICmpInst::ICMP_EQ, Builder.CreateAnd(X, -C),
ConstantExpr::getNeg(cast<Constant>(Y)));
// X+C >u C2 -> (X & ~C2) != C
// iff C & C2 == 0
// C2+1 is a power of 2
if (Pred == ICmpInst::ICMP_UGT && (C + 1).isPowerOf2() && (*C2 & C) == 0)
return new ICmpInst(ICmpInst::ICMP_NE, Builder.CreateAnd(X, ~C),
ConstantExpr::getNeg(cast<Constant>(Y)));
return nullptr;
}
bool InstCombiner::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
Value *&RHS, ConstantInt *&Less,
ConstantInt *&Equal,
ConstantInt *&Greater) {
// TODO: Generalize this to work with other comparison idioms or ensure
// they get canonicalized into this form.
// select i1 (a == b), i32 Equal, i32 (select i1 (a < b), i32 Less, i32
// Greater), where Equal, Less and Greater are placeholders for any three
// constants.
ICmpInst::Predicate PredA, PredB;
if (match(SI->getTrueValue(), m_ConstantInt(Equal)) &&
match(SI->getCondition(), m_ICmp(PredA, m_Value(LHS), m_Value(RHS))) &&
PredA == ICmpInst::ICMP_EQ &&
match(SI->getFalseValue(),
m_Select(m_ICmp(PredB, m_Specific(LHS), m_Specific(RHS)),
m_ConstantInt(Less), m_ConstantInt(Greater))) &&
PredB == ICmpInst::ICMP_SLT) {
return true;
}
return false;
}
Instruction *InstCombiner::foldICmpSelectConstant(ICmpInst &Cmp,
SelectInst *Select,
ConstantInt *C) {
assert(C && "Cmp RHS should be a constant int!");
// If we're testing a constant value against the result of a three way
// comparison, the result can be expressed directly in terms of the
// original values being compared. Note: We could possibly be more
// aggressive here and remove the hasOneUse test. The original select is
// really likely to simplify or sink when we remove a test of the result.
Value *OrigLHS, *OrigRHS;
ConstantInt *C1LessThan, *C2Equal, *C3GreaterThan;
if (Cmp.hasOneUse() &&
matchThreeWayIntCompare(Select, OrigLHS, OrigRHS, C1LessThan, C2Equal,
C3GreaterThan)) {
assert(C1LessThan && C2Equal && C3GreaterThan);
bool TrueWhenLessThan =
ConstantExpr::getCompare(Cmp.getPredicate(), C1LessThan, C)
->isAllOnesValue();
bool TrueWhenEqual =
ConstantExpr::getCompare(Cmp.getPredicate(), C2Equal, C)
->isAllOnesValue();
bool TrueWhenGreaterThan =
ConstantExpr::getCompare(Cmp.getPredicate(), C3GreaterThan, C)
->isAllOnesValue();
// This generates the new instruction that will replace the original Cmp
// Instruction. Instead of enumerating the various combinations when
// TrueWhenLessThan, TrueWhenEqual and TrueWhenGreaterThan are true versus
// false, we rely on chaining of ORs and future passes of InstCombine to
// simplify the OR further (i.e. a s< b || a == b becomes a s<= b).
// When none of the three constants satisfy the predicate for the RHS (C),
// the entire original Cmp can be simplified to a false.
Value *Cond = Builder.getFalse();
if (TrueWhenLessThan)
Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_SLT,
OrigLHS, OrigRHS));
if (TrueWhenEqual)
Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_EQ,
OrigLHS, OrigRHS));
if (TrueWhenGreaterThan)
Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_SGT,
OrigLHS, OrigRHS));
return replaceInstUsesWith(Cmp, Cond);
}
return nullptr;
}
static Instruction *foldICmpBitCast(ICmpInst &Cmp,
InstCombiner::BuilderTy &Builder) {
auto *Bitcast = dyn_cast<BitCastInst>(Cmp.getOperand(0));
if (!Bitcast)
return nullptr;
ICmpInst::Predicate Pred = Cmp.getPredicate();
Value *Op1 = Cmp.getOperand(1);
Value *BCSrcOp = Bitcast->getOperand(0);
// Make sure the bitcast doesn't change the number of vector elements.
if (Bitcast->getSrcTy()->getScalarSizeInBits() ==
Bitcast->getDestTy()->getScalarSizeInBits()) {
// Zero-equality and sign-bit checks are preserved through sitofp + bitcast.
Value *X;
if (match(BCSrcOp, m_SIToFP(m_Value(X)))) {
// icmp eq (bitcast (sitofp X)), 0 --> icmp eq X, 0
// icmp ne (bitcast (sitofp X)), 0 --> icmp ne X, 0
// icmp slt (bitcast (sitofp X)), 0 --> icmp slt X, 0
// icmp sgt (bitcast (sitofp X)), 0 --> icmp sgt X, 0
if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_SLT ||
Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT) &&
match(Op1, m_Zero()))
return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType()));
// icmp slt (bitcast (sitofp X)), 1 --> icmp slt X, 1
if (Pred == ICmpInst::ICMP_SLT && match(Op1, m_One()))
return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), 1));
// icmp sgt (bitcast (sitofp X)), -1 --> icmp sgt X, -1
if (Pred == ICmpInst::ICMP_SGT && match(Op1, m_AllOnes()))
return new ICmpInst(Pred, X,
ConstantInt::getAllOnesValue(X->getType()));
}
// Zero-equality checks are preserved through unsigned floating-point casts:
// icmp eq (bitcast (uitofp X)), 0 --> icmp eq X, 0
// icmp ne (bitcast (uitofp X)), 0 --> icmp ne X, 0
if (match(BCSrcOp, m_UIToFP(m_Value(X))))
if (Cmp.isEquality() && match(Op1, m_Zero()))
return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType()));
}
// Test to see if the operands of the icmp are casted versions of other
// values. If the ptr->ptr cast can be stripped off both arguments, do so.
if (Bitcast->getType()->isPointerTy() &&
(isa<Constant>(Op1) || isa<BitCastInst>(Op1))) {
// If operand #1 is a bitcast instruction, it must also be a ptr->ptr cast
// so eliminate it as well.
if (auto *BC2 = dyn_cast<BitCastInst>(Op1))
Op1 = BC2->getOperand(0);
Op1 = Builder.CreateBitCast(Op1, BCSrcOp->getType());
return new ICmpInst(Pred, BCSrcOp, Op1);
}
// Folding: icmp <pred> iN X, C
// where X = bitcast <M x iK> (shufflevector <M x iK> %vec, undef, SC)) to iN
// and C is a splat of a K-bit pattern
// and SC is a constant vector = <C', C', C', ..., C'>
// Into:
// %E = extractelement <M x iK> %vec, i32 C'
// icmp <pred> iK %E, trunc(C)
const APInt *C;
if (!match(Cmp.getOperand(1), m_APInt(C)) ||
!Bitcast->getType()->isIntegerTy() ||
!Bitcast->getSrcTy()->isIntOrIntVectorTy())
return nullptr;
Value *Vec;
Constant *Mask;
if (match(BCSrcOp,
m_ShuffleVector(m_Value(Vec), m_Undef(), m_Constant(Mask)))) {
// Check whether every element of Mask is the same constant
if (auto *Elem = dyn_cast_or_null<ConstantInt>(Mask->getSplatValue())) {
auto *VecTy = cast<VectorType>(BCSrcOp->getType());
auto *EltTy = cast<IntegerType>(VecTy->getElementType());
if (C->isSplat(EltTy->getBitWidth())) {
// Fold the icmp based on the value of C
// If C is M copies of an iK sized bit pattern,
// then:
// => %E = extractelement <N x iK> %vec, i32 Elem
// icmp <pred> iK %SplatVal, <pattern>
Value *Extract = Builder.CreateExtractElement(Vec, Elem);
Value *NewC = ConstantInt::get(EltTy, C->trunc(EltTy->getBitWidth()));
return new ICmpInst(Pred, Extract, NewC);
}
}
}
return nullptr;
}
/// Try to fold integer comparisons with a constant operand: icmp Pred X, C
/// where X is some kind of instruction.
Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) {
const APInt *C;
if (!match(Cmp.getOperand(1), m_APInt(C)))
return nullptr;
if (auto *BO = dyn_cast<BinaryOperator>(Cmp.getOperand(0))) {
switch (BO->getOpcode()) {
case Instruction::Xor:
if (Instruction *I = foldICmpXorConstant(Cmp, BO, *C))
return I;
break;
case Instruction::And:
if (Instruction *I = foldICmpAndConstant(Cmp, BO, *C))
return I;
break;
case Instruction::Or:
if (Instruction *I = foldICmpOrConstant(Cmp, BO, *C))
return I;
break;
case Instruction::Mul:
if (Instruction *I = foldICmpMulConstant(Cmp, BO, *C))
return I;
break;
case Instruction::Shl:
if (Instruction *I = foldICmpShlConstant(Cmp, BO, *C))
return I;
break;
case Instruction::LShr:
case Instruction::AShr:
if (Instruction *I = foldICmpShrConstant(Cmp, BO, *C))
return I;
break;
case Instruction::UDiv:
if (Instruction *I = foldICmpUDivConstant(Cmp, BO, *C))
return I;
LLVM_FALLTHROUGH;
case Instruction::SDiv:
if (Instruction *I = foldICmpDivConstant(Cmp, BO, *C))
return I;
break;
case Instruction::Sub:
if (Instruction *I = foldICmpSubConstant(Cmp, BO, *C))
return I;
break;
case Instruction::Add:
if (Instruction *I = foldICmpAddConstant(Cmp, BO, *C))
return I;
break;
default:
break;
}
// TODO: These folds could be refactored to be part of the above calls.
if (Instruction *I = foldICmpBinOpEqualityWithConstant(Cmp, BO, *C))
return I;
}
// Match against CmpInst LHS being instructions other than binary operators.
if (auto *SI = dyn_cast<SelectInst>(Cmp.getOperand(0))) {
// For now, we only support constant integers while folding the
// ICMP(SELECT)) pattern. We can extend this to support vector of integers
// similar to the cases handled by binary ops above.
if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(Cmp.getOperand(1)))
if (Instruction *I = foldICmpSelectConstant(Cmp, SI, ConstRHS))
return I;
}
if (auto *TI = dyn_cast<TruncInst>(Cmp.getOperand(0))) {
if (Instruction *I = foldICmpTruncConstant(Cmp, TI, *C))
return I;
}
if (auto *II = dyn_cast<IntrinsicInst>(Cmp.getOperand(0)))
if (Instruction *I = foldICmpIntrinsicWithConstant(Cmp, II, *C))
return I;
return nullptr;
}
/// Fold an icmp equality instruction with binary operator LHS and constant RHS:
/// icmp eq/ne BO, C.
Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
BinaryOperator *BO,
const APInt &C) {
// TODO: Some of these folds could work with arbitrary constants, but this
// function is limited to scalar and vector splat constants.
if (!Cmp.isEquality())
return nullptr;
ICmpInst::Predicate Pred = Cmp.getPredicate();
bool isICMP_NE = Pred == ICmpInst::ICMP_NE;
Constant *RHS = cast<Constant>(Cmp.getOperand(1));
Value *BOp0 = BO->getOperand(0), *BOp1 = BO->getOperand(1);
switch (BO->getOpcode()) {
case Instruction::SRem:
// If we have a signed (X % (2^c)) == 0, turn it into an unsigned one.
if (C.isNullValue() && BO->hasOneUse()) {
const APInt *BOC;
if (match(BOp1, m_APInt(BOC)) && BOC->sgt(1) && BOC->isPowerOf2()) {
Value *NewRem = Builder.CreateURem(BOp0, BOp1, BO->getName());
return new ICmpInst(Pred, NewRem,
Constant::getNullValue(BO->getType()));
}
}
break;
case Instruction::Add: {
// Replace ((add A, B) != C) with (A != C-B) if B & C are constants.
const APInt *BOC;
if (match(BOp1, m_APInt(BOC))) {
if (BO->hasOneUse()) {
Constant *SubC = ConstantExpr::getSub(RHS, cast<Constant>(BOp1));
return new ICmpInst(Pred, BOp0, SubC);
}
} else if (C.isNullValue()) {
// Replace ((add A, B) != 0) with (A != -B) if A or B is
// efficiently invertible, or if the add has just this one use.
if (Value *NegVal = dyn_castNegVal(BOp1))
return new ICmpInst(Pred, BOp0, NegVal);
if (Value *NegVal = dyn_castNegVal(BOp0))
return new ICmpInst(Pred, NegVal, BOp1);
if (BO->hasOneUse()) {
Value *Neg = Builder.CreateNeg(BOp1);
Neg->takeName(BO);
return new ICmpInst(Pred, BOp0, Neg);
}
}
break;
}
case Instruction::Xor:
if (BO->hasOneUse()) {
if (Constant *BOC = dyn_cast<Constant>(BOp1)) {
// For the xor case, we can xor two constants together, eliminating
// the explicit xor.
return new ICmpInst(Pred, BOp0, ConstantExpr::getXor(RHS, BOC));
} else if (C.isNullValue()) {
// Replace ((xor A, B) != 0) with (A != B)
return new ICmpInst(Pred, BOp0, BOp1);
}
}
break;
case Instruction::Sub:
if (BO->hasOneUse()) {
const APInt *BOC;
if (match(BOp0, m_APInt(BOC))) {
// Replace ((sub BOC, B) != C) with (B != BOC-C).
Constant *SubC = ConstantExpr::getSub(cast<Constant>(BOp0), RHS);
return new ICmpInst(Pred, BOp1, SubC);
} else if (C.isNullValue()) {
// Replace ((sub A, B) != 0) with (A != B).
return new ICmpInst(Pred, BOp0, BOp1);
}
}
break;
case Instruction::Or: {
const APInt *BOC;
if (match(BOp1, m_APInt(BOC)) && BO->hasOneUse() && RHS->isAllOnesValue()) {
// Comparing if all bits outside of a constant mask are set?
// Replace (X | C) == -1 with (X & ~C) == ~C.
// This removes the -1 constant.
Constant *NotBOC = ConstantExpr::getNot(cast<Constant>(BOp1));
Value *And = Builder.CreateAnd(BOp0, NotBOC);
return new ICmpInst(Pred, And, NotBOC);
}
break;
}
case Instruction::And: {
const APInt *BOC;
if (match(BOp1, m_APInt(BOC))) {
// If we have ((X & C) == C), turn it into ((X & C) != 0).
if (C == *BOC && C.isPowerOf2())
return new ICmpInst(isICMP_NE ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
BO, Constant::getNullValue(RHS->getType()));
}
break;
}
case Instruction::Mul:
if (C.isNullValue() && BO->hasNoSignedWrap()) {
const APInt *BOC;
if (match(BOp1, m_APInt(BOC)) && !BOC->isNullValue()) {
// The trivial case (mul X, 0) is handled by InstSimplify.
// General case : (mul X, C) != 0 iff X != 0
// (mul X, C) == 0 iff X == 0
return new ICmpInst(Pred, BOp0, Constant::getNullValue(RHS->getType()));
}
}
break;
case Instruction::UDiv:
if (C.isNullValue()) {
// (icmp eq/ne (udiv A, B), 0) -> (icmp ugt/ule i32 B, A)
auto NewPred = isICMP_NE ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT;
return new ICmpInst(NewPred, BOp1, BOp0);
}
break;
default:
break;
}
return nullptr;
}
/// Fold an equality icmp with LLVM intrinsic and constant operand.
Instruction *InstCombiner::foldICmpEqIntrinsicWithConstant(ICmpInst &Cmp,
IntrinsicInst *II,
const APInt &C) {
Type *Ty = II->getType();
unsigned BitWidth = C.getBitWidth();
switch (II->getIntrinsicID()) {
case Intrinsic::bswap:
Worklist.Add(II);
Cmp.setOperand(0, II->getArgOperand(0));
Cmp.setOperand(1, ConstantInt::get(Ty, C.byteSwap()));
return &Cmp;
case Intrinsic::ctlz:
case Intrinsic::cttz: {
// ctz(A) == bitwidth(A) -> A == 0 and likewise for !=
if (C == BitWidth) {
Worklist.Add(II);
Cmp.setOperand(0, II->getArgOperand(0));
Cmp.setOperand(1, ConstantInt::getNullValue(Ty));
return &Cmp;
}
// ctz(A) == C -> A & Mask1 == Mask2, where Mask2 only has bit C set
// and Mask1 has bits 0..C+1 set. Similar for ctl, but for high bits.
// Limit to one use to ensure we don't increase instruction count.
unsigned Num = C.getLimitedValue(BitWidth);
if (Num != BitWidth && II->hasOneUse()) {
bool IsTrailing = II->getIntrinsicID() == Intrinsic::cttz;
APInt Mask1 = IsTrailing ? APInt::getLowBitsSet(BitWidth, Num + 1)
: APInt::getHighBitsSet(BitWidth, Num + 1);
APInt Mask2 = IsTrailing
? APInt::getOneBitSet(BitWidth, Num)
: APInt::getOneBitSet(BitWidth, BitWidth - Num - 1);
Cmp.setOperand(0, Builder.CreateAnd(II->getArgOperand(0), Mask1));
Cmp.setOperand(1, ConstantInt::get(Ty, Mask2));
Worklist.Add(II);
return &Cmp;
}
break;
}
case Intrinsic::ctpop: {
// popcount(A) == 0 -> A == 0 and likewise for !=
// popcount(A) == bitwidth(A) -> A == -1 and likewise for !=
bool IsZero = C.isNullValue();
if (IsZero || C == BitWidth) {
Worklist.Add(II);
Cmp.setOperand(0, II->getArgOperand(0));
auto *NewOp =
IsZero ? Constant::getNullValue(Ty) : Constant::getAllOnesValue(Ty);
Cmp.setOperand(1, NewOp);
return &Cmp;
}
break;
}
default:
break;
}
return nullptr;
}
/// Fold an icmp with LLVM intrinsic and constant operand: icmp Pred II, C.
Instruction *InstCombiner::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
IntrinsicInst *II,
const APInt &C) {
if (Cmp.isEquality())
return foldICmpEqIntrinsicWithConstant(Cmp, II, C);
Type *Ty = II->getType();
unsigned BitWidth = C.getBitWidth();
switch (II->getIntrinsicID()) {
case Intrinsic::ctlz: {
// ctlz(0bXXXXXXXX) > 3 -> 0bXXXXXXXX < 0b00010000
if (Cmp.getPredicate() == ICmpInst::ICMP_UGT && C.ult(BitWidth)) {
unsigned Num = C.getLimitedValue();
APInt Limit = APInt::getOneBitSet(BitWidth, BitWidth - Num - 1);
return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_ULT,
II->getArgOperand(0), ConstantInt::get(Ty, Limit));
}
// ctlz(0bXXXXXXXX) < 3 -> 0bXXXXXXXX > 0b00011111
if (Cmp.getPredicate() == ICmpInst::ICMP_ULT &&
C.uge(1) && C.ule(BitWidth)) {
unsigned Num = C.getLimitedValue();
APInt Limit = APInt::getLowBitsSet(BitWidth, BitWidth - Num);
return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_UGT,
II->getArgOperand(0), ConstantInt::get(Ty, Limit));
}
break;
}
case Intrinsic::cttz: {
// Limit to one use to ensure we don't increase instruction count.
if (!II->hasOneUse())
return nullptr;
// cttz(0bXXXXXXXX) > 3 -> 0bXXXXXXXX & 0b00001111 == 0
if (Cmp.getPredicate() == ICmpInst::ICMP_UGT && C.ult(BitWidth)) {
APInt Mask = APInt::getLowBitsSet(BitWidth, C.getLimitedValue() + 1);
return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_EQ,
Builder.CreateAnd(II->getArgOperand(0), Mask),
ConstantInt::getNullValue(Ty));
}
// cttz(0bXXXXXXXX) < 3 -> 0bXXXXXXXX & 0b00000111 != 0
if (Cmp.getPredicate() == ICmpInst::ICMP_ULT &&
C.uge(1) && C.ule(BitWidth)) {
APInt Mask = APInt::getLowBitsSet(BitWidth, C.getLimitedValue());
return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_NE,
Builder.CreateAnd(II->getArgOperand(0), Mask),
ConstantInt::getNullValue(Ty));
}
break;
}
default:
break;
}
return nullptr;
}
/// Handle icmp with constant (but not simple integer constant) RHS.
Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) {
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
Constant *RHSC = dyn_cast<Constant>(Op1);
Instruction *LHSI = dyn_cast<Instruction>(Op0);
if (!RHSC || !LHSI)
return nullptr;
switch (LHSI->getOpcode()) {
case Instruction::GetElementPtr:
// icmp pred GEP (P, int 0, int 0, int 0), null -> icmp pred P, null
if (RHSC->isNullValue() &&
cast<GetElementPtrInst>(LHSI)->hasAllZeroIndices())
return new ICmpInst(
I.getPredicate(), LHSI->getOperand(0),
Constant::getNullValue(LHSI->getOperand(0)->getType()));
break;
case Instruction::PHI:
// Only fold icmp into the PHI if the phi and icmp are in the same
// block. If in the same block, we're encouraging jump threading. If
// not, we are just pessimizing the code by making an i1 phi.
if (LHSI->getParent() == I.getParent())
if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
return NV;
break;
case Instruction::Select: {
// If either operand of the select is a constant, we can fold the
// comparison into the select arms, which will cause one to be
// constant folded and the select turned into a bitwise or.
Value *Op1 = nullptr, *Op2 = nullptr;
ConstantInt *CI = nullptr;
if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) {
Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
CI = dyn_cast<ConstantInt>(Op1);
}
if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) {
Op2 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
CI = dyn_cast<ConstantInt>(Op2);
}
// We only want to perform this transformation if it will not lead to
// additional code. This is true if either both sides of the select
// fold to a constant (in which case the icmp is replaced with a select
// which will usually simplify) or this is the only user of the
// select (in which case we are trading a select+icmp for a simpler
// select+icmp) or all uses of the select can be replaced based on
// dominance information ("Global cases").
bool Transform = false;
if (Op1 && Op2)
Transform = true;
else if (Op1 || Op2) {
// Local case
if (LHSI->hasOneUse())
Transform = true;
// Global cases
else if (CI && !CI->isZero())
// When Op1 is constant try replacing select with second operand.
// Otherwise Op2 is constant and try replacing select with first
// operand.
Transform =
replacedSelectWithOperand(cast<SelectInst>(LHSI), &I, Op1 ? 2 : 1);
}
if (Transform) {
if (!Op1)
Op1 = Builder.CreateICmp(I.getPredicate(), LHSI->getOperand(1), RHSC,
I.getName());
if (!Op2)
Op2 = Builder.CreateICmp(I.getPredicate(), LHSI->getOperand(2), RHSC,
I.getName());
return SelectInst::Create(LHSI->getOperand(0), Op1, Op2);
}
break;
}
case Instruction::IntToPtr:
// icmp pred inttoptr(X), null -> icmp pred X, 0
if (RHSC->isNullValue() &&
DL.getIntPtrType(RHSC->getType()) == LHSI->getOperand(0)->getType())
return new ICmpInst(
I.getPredicate(), LHSI->getOperand(0),
Constant::getNullValue(LHSI->getOperand(0)->getType()));
break;
case Instruction::Load:
// Try to optimize things like "A[i] > 4" to index computations.
if (GetElementPtrInst *GEP =
dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) {
if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
!cast<LoadInst>(LHSI)->isVolatile())
if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
return Res;
}
break;
}
return nullptr;
}
/// Some comparisons can be simplified.
/// In this case, we are looking for comparisons that look like
/// a check for a lossy truncation.
/// Folds:
/// icmp SrcPred (x & Mask), x to icmp DstPred x, Mask
/// Where Mask is some pattern that produces all-ones in low bits:
/// (-1 >> y)
/// ((-1 << y) >> y) <- non-canonical, has extra uses
/// ~(-1 << y)
/// ((1 << y) + (-1)) <- non-canonical, has extra uses
/// The Mask can be a constant, too.
/// For some predicates, the operands are commutative.
/// For others, x can only be on a specific side.
static Value *foldICmpWithLowBitMaskedVal(ICmpInst &I,
InstCombiner::BuilderTy &Builder) {
ICmpInst::Predicate SrcPred;
Value *X, *M, *Y;
auto m_VariableMask = m_CombineOr(
m_CombineOr(m_Not(m_Shl(m_AllOnes(), m_Value())),
m_Add(m_Shl(m_One(), m_Value()), m_AllOnes())),
m_CombineOr(m_LShr(m_AllOnes(), m_Value()),
m_LShr(m_Shl(m_AllOnes(), m_Value(Y)), m_Deferred(Y))));
auto m_Mask = m_CombineOr(m_VariableMask, m_LowBitMask());
if (!match(&I, m_c_ICmp(SrcPred,
m_c_And(m_CombineAnd(m_Mask, m_Value(M)), m_Value(X)),
m_Deferred(X))))
return nullptr;
ICmpInst::Predicate DstPred;
switch (SrcPred) {
case ICmpInst::Predicate::ICMP_EQ:
// x & (-1 >> y) == x -> x u<= (-1 >> y)
DstPred = ICmpInst::Predicate::ICMP_ULE;
break;
case ICmpInst::Predicate::ICMP_NE:
// x & (-1 >> y) != x -> x u> (-1 >> y)
DstPred = ICmpInst::Predicate::ICMP_UGT;
break;
case ICmpInst::Predicate::ICMP_UGT:
// x u> x & (-1 >> y) -> x u> (-1 >> y)
assert(X == I.getOperand(0) && "instsimplify took care of commut. variant");
DstPred = ICmpInst::Predicate::ICMP_UGT;
break;
case ICmpInst::Predicate::ICMP_UGE:
// x & (-1 >> y) u>= x -> x u<= (-1 >> y)
assert(X == I.getOperand(1) && "instsimplify took care of commut. variant");
DstPred = ICmpInst::Predicate::ICMP_ULE;
break;
case ICmpInst::Predicate::ICMP_ULT:
// x & (-1 >> y) u< x -> x u> (-1 >> y)
assert(X == I.getOperand(1) && "instsimplify took care of commut. variant");
DstPred = ICmpInst::Predicate::ICMP_UGT;
break;
case ICmpInst::Predicate::ICMP_ULE:
// x u<= x & (-1 >> y) -> x u<= (-1 >> y)
assert(X == I.getOperand(0) && "instsimplify took care of commut. variant");
DstPred = ICmpInst::Predicate::ICMP_ULE;
break;
case ICmpInst::Predicate::ICMP_SGT:
// x s> x & (-1 >> y) -> x s> (-1 >> y)
if (X != I.getOperand(0)) // X must be on LHS of comparison!
return nullptr; // Ignore the other case.
if (!match(M, m_Constant())) // Can not do this fold with non-constant.
return nullptr;
if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
return nullptr;
DstPred = ICmpInst::Predicate::ICMP_SGT;
break;
case ICmpInst::Predicate::ICMP_SGE:
// x & (-1 >> y) s>= x -> x s<= (-1 >> y)
if (X != I.getOperand(1)) // X must be on RHS of comparison!
return nullptr; // Ignore the other case.
if (!match(M, m_Constant())) // Can not do this fold with non-constant.
return nullptr;
if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
return nullptr;
DstPred = ICmpInst::Predicate::ICMP_SLE;
break;
case ICmpInst::Predicate::ICMP_SLT:
// x & (-1 >> y) s< x -> x s> (-1 >> y)
if (X != I.getOperand(1)) // X must be on RHS of comparison!
return nullptr; // Ignore the other case.
if (!match(M, m_Constant())) // Can not do this fold with non-constant.
return nullptr;
if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
return nullptr;
DstPred = ICmpInst::Predicate::ICMP_SGT;
break;
case ICmpInst::Predicate::ICMP_SLE:
// x s<= x & (-1 >> y) -> x s<= (-1 >> y)
if (X != I.getOperand(0)) // X must be on LHS of comparison!
return nullptr; // Ignore the other case.
if (!match(M, m_Constant())) // Can not do this fold with non-constant.
return nullptr;
if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
return nullptr;
DstPred = ICmpInst::Predicate::ICMP_SLE;
break;
default:
llvm_unreachable("All possible folds are handled.");
}
return Builder.CreateICmp(DstPred, X, M);
}
/// Some comparisons can be simplified.
/// In this case, we are looking for comparisons that look like
/// a check for a lossy signed truncation.
/// Folds: (MaskedBits is a constant.)
/// ((%x << MaskedBits) a>> MaskedBits) SrcPred %x
/// Into:
/// (add %x, (1 << (KeptBits-1))) DstPred (1 << KeptBits)
/// Where KeptBits = bitwidth(%x) - MaskedBits
static Value *
foldICmpWithTruncSignExtendedVal(ICmpInst &I,
InstCombiner::BuilderTy &Builder) {
ICmpInst::Predicate SrcPred;
Value *X;
const APInt *C0, *C1; // FIXME: non-splats, potentially with undef.
// We are ok with 'shl' having multiple uses, but 'ashr' must be one-use.
if (!match(&I, m_c_ICmp(SrcPred,
m_OneUse(m_AShr(m_Shl(m_Value(X), m_APInt(C0)),
m_APInt(C1))),
m_Deferred(X))))
return nullptr;
// Potential handling of non-splats: for each element:
// * if both are undef, replace with constant 0.
// Because (1<<0) is OK and is 1, and ((1<<0)>>1) is also OK and is 0.
// * if both are not undef, and are different, bailout.
// * else, only one is undef, then pick the non-undef one.
// The shift amount must be equal.
if (*C0 != *C1)
return nullptr;
const APInt &MaskedBits = *C0;
assert(MaskedBits != 0 && "shift by zero should be folded away already.");
ICmpInst::Predicate DstPred;
switch (SrcPred) {
case ICmpInst::Predicate::ICMP_EQ:
// ((%x << MaskedBits) a>> MaskedBits) == %x
// =>
// (add %x, (1 << (KeptBits-1))) u< (1 << KeptBits)
DstPred = ICmpInst::Predicate::ICMP_ULT;
break;
case ICmpInst::Predicate::ICMP_NE:
// ((%x << MaskedBits) a>> MaskedBits) != %x
// =>
// (add %x, (1 << (KeptBits-1))) u>= (1 << KeptBits)
DstPred = ICmpInst::Predicate::ICMP_UGE;
break;
// FIXME: are more folds possible?
default:
return nullptr;
}
auto *XType = X->getType();
const unsigned XBitWidth = XType->getScalarSizeInBits();
const APInt BitWidth = APInt(XBitWidth, XBitWidth);
assert(BitWidth.ugt(MaskedBits) && "shifts should leave some bits untouched");
// KeptBits = bitwidth(%x) - MaskedBits
const APInt KeptBits = BitWidth - MaskedBits;
assert(KeptBits.ugt(0) && KeptBits.ult(BitWidth) && "unreachable");
// ICmpCst = (1 << KeptBits)
const APInt ICmpCst = APInt(XBitWidth, 1).shl(KeptBits);
assert(ICmpCst.isPowerOf2());
// AddCst = (1 << (KeptBits-1))
const APInt AddCst = ICmpCst.lshr(1);
assert(AddCst.ult(ICmpCst) && AddCst.isPowerOf2());
// T0 = add %x, AddCst
Value *T0 = Builder.CreateAdd(X, ConstantInt::get(XType, AddCst));
// T1 = T0 DstPred ICmpCst
Value *T1 = Builder.CreateICmp(DstPred, T0, ConstantInt::get(XType, ICmpCst));
return T1;
}
// Given pattern:
// icmp eq/ne (and ((x shift Q), (y oppositeshift K))), 0
// we should move shifts to the same hand of 'and', i.e. rewrite as
// icmp eq/ne (and (x shift (Q+K)), y), 0 iff (Q+K) u< bitwidth(x)
// We are only interested in opposite logical shifts here.
// If we can, we want to end up creating 'lshr' shift.
static Value *
foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ,
InstCombiner::BuilderTy &Builder) {
if (!I.isEquality() || !match(I.getOperand(1), m_Zero()) ||
!I.getOperand(0)->hasOneUse())
return nullptr;
auto m_AnyLogicalShift = m_LogicalShift(m_Value(), m_Value());
auto m_AnyLShr = m_LShr(m_Value(), m_Value());
// Look for an 'and' of two (opposite) logical shifts.
// Pick the single-use shift as XShift.
- Value *XShift, *YShift;
+ Instruction *XShift, *YShift;
if (!match(I.getOperand(0),
- m_c_And(m_OneUse(m_CombineAnd(m_AnyLogicalShift, m_Value(XShift))),
- m_CombineAnd(m_AnyLogicalShift, m_Value(YShift)))))
+ m_c_And(m_CombineAnd(m_AnyLogicalShift, m_Instruction(XShift)),
+ m_CombineAnd(m_AnyLogicalShift, m_Instruction(YShift)))))
return nullptr;
- // If YShift is a single-use 'lshr', swap the shifts around.
- if (match(YShift, m_OneUse(m_AnyLShr)))
+ // If YShift is a 'lshr', swap the shifts around.
+ if (match(YShift, m_AnyLShr))
std::swap(XShift, YShift);
// The shifts must be in opposite directions.
- Instruction::BinaryOps XShiftOpcode =
- cast<BinaryOperator>(XShift)->getOpcode();
- if (XShiftOpcode == cast<BinaryOperator>(YShift)->getOpcode())
+ auto XShiftOpcode = XShift->getOpcode();
+ if (XShiftOpcode == YShift->getOpcode())
return nullptr; // Do not care about same-direction shifts here.
Value *X, *XShAmt, *Y, *YShAmt;
match(XShift, m_BinOp(m_Value(X), m_Value(XShAmt)));
match(YShift, m_BinOp(m_Value(Y), m_Value(YShAmt)));
+
+ // If one of the values being shifted is a constant, then we will end with
+ // and+icmp, and shift instr will be constant-folded. If they are not,
+ // however, we will need to ensure that we won't increase instruction count.
+ if (!isa<Constant>(X) && !isa<Constant>(Y)) {
+ // At least one of the hands of the 'and' should be one-use shift.
+ if (!match(I.getOperand(0),
+ m_c_And(m_OneUse(m_AnyLogicalShift), m_Value())))
+ return nullptr;
+ }
// Can we fold (XShAmt+YShAmt) ?
Value *NewShAmt = SimplifyBinOp(Instruction::BinaryOps::Add, XShAmt, YShAmt,
SQ.getWithInstruction(&I));
if (!NewShAmt)
return nullptr;
// Is the new shift amount smaller than the bit width?
// FIXME: could also rely on ConstantRange.
unsigned BitWidth = X->getType()->getScalarSizeInBits();
if (!match(NewShAmt, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT,
APInt(BitWidth, BitWidth))))
return nullptr;
// All good, we can do this fold. The shift is the same that was for X.
Value *T0 = XShiftOpcode == Instruction::BinaryOps::LShr
? Builder.CreateLShr(X, NewShAmt)
: Builder.CreateShl(X, NewShAmt);
Value *T1 = Builder.CreateAnd(T0, Y);
return Builder.CreateICmp(I.getPredicate(), T1,
Constant::getNullValue(X->getType()));
}
/// Try to fold icmp (binop), X or icmp X, (binop).
/// TODO: A large part of this logic is duplicated in InstSimplify's
/// simplifyICmpWithBinOp(). We should be able to share that and avoid the code
/// duplication.
Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
// Special logic for binary operators.
BinaryOperator *BO0 = dyn_cast<BinaryOperator>(Op0);
BinaryOperator *BO1 = dyn_cast<BinaryOperator>(Op1);
if (!BO0 && !BO1)
return nullptr;
const CmpInst::Predicate Pred = I.getPredicate();
Value *X;
// Convert add-with-unsigned-overflow comparisons into a 'not' with compare.
// (Op1 + X) <u Op1 --> ~Op1 <u X
// Op0 >u (Op0 + X) --> X >u ~Op0
if (match(Op0, m_OneUse(m_c_Add(m_Specific(Op1), m_Value(X)))) &&
Pred == ICmpInst::ICMP_ULT)
return new ICmpInst(Pred, Builder.CreateNot(Op1), X);
if (match(Op1, m_OneUse(m_c_Add(m_Specific(Op0), m_Value(X)))) &&
Pred == ICmpInst::ICMP_UGT)
return new ICmpInst(Pred, X, Builder.CreateNot(Op0));
bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
if (BO0 && isa<OverflowingBinaryOperator>(BO0))
NoOp0WrapProblem =
ICmpInst::isEquality(Pred) ||
(CmpInst::isUnsigned(Pred) && BO0->hasNoUnsignedWrap()) ||
(CmpInst::isSigned(Pred) && BO0->hasNoSignedWrap());
if (BO1 && isa<OverflowingBinaryOperator>(BO1))
NoOp1WrapProblem =
ICmpInst::isEquality(Pred) ||
(CmpInst::isUnsigned(Pred) && BO1->hasNoUnsignedWrap()) ||
(CmpInst::isSigned(Pred) && BO1->hasNoSignedWrap());
// Analyze the case when either Op0 or Op1 is an add instruction.
// Op0 = A + B (or A and B are null); Op1 = C + D (or C and D are null).
Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
if (BO0 && BO0->getOpcode() == Instruction::Add) {
A = BO0->getOperand(0);
B = BO0->getOperand(1);
}
if (BO1 && BO1->getOpcode() == Instruction::Add) {
C = BO1->getOperand(0);
D = BO1->getOperand(1);
}
// icmp (X+Y), X -> icmp Y, 0 for equalities or if there is no overflow.
if ((A == Op1 || B == Op1) && NoOp0WrapProblem)
return new ICmpInst(Pred, A == Op1 ? B : A,
Constant::getNullValue(Op1->getType()));
// icmp X, (X+Y) -> icmp 0, Y for equalities or if there is no overflow.
if ((C == Op0 || D == Op0) && NoOp1WrapProblem)
return new ICmpInst(Pred, Constant::getNullValue(Op0->getType()),
C == Op0 ? D : C);
// icmp (X+Y), (X+Z) -> icmp Y, Z for equalities or if there is no overflow.
if (A && C && (A == C || A == D || B == C || B == D) && NoOp0WrapProblem &&
NoOp1WrapProblem &&
// Try not to increase register pressure.
BO0->hasOneUse() && BO1->hasOneUse()) {
// Determine Y and Z in the form icmp (X+Y), (X+Z).
Value *Y, *Z;
if (A == C) {
// C + B == C + D -> B == D
Y = B;
Z = D;
} else if (A == D) {
// D + B == C + D -> B == C
Y = B;
Z = C;
} else if (B == C) {
// A + C == C + D -> A == D
Y = A;
Z = D;
} else {
assert(B == D);
// A + D == C + D -> A == C
Y = A;
Z = C;
}
return new ICmpInst(Pred, Y, Z);
}
// icmp slt (X + -1), Y -> icmp sle X, Y
if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLT &&
match(B, m_AllOnes()))
return new ICmpInst(CmpInst::ICMP_SLE, A, Op1);
// icmp sge (X + -1), Y -> icmp sgt X, Y
if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGE &&
match(B, m_AllOnes()))
return new ICmpInst(CmpInst::ICMP_SGT, A, Op1);
// icmp sle (X + 1), Y -> icmp slt X, Y
if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLE && match(B, m_One()))
return new ICmpInst(CmpInst::ICMP_SLT, A, Op1);
// icmp sgt (X + 1), Y -> icmp sge X, Y
if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGT && match(B, m_One()))
return new ICmpInst(CmpInst::ICMP_SGE, A, Op1);
// icmp sgt X, (Y + -1) -> icmp sge X, Y
if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGT &&
match(D, m_AllOnes()))
return new ICmpInst(CmpInst::ICMP_SGE, Op0, C);
// icmp sle X, (Y + -1) -> icmp slt X, Y
if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLE &&
match(D, m_AllOnes()))
return new ICmpInst(CmpInst::ICMP_SLT, Op0, C);
// icmp sge X, (Y + 1) -> icmp sgt X, Y
if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGE && match(D, m_One()))
return new ICmpInst(CmpInst::ICMP_SGT, Op0, C);
// icmp slt X, (Y + 1) -> icmp sle X, Y
if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLT && match(D, m_One()))
return new ICmpInst(CmpInst::ICMP_SLE, Op0, C);
// TODO: The subtraction-related identities shown below also hold, but
// canonicalization from (X -nuw 1) to (X + -1) means that the combinations
// wouldn't happen even if they were implemented.
//
// icmp ult (X - 1), Y -> icmp ule X, Y
// icmp uge (X - 1), Y -> icmp ugt X, Y
// icmp ugt X, (Y - 1) -> icmp uge X, Y
// icmp ule X, (Y - 1) -> icmp ult X, Y
// icmp ule (X + 1), Y -> icmp ult X, Y
if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_ULE && match(B, m_One()))
return new ICmpInst(CmpInst::ICMP_ULT, A, Op1);
// icmp ugt (X + 1), Y -> icmp uge X, Y
if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_UGT && match(B, m_One()))
return new ICmpInst(CmpInst::ICMP_UGE, A, Op1);
// icmp uge X, (Y + 1) -> icmp ugt X, Y
if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_UGE && match(D, m_One()))
return new ICmpInst(CmpInst::ICMP_UGT, Op0, C);
// icmp ult X, (Y + 1) -> icmp ule X, Y
if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_ULT && match(D, m_One()))
return new ICmpInst(CmpInst::ICMP_ULE, Op0, C);
// if C1 has greater magnitude than C2:
// icmp (X + C1), (Y + C2) -> icmp (X + C3), Y
// s.t. C3 = C1 - C2
//
// if C2 has greater magnitude than C1:
// icmp (X + C1), (Y + C2) -> icmp X, (Y + C3)
// s.t. C3 = C2 - C1
if (A && C && NoOp0WrapProblem && NoOp1WrapProblem &&
(BO0->hasOneUse() || BO1->hasOneUse()) && !I.isUnsigned())
if (ConstantInt *C1 = dyn_cast<ConstantInt>(B))
if (ConstantInt *C2 = dyn_cast<ConstantInt>(D)) {
const APInt &AP1 = C1->getValue();
const APInt &AP2 = C2->getValue();
if (AP1.isNegative() == AP2.isNegative()) {
APInt AP1Abs = C1->getValue().abs();
APInt AP2Abs = C2->getValue().abs();
if (AP1Abs.uge(AP2Abs)) {
ConstantInt *C3 = Builder.getInt(AP1 - AP2);
Value *NewAdd = Builder.CreateNSWAdd(A, C3);
return new ICmpInst(Pred, NewAdd, C);
} else {
ConstantInt *C3 = Builder.getInt(AP2 - AP1);
Value *NewAdd = Builder.CreateNSWAdd(C, C3);
return new ICmpInst(Pred, A, NewAdd);
}
}
}
// Analyze the case when either Op0 or Op1 is a sub instruction.
// Op0 = A - B (or A and B are null); Op1 = C - D (or C and D are null).
A = nullptr;
B = nullptr;
C = nullptr;
D = nullptr;
if (BO0 && BO0->getOpcode() == Instruction::Sub) {
A = BO0->getOperand(0);
B = BO0->getOperand(1);
}
if (BO1 && BO1->getOpcode() == Instruction::Sub) {
C = BO1->getOperand(0);
D = BO1->getOperand(1);
}
// icmp (X-Y), X -> icmp 0, Y for equalities or if there is no overflow.
if (A == Op1 && NoOp0WrapProblem)
return new ICmpInst(Pred, Constant::getNullValue(Op1->getType()), B);
// icmp X, (X-Y) -> icmp Y, 0 for equalities or if there is no overflow.
if (C == Op0 && NoOp1WrapProblem)
return new ICmpInst(Pred, D, Constant::getNullValue(Op0->getType()));
// (A - B) >u A --> A <u B
if (A == Op1 && Pred == ICmpInst::ICMP_UGT)
return new ICmpInst(ICmpInst::ICMP_ULT, A, B);
// C <u (C - D) --> C <u D
if (C == Op0 && Pred == ICmpInst::ICMP_ULT)
return new ICmpInst(ICmpInst::ICMP_ULT, C, D);
// icmp (Y-X), (Z-X) -> icmp Y, Z for equalities or if there is no overflow.
if (B && D && B == D && NoOp0WrapProblem && NoOp1WrapProblem &&
// Try not to increase register pressure.
BO0->hasOneUse() && BO1->hasOneUse())
return new ICmpInst(Pred, A, C);
// icmp (X-Y), (X-Z) -> icmp Z, Y for equalities or if there is no overflow.
if (A && C && A == C && NoOp0WrapProblem && NoOp1WrapProblem &&
// Try not to increase register pressure.
BO0->hasOneUse() && BO1->hasOneUse())
return new ICmpInst(Pred, D, B);
// icmp (0-X) < cst --> x > -cst
if (NoOp0WrapProblem && ICmpInst::isSigned(Pred)) {
Value *X;
if (match(BO0, m_Neg(m_Value(X))))
if (Constant *RHSC = dyn_cast<Constant>(Op1))
if (RHSC->isNotMinSignedValue())
return new ICmpInst(I.getSwappedPredicate(), X,
ConstantExpr::getNeg(RHSC));
}
BinaryOperator *SRem = nullptr;
// icmp (srem X, Y), Y
if (BO0 && BO0->getOpcode() == Instruction::SRem && Op1 == BO0->getOperand(1))
SRem = BO0;
// icmp Y, (srem X, Y)
else if (BO1 && BO1->getOpcode() == Instruction::SRem &&
Op0 == BO1->getOperand(1))
SRem = BO1;
if (SRem) {
// We don't check hasOneUse to avoid increasing register pressure because
// the value we use is the same value this instruction was already using.
switch (SRem == BO0 ? ICmpInst::getSwappedPredicate(Pred) : Pred) {
default:
break;
case ICmpInst::ICMP_EQ:
return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
case ICmpInst::ICMP_NE:
return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
case ICmpInst::ICMP_SGT:
case ICmpInst::ICMP_SGE:
return new ICmpInst(ICmpInst::ICMP_SGT, SRem->getOperand(1),
Constant::getAllOnesValue(SRem->getType()));
case ICmpInst::ICMP_SLT:
case ICmpInst::ICMP_SLE:
return new ICmpInst(ICmpInst::ICMP_SLT, SRem->getOperand(1),
Constant::getNullValue(SRem->getType()));
}
}
if (BO0 && BO1 && BO0->getOpcode() == BO1->getOpcode() && BO0->hasOneUse() &&
BO1->hasOneUse() && BO0->getOperand(1) == BO1->getOperand(1)) {
switch (BO0->getOpcode()) {
default:
break;
case Instruction::Add:
case Instruction::Sub:
case Instruction::Xor: {
if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b
return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
const APInt *C;
if (match(BO0->getOperand(1), m_APInt(C))) {
// icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
if (C->isSignMask()) {
ICmpInst::Predicate NewPred =
I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
}
// icmp u/s (a ^ maxsignval), (b ^ maxsignval) --> icmp s/u' a, b
if (BO0->getOpcode() == Instruction::Xor && C->isMaxSignedValue()) {
ICmpInst::Predicate NewPred =
I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
NewPred = I.getSwappedPredicate(NewPred);
return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
}
}
break;
}
case Instruction::Mul: {
if (!I.isEquality())
break;
const APInt *C;
if (match(BO0->getOperand(1), m_APInt(C)) && !C->isNullValue() &&
!C->isOneValue()) {
// icmp eq/ne (X * C), (Y * C) --> icmp (X & Mask), (Y & Mask)
// Mask = -1 >> count-trailing-zeros(C).
if (unsigned TZs = C->countTrailingZeros()) {
Constant *Mask = ConstantInt::get(
BO0->getType(),
APInt::getLowBitsSet(C->getBitWidth(), C->getBitWidth() - TZs));
Value *And1 = Builder.CreateAnd(BO0->getOperand(0), Mask);
Value *And2 = Builder.CreateAnd(BO1->getOperand(0), Mask);
return new ICmpInst(Pred, And1, And2);
}
// If there are no trailing zeros in the multiplier, just eliminate
// the multiplies (no masking is needed):
// icmp eq/ne (X * C), (Y * C) --> icmp eq/ne X, Y
return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
}
break;
}
case Instruction::UDiv:
case Instruction::LShr:
if (I.isSigned() || !BO0->isExact() || !BO1->isExact())
break;
return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
case Instruction::SDiv:
if (!I.isEquality() || !BO0->isExact() || !BO1->isExact())
break;
return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
case Instruction::AShr:
if (!BO0->isExact() || !BO1->isExact())
break;
return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
case Instruction::Shl: {
bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap();
bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap();
if (!NUW && !NSW)
break;
if (!NSW && I.isSigned())
break;
return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
}
}
}
if (BO0) {
// Transform A & (L - 1) `ult` L --> L != 0
auto LSubOne = m_Add(m_Specific(Op1), m_AllOnes());
auto BitwiseAnd = m_c_And(m_Value(), LSubOne);
if (match(BO0, BitwiseAnd) && Pred == ICmpInst::ICMP_ULT) {
auto *Zero = Constant::getNullValue(BO0->getType());
return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero);
}
}
if (Value *V = foldICmpWithLowBitMaskedVal(I, Builder))
return replaceInstUsesWith(I, V);
if (Value *V = foldICmpWithTruncSignExtendedVal(I, Builder))
return replaceInstUsesWith(I, V);
if (Value *V = foldShiftIntoShiftInAnotherHandOfAndInICmp(I, SQ, Builder))
return replaceInstUsesWith(I, V);
return nullptr;
}
/// Fold icmp Pred min|max(X, Y), X.
static Instruction *foldICmpWithMinMax(ICmpInst &Cmp) {
ICmpInst::Predicate Pred = Cmp.getPredicate();
Value *Op0 = Cmp.getOperand(0);
Value *X = Cmp.getOperand(1);
// Canonicalize minimum or maximum operand to LHS of the icmp.
if (match(X, m_c_SMin(m_Specific(Op0), m_Value())) ||
match(X, m_c_SMax(m_Specific(Op0), m_Value())) ||
match(X, m_c_UMin(m_Specific(Op0), m_Value())) ||
match(X, m_c_UMax(m_Specific(Op0), m_Value()))) {
std::swap(Op0, X);
Pred = Cmp.getSwappedPredicate();
}
Value *Y;
if (match(Op0, m_c_SMin(m_Specific(X), m_Value(Y)))) {
// smin(X, Y) == X --> X s<= Y
// smin(X, Y) s>= X --> X s<= Y
if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_SGE)
return new ICmpInst(ICmpInst::ICMP_SLE, X, Y);
// smin(X, Y) != X --> X s> Y
// smin(X, Y) s< X --> X s> Y
if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_SLT)
return new ICmpInst(ICmpInst::ICMP_SGT, X, Y);
// These cases should be handled in InstSimplify:
// smin(X, Y) s<= X --> true
// smin(X, Y) s> X --> false
return nullptr;
}
if (match(Op0, m_c_SMax(m_Specific(X), m_Value(Y)))) {
// smax(X, Y) == X --> X s>= Y
// smax(X, Y) s<= X --> X s>= Y
if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_SLE)
return new ICmpInst(ICmpInst::ICMP_SGE, X, Y);
// smax(X, Y) != X --> X s< Y
// smax(X, Y) s> X --> X s< Y
if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_SGT)
return new ICmpInst(ICmpInst::ICMP_SLT, X, Y);
// These cases should be handled in InstSimplify:
// smax(X, Y) s>= X --> true
// smax(X, Y) s< X --> false
return nullptr;
}
if (match(Op0, m_c_UMin(m_Specific(X), m_Value(Y)))) {
// umin(X, Y) == X --> X u<= Y
// umin(X, Y) u>= X --> X u<= Y
if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_UGE)
return new ICmpInst(ICmpInst::ICMP_ULE, X, Y);
// umin(X, Y) != X --> X u> Y
// umin(X, Y) u< X --> X u> Y
if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_ULT)
return new ICmpInst(ICmpInst::ICMP_UGT, X, Y);
// These cases should be handled in InstSimplify:
// umin(X, Y) u<= X --> true
// umin(X, Y) u> X --> false
return nullptr;
}
if (match(Op0, m_c_UMax(m_Specific(X), m_Value(Y)))) {
// umax(X, Y) == X --> X u>= Y
// umax(X, Y) u<= X --> X u>= Y
if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_ULE)
return new ICmpInst(ICmpInst::ICMP_UGE, X, Y);
// umax(X, Y) != X --> X u< Y
// umax(X, Y) u> X --> X u< Y
if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_UGT)
return new ICmpInst(ICmpInst::ICMP_ULT, X, Y);
// These cases should be handled in InstSimplify:
// umax(X, Y) u>= X --> true
// umax(X, Y) u< X --> false
return nullptr;
}
return nullptr;
}
Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
if (!I.isEquality())
return nullptr;
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
const CmpInst::Predicate Pred = I.getPredicate();
Value *A, *B, *C, *D;
if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
if (A == Op1 || B == Op1) { // (A^B) == A -> B == 0
Value *OtherVal = A == Op1 ? B : A;
return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType()));
}
if (match(Op1, m_Xor(m_Value(C), m_Value(D)))) {
// A^c1 == C^c2 --> A == C^(c1^c2)
ConstantInt *C1, *C2;
if (match(B, m_ConstantInt(C1)) && match(D, m_ConstantInt(C2)) &&
Op1->hasOneUse()) {
Constant *NC = Builder.getInt(C1->getValue() ^ C2->getValue());
Value *Xor = Builder.CreateXor(C, NC);
return new ICmpInst(Pred, A, Xor);
}
// A^B == A^D -> B == D
if (A == C)
return new ICmpInst(Pred, B, D);
if (A == D)
return new ICmpInst(Pred, B, C);
if (B == C)
return new ICmpInst(Pred, A, D);
if (B == D)
return new ICmpInst(Pred, A, C);
}
}
if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && (A == Op0 || B == Op0)) {
// A == (A^B) -> B == 0
Value *OtherVal = A == Op0 ? B : A;
return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType()));
}
// (X&Z) == (Y&Z) -> (X^Y) & Z == 0
if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B)))) &&
match(Op1, m_OneUse(m_And(m_Value(C), m_Value(D))))) {
Value *X = nullptr, *Y = nullptr, *Z = nullptr;
if (A == C) {
X = B;
Y = D;
Z = A;
} else if (A == D) {
X = B;
Y = C;
Z = A;
} else if (B == C) {
X = A;
Y = D;
Z = B;
} else if (B == D) {
X = A;
Y = C;
Z = B;
}
if (X) { // Build (X^Y) & Z
Op1 = Builder.CreateXor(X, Y);
Op1 = Builder.CreateAnd(Op1, Z);
I.setOperand(0, Op1);
I.setOperand(1, Constant::getNullValue(Op1->getType()));
return &I;
}
}
// Transform (zext A) == (B & (1<<X)-1) --> A == (trunc B)
// and (B & (1<<X)-1) == (zext A) --> A == (trunc B)
ConstantInt *Cst1;
if ((Op0->hasOneUse() && match(Op0, m_ZExt(m_Value(A))) &&
match(Op1, m_And(m_Value(B), m_ConstantInt(Cst1)))) ||
(Op1->hasOneUse() && match(Op0, m_And(m_Value(B), m_ConstantInt(Cst1))) &&
match(Op1, m_ZExt(m_Value(A))))) {
APInt Pow2 = Cst1->getValue() + 1;
if (Pow2.isPowerOf2() && isa<IntegerType>(A->getType()) &&
Pow2.logBase2() == cast<IntegerType>(A->getType())->getBitWidth())
return new ICmpInst(Pred, A, Builder.CreateTrunc(B, A->getType()));
}
// (A >> C) == (B >> C) --> (A^B) u< (1 << C)
// For lshr and ashr pairs.
if ((match(Op0, m_OneUse(m_LShr(m_Value(A), m_ConstantInt(Cst1)))) &&
match(Op1, m_OneUse(m_LShr(m_Value(B), m_Specific(Cst1))))) ||
(match(Op0, m_OneUse(m_AShr(m_Value(A), m_ConstantInt(Cst1)))) &&
match(Op1, m_OneUse(m_AShr(m_Value(B), m_Specific(Cst1)))))) {
unsigned TypeBits = Cst1->getBitWidth();
unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
if (ShAmt < TypeBits && ShAmt != 0) {
ICmpInst::Predicate NewPred =
Pred == ICmpInst::ICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
Value *Xor = Builder.CreateXor(A, B, I.getName() + ".unshifted");
APInt CmpVal = APInt::getOneBitSet(TypeBits, ShAmt);
return new ICmpInst(NewPred, Xor, Builder.getInt(CmpVal));
}
}
// (A << C) == (B << C) --> ((A^B) & (~0U >> C)) == 0
if (match(Op0, m_OneUse(m_Shl(m_Value(A), m_ConstantInt(Cst1)))) &&
match(Op1, m_OneUse(m_Shl(m_Value(B), m_Specific(Cst1))))) {
unsigned TypeBits = Cst1->getBitWidth();
unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
if (ShAmt < TypeBits && ShAmt != 0) {
Value *Xor = Builder.CreateXor(A, B, I.getName() + ".unshifted");
APInt AndVal = APInt::getLowBitsSet(TypeBits, TypeBits - ShAmt);
Value *And = Builder.CreateAnd(Xor, Builder.getInt(AndVal),
I.getName() + ".mask");
return new ICmpInst(Pred, And, Constant::getNullValue(Cst1->getType()));
}
}
// Transform "icmp eq (trunc (lshr(X, cst1)), cst" to
// "icmp (and X, mask), cst"
uint64_t ShAmt = 0;
if (Op0->hasOneUse() &&
match(Op0, m_Trunc(m_OneUse(m_LShr(m_Value(A), m_ConstantInt(ShAmt))))) &&
match(Op1, m_ConstantInt(Cst1)) &&
// Only do this when A has multiple uses. This is most important to do
// when it exposes other optimizations.
!A->hasOneUse()) {
unsigned ASize = cast<IntegerType>(A->getType())->getPrimitiveSizeInBits();
if (ShAmt < ASize) {
APInt MaskV =
APInt::getLowBitsSet(ASize, Op0->getType()->getPrimitiveSizeInBits());
MaskV <<= ShAmt;
APInt CmpV = Cst1->getValue().zext(ASize);
CmpV <<= ShAmt;
Value *Mask = Builder.CreateAnd(A, Builder.getInt(MaskV));
return new ICmpInst(Pred, Mask, Builder.getInt(CmpV));
}
}
// If both operands are byte-swapped or bit-reversed, just compare the
// original values.
// TODO: Move this to a function similar to foldICmpIntrinsicWithConstant()
// and handle more intrinsics.
if ((match(Op0, m_BSwap(m_Value(A))) && match(Op1, m_BSwap(m_Value(B)))) ||
(match(Op0, m_BitReverse(m_Value(A))) &&
match(Op1, m_BitReverse(m_Value(B)))))
return new ICmpInst(Pred, A, B);
// Canonicalize checking for a power-of-2-or-zero value:
// (A & (A-1)) == 0 --> ctpop(A) < 2 (two commuted variants)
// ((A-1) & A) != 0 --> ctpop(A) > 1 (two commuted variants)
if (!match(Op0, m_OneUse(m_c_And(m_Add(m_Value(A), m_AllOnes()),
m_Deferred(A)))) ||
!match(Op1, m_ZeroInt()))
A = nullptr;
// (A & -A) == A --> ctpop(A) < 2 (four commuted variants)
// (-A & A) != A --> ctpop(A) > 1 (four commuted variants)
if (match(Op0, m_OneUse(m_c_And(m_Neg(m_Specific(Op1)), m_Specific(Op1)))))
A = Op1;
else if (match(Op1,
m_OneUse(m_c_And(m_Neg(m_Specific(Op0)), m_Specific(Op0)))))
A = Op0;
if (A) {
Type *Ty = A->getType();
CallInst *CtPop = Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, A);
return Pred == ICmpInst::ICMP_EQ
? new ICmpInst(ICmpInst::ICMP_ULT, CtPop, ConstantInt::get(Ty, 2))
: new ICmpInst(ICmpInst::ICMP_UGT, CtPop, ConstantInt::get(Ty, 1));
}
return nullptr;
}
/// Handle icmp (cast x to y), (cast/cst). We only handle extending casts so
/// far.
Instruction *InstCombiner::foldICmpWithCastAndCast(ICmpInst &ICmp) {
const CastInst *LHSCI = cast<CastInst>(ICmp.getOperand(0));
Value *LHSCIOp = LHSCI->getOperand(0);
Type *SrcTy = LHSCIOp->getType();
Type *DestTy = LHSCI->getType();
// Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the
// integer type is the same size as the pointer type.
const auto& CompatibleSizes = [&](Type* SrcTy, Type* DestTy) -> bool {
if (isa<VectorType>(SrcTy)) {
SrcTy = cast<VectorType>(SrcTy)->getElementType();
DestTy = cast<VectorType>(DestTy)->getElementType();
}
return DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth();
};
if (LHSCI->getOpcode() == Instruction::PtrToInt &&
CompatibleSizes(SrcTy, DestTy)) {
Value *RHSOp = nullptr;
if (auto *RHSC = dyn_cast<PtrToIntOperator>(ICmp.getOperand(1))) {
Value *RHSCIOp = RHSC->getOperand(0);
if (RHSCIOp->getType()->getPointerAddressSpace() ==
LHSCIOp->getType()->getPointerAddressSpace()) {
RHSOp = RHSC->getOperand(0);
// If the pointer types don't match, insert a bitcast.
if (LHSCIOp->getType() != RHSOp->getType())
RHSOp = Builder.CreateBitCast(RHSOp, LHSCIOp->getType());
}
} else if (auto *RHSC = dyn_cast<Constant>(ICmp.getOperand(1))) {
RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy);
}
if (RHSOp)
return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSOp);
}
// The code below only handles extension cast instructions, so far.
// Enforce this.
if (LHSCI->getOpcode() != Instruction::ZExt &&
LHSCI->getOpcode() != Instruction::SExt)
return nullptr;
bool isSignedExt = LHSCI->getOpcode() == Instruction::SExt;
bool isSignedCmp = ICmp.isSigned();
if (auto *CI = dyn_cast<CastInst>(ICmp.getOperand(1))) {
// Not an extension from the same type?
Value *RHSCIOp = CI->getOperand(0);
if (RHSCIOp->getType() != LHSCIOp->getType())
return nullptr;
// If the signedness of the two casts doesn't agree (i.e. one is a sext
// and the other is a zext), then we can't handle this.
if (CI->getOpcode() != LHSCI->getOpcode())
return nullptr;
// Deal with equality cases early.
if (ICmp.isEquality())
return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSCIOp);
// A signed comparison of sign extended values simplifies into a
// signed comparison.
if (isSignedCmp && isSignedExt)
return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSCIOp);
// The other three cases all fold into an unsigned comparison.
return new ICmpInst(ICmp.getUnsignedPredicate(), LHSCIOp, RHSCIOp);
}
// If we aren't dealing with a constant on the RHS, exit early.
auto *C = dyn_cast<Constant>(ICmp.getOperand(1));
if (!C)
return nullptr;
// Compute the constant that would happen if we truncated to SrcTy then
// re-extended to DestTy.
Constant *Res1 = ConstantExpr::getTrunc(C, SrcTy);
Constant *Res2 = ConstantExpr::getCast(LHSCI->getOpcode(), Res1, DestTy);
// If the re-extended constant didn't change...
if (Res2 == C) {
// Deal with equality cases early.
if (ICmp.isEquality())
return new ICmpInst(ICmp.getPredicate(), LHSCIOp, Res1);
// A signed comparison of sign extended values simplifies into a
// signed comparison.
if (isSignedExt && isSignedCmp)
return new ICmpInst(ICmp.getPredicate(), LHSCIOp, Res1);
// The other three cases all fold into an unsigned comparison.
return new ICmpInst(ICmp.getUnsignedPredicate(), LHSCIOp, Res1);
}
// The re-extended constant changed, partly changed (in the case of a vector),
// or could not be determined to be equal (in the case of a constant
// expression), so the constant cannot be represented in the shorter type.
// Consequently, we cannot emit a simple comparison.
// All the cases that fold to true or false will have already been handled
// by SimplifyICmpInst, so only deal with the tricky case.
if (isSignedCmp || !isSignedExt || !isa<ConstantInt>(C))
return nullptr;
// Evaluate the comparison for LT (we invert for GT below). LE and GE cases
// should have been folded away previously and not enter in here.
// We're performing an unsigned comp with a sign extended value.
// This is true if the input is >= 0. [aka >s -1]
Constant *NegOne = Constant::getAllOnesValue(SrcTy);
Value *Result = Builder.CreateICmpSGT(LHSCIOp, NegOne, ICmp.getName());
// Finally, return the value computed.
if (ICmp.getPredicate() == ICmpInst::ICMP_ULT)
return replaceInstUsesWith(ICmp, Result);
assert(ICmp.getPredicate() == ICmpInst::ICMP_UGT && "ICmp should be folded!");
return BinaryOperator::CreateNot(Result);
}
static bool isNeutralValue(Instruction::BinaryOps BinaryOp, Value *RHS) {
switch (BinaryOp) {
default:
llvm_unreachable("Unsupported binary op");
case Instruction::Add:
case Instruction::Sub:
return match(RHS, m_Zero());
case Instruction::Mul:
return match(RHS, m_One());
}
}
OverflowResult InstCombiner::computeOverflow(
Instruction::BinaryOps BinaryOp, bool IsSigned,
Value *LHS, Value *RHS, Instruction *CxtI) const {
switch (BinaryOp) {
default:
llvm_unreachable("Unsupported binary op");
case Instruction::Add:
if (IsSigned)
return computeOverflowForSignedAdd(LHS, RHS, CxtI);
else
return computeOverflowForUnsignedAdd(LHS, RHS, CxtI);
case Instruction::Sub:
if (IsSigned)
return computeOverflowForSignedSub(LHS, RHS, CxtI);
else
return computeOverflowForUnsignedSub(LHS, RHS, CxtI);
case Instruction::Mul:
if (IsSigned)
return computeOverflowForSignedMul(LHS, RHS, CxtI);
else
return computeOverflowForUnsignedMul(LHS, RHS, CxtI);
}
}
bool InstCombiner::OptimizeOverflowCheck(
Instruction::BinaryOps BinaryOp, bool IsSigned, Value *LHS, Value *RHS,
Instruction &OrigI, Value *&Result, Constant *&Overflow) {
if (OrigI.isCommutative() && isa<Constant>(LHS) && !isa<Constant>(RHS))
std::swap(LHS, RHS);
// If the overflow check was an add followed by a compare, the insertion point
// may be pointing to the compare. We want to insert the new instructions
// before the add in case there are uses of the add between the add and the
// compare.
Builder.SetInsertPoint(&OrigI);
if (isNeutralValue(BinaryOp, RHS)) {
Result = LHS;
Overflow = Builder.getFalse();
return true;
}
switch (computeOverflow(BinaryOp, IsSigned, LHS, RHS, &OrigI)) {
case OverflowResult::MayOverflow:
return false;
case OverflowResult::AlwaysOverflowsLow:
case OverflowResult::AlwaysOverflowsHigh:
Result = Builder.CreateBinOp(BinaryOp, LHS, RHS);
Result->takeName(&OrigI);
Overflow = Builder.getTrue();
return true;
case OverflowResult::NeverOverflows:
Result = Builder.CreateBinOp(BinaryOp, LHS, RHS);
Result->takeName(&OrigI);
Overflow = Builder.getFalse();
if (auto *Inst = dyn_cast<Instruction>(Result)) {
if (IsSigned)
Inst->setHasNoSignedWrap();
else
Inst->setHasNoUnsignedWrap();
}
return true;
}
llvm_unreachable("Unexpected overflow result");
}
/// Recognize and process idiom involving test for multiplication
/// overflow.
///
/// The caller has matched a pattern of the form:
/// I = cmp u (mul(zext A, zext B), V
/// The function checks if this is a test for overflow and if so replaces
/// multiplication with call to 'mul.with.overflow' intrinsic.
///
/// \param I Compare instruction.
/// \param MulVal Result of 'mult' instruction. It is one of the arguments of
/// the compare instruction. Must be of integer type.
/// \param OtherVal The other argument of compare instruction.
/// \returns Instruction which must replace the compare instruction, NULL if no
/// replacement required.
static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
Value *OtherVal, InstCombiner &IC) {
// Don't bother doing this transformation for pointers, don't do it for
// vectors.
if (!isa<IntegerType>(MulVal->getType()))
return nullptr;
assert(I.getOperand(0) == MulVal || I.getOperand(1) == MulVal);
assert(I.getOperand(0) == OtherVal || I.getOperand(1) == OtherVal);
auto *MulInstr = dyn_cast<Instruction>(MulVal);
if (!MulInstr)
return nullptr;
assert(MulInstr->getOpcode() == Instruction::Mul);
auto *LHS = cast<ZExtOperator>(MulInstr->getOperand(0)),
*RHS = cast<ZExtOperator>(MulInstr->getOperand(1));
assert(LHS->getOpcode() == Instruction::ZExt);
assert(RHS->getOpcode() == Instruction::ZExt);
Value *A = LHS->getOperand(0), *B = RHS->getOperand(0);
// Calculate type and width of the result produced by mul.with.overflow.
Type *TyA = A->getType(), *TyB = B->getType();
unsigned WidthA = TyA->getPrimitiveSizeInBits(),
WidthB = TyB->getPrimitiveSizeInBits();
unsigned MulWidth;
Type *MulType;
if (WidthB > WidthA) {
MulWidth = WidthB;
MulType = TyB;
} else {
MulWidth = WidthA;
MulType = TyA;
}
// In order to replace the original mul with a narrower mul.with.overflow,
// all uses must ignore upper bits of the product. The number of used low
// bits must be not greater than the width of mul.with.overflow.
if (MulVal->hasNUsesOrMore(2))
for (User *U : MulVal->users()) {
if (U == &I)
continue;
if (TruncInst *TI = dyn_cast<TruncInst>(U)) {
// Check if truncation ignores bits above MulWidth.
unsigned TruncWidth = TI->getType()->getPrimitiveSizeInBits();
if (TruncWidth > MulWidth)
return nullptr;
} else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
// Check if AND ignores bits above MulWidth.
if (BO->getOpcode() != Instruction::And)
return nullptr;
if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
const APInt &CVal = CI->getValue();
if (CVal.getBitWidth() - CVal.countLeadingZeros() > MulWidth)
return nullptr;
} else {
// In this case we could have the operand of the binary operation
// being defined in another block, and performing the replacement
// could break the dominance relation.
return nullptr;
}
} else {
// Other uses prohibit this transformation.
return nullptr;
}
}
// Recognize patterns
switch (I.getPredicate()) {
case ICmpInst::ICMP_EQ:
case ICmpInst::ICMP_NE:
// Recognize pattern:
// mulval = mul(zext A, zext B)
// cmp eq/neq mulval, zext trunc mulval
if (ZExtInst *Zext = dyn_cast<ZExtInst>(OtherVal))
if (Zext->hasOneUse()) {
Value *ZextArg = Zext->getOperand(0);
if (TruncInst *Trunc = dyn_cast<TruncInst>(ZextArg))
if (Trunc->getType()->getPrimitiveSizeInBits() == MulWidth)
break; //Recognized
}
// Recognize pattern:
// mulval = mul(zext A, zext B)
// cmp eq/neq mulval, and(mulval, mask), mask selects low MulWidth bits.
ConstantInt *CI;
Value *ValToMask;
if (match(OtherVal, m_And(m_Value(ValToMask), m_ConstantInt(CI)))) {
if (ValToMask != MulVal)
return nullptr;
const APInt &CVal = CI->getValue() + 1;
if (CVal.isPowerOf2()) {
unsigned MaskWidth = CVal.logBase2();
if (MaskWidth == MulWidth)
break; // Recognized
}
}
return nullptr;
case ICmpInst::ICMP_UGT:
// Recognize pattern:
// mulval = mul(zext A, zext B)
// cmp ugt mulval, max
if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
APInt MaxVal = APInt::getMaxValue(MulWidth);
MaxVal = MaxVal.zext(CI->getBitWidth());
if (MaxVal.eq(CI->getValue()))
break; // Recognized
}
return nullptr;
case ICmpInst::ICMP_UGE:
// Recognize pattern:
// mulval = mul(zext A, zext B)
// cmp uge mulval, max+1
if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth);
if (MaxVal.eq(CI->getValue()))
break; // Recognized
}
return nullptr;
case ICmpInst::ICMP_ULE:
// Recognize pattern:
// mulval = mul(zext A, zext B)
// cmp ule mulval, max
if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
APInt MaxVal = APInt::getMaxValue(MulWidth);
MaxVal = MaxVal.zext(CI->getBitWidth());
if (MaxVal.eq(CI->getValue()))
break; // Recognized
}
return nullptr;
case ICmpInst::ICMP_ULT:
// Recognize pattern:
// mulval = mul(zext A, zext B)
// cmp ule mulval, max + 1
if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth);
if (MaxVal.eq(CI->getValue()))
break; // Recognized
}
return nullptr;
default:
return nullptr;
}
InstCombiner::BuilderTy &Builder = IC.Builder;
Builder.SetInsertPoint(MulInstr);
// Replace: mul(zext A, zext B) --> mul.with.overflow(A, B)
Value *MulA = A, *MulB = B;
if (WidthA < MulWidth)
MulA = Builder.CreateZExt(A, MulType);
if (WidthB < MulWidth)
MulB = Builder.CreateZExt(B, MulType);
Function *F = Intrinsic::getDeclaration(
I.getModule(), Intrinsic::umul_with_overflow, MulType);
CallInst *Call = Builder.CreateCall(F, {MulA, MulB}, "umul");
IC.Worklist.Add(MulInstr);
// If there are uses of mul result other than the comparison, we know that
// they are truncation or binary AND. Change them to use result of
// mul.with.overflow and adjust properly mask/size.
if (MulVal->hasNUsesOrMore(2)) {
Value *Mul = Builder.CreateExtractValue(Call, 0, "umul.value");
for (auto UI = MulVal->user_begin(), UE = MulVal->user_end(); UI != UE;) {
User *U = *UI++;
if (U == &I || U == OtherVal)
continue;
if (TruncInst *TI = dyn_cast<TruncInst>(U)) {
if (TI->getType()->getPrimitiveSizeInBits() == MulWidth)
IC.replaceInstUsesWith(*TI, Mul);
else
TI->setOperand(0, Mul);
} else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
assert(BO->getOpcode() == Instruction::And);
// Replace (mul & mask) --> zext (mul.with.overflow & short_mask)
ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
APInt ShortMask = CI->getValue().trunc(MulWidth);
Value *ShortAnd = Builder.CreateAnd(Mul, ShortMask);
Instruction *Zext =
cast<Instruction>(Builder.CreateZExt(ShortAnd, BO->getType()));
IC.Worklist.Add(Zext);
IC.replaceInstUsesWith(*BO, Zext);
} else {
llvm_unreachable("Unexpected Binary operation");
}
IC.Worklist.Add(cast<Instruction>(U));
}
}
if (isa<Instruction>(OtherVal))
IC.Worklist.Add(cast<Instruction>(OtherVal));
// The original icmp gets replaced with the overflow value, maybe inverted
// depending on predicate.
bool Inverse = false;
switch (I.getPredicate()) {
case ICmpInst::ICMP_NE:
break;
case ICmpInst::ICMP_EQ:
Inverse = true;
break;
case ICmpInst::ICMP_UGT:
case ICmpInst::ICMP_UGE:
if (I.getOperand(0) == MulVal)
break;
Inverse = true;
break;
case ICmpInst::ICMP_ULT:
case ICmpInst::ICMP_ULE:
if (I.getOperand(1) == MulVal)
break;
Inverse = true;
break;
default:
llvm_unreachable("Unexpected predicate");
}
if (Inverse) {
Value *Res = Builder.CreateExtractValue(Call, 1);
return BinaryOperator::CreateNot(Res);
}
return ExtractValueInst::Create(Call, 1);
}
/// When performing a comparison against a constant, it is possible that not all
/// the bits in the LHS are demanded. This helper method computes the mask that
/// IS demanded.
static APInt getDemandedBitsLHSMask(ICmpInst &I, unsigned BitWidth) {
const APInt *RHS;
if (!match(I.getOperand(1), m_APInt(RHS)))
return APInt::getAllOnesValue(BitWidth);
// If this is a normal comparison, it demands all bits. If it is a sign bit
// comparison, it only demands the sign bit.
bool UnusedBit;
if (isSignBitCheck(I.getPredicate(), *RHS, UnusedBit))
return APInt::getSignMask(BitWidth);
switch (I.getPredicate()) {
// For a UGT comparison, we don't care about any bits that
// correspond to the trailing ones of the comparand. The value of these
// bits doesn't impact the outcome of the comparison, because any value
// greater than the RHS must differ in a bit higher than these due to carry.
case ICmpInst::ICMP_UGT:
return APInt::getBitsSetFrom(BitWidth, RHS->countTrailingOnes());
// Similarly, for a ULT comparison, we don't care about the trailing zeros.
// Any value less than the RHS must differ in a higher bit because of carries.
case ICmpInst::ICMP_ULT:
return APInt::getBitsSetFrom(BitWidth, RHS->countTrailingZeros());
default:
return APInt::getAllOnesValue(BitWidth);
}
}
/// Check if the order of \p Op0 and \p Op1 as operands in an ICmpInst
/// should be swapped.
/// The decision is based on how many times these two operands are reused
/// as subtract operands and their positions in those instructions.
/// The rationale is that several architectures use the same instruction for
/// both subtract and cmp. Thus, it is better if the order of those operands
/// match.
/// \return true if Op0 and Op1 should be swapped.
static bool swapMayExposeCSEOpportunities(const Value *Op0, const Value *Op1) {
// Filter out pointer values as those cannot appear directly in subtract.
// FIXME: we may want to go through inttoptrs or bitcasts.
if (Op0->getType()->isPointerTy())
return false;
// If a subtract already has the same operands as a compare, swapping would be
// bad. If a subtract has the same operands as a compare but in reverse order,
// then swapping is good.
int GoodToSwap = 0;
for (const User *U : Op0->users()) {
if (match(U, m_Sub(m_Specific(Op1), m_Specific(Op0))))
GoodToSwap++;
else if (match(U, m_Sub(m_Specific(Op0), m_Specific(Op1))))
GoodToSwap--;
}
return GoodToSwap > 0;
}
/// Check that one use is in the same block as the definition and all
/// other uses are in blocks dominated by a given block.
///
/// \param DI Definition
/// \param UI Use
/// \param DB Block that must dominate all uses of \p DI outside
/// the parent block
/// \return true when \p UI is the only use of \p DI in the parent block
/// and all other uses of \p DI are in blocks dominated by \p DB.
///
bool InstCombiner::dominatesAllUses(const Instruction *DI,
const Instruction *UI,
const BasicBlock *DB) const {
assert(DI && UI && "Instruction not defined\n");
// Ignore incomplete definitions.
if (!DI->getParent())
return false;
// DI and UI must be in the same block.
if (DI->getParent() != UI->getParent())
return false;
// Protect from self-referencing blocks.
if (DI->getParent() == DB)
return false;
for (const User *U : DI->users()) {
auto *Usr = cast<Instruction>(U);
if (Usr != UI && !DT.dominates(DB, Usr->getParent()))
return false;
}
return true;
}
/// Return true when the instruction sequence within a block is select-cmp-br.
static bool isChainSelectCmpBranch(const SelectInst *SI) {
const BasicBlock *BB = SI->getParent();
if (!BB)
return false;
auto *BI = dyn_cast_or_null<BranchInst>(BB->getTerminator());
if (!BI || BI->getNumSuccessors() != 2)
return false;
auto *IC = dyn_cast<ICmpInst>(BI->getCondition());
if (!IC || (IC->getOperand(0) != SI && IC->getOperand(1) != SI))
return false;
return true;
}
/// True when a select result is replaced by one of its operands
/// in select-icmp sequence. This will eventually result in the elimination
/// of the select.
///
/// \param SI Select instruction
/// \param Icmp Compare instruction
/// \param SIOpd Operand that replaces the select
///
/// Notes:
/// - The replacement is global and requires dominator information
/// - The caller is responsible for the actual replacement
///
/// Example:
///
/// entry:
/// %4 = select i1 %3, %C* %0, %C* null
/// %5 = icmp eq %C* %4, null
/// br i1 %5, label %9, label %7
/// ...
/// ; <label>:7 ; preds = %entry
/// %8 = getelementptr inbounds %C* %4, i64 0, i32 0
/// ...
///
/// can be transformed to
///
/// %5 = icmp eq %C* %0, null
/// %6 = select i1 %3, i1 %5, i1 true
/// br i1 %6, label %9, label %7
/// ...
/// ; <label>:7 ; preds = %entry
/// %8 = getelementptr inbounds %C* %0, i64 0, i32 0 // replace by %0!
///
/// Similar when the first operand of the select is a constant or/and
/// the compare is for not equal rather than equal.
///
/// NOTE: The function is only called when the select and compare constants
/// are equal, the optimization can work only for EQ predicates. This is not a
/// major restriction since a NE compare should be 'normalized' to an equal
/// compare, which usually happens in the combiner and test case
/// select-cmp-br.ll checks for it.
bool InstCombiner::replacedSelectWithOperand(SelectInst *SI,
const ICmpInst *Icmp,
const unsigned SIOpd) {
assert((SIOpd == 1 || SIOpd == 2) && "Invalid select operand!");
if (isChainSelectCmpBranch(SI) && Icmp->getPredicate() == ICmpInst::ICMP_EQ) {
BasicBlock *Succ = SI->getParent()->getTerminator()->getSuccessor(1);
// The check for the single predecessor is not the best that can be
// done. But it protects efficiently against cases like when SI's
// home block has two successors, Succ and Succ1, and Succ1 predecessor
// of Succ. Then SI can't be replaced by SIOpd because the use that gets
// replaced can be reached on either path. So the uniqueness check
// guarantees that the path all uses of SI (outside SI's parent) are on
// is disjoint from all other paths out of SI. But that information
// is more expensive to compute, and the trade-off here is in favor
// of compile-time. It should also be noticed that we check for a single
// predecessor and not only uniqueness. This to handle the situation when
// Succ and Succ1 points to the same basic block.
if (Succ->getSinglePredecessor() && dominatesAllUses(SI, Icmp, Succ)) {
NumSel++;
SI->replaceUsesOutsideBlock(SI->getOperand(SIOpd), SI->getParent());
return true;
}
}
return false;
}
/// Try to fold the comparison based on range information we can get by checking
/// whether bits are known to be zero or one in the inputs.
Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
Type *Ty = Op0->getType();
ICmpInst::Predicate Pred = I.getPredicate();
// Get scalar or pointer size.
unsigned BitWidth = Ty->isIntOrIntVectorTy()
? Ty->getScalarSizeInBits()
: DL.getIndexTypeSizeInBits(Ty->getScalarType());
if (!BitWidth)
return nullptr;
KnownBits Op0Known(BitWidth);
KnownBits Op1Known(BitWidth);
if (SimplifyDemandedBits(&I, 0,
getDemandedBitsLHSMask(I, BitWidth),
Op0Known, 0))
return &I;
if (SimplifyDemandedBits(&I, 1, APInt::getAllOnesValue(BitWidth),
Op1Known, 0))
return &I;
// Given the known and unknown bits, compute a range that the LHS could be
// in. Compute the Min, Max and RHS values based on the known bits. For the
// EQ and NE we use unsigned values.
APInt Op0Min(BitWidth, 0), Op0Max(BitWidth, 0);
APInt Op1Min(BitWidth, 0), Op1Max(BitWidth, 0);
if (I.isSigned()) {
computeSignedMinMaxValuesFromKnownBits(Op0Known, Op0Min, Op0Max);
computeSignedMinMaxValuesFromKnownBits(Op1Known, Op1Min, Op1Max);
} else {
computeUnsignedMinMaxValuesFromKnownBits(Op0Known, Op0Min, Op0Max);
computeUnsignedMinMaxValuesFromKnownBits(Op1Known, Op1Min, Op1Max);
}
// If Min and Max are known to be the same, then SimplifyDemandedBits figured
// out that the LHS or RHS is a constant. Constant fold this now, so that
// code below can assume that Min != Max.
if (!isa<Constant>(Op0) && Op0Min == Op0Max)
return new ICmpInst(Pred, ConstantExpr::getIntegerValue(Ty, Op0Min), Op1);
if (!isa<Constant>(Op1) && Op1Min == Op1Max)
return new ICmpInst(Pred, Op0, ConstantExpr::getIntegerValue(Ty, Op1Min));
// Based on the range information we know about the LHS, see if we can
// simplify this comparison. For example, (x&4) < 8 is always true.
switch (Pred) {
default:
llvm_unreachable("Unknown icmp opcode!");
case ICmpInst::ICMP_EQ:
case ICmpInst::ICMP_NE: {
if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max)) {
return Pred == CmpInst::ICMP_EQ
? replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()))
: replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
}
// If all bits are known zero except for one, then we know at most one bit
// is set. If the comparison is against zero, then this is a check to see if
// *that* bit is set.
APInt Op0KnownZeroInverted = ~Op0Known.Zero;
if (Op1Known.isZero()) {
// If the LHS is an AND with the same constant, look through it.
Value *LHS = nullptr;
const APInt *LHSC;
if (!match(Op0, m_And(m_Value(LHS), m_APInt(LHSC))) ||
*LHSC != Op0KnownZeroInverted)
LHS = Op0;
Value *X;
if (match(LHS, m_Shl(m_One(), m_Value(X)))) {
APInt ValToCheck = Op0KnownZeroInverted;
Type *XTy = X->getType();
if (ValToCheck.isPowerOf2()) {
// ((1 << X) & 8) == 0 -> X != 3
// ((1 << X) & 8) != 0 -> X == 3
auto *CmpC = ConstantInt::get(XTy, ValToCheck.countTrailingZeros());
auto NewPred = ICmpInst::getInversePredicate(Pred);
return new ICmpInst(NewPred, X, CmpC);
} else if ((++ValToCheck).isPowerOf2()) {
// ((1 << X) & 7) == 0 -> X >= 3
// ((1 << X) & 7) != 0 -> X < 3
auto *CmpC = ConstantInt::get(XTy, ValToCheck.countTrailingZeros());
auto NewPred =
Pred == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGE : CmpInst::ICMP_ULT;
return new ICmpInst(NewPred, X, CmpC);
}
}
// Check if the LHS is 8 >>u x and the result is a power of 2 like 1.
const APInt *CI;
if (Op0KnownZeroInverted.isOneValue() &&
match(LHS, m_LShr(m_Power2(CI), m_Value(X)))) {
// ((8 >>u X) & 1) == 0 -> X != 3
// ((8 >>u X) & 1) != 0 -> X == 3
unsigned CmpVal = CI->countTrailingZeros();
auto NewPred = ICmpInst::getInversePredicate(Pred);
return new ICmpInst(NewPred, X, ConstantInt::get(X->getType(), CmpVal));
}
}
break;
}
case ICmpInst::ICMP_ULT: {
if (Op0Max.ult(Op1Min)) // A <u B -> true if max(A) < min(B)
return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
if (Op0Min.uge(Op1Max)) // A <u B -> false if min(A) >= max(B)
return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
if (Op1Min == Op0Max) // A <u B -> A != B if max(A) == min(B)
return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
const APInt *CmpC;
if (match(Op1, m_APInt(CmpC))) {
// A <u C -> A == C-1 if min(A)+1 == C
if (*CmpC == Op0Min + 1)
return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
ConstantInt::get(Op1->getType(), *CmpC - 1));
// X <u C --> X == 0, if the number of zero bits in the bottom of X
// exceeds the log2 of C.
if (Op0Known.countMinTrailingZeros() >= CmpC->ceilLogBase2())
return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
Constant::getNullValue(Op1->getType()));
}
break;
}
case ICmpInst::ICMP_UGT: {
if (Op0Min.ugt(Op1Max)) // A >u B -> true if min(A) > max(B)
return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
if (Op0Max.ule(Op1Min)) // A >u B -> false if max(A) <= max(B)
return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
if (Op1Max == Op0Min) // A >u B -> A != B if min(A) == max(B)
return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
const APInt *CmpC;
if (match(Op1, m_APInt(CmpC))) {
// A >u C -> A == C+1 if max(a)-1 == C
if (*CmpC == Op0Max - 1)
return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
ConstantInt::get(Op1->getType(), *CmpC + 1));
// X >u C --> X != 0, if the number of zero bits in the bottom of X
// exceeds the log2 of C.
if (Op0Known.countMinTrailingZeros() >= CmpC->getActiveBits())
return new ICmpInst(ICmpInst::ICMP_NE, Op0,
Constant::getNullValue(Op1->getType()));
}
break;
}
case ICmpInst::ICMP_SLT: {
if (Op0Max.slt(Op1Min)) // A <s B -> true if max(A) < min(C)
return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
if (Op0Min.sge(Op1Max)) // A <s B -> false if min(A) >= max(C)
return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
if (Op1Min == Op0Max) // A <s B -> A != B if max(A) == min(B)
return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
const APInt *CmpC;
if (match(Op1, m_APInt(CmpC))) {
if (*CmpC == Op0Min + 1) // A <s C -> A == C-1 if min(A)+1 == C
return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
ConstantInt::get(Op1->getType(), *CmpC - 1));
}
break;
}
case ICmpInst::ICMP_SGT: {
if (Op0Min.sgt(Op1Max)) // A >s B -> true if min(A) > max(B)
return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
if (Op0Max.sle(Op1Min)) // A >s B -> false if max(A) <= min(B)
return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
if (Op1Max == Op0Min) // A >s B -> A != B if min(A) == max(B)
return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
const APInt *CmpC;
if (match(Op1, m_APInt(CmpC))) {
if (*CmpC == Op0Max - 1) // A >s C -> A == C+1 if max(A)-1 == C
return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
ConstantInt::get(Op1->getType(), *CmpC + 1));
}
break;
}
case ICmpInst::ICMP_SGE:
assert(!isa<ConstantInt>(Op1) && "ICMP_SGE with ConstantInt not folded!");
if (Op0Min.sge(Op1Max)) // A >=s B -> true if min(A) >= max(B)
return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
if (Op0Max.slt(Op1Min)) // A >=s B -> false if max(A) < min(B)
return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
if (Op1Min == Op0Max) // A >=s B -> A == B if max(A) == min(B)
return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
break;
case ICmpInst::ICMP_SLE:
assert(!isa<ConstantInt>(Op1) && "ICMP_SLE with ConstantInt not folded!");
if (Op0Max.sle(Op1Min)) // A <=s B -> true if max(A) <= min(B)
return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
if (Op0Min.sgt(Op1Max)) // A <=s B -> false if min(A) > max(B)
return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
if (Op1Max == Op0Min) // A <=s B -> A == B if min(A) == max(B)
return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
break;
case ICmpInst::ICMP_UGE:
assert(!isa<ConstantInt>(Op1) && "ICMP_UGE with ConstantInt not folded!");
if (Op0Min.uge(Op1Max)) // A >=u B -> true if min(A) >= max(B)
return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
if (Op0Max.ult(Op1Min)) // A >=u B -> false if max(A) < min(B)
return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
if (Op1Min == Op0Max) // A >=u B -> A == B if max(A) == min(B)
return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
break;
case ICmpInst::ICMP_ULE:
assert(!isa<ConstantInt>(Op1) && "ICMP_ULE with ConstantInt not folded!");
if (Op0Max.ule(Op1Min)) // A <=u B -> true if max(A) <= min(B)
return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
if (Op0Min.ugt(Op1Max)) // A <=u B -> false if min(A) > max(B)
return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
if (Op1Max == Op0Min) // A <=u B -> A == B if min(A) == max(B)
return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
break;
}
// Turn a signed comparison into an unsigned one if both operands are known to
// have the same sign.
if (I.isSigned() &&
((Op0Known.Zero.isNegative() && Op1Known.Zero.isNegative()) ||
(Op0Known.One.isNegative() && Op1Known.One.isNegative())))
return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1);
return nullptr;
}
/// If we have an icmp le or icmp ge instruction with a constant operand, turn
/// it into the appropriate icmp lt or icmp gt instruction. This transform
/// allows them to be folded in visitICmpInst.
static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
ICmpInst::Predicate Pred = I.getPredicate();
if (Pred != ICmpInst::ICMP_SLE && Pred != ICmpInst::ICMP_SGE &&
Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_UGE)
return nullptr;
Value *Op0 = I.getOperand(0);
Value *Op1 = I.getOperand(1);
auto *Op1C = dyn_cast<Constant>(Op1);
if (!Op1C)
return nullptr;
// Check if the constant operand can be safely incremented/decremented without
// overflowing/underflowing. For scalars, SimplifyICmpInst has already handled
// the edge cases for us, so we just assert on them. For vectors, we must
// handle the edge cases.
Type *Op1Type = Op1->getType();
bool IsSigned = I.isSigned();
bool IsLE = (Pred == ICmpInst::ICMP_SLE || Pred == ICmpInst::ICMP_ULE);
auto *CI = dyn_cast<ConstantInt>(Op1C);
if (CI) {
// A <= MAX -> TRUE ; A >= MIN -> TRUE
assert(IsLE ? !CI->isMaxValue(IsSigned) : !CI->isMinValue(IsSigned));
} else if (Op1Type->isVectorTy()) {
// TODO? If the edge cases for vectors were guaranteed to be handled as they
// are for scalar, we could remove the min/max checks. However, to do that,
// we would have to use insertelement/shufflevector to replace edge values.
unsigned NumElts = Op1Type->getVectorNumElements();
for (unsigned i = 0; i != NumElts; ++i) {
Constant *Elt = Op1C->getAggregateElement(i);
if (!Elt)
return nullptr;
if (isa<UndefValue>(Elt))
continue;
// Bail out if we can't determine if this constant is min/max or if we
// know that this constant is min/max.
auto *CI = dyn_cast<ConstantInt>(Elt);
if (!CI || (IsLE ? CI->isMaxValue(IsSigned) : CI->isMinValue(IsSigned)))
return nullptr;
}
} else {
// ConstantExpr?
return nullptr;
}
// Increment or decrement the constant and set the new comparison predicate:
// ULE -> ULT ; UGE -> UGT ; SLE -> SLT ; SGE -> SGT
Constant *OneOrNegOne = ConstantInt::get(Op1Type, IsLE ? 1 : -1, true);
CmpInst::Predicate NewPred = IsLE ? ICmpInst::ICMP_ULT: ICmpInst::ICMP_UGT;
NewPred = IsSigned ? ICmpInst::getSignedPredicate(NewPred) : NewPred;
return new ICmpInst(NewPred, Op0, ConstantExpr::getAdd(Op1C, OneOrNegOne));
}
/// Integer compare with boolean values can always be turned into bitwise ops.
static Instruction *canonicalizeICmpBool(ICmpInst &I,
InstCombiner::BuilderTy &Builder) {
Value *A = I.getOperand(0), *B = I.getOperand(1);
assert(A->getType()->isIntOrIntVectorTy(1) && "Bools only");
// A boolean compared to true/false can be simplified to Op0/true/false in
// 14 out of the 20 (10 predicates * 2 constants) possible combinations.
// Cases not handled by InstSimplify are always 'not' of Op0.
if (match(B, m_Zero())) {
switch (I.getPredicate()) {
case CmpInst::ICMP_EQ: // A == 0 -> !A
case CmpInst::ICMP_ULE: // A <=u 0 -> !A
case CmpInst::ICMP_SGE: // A >=s 0 -> !A
return BinaryOperator::CreateNot(A);
default:
llvm_unreachable("ICmp i1 X, C not simplified as expected.");
}
} else if (match(B, m_One())) {
switch (I.getPredicate()) {
case CmpInst::ICMP_NE: // A != 1 -> !A
case CmpInst::ICMP_ULT: // A <u 1 -> !A
case CmpInst::ICMP_SGT: // A >s -1 -> !A
return BinaryOperator::CreateNot(A);
default:
llvm_unreachable("ICmp i1 X, C not simplified as expected.");
}
}
switch (I.getPredicate()) {
default:
llvm_unreachable("Invalid icmp instruction!");
case ICmpInst::ICMP_EQ:
// icmp eq i1 A, B -> ~(A ^ B)
return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
case ICmpInst::ICMP_NE:
// icmp ne i1 A, B -> A ^ B
return BinaryOperator::CreateXor(A, B);
case ICmpInst::ICMP_UGT:
// icmp ugt -> icmp ult
std::swap(A, B);
LLVM_FALLTHROUGH;
case ICmpInst::ICMP_ULT:
// icmp ult i1 A, B -> ~A & B
return BinaryOperator::CreateAnd(Builder.CreateNot(A), B);
case ICmpInst::ICMP_SGT:
// icmp sgt -> icmp slt
std::swap(A, B);
LLVM_FALLTHROUGH;
case ICmpInst::ICMP_SLT:
// icmp slt i1 A, B -> A & ~B
return BinaryOperator::CreateAnd(Builder.CreateNot(B), A);
case ICmpInst::ICMP_UGE:
// icmp uge -> icmp ule
std::swap(A, B);
LLVM_FALLTHROUGH;
case ICmpInst::ICMP_ULE:
// icmp ule i1 A, B -> ~A | B
return BinaryOperator::CreateOr(Builder.CreateNot(A), B);
case ICmpInst::ICMP_SGE:
// icmp sge -> icmp sle
std::swap(A, B);
LLVM_FALLTHROUGH;
case ICmpInst::ICMP_SLE:
// icmp sle i1 A, B -> A | ~B
return BinaryOperator::CreateOr(Builder.CreateNot(B), A);
}
}
// Transform pattern like:
// (1 << Y) u<= X or ~(-1 << Y) u< X or ((1 << Y)+(-1)) u< X
// (1 << Y) u> X or ~(-1 << Y) u>= X or ((1 << Y)+(-1)) u>= X
// Into:
// (X l>> Y) != 0
// (X l>> Y) == 0
static Instruction *foldICmpWithHighBitMask(ICmpInst &Cmp,
InstCombiner::BuilderTy &Builder) {
ICmpInst::Predicate Pred, NewPred;
Value *X, *Y;
if (match(&Cmp,
m_c_ICmp(Pred, m_OneUse(m_Shl(m_One(), m_Value(Y))), m_Value(X)))) {
// We want X to be the icmp's second operand, so swap predicate if it isn't.
if (Cmp.getOperand(0) == X)
Pred = Cmp.getSwappedPredicate();
switch (Pred) {
case ICmpInst::ICMP_ULE:
NewPred = ICmpInst::ICMP_NE;
break;
case ICmpInst::ICMP_UGT:
NewPred = ICmpInst::ICMP_EQ;
break;
default:
return nullptr;
}
} else if (match(&Cmp, m_c_ICmp(Pred,
m_OneUse(m_CombineOr(
m_Not(m_Shl(m_AllOnes(), m_Value(Y))),
m_Add(m_Shl(m_One(), m_Value(Y)),
m_AllOnes()))),
m_Value(X)))) {
// The variant with 'add' is not canonical, (the variant with 'not' is)
// we only get it because it has extra uses, and can't be canonicalized,
// We want X to be the icmp's second operand, so swap predicate if it isn't.
if (Cmp.getOperand(0) == X)
Pred = Cmp.getSwappedPredicate();
switch (Pred) {
case ICmpInst::ICMP_ULT:
NewPred = ICmpInst::ICMP_NE;
break;
case ICmpInst::ICMP_UGE:
NewPred = ICmpInst::ICMP_EQ;
break;
default:
return nullptr;
}
} else
return nullptr;
Value *NewX = Builder.CreateLShr(X, Y, X->getName() + ".highbits");
Constant *Zero = Constant::getNullValue(NewX->getType());
return CmpInst::Create(Instruction::ICmp, NewPred, NewX, Zero);
}
static Instruction *foldVectorCmp(CmpInst &Cmp,
InstCombiner::BuilderTy &Builder) {
// If both arguments of the cmp are shuffles that use the same mask and
// shuffle within a single vector, move the shuffle after the cmp.
Value *LHS = Cmp.getOperand(0), *RHS = Cmp.getOperand(1);
Value *V1, *V2;
Constant *M;
if (match(LHS, m_ShuffleVector(m_Value(V1), m_Undef(), m_Constant(M))) &&
match(RHS, m_ShuffleVector(m_Value(V2), m_Undef(), m_Specific(M))) &&
V1->getType() == V2->getType() &&
(LHS->hasOneUse() || RHS->hasOneUse())) {
// cmp (shuffle V1, M), (shuffle V2, M) --> shuffle (cmp V1, V2), M
CmpInst::Predicate P = Cmp.getPredicate();
Value *NewCmp = isa<ICmpInst>(Cmp) ? Builder.CreateICmp(P, V1, V2)
: Builder.CreateFCmp(P, V1, V2);
return new ShuffleVectorInst(NewCmp, UndefValue::get(NewCmp->getType()), M);
}
return nullptr;
}
Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
bool Changed = false;
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
unsigned Op0Cplxity = getComplexity(Op0);
unsigned Op1Cplxity = getComplexity(Op1);
/// Orders the operands of the compare so that they are listed from most
/// complex to least complex. This puts constants before unary operators,
/// before binary operators.
if (Op0Cplxity < Op1Cplxity ||
(Op0Cplxity == Op1Cplxity && swapMayExposeCSEOpportunities(Op0, Op1))) {
I.swapOperands();
std::swap(Op0, Op1);
Changed = true;
}
if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1,
SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);
// Comparing -val or val with non-zero is the same as just comparing val
// ie, abs(val) != 0 -> val != 0
if (I.getPredicate() == ICmpInst::ICMP_NE && match(Op1, m_Zero())) {
Value *Cond, *SelectTrue, *SelectFalse;
if (match(Op0, m_Select(m_Value(Cond), m_Value(SelectTrue),
m_Value(SelectFalse)))) {
if (Value *V = dyn_castNegVal(SelectTrue)) {
if (V == SelectFalse)
return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1);
}
else if (Value *V = dyn_castNegVal(SelectFalse)) {
if (V == SelectTrue)
return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1);
}
}
}
if (Op0->getType()->isIntOrIntVectorTy(1))
if (Instruction *Res = canonicalizeICmpBool(I, Builder))
return Res;
if (ICmpInst *NewICmp = canonicalizeCmpWithConstant(I))
return NewICmp;
if (Instruction *Res = foldICmpWithConstant(I))
return Res;
if (Instruction *Res = foldICmpWithDominatingICmp(I))
return Res;
if (Instruction *Res = foldICmpUsingKnownBits(I))
return Res;
// Test if the ICmpInst instruction is used exclusively by a select as
// part of a minimum or maximum operation. If so, refrain from doing
// any other folding. This helps out other analyses which understand
// non-obfuscated minimum and maximum idioms, such as ScalarEvolution
// and CodeGen. And in this case, at least one of the comparison
// operands has at least one user besides the compare (the select),
// which would often largely negate the benefit of folding anyway.
//
// Do the same for the other patterns recognized by matchSelectPattern.
if (I.hasOneUse())
if (SelectInst *SI = dyn_cast<SelectInst>(I.user_back())) {
Value *A, *B;
SelectPatternResult SPR = matchSelectPattern(SI, A, B);
if (SPR.Flavor != SPF_UNKNOWN)
return nullptr;
}
// Do this after checking for min/max to prevent infinite looping.
if (Instruction *Res = foldICmpWithZero(I))
return Res;
// FIXME: We only do this after checking for min/max to prevent infinite
// looping caused by a reverse canonicalization of these patterns for min/max.
// FIXME: The organization of folds is a mess. These would naturally go into
// canonicalizeCmpWithConstant(), but we can't move all of the above folds
// down here after the min/max restriction.
ICmpInst::Predicate Pred = I.getPredicate();
const APInt *C;
if (match(Op1, m_APInt(C))) {
// For i32: x >u 2147483647 -> x <s 0 -> true if sign bit set
if (Pred == ICmpInst::ICMP_UGT && C->isMaxSignedValue()) {
Constant *Zero = Constant::getNullValue(Op0->getType());
return new ICmpInst(ICmpInst::ICMP_SLT, Op0, Zero);
}
// For i32: x <u 2147483648 -> x >s -1 -> true if sign bit clear
if (Pred == ICmpInst::ICMP_ULT && C->isMinSignedValue()) {
Constant *AllOnes = Constant::getAllOnesValue(Op0->getType());
return new ICmpInst(ICmpInst::ICMP_SGT, Op0, AllOnes);
}
}
if (Instruction *Res = foldICmpInstWithConstant(I))
return Res;
if (Instruction *Res = foldICmpInstWithConstantNotInt(I))
return Res;
// If we can optimize a 'icmp GEP, P' or 'icmp P, GEP', do so now.
if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op0))
if (Instruction *NI = foldGEPICmp(GEP, Op1, I.getPredicate(), I))
return NI;
if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op1))
if (Instruction *NI = foldGEPICmp(GEP, Op0,
ICmpInst::getSwappedPredicate(I.getPredicate()), I))
return NI;
// Try to optimize equality comparisons against alloca-based pointers.
if (Op0->getType()->isPointerTy() && I.isEquality()) {
assert(Op1->getType()->isPointerTy() && "Comparing pointer with non-pointer?");
if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op0, DL)))
if (Instruction *New = foldAllocaCmp(I, Alloca, Op1))
return New;
if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op1, DL)))
if (Instruction *New = foldAllocaCmp(I, Alloca, Op0))
return New;
}
if (Instruction *Res = foldICmpBitCast(I, Builder))
return Res;
if (isa<CastInst>(Op0)) {
// Handle the special case of: icmp (cast bool to X), <cst>
// This comes up when you have code like
// int X = A < B;
// if (X) ...
// For generality, we handle any zero-extension of any operand comparison
// with a constant or another cast from the same type.
if (isa<Constant>(Op1) || isa<CastInst>(Op1))
if (Instruction *R = foldICmpWithCastAndCast(I))
return R;
}
if (Instruction *Res = foldICmpBinOp(I))
return Res;
if (Instruction *Res = foldICmpWithMinMax(I))
return Res;
{
Value *A, *B;
// Transform (A & ~B) == 0 --> (A & B) != 0
// and (A & ~B) != 0 --> (A & B) == 0
// if A is a power of 2.
if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
match(Op1, m_Zero()) &&
isKnownToBeAPowerOfTwo(A, false, 0, &I) && I.isEquality())
return new ICmpInst(I.getInversePredicate(), Builder.CreateAnd(A, B),
Op1);
// ~X < ~Y --> Y < X
// ~X < C --> X > ~C
if (match(Op0, m_Not(m_Value(A)))) {
if (match(Op1, m_Not(m_Value(B))))
return new ICmpInst(I.getPredicate(), B, A);
const APInt *C;
if (match(Op1, m_APInt(C)))
return new ICmpInst(I.getSwappedPredicate(), A,
ConstantInt::get(Op1->getType(), ~(*C)));
}
Instruction *AddI = nullptr;
if (match(&I, m_UAddWithOverflow(m_Value(A), m_Value(B),
m_Instruction(AddI))) &&
isa<IntegerType>(A->getType())) {
Value *Result;
Constant *Overflow;
if (OptimizeOverflowCheck(Instruction::Add, /*Signed*/false, A, B,
*AddI, Result, Overflow)) {
replaceInstUsesWith(*AddI, Result);
return replaceInstUsesWith(I, Overflow);
}
}
// (zext a) * (zext b) --> llvm.umul.with.overflow.
if (match(Op0, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) {
if (Instruction *R = processUMulZExtIdiom(I, Op0, Op1, *this))
return R;
}
if (match(Op1, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) {
if (Instruction *R = processUMulZExtIdiom(I, Op1, Op0, *this))
return R;
}
}
if (Instruction *Res = foldICmpEquality(I))
return Res;
// The 'cmpxchg' instruction returns an aggregate containing the old value and
// an i1 which indicates whether or not we successfully did the swap.
//
// Replace comparisons between the old value and the expected value with the
// indicator that 'cmpxchg' returns.
//
// N.B. This transform is only valid when the 'cmpxchg' is not permitted to
// spuriously fail. In those cases, the old value may equal the expected
// value but it is possible for the swap to not occur.
if (I.getPredicate() == ICmpInst::ICMP_EQ)
if (auto *EVI = dyn_cast<ExtractValueInst>(Op0))
if (auto *ACXI = dyn_cast<AtomicCmpXchgInst>(EVI->getAggregateOperand()))
if (EVI->getIndices()[0] == 0 && ACXI->getCompareOperand() == Op1 &&
!ACXI->isWeak())
return ExtractValueInst::Create(ACXI, 1);
{
Value *X;
const APInt *C;
// icmp X+Cst, X
if (match(Op0, m_Add(m_Value(X), m_APInt(C))) && Op1 == X)
return foldICmpAddOpConst(X, *C, I.getPredicate());
// icmp X, X+Cst
if (match(Op1, m_Add(m_Value(X), m_APInt(C))) && Op0 == X)
return foldICmpAddOpConst(X, *C, I.getSwappedPredicate());
}
if (Instruction *Res = foldICmpWithHighBitMask(I, Builder))
return Res;
if (I.getType()->isVectorTy())
if (Instruction *Res = foldVectorCmp(I, Builder))
return Res;
return Changed ? &I : nullptr;
}
/// Fold fcmp ([us]itofp x, cst) if possible.
Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
Constant *RHSC) {
if (!isa<ConstantFP>(RHSC)) return nullptr;
const APFloat &RHS = cast<ConstantFP>(RHSC)->getValueAPF();
// Get the width of the mantissa. We don't want to hack on conversions that
// might lose information from the integer, e.g. "i64 -> float"
int MantissaWidth = LHSI->getType()->getFPMantissaWidth();
if (MantissaWidth == -1) return nullptr; // Unknown.
IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType());
bool LHSUnsigned = isa<UIToFPInst>(LHSI);
if (I.isEquality()) {
FCmpInst::Predicate P = I.getPredicate();
bool IsExact = false;
APSInt RHSCvt(IntTy->getBitWidth(), LHSUnsigned);
RHS.convertToInteger(RHSCvt, APFloat::rmNearestTiesToEven, &IsExact);
// If the floating point constant isn't an integer value, we know if we will
// ever compare equal / not equal to it.
if (!IsExact) {
// TODO: Can never be -0.0 and other non-representable values
APFloat RHSRoundInt(RHS);
RHSRoundInt.roundToIntegral(APFloat::rmNearestTiesToEven);
if (RHS.compare(RHSRoundInt) != APFloat::cmpEqual) {
if (P == FCmpInst::FCMP_OEQ || P == FCmpInst::FCMP_UEQ)
return replaceInstUsesWith(I, Builder.getFalse());
assert(P == FCmpInst::FCMP_ONE || P == FCmpInst::FCMP_UNE);
return replaceInstUsesWith(I, Builder.getTrue());
}
}
// TODO: If the constant is exactly representable, is it always OK to do
// equality compares as integer?
}
// Check to see that the input is converted from an integer type that is small
// enough that preserves all bits. TODO: check here for "known" sign bits.
// This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e.
unsigned InputSize = IntTy->getScalarSizeInBits();
// Following test does NOT adjust InputSize downwards for signed inputs,
// because the most negative value still requires all the mantissa bits
// to distinguish it from one less than that value.
if ((int)InputSize > MantissaWidth) {
// Conversion would lose accuracy. Check if loss can impact comparison.
int Exp = ilogb(RHS);
if (Exp == APFloat::IEK_Inf) {
int MaxExponent = ilogb(APFloat::getLargest(RHS.getSemantics()));
if (MaxExponent < (int)InputSize - !LHSUnsigned)
// Conversion could create infinity.
return nullptr;
} else {
// Note that if RHS is zero or NaN, then Exp is negative
// and first condition is trivially false.
if (MantissaWidth <= Exp && Exp <= (int)InputSize - !LHSUnsigned)
// Conversion could affect comparison.
return nullptr;
}
}
// Otherwise, we can potentially simplify the comparison. We know that it
// will always come through as an integer value and we know the constant is
// not a NAN (it would have been previously simplified).
assert(!RHS.isNaN() && "NaN comparison not already folded!");
ICmpInst::Predicate Pred;
switch (I.getPredicate()) {
default: llvm_unreachable("Unexpected predicate!");
case FCmpInst::FCMP_UEQ:
case FCmpInst::FCMP_OEQ:
Pred = ICmpInst::ICMP_EQ;
break;
case FCmpInst::FCMP_UGT:
case FCmpInst::FCMP_OGT:
Pred = LHSUnsigned ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_SGT;
break;
case FCmpInst::FCMP_UGE:
case FCmpInst::FCMP_OGE:
Pred = LHSUnsigned ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_SGE;
break;
case FCmpInst::FCMP_ULT:
case FCmpInst::FCMP_OLT:
Pred = LHSUnsigned ? ICmpInst::ICMP_ULT : ICmpInst::ICMP_SLT;
break;
case FCmpInst::FCMP_ULE:
case FCmpInst::FCMP_OLE:
Pred = LHSUnsigned ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_SLE;
break;
case FCmpInst::FCMP_UNE:
case FCmpInst::FCMP_ONE:
Pred = ICmpInst::ICMP_NE;
break;
case FCmpInst::FCMP_ORD:
return replaceInstUsesWith(I, Builder.getTrue());
case FCmpInst::FCMP_UNO:
return replaceInstUsesWith(I, Builder.getFalse());
}
// Now we know that the APFloat is a normal number, zero or inf.
// See if the FP constant is too large for the integer. For example,
// comparing an i8 to 300.0.
unsigned IntWidth = IntTy->getScalarSizeInBits();
if (!LHSUnsigned) {
// If the RHS value is > SignedMax, fold the comparison. This handles +INF
// and large values.
APFloat SMax(RHS.getSemantics());
SMax.convertFromAPInt(APInt::getSignedMaxValue(IntWidth), true,
APFloat::rmNearestTiesToEven);
if (SMax.compare(RHS) == APFloat::cmpLessThan) { // smax < 13123.0
if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SLT ||
Pred == ICmpInst::ICMP_SLE)
return replaceInstUsesWith(I, Builder.getTrue());
return replaceInstUsesWith(I, Builder.getFalse());
}
} else {
// If the RHS value is > UnsignedMax, fold the comparison. This handles
// +INF and large values.
APFloat UMax(RHS.getSemantics());
UMax.convertFromAPInt(APInt::getMaxValue(IntWidth), false,
APFloat::rmNearestTiesToEven);
if (UMax.compare(RHS) == APFloat::cmpLessThan) { // umax < 13123.0
if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_ULT ||
Pred == ICmpInst::ICMP_ULE)
return replaceInstUsesWith(I, Builder.getTrue());
return replaceInstUsesWith(I, Builder.getFalse());
}
}
if (!LHSUnsigned) {
// See if the RHS value is < SignedMin.
APFloat SMin(RHS.getSemantics());
SMin.convertFromAPInt(APInt::getSignedMinValue(IntWidth), true,
APFloat::rmNearestTiesToEven);
if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // smin > 12312.0
if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT ||
Pred == ICmpInst::ICMP_SGE)
return replaceInstUsesWith(I, Builder.getTrue());
return replaceInstUsesWith(I, Builder.getFalse());
}
} else {
// See if the RHS value is < UnsignedMin.
APFloat SMin(RHS.getSemantics());
SMin.convertFromAPInt(APInt::getMinValue(IntWidth), true,
APFloat::rmNearestTiesToEven);
if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // umin > 12312.0
if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_UGT ||
Pred == ICmpInst::ICMP_UGE)
return replaceInstUsesWith(I, Builder.getTrue());
return replaceInstUsesWith(I, Builder.getFalse());
}
}
// Okay, now we know that the FP constant fits in the range [SMIN, SMAX] or
// [0, UMAX], but it may still be fractional. See if it is fractional by
// casting the FP value to the integer value and back, checking for equality.
// Don't do this for zero, because -0.0 is not fractional.
Constant *RHSInt = LHSUnsigned
? ConstantExpr::getFPToUI(RHSC, IntTy)
: ConstantExpr::getFPToSI(RHSC, IntTy);
if (!RHS.isZero()) {
bool Equal = LHSUnsigned
? ConstantExpr::getUIToFP(RHSInt, RHSC->getType()) == RHSC
: ConstantExpr::getSIToFP(RHSInt, RHSC->getType()) == RHSC;
if (!Equal) {
// If we had a comparison against a fractional value, we have to adjust
// the compare predicate and sometimes the value. RHSC is rounded towards
// zero at this point.
switch (Pred) {
default: llvm_unreachable("Unexpected integer comparison!");
case ICmpInst::ICMP_NE: // (float)int != 4.4 --> true
return replaceInstUsesWith(I, Builder.getTrue());
case ICmpInst::ICMP_EQ: // (float)int == 4.4 --> false
return replaceInstUsesWith(I, Builder.getFalse());
case ICmpInst::ICMP_ULE:
// (float)int <= 4.4 --> int <= 4
// (float)int <= -4.4 --> false
if (RHS.isNegative())
return replaceInstUsesWith(I, Builder.getFalse());
break;
case ICmpInst::ICMP_SLE:
// (float)int <= 4.4 --> int <= 4
// (float)int <= -4.4 --> int < -4
if (RHS.isNegative())
Pred = ICmpInst::ICMP_SLT;
break;
case ICmpInst::ICMP_ULT:
// (float)int < -4.4 --> false
// (float)int < 4.4 --> int <= 4
if (RHS.isNegative())
return replaceInstUsesWith(I, Builder.getFalse());
Pred = ICmpInst::ICMP_ULE;
break;
case ICmpInst::ICMP_SLT:
// (float)int < -4.4 --> int < -4
// (float)int < 4.4 --> int <= 4
if (!RHS.isNegative())
Pred = ICmpInst::ICMP_SLE;
break;
case ICmpInst::ICMP_UGT:
// (float)int > 4.4 --> int > 4
// (float)int > -4.4 --> true
if (RHS.isNegative())
return replaceInstUsesWith(I, Builder.getTrue());
break;
case ICmpInst::ICMP_SGT:
// (float)int > 4.4 --> int > 4
// (float)int > -4.4 --> int >= -4
if (RHS.isNegative())
Pred = ICmpInst::ICMP_SGE;
break;
case ICmpInst::ICMP_UGE:
// (float)int >= -4.4 --> true
// (float)int >= 4.4 --> int > 4
if (RHS.isNegative())
return replaceInstUsesWith(I, Builder.getTrue());
Pred = ICmpInst::ICMP_UGT;
break;
case ICmpInst::ICMP_SGE:
// (float)int >= -4.4 --> int >= -4
// (float)int >= 4.4 --> int > 4
if (!RHS.isNegative())
Pred = ICmpInst::ICMP_SGT;
break;
}
}
}
// Lower this FP comparison into an appropriate integer version of the
// comparison.
return new ICmpInst(Pred, LHSI->getOperand(0), RHSInt);
}
/// Fold (C / X) < 0.0 --> X < 0.0 if possible. Swap predicate if necessary.
static Instruction *foldFCmpReciprocalAndZero(FCmpInst &I, Instruction *LHSI,
Constant *RHSC) {
// When C is not 0.0 and infinities are not allowed:
// (C / X) < 0.0 is a sign-bit test of X
// (C / X) < 0.0 --> X < 0.0 (if C is positive)
// (C / X) < 0.0 --> X > 0.0 (if C is negative, swap the predicate)
//
// Proof:
// Multiply (C / X) < 0.0 by X * X / C.
// - X is non zero, if it is the flag 'ninf' is violated.
// - C defines the sign of X * X * C. Thus it also defines whether to swap
// the predicate. C is also non zero by definition.
//
// Thus X * X / C is non zero and the transformation is valid. [qed]
FCmpInst::Predicate Pred = I.getPredicate();
// Check that predicates are valid.
if ((Pred != FCmpInst::FCMP_OGT) && (Pred != FCmpInst::FCMP_OLT) &&
(Pred != FCmpInst::FCMP_OGE) && (Pred != FCmpInst::FCMP_OLE))
return nullptr;
// Check that RHS operand is zero.
if (!match(RHSC, m_AnyZeroFP()))
return nullptr;
// Check fastmath flags ('ninf').
if (!LHSI->hasNoInfs() || !I.hasNoInfs())
return nullptr;
// Check the properties of the dividend. It must not be zero to avoid a
// division by zero (see Proof).
const APFloat *C;
if (!match(LHSI->getOperand(0), m_APFloat(C)))
return nullptr;
if (C->isZero())
return nullptr;
// Get swapped predicate if necessary.
if (C->isNegative())
Pred = I.getSwappedPredicate();
return new FCmpInst(Pred, LHSI->getOperand(1), RHSC, "", &I);
}
/// Optimize fabs(X) compared with zero.
static Instruction *foldFabsWithFcmpZero(FCmpInst &I) {
Value *X;
if (!match(I.getOperand(0), m_Intrinsic<Intrinsic::fabs>(m_Value(X))) ||
!match(I.getOperand(1), m_PosZeroFP()))
return nullptr;
auto replacePredAndOp0 = [](FCmpInst *I, FCmpInst::Predicate P, Value *X) {
I->setPredicate(P);
I->setOperand(0, X);
return I;
};
switch (I.getPredicate()) {
case FCmpInst::FCMP_UGE:
case FCmpInst::FCMP_OLT:
// fabs(X) >= 0.0 --> true
// fabs(X) < 0.0 --> false
llvm_unreachable("fcmp should have simplified");
case FCmpInst::FCMP_OGT:
// fabs(X) > 0.0 --> X != 0.0
return replacePredAndOp0(&I, FCmpInst::FCMP_ONE, X);
case FCmpInst::FCMP_UGT:
// fabs(X) u> 0.0 --> X u!= 0.0
return replacePredAndOp0(&I, FCmpInst::FCMP_UNE, X);
case FCmpInst::FCMP_OLE:
// fabs(X) <= 0.0 --> X == 0.0
return replacePredAndOp0(&I, FCmpInst::FCMP_OEQ, X);
case FCmpInst::FCMP_ULE:
// fabs(X) u<= 0.0 --> X u== 0.0
return replacePredAndOp0(&I, FCmpInst::FCMP_UEQ, X);
case FCmpInst::FCMP_OGE:
// fabs(X) >= 0.0 --> !isnan(X)
assert(!I.hasNoNaNs() && "fcmp should have simplified");
return replacePredAndOp0(&I, FCmpInst::FCMP_ORD, X);
case FCmpInst::FCMP_ULT:
// fabs(X) u< 0.0 --> isnan(X)
assert(!I.hasNoNaNs() && "fcmp should have simplified");
return replacePredAndOp0(&I, FCmpInst::FCMP_UNO, X);
case FCmpInst::FCMP_OEQ:
case FCmpInst::FCMP_UEQ:
case FCmpInst::FCMP_ONE:
case FCmpInst::FCMP_UNE:
case FCmpInst::FCMP_ORD:
case FCmpInst::FCMP_UNO:
// Look through the fabs() because it doesn't change anything but the sign.
// fabs(X) == 0.0 --> X == 0.0,
// fabs(X) != 0.0 --> X != 0.0
// isnan(fabs(X)) --> isnan(X)
// !isnan(fabs(X) --> !isnan(X)
return replacePredAndOp0(&I, I.getPredicate(), X);
default:
return nullptr;
}
}
Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
bool Changed = false;
/// Orders the operands of the compare so that they are listed from most
/// complex to least complex. This puts constants before unary operators,
/// before binary operators.
if (getComplexity(I.getOperand(0)) < getComplexity(I.getOperand(1))) {
I.swapOperands();
Changed = true;
}
const CmpInst::Predicate Pred = I.getPredicate();
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
if (Value *V = SimplifyFCmpInst(Pred, Op0, Op1, I.getFastMathFlags(),
SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);
// Simplify 'fcmp pred X, X'
Type *OpType = Op0->getType();
assert(OpType == Op1->getType() && "fcmp with different-typed operands?");
if (Op0 == Op1) {
switch (Pred) {
default: break;
case FCmpInst::FCMP_UNO: // True if unordered: isnan(X) | isnan(Y)
case FCmpInst::FCMP_ULT: // True if unordered or less than
case FCmpInst::FCMP_UGT: // True if unordered or greater than
case FCmpInst::FCMP_UNE: // True if unordered or not equal
// Canonicalize these to be 'fcmp uno %X, 0.0'.
I.setPredicate(FCmpInst::FCMP_UNO);
I.setOperand(1, Constant::getNullValue(OpType));
return &I;
case FCmpInst::FCMP_ORD: // True if ordered (no nans)
case FCmpInst::FCMP_OEQ: // True if ordered and equal
case FCmpInst::FCMP_OGE: // True if ordered and greater than or equal
case FCmpInst::FCMP_OLE: // True if ordered and less than or equal
// Canonicalize these to be 'fcmp ord %X, 0.0'.
I.setPredicate(FCmpInst::FCMP_ORD);
I.setOperand(1, Constant::getNullValue(OpType));
return &I;
}
}
// If we're just checking for a NaN (ORD/UNO) and have a non-NaN operand,
// then canonicalize the operand to 0.0.
if (Pred == CmpInst::FCMP_ORD || Pred == CmpInst::FCMP_UNO) {
if (!match(Op0, m_PosZeroFP()) && isKnownNeverNaN(Op0, &TLI)) {
I.setOperand(0, ConstantFP::getNullValue(OpType));
return &I;
}
if (!match(Op1, m_PosZeroFP()) && isKnownNeverNaN(Op1, &TLI)) {
I.setOperand(1, ConstantFP::getNullValue(OpType));
return &I;
}
}
// fcmp pred (fneg X), (fneg Y) -> fcmp swap(pred) X, Y
Value *X, *Y;
if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y))))
return new FCmpInst(I.getSwappedPredicate(), X, Y, "", &I);
// Test if the FCmpInst instruction is used exclusively by a select as
// part of a minimum or maximum operation. If so, refrain from doing
// any other folding. This helps out other analyses which understand
// non-obfuscated minimum and maximum idioms, such as ScalarEvolution
// and CodeGen. And in this case, at least one of the comparison
// operands has at least one user besides the compare (the select),
// which would often largely negate the benefit of folding anyway.
if (I.hasOneUse())
if (SelectInst *SI = dyn_cast<SelectInst>(I.user_back())) {
Value *A, *B;
SelectPatternResult SPR = matchSelectPattern(SI, A, B);
if (SPR.Flavor != SPF_UNKNOWN)
return nullptr;
}
// The sign of 0.0 is ignored by fcmp, so canonicalize to +0.0:
// fcmp Pred X, -0.0 --> fcmp Pred X, 0.0
if (match(Op1, m_AnyZeroFP()) && !match(Op1, m_PosZeroFP())) {
I.setOperand(1, ConstantFP::getNullValue(OpType));
return &I;
}
// Handle fcmp with instruction LHS and constant RHS.
Instruction *LHSI;
Constant *RHSC;
if (match(Op0, m_Instruction(LHSI)) && match(Op1, m_Constant(RHSC))) {
switch (LHSI->getOpcode()) {
case Instruction::PHI:
// Only fold fcmp into the PHI if the phi and fcmp are in the same
// block. If in the same block, we're encouraging jump threading. If
// not, we are just pessimizing the code by making an i1 phi.
if (LHSI->getParent() == I.getParent())
if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
return NV;
break;
case Instruction::SIToFP:
case Instruction::UIToFP:
if (Instruction *NV = foldFCmpIntToFPConst(I, LHSI, RHSC))
return NV;
break;
case Instruction::FDiv:
if (Instruction *NV = foldFCmpReciprocalAndZero(I, LHSI, RHSC))
return NV;
break;
case Instruction::Load:
if (auto *GEP = dyn_cast<GetElementPtrInst>(LHSI->getOperand(0)))
if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
!cast<LoadInst>(LHSI)->isVolatile())
if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
return Res;
break;
}
}
if (Instruction *R = foldFabsWithFcmpZero(I))
return R;
if (match(Op0, m_FNeg(m_Value(X)))) {
// fcmp pred (fneg X), C --> fcmp swap(pred) X, -C
Constant *C;
if (match(Op1, m_Constant(C))) {
Constant *NegC = ConstantExpr::getFNeg(C);
return new FCmpInst(I.getSwappedPredicate(), X, NegC, "", &I);
}
}
if (match(Op0, m_FPExt(m_Value(X)))) {
// fcmp (fpext X), (fpext Y) -> fcmp X, Y
if (match(Op1, m_FPExt(m_Value(Y))) && X->getType() == Y->getType())
return new FCmpInst(Pred, X, Y, "", &I);
// fcmp (fpext X), C -> fcmp X, (fptrunc C) if fptrunc is lossless
const APFloat *C;
if (match(Op1, m_APFloat(C))) {
const fltSemantics &FPSem =
X->getType()->getScalarType()->getFltSemantics();
bool Lossy;
APFloat TruncC = *C;
TruncC.convert(FPSem, APFloat::rmNearestTiesToEven, &Lossy);
// Avoid lossy conversions and denormals.
// Zero is a special case that's OK to convert.
APFloat Fabs = TruncC;
Fabs.clearSign();
if (!Lossy &&
((Fabs.compare(APFloat::getSmallestNormalized(FPSem)) !=
APFloat::cmpLessThan) || Fabs.isZero())) {
Constant *NewC = ConstantFP::get(X->getType(), TruncC);
return new FCmpInst(Pred, X, NewC, "", &I);
}
}
}
if (I.getType()->isVectorTy())
if (Instruction *Res = foldVectorCmp(I, Builder))
return Res;
return Changed ? &I : nullptr;
}
Index: vendor/llvm/dist-release_90/lib/Transforms/Scalar/DivRemPairs.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/Transforms/Scalar/DivRemPairs.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Transforms/Scalar/DivRemPairs.cpp (revision 351303)
@@ -1,216 +1,278 @@
//===- DivRemPairs.cpp - Hoist/decompose division and remainder -*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass hoists and/or decomposes integer division and remainder
// instructions to enable CFG improvements and better codegen.
//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/DivRemPairs.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/Pass.h"
#include "llvm/Support/DebugCounter.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BypassSlowDivision.h"
+
using namespace llvm;
#define DEBUG_TYPE "div-rem-pairs"
STATISTIC(NumPairs, "Number of div/rem pairs");
STATISTIC(NumHoisted, "Number of instructions hoisted");
STATISTIC(NumDecomposed, "Number of instructions decomposed");
DEBUG_COUNTER(DRPCounter, "div-rem-pairs-transform",
"Controls transformations in div-rem-pairs pass");
-/// Find matching pairs of integer div/rem ops (they have the same numerator,
-/// denominator, and signedness). If they exist in different basic blocks, bring
-/// them together by hoisting or replace the common division operation that is
-/// implicit in the remainder:
-/// X % Y <--> X - ((X / Y) * Y).
-///
-/// We can largely ignore the normal safety and cost constraints on speculation
-/// of these ops when we find a matching pair. This is because we are already
-/// guaranteed that any exceptions and most cost are already incurred by the
-/// first member of the pair.
-///
-/// Note: This transform could be an oddball enhancement to EarlyCSE, GVN, or
-/// SimplifyCFG, but it's split off on its own because it's different enough
-/// that it doesn't quite match the stated objectives of those passes.
-static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
- const DominatorTree &DT) {
- bool Changed = false;
+/// A thin wrapper to store two values that we matched as div-rem pair.
+/// We want this extra indirection to avoid dealing with RAUW'ing the map keys.
+struct DivRemPairWorklistEntry {
+ /// The actual udiv/sdiv instruction. Source of truth.
+ AssertingVH<Instruction> DivInst;
+ /// The instruction that we have matched as a remainder instruction.
+ /// Should only be used as Value, don't introspect it.
+ AssertingVH<Instruction> RemInst;
+
+ DivRemPairWorklistEntry(Instruction *DivInst_, Instruction *RemInst_)
+ : DivInst(DivInst_), RemInst(RemInst_) {
+ assert((DivInst->getOpcode() == Instruction::UDiv ||
+ DivInst->getOpcode() == Instruction::SDiv) &&
+ "Not a division.");
+ assert(DivInst->getType() == RemInst->getType() && "Types should match.");
+ // We can't check anything else about remainder instruction,
+ // it's not strictly required to be a urem/srem.
+ }
+
+ /// The type for this pair, identical for both the div and rem.
+ Type *getType() const { return DivInst->getType(); }
+
+ /// Is this pair signed or unsigned?
+ bool isSigned() const { return DivInst->getOpcode() == Instruction::SDiv; }
+
+ /// In this pair, what are the divident and divisor?
+ Value *getDividend() const { return DivInst->getOperand(0); }
+ Value *getDivisor() const { return DivInst->getOperand(1); }
+};
+using DivRemWorklistTy = SmallVector<DivRemPairWorklistEntry, 4>;
+
+/// Find matching pairs of integer div/rem ops (they have the same numerator,
+/// denominator, and signedness). Place those pairs into a worklist for further
+/// processing. This indirection is needed because we have to use TrackingVH<>
+/// because we will be doing RAUW, and if one of the rem instructions we change
+/// happens to be an input to another div/rem in the maps, we'd have problems.
+static DivRemWorklistTy getWorklist(Function &F) {
// Insert all divide and remainder instructions into maps keyed by their
// operands and opcode (signed or unsigned).
DenseMap<DivRemMapKey, Instruction *> DivMap;
// Use a MapVector for RemMap so that instructions are moved/inserted in a
// deterministic order.
MapVector<DivRemMapKey, Instruction *> RemMap;
for (auto &BB : F) {
for (auto &I : BB) {
if (I.getOpcode() == Instruction::SDiv)
DivMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I;
else if (I.getOpcode() == Instruction::UDiv)
DivMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I;
else if (I.getOpcode() == Instruction::SRem)
RemMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I;
else if (I.getOpcode() == Instruction::URem)
RemMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I;
}
}
+ // We'll accumulate the matching pairs of div-rem instructions here.
+ DivRemWorklistTy Worklist;
+
// We can iterate over either map because we are only looking for matched
// pairs. Choose remainders for efficiency because they are usually even more
// rare than division.
for (auto &RemPair : RemMap) {
// Find the matching division instruction from the division map.
Instruction *DivInst = DivMap[RemPair.first];
if (!DivInst)
continue;
- // We have a matching pair of div/rem instructions. If one dominates the
- // other, hoist and/or replace one.
+ // We have a matching pair of div/rem instructions.
NumPairs++;
Instruction *RemInst = RemPair.second;
- bool IsSigned = DivInst->getOpcode() == Instruction::SDiv;
- bool HasDivRemOp = TTI.hasDivRemOp(DivInst->getType(), IsSigned);
+ // Place it in the worklist.
+ Worklist.emplace_back(DivInst, RemInst);
+ }
+
+ return Worklist;
+}
+
+/// Find matching pairs of integer div/rem ops (they have the same numerator,
+/// denominator, and signedness). If they exist in different basic blocks, bring
+/// them together by hoisting or replace the common division operation that is
+/// implicit in the remainder:
+/// X % Y <--> X - ((X / Y) * Y).
+///
+/// We can largely ignore the normal safety and cost constraints on speculation
+/// of these ops when we find a matching pair. This is because we are already
+/// guaranteed that any exceptions and most cost are already incurred by the
+/// first member of the pair.
+///
+/// Note: This transform could be an oddball enhancement to EarlyCSE, GVN, or
+/// SimplifyCFG, but it's split off on its own because it's different enough
+/// that it doesn't quite match the stated objectives of those passes.
+static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
+ const DominatorTree &DT) {
+ bool Changed = false;
+
+ // Get the matching pairs of div-rem instructions. We want this extra
+ // indirection to avoid dealing with having to RAUW the keys of the maps.
+ DivRemWorklistTy Worklist = getWorklist(F);
+
+ // Process each entry in the worklist.
+ for (DivRemPairWorklistEntry &E : Worklist) {
+ bool HasDivRemOp = TTI.hasDivRemOp(E.getType(), E.isSigned());
+
+ auto &DivInst = E.DivInst;
+ auto &RemInst = E.RemInst;
+
// If the target supports div+rem and the instructions are in the same block
// already, there's nothing to do. The backend should handle this. If the
// target does not support div+rem, then we will decompose the rem.
if (HasDivRemOp && RemInst->getParent() == DivInst->getParent())
continue;
bool DivDominates = DT.dominates(DivInst, RemInst);
if (!DivDominates && !DT.dominates(RemInst, DivInst))
continue;
if (!DebugCounter::shouldExecute(DRPCounter))
continue;
if (HasDivRemOp) {
// The target has a single div/rem operation. Hoist the lower instruction
// to make the matched pair visible to the backend.
if (DivDominates)
RemInst->moveAfter(DivInst);
else
DivInst->moveAfter(RemInst);
NumHoisted++;
} else {
// The target does not have a single div/rem operation. Decompose the
// remainder calculation as:
// X % Y --> X - ((X / Y) * Y).
- Value *X = RemInst->getOperand(0);
- Value *Y = RemInst->getOperand(1);
+ Value *X = E.getDividend();
+ Value *Y = E.getDivisor();
Instruction *Mul = BinaryOperator::CreateMul(DivInst, Y);
Instruction *Sub = BinaryOperator::CreateSub(X, Mul);
// If the remainder dominates, then hoist the division up to that block:
//
// bb1:
// %rem = srem %x, %y
// bb2:
// %div = sdiv %x, %y
// -->
// bb1:
// %div = sdiv %x, %y
// %mul = mul %div, %y
// %rem = sub %x, %mul
//
// If the division dominates, it's already in the right place. The mul+sub
// will be in a different block because we don't assume that they are
// cheap to speculatively execute:
//
// bb1:
// %div = sdiv %x, %y
// bb2:
// %rem = srem %x, %y
// -->
// bb1:
// %div = sdiv %x, %y
// bb2:
// %mul = mul %div, %y
// %rem = sub %x, %mul
//
// If the div and rem are in the same block, we do the same transform,
// but any code movement would be within the same block.
if (!DivDominates)
DivInst->moveBefore(RemInst);
Mul->insertAfter(RemInst);
Sub->insertAfter(Mul);
// Now kill the explicit remainder. We have replaced it with:
// (sub X, (mul (div X, Y), Y)
- RemInst->replaceAllUsesWith(Sub);
- RemInst->eraseFromParent();
+ Sub->setName(RemInst->getName() + ".decomposed");
+ Instruction *OrigRemInst = RemInst;
+ // Update AssertingVH<> with new instruction so it doesn't assert.
+ RemInst = Sub;
+ // And replace the original instruction with the new one.
+ OrigRemInst->replaceAllUsesWith(Sub);
+ OrigRemInst->eraseFromParent();
NumDecomposed++;
}
Changed = true;
}
return Changed;
}
// Pass manager boilerplate below here.
namespace {
struct DivRemPairsLegacyPass : public FunctionPass {
static char ID;
DivRemPairsLegacyPass() : FunctionPass(ID) {
initializeDivRemPairsLegacyPassPass(*PassRegistry::getPassRegistry());
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.setPreservesCFG();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
}
bool runOnFunction(Function &F) override {
if (skipFunction(F))
return false;
auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
return optimizeDivRem(F, TTI, DT);
}
};
-}
+} // namespace
char DivRemPairsLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(DivRemPairsLegacyPass, "div-rem-pairs",
"Hoist/decompose integer division and remainder", false,
false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_END(DivRemPairsLegacyPass, "div-rem-pairs",
"Hoist/decompose integer division and remainder", false,
false)
FunctionPass *llvm::createDivRemPairsPass() {
return new DivRemPairsLegacyPass();
}
PreservedAnalyses DivRemPairsPass::run(Function &F,
FunctionAnalysisManager &FAM) {
TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
if (!optimizeDivRem(F, TTI, DT))
return PreservedAnalyses::all();
// TODO: This pass just hoists/replaces math ops - all analyses are preserved?
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
PA.preserve<GlobalsAA>();
return PA;
}
Index: vendor/llvm/dist-release_90/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
===================================================================
--- vendor/llvm/dist-release_90/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp (revision 351302)
+++ vendor/llvm/dist-release_90/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp (revision 351303)
@@ -1,827 +1,829 @@
//===- SpeculateAroundPHIs.cpp --------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/Sequence.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
using namespace llvm;
#define DEBUG_TYPE "spec-phis"
STATISTIC(NumPHIsSpeculated, "Number of PHI nodes we speculated around");
STATISTIC(NumEdgesSplit,
"Number of critical edges which were split for speculation");
STATISTIC(NumSpeculatedInstructions,
"Number of instructions we speculated around the PHI nodes");
STATISTIC(NumNewRedundantInstructions,
"Number of new, redundant instructions inserted");
/// Check whether speculating the users of a PHI node around the PHI
/// will be safe.
///
/// This checks both that all of the users are safe and also that all of their
/// operands are either recursively safe or already available along an incoming
/// edge to the PHI.
///
/// This routine caches both all the safe nodes explored in `PotentialSpecSet`
/// and the chain of nodes that definitively reach any unsafe node in
/// `UnsafeSet`. By preserving these between repeated calls to this routine for
/// PHIs in the same basic block, the exploration here can be reused. However,
/// these caches must no be reused for PHIs in a different basic block as they
/// reflect what is available along incoming edges.
static bool
isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
SmallPtrSetImpl<Instruction *> &UnsafeSet) {
auto *PhiBB = PN.getParent();
SmallPtrSet<Instruction *, 4> Visited;
SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;
// Walk each user of the PHI node.
for (Use &U : PN.uses()) {
auto *UI = cast<Instruction>(U.getUser());
// Ensure the use post-dominates the PHI node. This ensures that, in the
// absence of unwinding, the use will actually be reached.
// FIXME: We use a blunt hammer of requiring them to be in the same basic
// block. We should consider using actual post-dominance here in the
// future.
if (UI->getParent() != PhiBB) {
LLVM_DEBUG(dbgs() << " Unsafe: use in a different BB: " << *UI << "\n");
return false;
}
if (auto CS = ImmutableCallSite(UI)) {
if (CS.isConvergent() || CS.cannotDuplicate()) {
LLVM_DEBUG(dbgs() << " Unsafe: convergent "
"callsite cannot de duplicated: " << *UI << '\n');
return false;
}
}
// FIXME: This check is much too conservative. We're not going to move these
// instructions onto new dynamic paths through the program unless there is
// a call instruction between the use and the PHI node. And memory isn't
// changing unless there is a store in that same sequence. We should
// probably change this to do at least a limited scan of the intervening
// instructions and allow handling stores in easily proven safe cases.
if (mayBeMemoryDependent(*UI)) {
LLVM_DEBUG(dbgs() << " Unsafe: can't speculate use: " << *UI << "\n");
return false;
}
// Now do a depth-first search of everything these users depend on to make
// sure they are transitively safe. This is a depth-first search, but we
// check nodes in preorder to minimize the amount of checking.
Visited.insert(UI);
DFSStack.push_back({UI, UI->value_op_begin()});
do {
User::value_op_iterator OpIt;
std::tie(UI, OpIt) = DFSStack.pop_back_val();
while (OpIt != UI->value_op_end()) {
auto *OpI = dyn_cast<Instruction>(*OpIt);
// Increment to the next operand for whenever we continue.
++OpIt;
// No need to visit non-instructions, which can't form dependencies.
if (!OpI)
continue;
// Now do the main pre-order checks that this operand is a viable
// dependency of something we want to speculate.
// First do a few checks for instructions that won't require
// speculation at all because they are trivially available on the
// incoming edge (either through dominance or through an incoming value
// to a PHI).
//
// The cases in the current block will be trivially dominated by the
// edge.
auto *ParentBB = OpI->getParent();
if (ParentBB == PhiBB) {
if (isa<PHINode>(OpI)) {
// We can trivially map through phi nodes in the same block.
continue;
}
} else if (DT.dominates(ParentBB, PhiBB)) {
// Instructions from dominating blocks are already available.
continue;
}
// Once we know that we're considering speculating the operand, check
// if we've already explored this subgraph and found it to be safe.
if (PotentialSpecSet.count(OpI))
continue;
// If we've already explored this subgraph and found it unsafe, bail.
// If when we directly test whether this is safe it fails, bail.
if (UnsafeSet.count(OpI) || ParentBB != PhiBB ||
mayBeMemoryDependent(*OpI)) {
LLVM_DEBUG(dbgs() << " Unsafe: can't speculate transitive use: "
<< *OpI << "\n");
// Record the stack of instructions which reach this node as unsafe
// so we prune subsequent searches.
UnsafeSet.insert(OpI);
for (auto &StackPair : DFSStack) {
Instruction *I = StackPair.first;
UnsafeSet.insert(I);
}
return false;
}
// Skip any operands we're already recursively checking.
if (!Visited.insert(OpI).second)
continue;
// Push onto the stack and descend. We can directly continue this
// loop when ascending.
DFSStack.push_back({UI, OpIt});
UI = OpI;
OpIt = OpI->value_op_begin();
}
// This node and all its operands are safe. Go ahead and cache that for
// reuse later.
PotentialSpecSet.insert(UI);
// Continue with the next node on the stack.
} while (!DFSStack.empty());
}
#ifndef NDEBUG
// Every visited operand should have been marked as safe for speculation at
// this point. Verify this and return success.
for (auto *I : Visited)
assert(PotentialSpecSet.count(I) &&
"Failed to mark a visited instruction as safe!");
#endif
return true;
}
/// Check whether, in isolation, a given PHI node is both safe and profitable
/// to speculate users around.
///
/// This handles checking whether there are any constant operands to a PHI
/// which could represent a useful speculation candidate, whether the users of
/// the PHI are safe to speculate including all their transitive dependencies,
/// and whether after speculation there will be some cost savings (profit) to
/// folding the operands into the users of the PHI node. Returns true if both
/// safe and profitable with relevant cost savings updated in the map and with
/// an update to the `PotentialSpecSet`. Returns false if either safety or
/// profitability are absent. Some new entries may be made to the
/// `PotentialSpecSet` even when this routine returns false, but they remain
/// conservatively correct.
///
/// The profitability check here is a local one, but it checks this in an
/// interesting way. Beyond checking that the total cost of materializing the
/// constants will be less than the cost of folding them into their users, it
/// also checks that no one incoming constant will have a higher cost when
/// folded into its users rather than materialized. This higher cost could
/// result in a dynamic *path* that is more expensive even when the total cost
/// is lower. Currently, all of the interesting cases where this optimization
/// should fire are ones where it is a no-loss operation in this sense. If we
/// ever want to be more aggressive here, we would need to balance the
/// different incoming edges' cost by looking at their respective
/// probabilities.
static bool isSafeAndProfitableToSpeculateAroundPHI(
PHINode &PN, SmallDenseMap<PHINode *, int, 16> &CostSavingsMap,
SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
SmallPtrSetImpl<Instruction *> &UnsafeSet, DominatorTree &DT,
TargetTransformInfo &TTI) {
// First see whether there is any cost savings to speculating around this
// PHI, and build up a map of the constant inputs to how many times they
// occur.
bool NonFreeMat = false;
struct CostsAndCount {
int MatCost = TargetTransformInfo::TCC_Free;
int FoldedCost = TargetTransformInfo::TCC_Free;
int Count = 0;
};
SmallDenseMap<ConstantInt *, CostsAndCount, 16> CostsAndCounts;
SmallPtrSet<BasicBlock *, 16> IncomingConstantBlocks;
for (int i : llvm::seq<int>(0, PN.getNumIncomingValues())) {
auto *IncomingC = dyn_cast<ConstantInt>(PN.getIncomingValue(i));
if (!IncomingC)
continue;
// Only visit each incoming edge with a constant input once.
if (!IncomingConstantBlocks.insert(PN.getIncomingBlock(i)).second)
continue;
auto InsertResult = CostsAndCounts.insert({IncomingC, {}});
// Count how many edges share a given incoming costant.
++InsertResult.first->second.Count;
// Only compute the cost the first time we see a particular constant.
if (!InsertResult.second)
continue;
int &MatCost = InsertResult.first->second.MatCost;
MatCost = TTI.getIntImmCost(IncomingC->getValue(), IncomingC->getType());
NonFreeMat |= MatCost != TTI.TCC_Free;
}
if (!NonFreeMat) {
LLVM_DEBUG(dbgs() << " Free: " << PN << "\n");
// No profit in free materialization.
return false;
}
// Now check that the uses of this PHI can actually be speculated,
// otherwise we'll still have to materialize the PHI value.
if (!isSafeToSpeculatePHIUsers(PN, DT, PotentialSpecSet, UnsafeSet)) {
LLVM_DEBUG(dbgs() << " Unsafe PHI: " << PN << "\n");
return false;
}
// Compute how much (if any) savings are available by speculating around this
// PHI.
for (Use &U : PN.uses()) {
auto *UserI = cast<Instruction>(U.getUser());
// Now check whether there is any savings to folding the incoming constants
// into this use.
unsigned Idx = U.getOperandNo();
// If we have a binary operator that is commutative, an actual constant
// operand would end up on the RHS, so pretend the use of the PHI is on the
// RHS.
//
// Technically, this is a bit weird if *both* operands are PHIs we're
// speculating. But if that is the case, giving an "optimistic" cost isn't
// a bad thing because after speculation it will constant fold. And
// moreover, such cases should likely have been constant folded already by
// some other pass, so we shouldn't worry about "modeling" them terribly
// accurately here. Similarly, if the other operand is a constant, it still
// seems fine to be "optimistic" in our cost modeling, because when the
// incoming operand from the PHI node is also a constant, we will end up
// constant folding.
if (UserI->isBinaryOp() && UserI->isCommutative() && Idx != 1)
// Assume we will commute the constant to the RHS to be canonical.
Idx = 1;
// Get the intrinsic ID if this user is an intrinsic.
Intrinsic::ID IID = Intrinsic::not_intrinsic;
if (auto *UserII = dyn_cast<IntrinsicInst>(UserI))
IID = UserII->getIntrinsicID();
for (auto &IncomingConstantAndCostsAndCount : CostsAndCounts) {
ConstantInt *IncomingC = IncomingConstantAndCostsAndCount.first;
int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
int &FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
if (IID)
FoldedCost += TTI.getIntImmCost(IID, Idx, IncomingC->getValue(),
IncomingC->getType());
else
FoldedCost +=
TTI.getIntImmCost(UserI->getOpcode(), Idx, IncomingC->getValue(),
IncomingC->getType());
// If we accumulate more folded cost for this incoming constant than
// materialized cost, then we'll regress any edge with this constant so
// just bail. We're only interested in cases where folding the incoming
// constants is at least break-even on all paths.
if (FoldedCost > MatCost) {
LLVM_DEBUG(dbgs() << " Not profitable to fold imm: " << *IncomingC
<< "\n"
" Materializing cost: "
<< MatCost
<< "\n"
" Accumulated folded cost: "
<< FoldedCost << "\n");
return false;
}
}
}
// Compute the total cost savings afforded by this PHI node.
int TotalMatCost = TTI.TCC_Free, TotalFoldedCost = TTI.TCC_Free;
for (auto IncomingConstantAndCostsAndCount : CostsAndCounts) {
int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
int FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
int Count = IncomingConstantAndCostsAndCount.second.Count;
TotalMatCost += MatCost * Count;
TotalFoldedCost += FoldedCost * Count;
}
assert(TotalFoldedCost <= TotalMatCost && "If each constant's folded cost is "
"less that its materialized cost, "
"the sum must be as well.");
LLVM_DEBUG(dbgs() << " Cost savings " << (TotalMatCost - TotalFoldedCost)
<< ": " << PN << "\n");
CostSavingsMap[&PN] = TotalMatCost - TotalFoldedCost;
return true;
}
/// Simple helper to walk all the users of a list of phis depth first, and call
/// a visit function on each one in post-order.
///
/// All of the PHIs should be in the same basic block, and this is primarily
/// used to make a single depth-first walk across their collective users
/// without revisiting any subgraphs. Callers should provide a fast, idempotent
/// callable to test whether a node has been visited and the more important
/// callable to actually visit a particular node.
///
/// Depth-first and postorder here refer to the *operand* graph -- we start
/// from a collection of users of PHI nodes and walk "up" the operands
/// depth-first.
template <typename IsVisitedT, typename VisitT>
static void visitPHIUsersAndDepsInPostOrder(ArrayRef<PHINode *> PNs,
IsVisitedT IsVisited,
VisitT Visit) {
SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;
for (auto *PN : PNs)
for (Use &U : PN->uses()) {
auto *UI = cast<Instruction>(U.getUser());
if (IsVisited(UI))
// Already visited this user, continue across the roots.
continue;
// Otherwise, walk the operand graph depth-first and visit each
// dependency in postorder.
DFSStack.push_back({UI, UI->value_op_begin()});
do {
User::value_op_iterator OpIt;
std::tie(UI, OpIt) = DFSStack.pop_back_val();
while (OpIt != UI->value_op_end()) {
auto *OpI = dyn_cast<Instruction>(*OpIt);
// Increment to the next operand for whenever we continue.
++OpIt;
// No need to visit non-instructions, which can't form dependencies,
// or instructions outside of our potential dependency set that we
// were given. Finally, if we've already visited the node, continue
// to the next.
if (!OpI || IsVisited(OpI))
continue;
// Push onto the stack and descend. We can directly continue this
// loop when ascending.
DFSStack.push_back({UI, OpIt});
UI = OpI;
OpIt = OpI->value_op_begin();
}
// Finished visiting children, visit this node.
assert(!IsVisited(UI) && "Should not have already visited a node!");
Visit(UI);
} while (!DFSStack.empty());
}
}
/// Find profitable PHIs to speculate.
///
/// For a PHI node to be profitable, we need the cost of speculating its users
/// (and their dependencies) to not exceed the savings of folding the PHI's
/// constant operands into the speculated users.
///
/// Computing this is surprisingly challenging. Because users of two different
/// PHI nodes can depend on each other or on common other instructions, it may
/// be profitable to speculate two PHI nodes together even though neither one
/// in isolation is profitable. The straightforward way to find all the
/// profitable PHIs would be to check each combination of PHIs' cost, but this
/// is exponential in complexity.
///
/// Even if we assume that we only care about cases where we can consider each
/// PHI node in isolation (rather than considering cases where none are
/// profitable in isolation but some subset are profitable as a set), we still
/// have a challenge. The obvious way to find all individually profitable PHIs
/// is to iterate until reaching a fixed point, but this will be quadratic in
/// complexity. =/
///
/// This code currently uses a linear-to-compute order for a greedy approach.
/// It won't find cases where a set of PHIs must be considered together, but it
/// handles most cases of order dependence without quadratic iteration. The
/// specific order used is the post-order across the operand DAG. When the last
/// user of a PHI is visited in this postorder walk, we check it for
/// profitability.
///
/// There is an orthogonal extra complexity to all of this: computing the cost
/// itself can easily become a linear computation making everything again (at
/// best) quadratic. Using a postorder over the operand graph makes it
/// particularly easy to avoid this through dynamic programming. As we do the
/// postorder walk, we build the transitive cost of that subgraph. It is also
/// straightforward to then update these costs when we mark a PHI for
/// speculation so that subsequent PHIs don't re-pay the cost of already
/// speculated instructions.
static SmallVector<PHINode *, 16>
findProfitablePHIs(ArrayRef<PHINode *> PNs,
const SmallDenseMap<PHINode *, int, 16> &CostSavingsMap,
const SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
int NumPreds, DominatorTree &DT, TargetTransformInfo &TTI) {
SmallVector<PHINode *, 16> SpecPNs;
// First, establish a reverse mapping from immediate users of the PHI nodes
// to the nodes themselves, and count how many users each PHI node has in
// a way we can update while processing them.
SmallDenseMap<Instruction *, TinyPtrVector<PHINode *>, 16> UserToPNMap;
SmallDenseMap<PHINode *, int, 16> PNUserCountMap;
SmallPtrSet<Instruction *, 16> UserSet;
for (auto *PN : PNs) {
assert(UserSet.empty() && "Must start with an empty user set!");
for (Use &U : PN->uses())
UserSet.insert(cast<Instruction>(U.getUser()));
PNUserCountMap[PN] = UserSet.size();
for (auto *UI : UserSet)
UserToPNMap.insert({UI, {}}).first->second.push_back(PN);
UserSet.clear();
}
// Now do a DFS across the operand graph of the users, computing cost as we
// go and when all costs for a given PHI are known, checking that PHI for
// profitability.
SmallDenseMap<Instruction *, int, 16> SpecCostMap;
visitPHIUsersAndDepsInPostOrder(
PNs,
/*IsVisited*/
[&](Instruction *I) {
// We consider anything that isn't potentially speculated to be
// "visited" as it is already handled. Similarly, anything that *is*
// potentially speculated but for which we have an entry in our cost
// map, we're done.
return !PotentialSpecSet.count(I) || SpecCostMap.count(I);
},
/*Visit*/
[&](Instruction *I) {
// We've fully visited the operands, so sum their cost with this node
// and update the cost map.
int Cost = TTI.TCC_Free;
for (Value *OpV : I->operand_values())
if (auto *OpI = dyn_cast<Instruction>(OpV)) {
auto CostMapIt = SpecCostMap.find(OpI);
if (CostMapIt != SpecCostMap.end())
Cost += CostMapIt->second;
}
Cost += TTI.getUserCost(I);
bool Inserted = SpecCostMap.insert({I, Cost}).second;
(void)Inserted;
assert(Inserted && "Must not re-insert a cost during the DFS!");
// Now check if this node had a corresponding PHI node using it. If so,
// we need to decrement the outstanding user count for it.
auto UserPNsIt = UserToPNMap.find(I);
if (UserPNsIt == UserToPNMap.end())
return;
auto &UserPNs = UserPNsIt->second;
auto UserPNsSplitIt = std::stable_partition(
UserPNs.begin(), UserPNs.end(), [&](PHINode *UserPN) {
int &PNUserCount = PNUserCountMap.find(UserPN)->second;
assert(
PNUserCount > 0 &&
"Should never re-visit a PN after its user count hits zero!");
--PNUserCount;
return PNUserCount != 0;
});
// FIXME: Rather than one at a time, we should sum the savings as the
// cost will be completely shared.
SmallVector<Instruction *, 16> SpecWorklist;
for (auto *PN : llvm::make_range(UserPNsSplitIt, UserPNs.end())) {
int SpecCost = TTI.TCC_Free;
for (Use &U : PN->uses())
SpecCost +=
SpecCostMap.find(cast<Instruction>(U.getUser()))->second;
SpecCost *= (NumPreds - 1);
// When the user count of a PHI node hits zero, we should check its
// profitability. If profitable, we should mark it for speculation
// and zero out the cost of everything it depends on.
int CostSavings = CostSavingsMap.find(PN)->second;
if (SpecCost > CostSavings) {
LLVM_DEBUG(dbgs() << " Not profitable, speculation cost: " << *PN
<< "\n"
" Cost savings: "
<< CostSavings
<< "\n"
" Speculation cost: "
<< SpecCost << "\n");
continue;
}
// We're going to speculate this user-associated PHI. Copy it out and
// add its users to the worklist to update their cost.
SpecPNs.push_back(PN);
for (Use &U : PN->uses()) {
auto *UI = cast<Instruction>(U.getUser());
auto CostMapIt = SpecCostMap.find(UI);
if (CostMapIt->second == 0)
continue;
// Zero out this cost entry to avoid duplicates.
CostMapIt->second = 0;
SpecWorklist.push_back(UI);
}
}
// Now walk all the operands of the users in the worklist transitively
// to zero out all the memoized costs.
while (!SpecWorklist.empty()) {
Instruction *SpecI = SpecWorklist.pop_back_val();
assert(SpecCostMap.find(SpecI)->second == 0 &&
"Didn't zero out a cost!");
// Walk the operands recursively to zero out their cost as well.
for (auto *OpV : SpecI->operand_values()) {
auto *OpI = dyn_cast<Instruction>(OpV);
if (!OpI)
continue;
auto CostMapIt = SpecCostMap.find(OpI);
if (CostMapIt == SpecCostMap.end() || CostMapIt->second == 0)
continue;
CostMapIt->second = 0;
SpecWorklist.push_back(OpI);
}
}
});
return SpecPNs;
}
/// Speculate users around a set of PHI nodes.
///
/// This routine does the actual speculation around a set of PHI nodes where we
/// have determined this to be both safe and profitable.
///
/// This routine handles any spliting of critical edges necessary to create
/// a safe block to speculate into as well as cloning the instructions and
/// rewriting all uses.
static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
SmallSetVector<BasicBlock *, 16> &PredSet,
DominatorTree &DT) {
LLVM_DEBUG(dbgs() << " Speculating around " << SpecPNs.size() << " PHIs!\n");
NumPHIsSpeculated += SpecPNs.size();
// Split any critical edges so that we have a block to hoist into.
auto *ParentBB = SpecPNs[0]->getParent();
SmallVector<BasicBlock *, 16> SpecPreds;
SpecPreds.reserve(PredSet.size());
for (auto *PredBB : PredSet) {
auto *NewPredBB = SplitCriticalEdge(
PredBB, ParentBB,
CriticalEdgeSplittingOptions(&DT).setMergeIdenticalEdges());
if (NewPredBB) {
++NumEdgesSplit;
LLVM_DEBUG(dbgs() << " Split critical edge from: " << PredBB->getName()
<< "\n");
SpecPreds.push_back(NewPredBB);
} else {
assert(PredBB->getSingleSuccessor() == ParentBB &&
"We need a non-critical predecessor to speculate into.");
assert(!isa<InvokeInst>(PredBB->getTerminator()) &&
"Cannot have a non-critical invoke!");
// Already non-critical, use existing pred.
SpecPreds.push_back(PredBB);
}
}
SmallPtrSet<Instruction *, 16> SpecSet;
SmallVector<Instruction *, 16> SpecList;
visitPHIUsersAndDepsInPostOrder(SpecPNs,
/*IsVisited*/
[&](Instruction *I) {
// This is visited if we don't need to
// speculate it or we already have
// speculated it.
return !PotentialSpecSet.count(I) ||
SpecSet.count(I);
},
/*Visit*/
[&](Instruction *I) {
// All operands scheduled, schedule this
// node.
SpecSet.insert(I);
SpecList.push_back(I);
});
int NumSpecInsts = SpecList.size() * SpecPreds.size();
int NumRedundantInsts = NumSpecInsts - SpecList.size();
LLVM_DEBUG(dbgs() << " Inserting " << NumSpecInsts
<< " speculated instructions, " << NumRedundantInsts
<< " redundancies\n");
NumSpeculatedInstructions += NumSpecInsts;
NumNewRedundantInstructions += NumRedundantInsts;
// Each predecessor is numbered by its index in `SpecPreds`, so for each
// instruction we speculate, the speculated instruction is stored in that
// index of the vector associated with the original instruction. We also
// store the incoming values for each predecessor from any PHIs used.
SmallDenseMap<Instruction *, SmallVector<Value *, 2>, 16> SpeculatedValueMap;
// Inject the synthetic mappings to rewrite PHIs to the appropriate incoming
// value. This handles both the PHIs we are speculating around and any other
// PHIs that happen to be used.
for (auto *OrigI : SpecList)
for (auto *OpV : OrigI->operand_values()) {
auto *OpPN = dyn_cast<PHINode>(OpV);
if (!OpPN || OpPN->getParent() != ParentBB)
continue;
auto InsertResult = SpeculatedValueMap.insert({OpPN, {}});
if (!InsertResult.second)
continue;
auto &SpeculatedVals = InsertResult.first->second;
// Populating our structure for mapping is particularly annoying because
// finding an incoming value for a particular predecessor block in a PHI
// node is a linear time operation! To avoid quadratic behavior, we build
// a map for this PHI node's incoming values and then translate it into
// the more compact representation used below.
SmallDenseMap<BasicBlock *, Value *, 16> IncomingValueMap;
for (int i : llvm::seq<int>(0, OpPN->getNumIncomingValues()))
IncomingValueMap[OpPN->getIncomingBlock(i)] = OpPN->getIncomingValue(i);
for (auto *PredBB : SpecPreds)
SpeculatedVals.push_back(IncomingValueMap.find(PredBB)->second);
}
// Speculate into each predecessor.
for (int PredIdx : llvm::seq<int>(0, SpecPreds.size())) {
auto *PredBB = SpecPreds[PredIdx];
assert(PredBB->getSingleSuccessor() == ParentBB &&
"We need a non-critical predecessor to speculate into.");
for (auto *OrigI : SpecList) {
auto *NewI = OrigI->clone();
NewI->setName(Twine(OrigI->getName()) + "." + Twine(PredIdx));
NewI->insertBefore(PredBB->getTerminator());
// Rewrite all the operands to the previously speculated instructions.
// Because we're walking in-order, the defs must precede the uses and we
// should already have these mappings.
for (Use &U : NewI->operands()) {
auto *OpI = dyn_cast<Instruction>(U.get());
if (!OpI)
continue;
auto MapIt = SpeculatedValueMap.find(OpI);
if (MapIt == SpeculatedValueMap.end())
continue;
const auto &SpeculatedVals = MapIt->second;
assert(SpeculatedVals[PredIdx] &&
"Must have a speculated value for this predecessor!");
assert(SpeculatedVals[PredIdx]->getType() == OpI->getType() &&
"Speculated value has the wrong type!");
// Rewrite the use to this predecessor's speculated instruction.
U.set(SpeculatedVals[PredIdx]);
}
// Commute instructions which now have a constant in the LHS but not the
// RHS.
if (NewI->isBinaryOp() && NewI->isCommutative() &&
isa<Constant>(NewI->getOperand(0)) &&
!isa<Constant>(NewI->getOperand(1)))
NewI->getOperandUse(0).swap(NewI->getOperandUse(1));
SpeculatedValueMap[OrigI].push_back(NewI);
assert(SpeculatedValueMap[OrigI][PredIdx] == NewI &&
"Mismatched speculated instruction index!");
}
}
// Walk the speculated instruction list and if they have uses, insert a PHI
// for them from the speculated versions, and replace the uses with the PHI.
// Then erase the instructions as they have been fully speculated. The walk
// needs to be in reverse so that we don't think there are users when we'll
// actually eventually remove them later.
IRBuilder<> IRB(SpecPNs[0]);
for (auto *OrigI : llvm::reverse(SpecList)) {
// Check if we need a PHI for any remaining users and if so, insert it.
if (!OrigI->use_empty()) {
auto *SpecIPN = IRB.CreatePHI(OrigI->getType(), SpecPreds.size(),
Twine(OrigI->getName()) + ".phi");
// Add the incoming values we speculated.
auto &SpeculatedVals = SpeculatedValueMap.find(OrigI)->second;
for (int PredIdx : llvm::seq<int>(0, SpecPreds.size()))
SpecIPN->addIncoming(SpeculatedVals[PredIdx], SpecPreds[PredIdx]);
// And replace the uses with the PHI node.
OrigI->replaceAllUsesWith(SpecIPN);
}
// It is important to immediately erase this so that it stops using other
// instructions. This avoids inserting needless PHIs of them.
OrigI->eraseFromParent();
}
// All of the uses of the speculated phi nodes should be removed at this
// point, so erase them.
for (auto *SpecPN : SpecPNs) {
assert(SpecPN->use_empty() && "All users should have been speculated!");
SpecPN->eraseFromParent();
}
}
/// Try to speculate around a series of PHIs from a single basic block.
///
/// This routine checks whether any of these PHIs are profitable to speculate
/// users around. If safe and profitable, it does the speculation. It returns
/// true when at least some speculation occurs.
static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
DominatorTree &DT, TargetTransformInfo &TTI) {
LLVM_DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n");
// Savings in cost from speculating around a PHI node.
SmallDenseMap<PHINode *, int, 16> CostSavingsMap;
// Remember the set of instructions that are candidates for speculation so
// that we can quickly walk things within that space. This prunes out
// instructions already available along edges, etc.
SmallPtrSet<Instruction *, 16> PotentialSpecSet;
// Remember the set of instructions that are (transitively) unsafe to
// speculate into the incoming edges of this basic block. This avoids
// recomputing them for each PHI node we check. This set is specific to this
// block though as things are pruned out of it based on what is available
// along incoming edges.
SmallPtrSet<Instruction *, 16> UnsafeSet;
// For each PHI node in this block, check whether there are immediate folding
// opportunities from speculation, and whether that speculation will be
// valid. This determise the set of safe PHIs to speculate.
PNs.erase(llvm::remove_if(PNs,
[&](PHINode *PN) {
return !isSafeAndProfitableToSpeculateAroundPHI(
*PN, CostSavingsMap, PotentialSpecSet,
UnsafeSet, DT, TTI);
}),
PNs.end());
// If no PHIs were profitable, skip.
if (PNs.empty()) {
LLVM_DEBUG(dbgs() << " No safe and profitable PHIs found!\n");
return false;
}
// We need to know how much speculation will cost which is determined by how
// many incoming edges will need a copy of each speculated instruction.
SmallSetVector<BasicBlock *, 16> PredSet;
for (auto *PredBB : PNs[0]->blocks()) {
if (!PredSet.insert(PredBB))
continue;
// We cannot speculate when a predecessor is an indirect branch.
// FIXME: We also can't reliably create a non-critical edge block for
// speculation if the predecessor is an invoke. This doesn't seem
// fundamental and we should probably be splitting critical edges
// differently.
- if (isa<IndirectBrInst>(PredBB->getTerminator()) ||
- isa<InvokeInst>(PredBB->getTerminator())) {
+ const auto *TermInst = PredBB->getTerminator();
+ if (isa<IndirectBrInst>(TermInst) ||
+ isa<InvokeInst>(TermInst) ||
+ isa<CallBrInst>(TermInst)) {
LLVM_DEBUG(dbgs() << " Invalid: predecessor terminator: "
<< PredBB->getName() << "\n");
return false;
}
}
if (PredSet.size() < 2) {
LLVM_DEBUG(dbgs() << " Unimportant: phi with only one predecessor\n");
return false;
}
SmallVector<PHINode *, 16> SpecPNs = findProfitablePHIs(
PNs, CostSavingsMap, PotentialSpecSet, PredSet.size(), DT, TTI);
if (SpecPNs.empty())
// Nothing to do.
return false;
speculatePHIs(SpecPNs, PotentialSpecSet, PredSet, DT);
return true;
}
PreservedAnalyses SpeculateAroundPHIsPass::run(Function &F,
FunctionAnalysisManager &AM) {
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
bool Changed = false;
for (auto *BB : ReversePostOrderTraversal<Function *>(&F)) {
SmallVector<PHINode *, 16> PNs;
auto BBI = BB->begin();
while (auto *PN = dyn_cast<PHINode>(&*BBI)) {
PNs.push_back(PN);
++BBI;
}
if (PNs.empty())
continue;
Changed |= tryToSpeculatePHIs(PNs, DT, TTI);
}
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
return PA;
}

File Metadata

Mime Type
application/octet-stream
Expires
Tue, May 21, 10:57 AM (1 d, 23 h)
Storage Engine
chunks
Storage Format
Chunks
Storage Handle
ZjXWA_agS6ZF
Default Alt Text
(7 MB)

Event Timeline