Index: stable/11/contrib/llvm-project/clang/lib/Basic/Targets/X86.cpp =================================================================== --- stable/11/contrib/llvm-project/clang/lib/Basic/Targets/X86.cpp (revision 366451) +++ stable/11/contrib/llvm-project/clang/lib/Basic/Targets/X86.cpp (revision 366452) @@ -1,1905 +1,1911 @@ //===--- X86.cpp - Implement X86 target feature support -------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements X86 TargetInfo objects. // //===----------------------------------------------------------------------===// #include "X86.h" #include "clang/Basic/Builtins.h" #include "clang/Basic/Diagnostic.h" #include "clang/Basic/TargetBuiltins.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/TargetParser.h" namespace clang { namespace targets { const Builtin::Info BuiltinInfoX86[] = { #define BUILTIN(ID, TYPE, ATTRS) \ {#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr}, #define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) \ {#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, FEATURE}, #define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANGS, FEATURE) \ {#ID, TYPE, ATTRS, HEADER, LANGS, FEATURE}, #include "clang/Basic/BuiltinsX86.def" #define BUILTIN(ID, TYPE, ATTRS) \ {#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr}, #define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) \ {#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, FEATURE}, #define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANGS, FEATURE) \ {#ID, TYPE, ATTRS, HEADER, LANGS, FEATURE}, #include "clang/Basic/BuiltinsX86_64.def" }; static const char *const GCCRegNames[] = { "ax", "dx", "cx", "bx", "si", "di", "bp", "sp", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)", "argp", "flags", "fpcr", "fpsr", "dirflag", "frame", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23", "xmm24", "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31", "ymm16", "ymm17", "ymm18", "ymm19", "ymm20", "ymm21", "ymm22", "ymm23", "ymm24", "ymm25", "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "k0", "k1", "k2", "k3", "k4", "k5", "k6", "k7", "cr0", "cr2", "cr3", "cr4", "cr8", "dr0", "dr1", "dr2", "dr3", "dr6", "dr7", "bnd0", "bnd1", "bnd2", "bnd3", }; const TargetInfo::AddlRegName AddlRegNames[] = { {{"al", "ah", "eax", "rax"}, 0}, {{"bl", "bh", "ebx", "rbx"}, 3}, {{"cl", "ch", "ecx", "rcx"}, 2}, {{"dl", "dh", "edx", "rdx"}, 1}, {{"esi", "rsi"}, 4}, {{"edi", "rdi"}, 5}, {{"esp", "rsp"}, 7}, {{"ebp", "rbp"}, 6}, {{"r8d", "r8w", "r8b"}, 38}, {{"r9d", "r9w", "r9b"}, 39}, {{"r10d", "r10w", "r10b"}, 40}, {{"r11d", "r11w", "r11b"}, 41}, {{"r12d", "r12w", "r12b"}, 42}, {{"r13d", "r13w", "r13b"}, 43}, {{"r14d", "r14w", "r14b"}, 44}, {{"r15d", "r15w", "r15b"}, 45}, }; } // namespace targets } // namespace clang using namespace clang; using namespace clang::targets; bool X86TargetInfo::setFPMath(StringRef Name) { if (Name == "387") { FPMath = FP_387; return true; } if (Name == "sse") { FPMath = FP_SSE; return true; } return false; } bool X86TargetInfo::initFeatureMap( llvm::StringMap &Features, DiagnosticsEngine &Diags, StringRef CPU, const std::vector &FeaturesVec) const { // FIXME: This *really* should not be here. // X86_64 always has SSE2. if (getTriple().getArch() == llvm::Triple::x86_64) setFeatureEnabledImpl(Features, "sse2", true); const CPUKind Kind = getCPUKind(CPU); // Enable X87 for all X86 processors but Lakemont. if (Kind != CK_Lakemont) setFeatureEnabledImpl(Features, "x87", true); // Enable cmpxchg8 for i586 and greater CPUs. Include generic for backwards // compatibility. if (Kind >= CK_i586 || Kind == CK_Generic) setFeatureEnabledImpl(Features, "cx8", true); switch (Kind) { case CK_Generic: case CK_i386: case CK_i486: case CK_i586: case CK_Pentium: case CK_PentiumPro: case CK_i686: case CK_Lakemont: break; case CK_Cooperlake: // CPX inherits all CLX features plus AVX512BF16 setFeatureEnabledImpl(Features, "avx512bf16", true); LLVM_FALLTHROUGH; case CK_Cascadelake: // CLX inherits all SKX features plus AVX512VNNI setFeatureEnabledImpl(Features, "avx512vnni", true); LLVM_FALLTHROUGH; case CK_SkylakeServer: setFeatureEnabledImpl(Features, "avx512f", true); setFeatureEnabledImpl(Features, "avx512cd", true); setFeatureEnabledImpl(Features, "avx512dq", true); setFeatureEnabledImpl(Features, "avx512bw", true); setFeatureEnabledImpl(Features, "avx512vl", true); setFeatureEnabledImpl(Features, "clwb", true); setFeatureEnabledImpl(Features, "pku", true); // SkylakeServer cores inherits all SKL features, except SGX goto SkylakeCommon; case CK_Tigerlake: setFeatureEnabledImpl(Features, "avx512vp2intersect", true); setFeatureEnabledImpl(Features, "movdiri", true); setFeatureEnabledImpl(Features, "movdir64b", true); setFeatureEnabledImpl(Features, "shstk", true); // Tigerlake cores inherits IcelakeClient, except pconfig and wbnoinvd goto IcelakeCommon; case CK_IcelakeServer: setFeatureEnabledImpl(Features, "pconfig", true); setFeatureEnabledImpl(Features, "wbnoinvd", true); LLVM_FALLTHROUGH; case CK_IcelakeClient: IcelakeCommon: setFeatureEnabledImpl(Features, "vaes", true); setFeatureEnabledImpl(Features, "gfni", true); setFeatureEnabledImpl(Features, "vpclmulqdq", true); setFeatureEnabledImpl(Features, "avx512bitalg", true); setFeatureEnabledImpl(Features, "avx512vbmi2", true); setFeatureEnabledImpl(Features, "avx512vnni", true); setFeatureEnabledImpl(Features, "avx512vpopcntdq", true); setFeatureEnabledImpl(Features, "rdpid", true); setFeatureEnabledImpl(Features, "clwb", true); LLVM_FALLTHROUGH; case CK_Cannonlake: setFeatureEnabledImpl(Features, "avx512f", true); setFeatureEnabledImpl(Features, "avx512cd", true); setFeatureEnabledImpl(Features, "avx512dq", true); setFeatureEnabledImpl(Features, "avx512bw", true); setFeatureEnabledImpl(Features, "avx512vl", true); setFeatureEnabledImpl(Features, "avx512ifma", true); setFeatureEnabledImpl(Features, "avx512vbmi", true); setFeatureEnabledImpl(Features, "pku", true); setFeatureEnabledImpl(Features, "sha", true); LLVM_FALLTHROUGH; case CK_SkylakeClient: setFeatureEnabledImpl(Features, "sgx", true); // SkylakeServer cores inherits all SKL features, except SGX SkylakeCommon: setFeatureEnabledImpl(Features, "xsavec", true); setFeatureEnabledImpl(Features, "xsaves", true); setFeatureEnabledImpl(Features, "clflushopt", true); setFeatureEnabledImpl(Features, "aes", true); LLVM_FALLTHROUGH; case CK_Broadwell: setFeatureEnabledImpl(Features, "rdseed", true); setFeatureEnabledImpl(Features, "adx", true); setFeatureEnabledImpl(Features, "prfchw", true); LLVM_FALLTHROUGH; case CK_Haswell: setFeatureEnabledImpl(Features, "avx2", true); setFeatureEnabledImpl(Features, "lzcnt", true); setFeatureEnabledImpl(Features, "bmi", true); setFeatureEnabledImpl(Features, "bmi2", true); setFeatureEnabledImpl(Features, "fma", true); setFeatureEnabledImpl(Features, "invpcid", true); setFeatureEnabledImpl(Features, "movbe", true); LLVM_FALLTHROUGH; case CK_IvyBridge: setFeatureEnabledImpl(Features, "rdrnd", true); setFeatureEnabledImpl(Features, "f16c", true); setFeatureEnabledImpl(Features, "fsgsbase", true); LLVM_FALLTHROUGH; case CK_SandyBridge: setFeatureEnabledImpl(Features, "avx", true); setFeatureEnabledImpl(Features, "xsave", true); setFeatureEnabledImpl(Features, "xsaveopt", true); LLVM_FALLTHROUGH; case CK_Westmere: setFeatureEnabledImpl(Features, "pclmul", true); LLVM_FALLTHROUGH; case CK_Nehalem: setFeatureEnabledImpl(Features, "sse4.2", true); LLVM_FALLTHROUGH; case CK_Penryn: setFeatureEnabledImpl(Features, "sse4.1", true); LLVM_FALLTHROUGH; case CK_Core2: setFeatureEnabledImpl(Features, "ssse3", true); setFeatureEnabledImpl(Features, "sahf", true); LLVM_FALLTHROUGH; case CK_Nocona: setFeatureEnabledImpl(Features, "cx16", true); LLVM_FALLTHROUGH; case CK_Yonah: case CK_Prescott: setFeatureEnabledImpl(Features, "sse3", true); LLVM_FALLTHROUGH; case CK_PentiumM: case CK_Pentium4: case CK_x86_64: setFeatureEnabledImpl(Features, "sse2", true); LLVM_FALLTHROUGH; case CK_Pentium3: case CK_C3_2: setFeatureEnabledImpl(Features, "sse", true); LLVM_FALLTHROUGH; case CK_Pentium2: setFeatureEnabledImpl(Features, "fxsr", true); LLVM_FALLTHROUGH; case CK_PentiumMMX: case CK_K6: case CK_WinChipC6: setFeatureEnabledImpl(Features, "mmx", true); break; case CK_Tremont: setFeatureEnabledImpl(Features, "cldemote", true); setFeatureEnabledImpl(Features, "movdiri", true); setFeatureEnabledImpl(Features, "movdir64b", true); setFeatureEnabledImpl(Features, "gfni", true); setFeatureEnabledImpl(Features, "waitpkg", true); LLVM_FALLTHROUGH; case CK_GoldmontPlus: setFeatureEnabledImpl(Features, "ptwrite", true); setFeatureEnabledImpl(Features, "rdpid", true); setFeatureEnabledImpl(Features, "sgx", true); LLVM_FALLTHROUGH; case CK_Goldmont: setFeatureEnabledImpl(Features, "sha", true); setFeatureEnabledImpl(Features, "rdseed", true); setFeatureEnabledImpl(Features, "xsave", true); setFeatureEnabledImpl(Features, "xsaveopt", true); setFeatureEnabledImpl(Features, "xsavec", true); setFeatureEnabledImpl(Features, "xsaves", true); setFeatureEnabledImpl(Features, "clflushopt", true); setFeatureEnabledImpl(Features, "fsgsbase", true); setFeatureEnabledImpl(Features, "aes", true); LLVM_FALLTHROUGH; case CK_Silvermont: setFeatureEnabledImpl(Features, "rdrnd", true); setFeatureEnabledImpl(Features, "pclmul", true); setFeatureEnabledImpl(Features, "sse4.2", true); setFeatureEnabledImpl(Features, "prfchw", true); LLVM_FALLTHROUGH; case CK_Bonnell: setFeatureEnabledImpl(Features, "movbe", true); setFeatureEnabledImpl(Features, "ssse3", true); setFeatureEnabledImpl(Features, "fxsr", true); setFeatureEnabledImpl(Features, "cx16", true); setFeatureEnabledImpl(Features, "sahf", true); setFeatureEnabledImpl(Features, "mmx", true); break; case CK_KNM: // TODO: Add avx5124fmaps/avx5124vnniw. setFeatureEnabledImpl(Features, "avx512vpopcntdq", true); LLVM_FALLTHROUGH; case CK_KNL: setFeatureEnabledImpl(Features, "avx512f", true); setFeatureEnabledImpl(Features, "avx512cd", true); setFeatureEnabledImpl(Features, "avx512er", true); setFeatureEnabledImpl(Features, "avx512pf", true); setFeatureEnabledImpl(Features, "prfchw", true); setFeatureEnabledImpl(Features, "prefetchwt1", true); setFeatureEnabledImpl(Features, "fxsr", true); setFeatureEnabledImpl(Features, "rdseed", true); setFeatureEnabledImpl(Features, "adx", true); setFeatureEnabledImpl(Features, "lzcnt", true); setFeatureEnabledImpl(Features, "bmi", true); setFeatureEnabledImpl(Features, "bmi2", true); setFeatureEnabledImpl(Features, "fma", true); setFeatureEnabledImpl(Features, "rdrnd", true); setFeatureEnabledImpl(Features, "f16c", true); setFeatureEnabledImpl(Features, "fsgsbase", true); setFeatureEnabledImpl(Features, "aes", true); setFeatureEnabledImpl(Features, "pclmul", true); setFeatureEnabledImpl(Features, "cx16", true); setFeatureEnabledImpl(Features, "xsaveopt", true); setFeatureEnabledImpl(Features, "xsave", true); setFeatureEnabledImpl(Features, "movbe", true); setFeatureEnabledImpl(Features, "sahf", true); setFeatureEnabledImpl(Features, "mmx", true); break; case CK_K6_2: case CK_K6_3: case CK_WinChip2: case CK_C3: setFeatureEnabledImpl(Features, "3dnow", true); break; case CK_AMDFAM10: setFeatureEnabledImpl(Features, "sse4a", true); setFeatureEnabledImpl(Features, "lzcnt", true); setFeatureEnabledImpl(Features, "popcnt", true); setFeatureEnabledImpl(Features, "sahf", true); LLVM_FALLTHROUGH; case CK_K8SSE3: setFeatureEnabledImpl(Features, "sse3", true); LLVM_FALLTHROUGH; case CK_K8: setFeatureEnabledImpl(Features, "sse2", true); LLVM_FALLTHROUGH; case CK_AthlonXP: setFeatureEnabledImpl(Features, "sse", true); setFeatureEnabledImpl(Features, "fxsr", true); LLVM_FALLTHROUGH; case CK_Athlon: case CK_Geode: setFeatureEnabledImpl(Features, "3dnowa", true); break; case CK_BTVER2: setFeatureEnabledImpl(Features, "avx", true); setFeatureEnabledImpl(Features, "aes", true); setFeatureEnabledImpl(Features, "pclmul", true); setFeatureEnabledImpl(Features, "bmi", true); setFeatureEnabledImpl(Features, "f16c", true); setFeatureEnabledImpl(Features, "xsaveopt", true); setFeatureEnabledImpl(Features, "movbe", true); LLVM_FALLTHROUGH; case CK_BTVER1: setFeatureEnabledImpl(Features, "ssse3", true); setFeatureEnabledImpl(Features, "sse4a", true); setFeatureEnabledImpl(Features, "lzcnt", true); setFeatureEnabledImpl(Features, "popcnt", true); setFeatureEnabledImpl(Features, "prfchw", true); setFeatureEnabledImpl(Features, "cx16", true); setFeatureEnabledImpl(Features, "fxsr", true); setFeatureEnabledImpl(Features, "sahf", true); setFeatureEnabledImpl(Features, "mmx", true); break; case CK_ZNVER2: setFeatureEnabledImpl(Features, "clwb", true); setFeatureEnabledImpl(Features, "rdpid", true); setFeatureEnabledImpl(Features, "wbnoinvd", true); LLVM_FALLTHROUGH; case CK_ZNVER1: setFeatureEnabledImpl(Features, "adx", true); setFeatureEnabledImpl(Features, "aes", true); setFeatureEnabledImpl(Features, "avx2", true); setFeatureEnabledImpl(Features, "bmi", true); setFeatureEnabledImpl(Features, "bmi2", true); setFeatureEnabledImpl(Features, "clflushopt", true); setFeatureEnabledImpl(Features, "clzero", true); setFeatureEnabledImpl(Features, "cx16", true); setFeatureEnabledImpl(Features, "f16c", true); setFeatureEnabledImpl(Features, "fma", true); setFeatureEnabledImpl(Features, "fsgsbase", true); setFeatureEnabledImpl(Features, "fxsr", true); setFeatureEnabledImpl(Features, "lzcnt", true); setFeatureEnabledImpl(Features, "mmx", true); setFeatureEnabledImpl(Features, "mwaitx", true); setFeatureEnabledImpl(Features, "movbe", true); setFeatureEnabledImpl(Features, "pclmul", true); setFeatureEnabledImpl(Features, "popcnt", true); setFeatureEnabledImpl(Features, "prfchw", true); setFeatureEnabledImpl(Features, "rdrnd", true); setFeatureEnabledImpl(Features, "rdseed", true); setFeatureEnabledImpl(Features, "sahf", true); setFeatureEnabledImpl(Features, "sha", true); setFeatureEnabledImpl(Features, "sse4a", true); setFeatureEnabledImpl(Features, "xsave", true); setFeatureEnabledImpl(Features, "xsavec", true); setFeatureEnabledImpl(Features, "xsaveopt", true); setFeatureEnabledImpl(Features, "xsaves", true); break; case CK_BDVER4: setFeatureEnabledImpl(Features, "avx2", true); setFeatureEnabledImpl(Features, "bmi2", true); setFeatureEnabledImpl(Features, "mwaitx", true); LLVM_FALLTHROUGH; case CK_BDVER3: setFeatureEnabledImpl(Features, "fsgsbase", true); setFeatureEnabledImpl(Features, "xsaveopt", true); LLVM_FALLTHROUGH; case CK_BDVER2: setFeatureEnabledImpl(Features, "bmi", true); setFeatureEnabledImpl(Features, "fma", true); setFeatureEnabledImpl(Features, "f16c", true); setFeatureEnabledImpl(Features, "tbm", true); LLVM_FALLTHROUGH; case CK_BDVER1: // xop implies avx, sse4a and fma4. setFeatureEnabledImpl(Features, "xop", true); setFeatureEnabledImpl(Features, "lwp", true); setFeatureEnabledImpl(Features, "lzcnt", true); setFeatureEnabledImpl(Features, "aes", true); setFeatureEnabledImpl(Features, "pclmul", true); setFeatureEnabledImpl(Features, "prfchw", true); setFeatureEnabledImpl(Features, "cx16", true); setFeatureEnabledImpl(Features, "fxsr", true); setFeatureEnabledImpl(Features, "xsave", true); setFeatureEnabledImpl(Features, "sahf", true); setFeatureEnabledImpl(Features, "mmx", true); break; } if (!TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec)) return false; // Can't do this earlier because we need to be able to explicitly enable // or disable these features and the things that they depend upon. // Enable popcnt if sse4.2 is enabled and popcnt is not explicitly disabled. auto I = Features.find("sse4.2"); if (I != Features.end() && I->getValue() && llvm::find(FeaturesVec, "-popcnt") == FeaturesVec.end()) Features["popcnt"] = true; // Enable prfchw if 3DNow! is enabled and prfchw is not explicitly disabled. I = Features.find("3dnow"); if (I != Features.end() && I->getValue() && llvm::find(FeaturesVec, "-prfchw") == FeaturesVec.end()) Features["prfchw"] = true; // Additionally, if SSE is enabled and mmx is not explicitly disabled, // then enable MMX. I = Features.find("sse"); if (I != Features.end() && I->getValue() && llvm::find(FeaturesVec, "-mmx") == FeaturesVec.end()) Features["mmx"] = true; return true; } void X86TargetInfo::setSSELevel(llvm::StringMap &Features, X86SSEEnum Level, bool Enabled) { if (Enabled) { switch (Level) { case AVX512F: Features["avx512f"] = true; Features["fma"] = true; Features["f16c"] = true; LLVM_FALLTHROUGH; case AVX2: Features["avx2"] = true; LLVM_FALLTHROUGH; case AVX: Features["avx"] = true; Features["xsave"] = true; LLVM_FALLTHROUGH; case SSE42: Features["sse4.2"] = true; LLVM_FALLTHROUGH; case SSE41: Features["sse4.1"] = true; LLVM_FALLTHROUGH; case SSSE3: Features["ssse3"] = true; LLVM_FALLTHROUGH; case SSE3: Features["sse3"] = true; LLVM_FALLTHROUGH; case SSE2: Features["sse2"] = true; LLVM_FALLTHROUGH; case SSE1: Features["sse"] = true; LLVM_FALLTHROUGH; case NoSSE: break; } return; } switch (Level) { case NoSSE: case SSE1: Features["sse"] = false; LLVM_FALLTHROUGH; case SSE2: Features["sse2"] = Features["pclmul"] = Features["aes"] = false; Features["sha"] = Features["gfni"] = false; LLVM_FALLTHROUGH; case SSE3: Features["sse3"] = false; setXOPLevel(Features, NoXOP, false); LLVM_FALLTHROUGH; case SSSE3: Features["ssse3"] = false; LLVM_FALLTHROUGH; case SSE41: Features["sse4.1"] = false; LLVM_FALLTHROUGH; case SSE42: Features["sse4.2"] = false; LLVM_FALLTHROUGH; case AVX: Features["fma"] = Features["avx"] = Features["f16c"] = false; Features["xsave"] = Features["xsaveopt"] = Features["vaes"] = false; Features["vpclmulqdq"] = false; setXOPLevel(Features, FMA4, false); LLVM_FALLTHROUGH; case AVX2: Features["avx2"] = false; LLVM_FALLTHROUGH; case AVX512F: Features["avx512f"] = Features["avx512cd"] = Features["avx512er"] = false; Features["avx512pf"] = Features["avx512dq"] = Features["avx512bw"] = false; Features["avx512vl"] = Features["avx512vbmi"] = false; Features["avx512ifma"] = Features["avx512vpopcntdq"] = false; Features["avx512bitalg"] = Features["avx512vnni"] = false; Features["avx512vbmi2"] = Features["avx512bf16"] = false; Features["avx512vp2intersect"] = false; break; } } void X86TargetInfo::setMMXLevel(llvm::StringMap &Features, MMX3DNowEnum Level, bool Enabled) { if (Enabled) { switch (Level) { case AMD3DNowAthlon: Features["3dnowa"] = true; LLVM_FALLTHROUGH; case AMD3DNow: Features["3dnow"] = true; LLVM_FALLTHROUGH; case MMX: Features["mmx"] = true; LLVM_FALLTHROUGH; case NoMMX3DNow: break; } return; } switch (Level) { case NoMMX3DNow: case MMX: Features["mmx"] = false; LLVM_FALLTHROUGH; case AMD3DNow: Features["3dnow"] = false; LLVM_FALLTHROUGH; case AMD3DNowAthlon: Features["3dnowa"] = false; break; } } void X86TargetInfo::setXOPLevel(llvm::StringMap &Features, XOPEnum Level, bool Enabled) { if (Enabled) { switch (Level) { case XOP: Features["xop"] = true; LLVM_FALLTHROUGH; case FMA4: Features["fma4"] = true; setSSELevel(Features, AVX, true); LLVM_FALLTHROUGH; case SSE4A: Features["sse4a"] = true; setSSELevel(Features, SSE3, true); LLVM_FALLTHROUGH; case NoXOP: break; } return; } switch (Level) { case NoXOP: case SSE4A: Features["sse4a"] = false; LLVM_FALLTHROUGH; case FMA4: Features["fma4"] = false; LLVM_FALLTHROUGH; case XOP: Features["xop"] = false; break; } } void X86TargetInfo::setFeatureEnabledImpl(llvm::StringMap &Features, StringRef Name, bool Enabled) { // This is a bit of a hack to deal with the sse4 target feature when used // as part of the target attribute. We handle sse4 correctly everywhere // else. See below for more information on how we handle the sse4 options. if (Name != "sse4") Features[Name] = Enabled; if (Name == "mmx") { setMMXLevel(Features, MMX, Enabled); } else if (Name == "sse") { setSSELevel(Features, SSE1, Enabled); } else if (Name == "sse2") { setSSELevel(Features, SSE2, Enabled); } else if (Name == "sse3") { setSSELevel(Features, SSE3, Enabled); } else if (Name == "ssse3") { setSSELevel(Features, SSSE3, Enabled); } else if (Name == "sse4.2") { setSSELevel(Features, SSE42, Enabled); } else if (Name == "sse4.1") { setSSELevel(Features, SSE41, Enabled); } else if (Name == "3dnow") { setMMXLevel(Features, AMD3DNow, Enabled); } else if (Name == "3dnowa") { setMMXLevel(Features, AMD3DNowAthlon, Enabled); } else if (Name == "aes") { if (Enabled) setSSELevel(Features, SSE2, Enabled); else Features["vaes"] = false; } else if (Name == "vaes") { if (Enabled) { setSSELevel(Features, AVX, Enabled); Features["aes"] = true; } } else if (Name == "pclmul") { if (Enabled) setSSELevel(Features, SSE2, Enabled); else Features["vpclmulqdq"] = false; } else if (Name == "vpclmulqdq") { if (Enabled) { setSSELevel(Features, AVX, Enabled); Features["pclmul"] = true; } } else if (Name == "gfni") { if (Enabled) setSSELevel(Features, SSE2, Enabled); } else if (Name == "avx") { setSSELevel(Features, AVX, Enabled); } else if (Name == "avx2") { setSSELevel(Features, AVX2, Enabled); } else if (Name == "avx512f") { setSSELevel(Features, AVX512F, Enabled); } else if (Name.startswith("avx512")) { if (Enabled) setSSELevel(Features, AVX512F, Enabled); // Enable BWI instruction if certain features are being enabled. if ((Name == "avx512vbmi" || Name == "avx512vbmi2" || Name == "avx512bitalg" || Name == "avx512bf16") && Enabled) Features["avx512bw"] = true; // Also disable some features if BWI is being disabled. if (Name == "avx512bw" && !Enabled) { Features["avx512vbmi"] = false; Features["avx512vbmi2"] = false; Features["avx512bitalg"] = false; Features["avx512bf16"] = false; } } else if (Name == "fma") { if (Enabled) setSSELevel(Features, AVX, Enabled); else setSSELevel(Features, AVX512F, Enabled); } else if (Name == "fma4") { setXOPLevel(Features, FMA4, Enabled); } else if (Name == "xop") { setXOPLevel(Features, XOP, Enabled); } else if (Name == "sse4a") { setXOPLevel(Features, SSE4A, Enabled); } else if (Name == "f16c") { if (Enabled) setSSELevel(Features, AVX, Enabled); else setSSELevel(Features, AVX512F, Enabled); } else if (Name == "sha") { if (Enabled) setSSELevel(Features, SSE2, Enabled); } else if (Name == "sse4") { // We can get here via the __target__ attribute since that's not controlled // via the -msse4/-mno-sse4 command line alias. Handle this the same way // here - turn on the sse4.2 if enabled, turn off the sse4.1 level if // disabled. if (Enabled) setSSELevel(Features, SSE42, Enabled); else setSSELevel(Features, SSE41, Enabled); } else if (Name == "xsave") { if (!Enabled) Features["xsaveopt"] = false; } else if (Name == "xsaveopt" || Name == "xsavec" || Name == "xsaves") { if (Enabled) Features["xsave"] = true; } } /// handleTargetFeatures - Perform initialization based on the user /// configured set of features. bool X86TargetInfo::handleTargetFeatures(std::vector &Features, DiagnosticsEngine &Diags) { for (const auto &Feature : Features) { if (Feature[0] != '+') continue; if (Feature == "+aes") { HasAES = true; } else if (Feature == "+vaes") { HasVAES = true; } else if (Feature == "+pclmul") { HasPCLMUL = true; } else if (Feature == "+vpclmulqdq") { HasVPCLMULQDQ = true; } else if (Feature == "+lzcnt") { HasLZCNT = true; } else if (Feature == "+rdrnd") { HasRDRND = true; } else if (Feature == "+fsgsbase") { HasFSGSBASE = true; } else if (Feature == "+bmi") { HasBMI = true; } else if (Feature == "+bmi2") { HasBMI2 = true; } else if (Feature == "+popcnt") { HasPOPCNT = true; } else if (Feature == "+rtm") { HasRTM = true; } else if (Feature == "+prfchw") { HasPRFCHW = true; } else if (Feature == "+rdseed") { HasRDSEED = true; } else if (Feature == "+adx") { HasADX = true; } else if (Feature == "+tbm") { HasTBM = true; } else if (Feature == "+lwp") { HasLWP = true; } else if (Feature == "+fma") { HasFMA = true; } else if (Feature == "+f16c") { HasF16C = true; } else if (Feature == "+gfni") { HasGFNI = true; } else if (Feature == "+avx512cd") { HasAVX512CD = true; } else if (Feature == "+avx512vpopcntdq") { HasAVX512VPOPCNTDQ = true; } else if (Feature == "+avx512vnni") { HasAVX512VNNI = true; } else if (Feature == "+avx512bf16") { HasAVX512BF16 = true; } else if (Feature == "+avx512er") { HasAVX512ER = true; } else if (Feature == "+avx512pf") { HasAVX512PF = true; } else if (Feature == "+avx512dq") { HasAVX512DQ = true; } else if (Feature == "+avx512bitalg") { HasAVX512BITALG = true; } else if (Feature == "+avx512bw") { HasAVX512BW = true; } else if (Feature == "+avx512vl") { HasAVX512VL = true; } else if (Feature == "+avx512vbmi") { HasAVX512VBMI = true; } else if (Feature == "+avx512vbmi2") { HasAVX512VBMI2 = true; } else if (Feature == "+avx512ifma") { HasAVX512IFMA = true; } else if (Feature == "+avx512vp2intersect") { HasAVX512VP2INTERSECT = true; } else if (Feature == "+sha") { HasSHA = true; } else if (Feature == "+shstk") { HasSHSTK = true; } else if (Feature == "+movbe") { HasMOVBE = true; } else if (Feature == "+sgx") { HasSGX = true; } else if (Feature == "+cx8") { HasCX8 = true; } else if (Feature == "+cx16") { HasCX16 = true; } else if (Feature == "+fxsr") { HasFXSR = true; } else if (Feature == "+xsave") { HasXSAVE = true; } else if (Feature == "+xsaveopt") { HasXSAVEOPT = true; } else if (Feature == "+xsavec") { HasXSAVEC = true; } else if (Feature == "+xsaves") { HasXSAVES = true; } else if (Feature == "+mwaitx") { HasMWAITX = true; } else if (Feature == "+pku") { HasPKU = true; } else if (Feature == "+clflushopt") { HasCLFLUSHOPT = true; } else if (Feature == "+clwb") { HasCLWB = true; } else if (Feature == "+wbnoinvd") { HasWBNOINVD = true; } else if (Feature == "+prefetchwt1") { HasPREFETCHWT1 = true; } else if (Feature == "+clzero") { HasCLZERO = true; } else if (Feature == "+cldemote") { HasCLDEMOTE = true; } else if (Feature == "+rdpid") { HasRDPID = true; } else if (Feature == "+retpoline-external-thunk") { HasRetpolineExternalThunk = true; } else if (Feature == "+sahf") { HasLAHFSAHF = true; } else if (Feature == "+waitpkg") { HasWAITPKG = true; } else if (Feature == "+movdiri") { HasMOVDIRI = true; } else if (Feature == "+movdir64b") { HasMOVDIR64B = true; } else if (Feature == "+pconfig") { HasPCONFIG = true; } else if (Feature == "+ptwrite") { HasPTWRITE = true; } else if (Feature == "+invpcid") { HasINVPCID = true; } else if (Feature == "+enqcmd") { HasENQCMD = true; } X86SSEEnum Level = llvm::StringSwitch(Feature) .Case("+avx512f", AVX512F) .Case("+avx2", AVX2) .Case("+avx", AVX) .Case("+sse4.2", SSE42) .Case("+sse4.1", SSE41) .Case("+ssse3", SSSE3) .Case("+sse3", SSE3) .Case("+sse2", SSE2) .Case("+sse", SSE1) .Default(NoSSE); SSELevel = std::max(SSELevel, Level); MMX3DNowEnum ThreeDNowLevel = llvm::StringSwitch(Feature) .Case("+3dnowa", AMD3DNowAthlon) .Case("+3dnow", AMD3DNow) .Case("+mmx", MMX) .Default(NoMMX3DNow); MMX3DNowLevel = std::max(MMX3DNowLevel, ThreeDNowLevel); XOPEnum XLevel = llvm::StringSwitch(Feature) .Case("+xop", XOP) .Case("+fma4", FMA4) .Case("+sse4a", SSE4A) .Default(NoXOP); XOPLevel = std::max(XOPLevel, XLevel); } // LLVM doesn't have a separate switch for fpmath, so only accept it if it // matches the selected sse level. if ((FPMath == FP_SSE && SSELevel < SSE1) || (FPMath == FP_387 && SSELevel >= SSE1)) { Diags.Report(diag::err_target_unsupported_fpmath) << (FPMath == FP_SSE ? "sse" : "387"); return false; } SimdDefaultAlign = hasFeature("avx512f") ? 512 : hasFeature("avx") ? 256 : 128; return true; } /// X86TargetInfo::getTargetDefines - Return the set of the X86-specific macro /// definitions for this particular subtarget. void X86TargetInfo::getTargetDefines(const LangOptions &Opts, MacroBuilder &Builder) const { // Inline assembly supports X86 flag outputs. Builder.defineMacro("__GCC_ASM_FLAG_OUTPUTS__"); std::string CodeModel = getTargetOpts().CodeModel; if (CodeModel == "default") CodeModel = "small"; Builder.defineMacro("__code_model_" + CodeModel + "_"); // Target identification. if (getTriple().getArch() == llvm::Triple::x86_64) { Builder.defineMacro("__amd64__"); Builder.defineMacro("__amd64"); Builder.defineMacro("__x86_64"); Builder.defineMacro("__x86_64__"); if (getTriple().getArchName() == "x86_64h") { Builder.defineMacro("__x86_64h"); Builder.defineMacro("__x86_64h__"); } } else { DefineStd(Builder, "i386", Opts); } Builder.defineMacro("__SEG_GS"); Builder.defineMacro("__SEG_FS"); Builder.defineMacro("__seg_gs", "__attribute__((address_space(256)))"); Builder.defineMacro("__seg_fs", "__attribute__((address_space(257)))"); // Subtarget options. // FIXME: We are hard-coding the tune parameters based on the CPU, but they // truly should be based on -mtune options. switch (CPU) { case CK_Generic: break; case CK_i386: // The rest are coming from the i386 define above. Builder.defineMacro("__tune_i386__"); break; case CK_i486: case CK_WinChipC6: case CK_WinChip2: case CK_C3: defineCPUMacros(Builder, "i486"); break; case CK_PentiumMMX: Builder.defineMacro("__pentium_mmx__"); Builder.defineMacro("__tune_pentium_mmx__"); LLVM_FALLTHROUGH; case CK_i586: case CK_Pentium: defineCPUMacros(Builder, "i586"); defineCPUMacros(Builder, "pentium"); break; case CK_Pentium3: case CK_PentiumM: Builder.defineMacro("__tune_pentium3__"); LLVM_FALLTHROUGH; case CK_Pentium2: case CK_C3_2: Builder.defineMacro("__tune_pentium2__"); LLVM_FALLTHROUGH; case CK_PentiumPro: case CK_i686: defineCPUMacros(Builder, "i686"); defineCPUMacros(Builder, "pentiumpro"); break; case CK_Pentium4: defineCPUMacros(Builder, "pentium4"); break; case CK_Yonah: case CK_Prescott: case CK_Nocona: defineCPUMacros(Builder, "nocona"); break; case CK_Core2: case CK_Penryn: defineCPUMacros(Builder, "core2"); break; case CK_Bonnell: defineCPUMacros(Builder, "atom"); break; case CK_Silvermont: defineCPUMacros(Builder, "slm"); break; case CK_Goldmont: defineCPUMacros(Builder, "goldmont"); break; case CK_GoldmontPlus: defineCPUMacros(Builder, "goldmont_plus"); break; case CK_Tremont: defineCPUMacros(Builder, "tremont"); break; case CK_Nehalem: case CK_Westmere: case CK_SandyBridge: case CK_IvyBridge: case CK_Haswell: case CK_Broadwell: case CK_SkylakeClient: case CK_SkylakeServer: case CK_Cascadelake: case CK_Cooperlake: case CK_Cannonlake: case CK_IcelakeClient: case CK_IcelakeServer: case CK_Tigerlake: // FIXME: Historically, we defined this legacy name, it would be nice to // remove it at some point. We've never exposed fine-grained names for // recent primary x86 CPUs, and we should keep it that way. defineCPUMacros(Builder, "corei7"); break; case CK_KNL: defineCPUMacros(Builder, "knl"); break; case CK_KNM: break; case CK_Lakemont: defineCPUMacros(Builder, "i586", /*Tuning*/false); defineCPUMacros(Builder, "pentium", /*Tuning*/false); Builder.defineMacro("__tune_lakemont__"); break; case CK_K6_2: Builder.defineMacro("__k6_2__"); Builder.defineMacro("__tune_k6_2__"); LLVM_FALLTHROUGH; case CK_K6_3: if (CPU != CK_K6_2) { // In case of fallthrough // FIXME: GCC may be enabling these in cases where some other k6 // architecture is specified but -m3dnow is explicitly provided. The // exact semantics need to be determined and emulated here. Builder.defineMacro("__k6_3__"); Builder.defineMacro("__tune_k6_3__"); } LLVM_FALLTHROUGH; case CK_K6: defineCPUMacros(Builder, "k6"); break; case CK_Athlon: case CK_AthlonXP: defineCPUMacros(Builder, "athlon"); if (SSELevel != NoSSE) { Builder.defineMacro("__athlon_sse__"); Builder.defineMacro("__tune_athlon_sse__"); } break; case CK_K8: case CK_K8SSE3: case CK_x86_64: defineCPUMacros(Builder, "k8"); break; case CK_AMDFAM10: defineCPUMacros(Builder, "amdfam10"); break; case CK_BTVER1: defineCPUMacros(Builder, "btver1"); break; case CK_BTVER2: defineCPUMacros(Builder, "btver2"); break; case CK_BDVER1: defineCPUMacros(Builder, "bdver1"); break; case CK_BDVER2: defineCPUMacros(Builder, "bdver2"); break; case CK_BDVER3: defineCPUMacros(Builder, "bdver3"); break; case CK_BDVER4: defineCPUMacros(Builder, "bdver4"); break; case CK_ZNVER1: defineCPUMacros(Builder, "znver1"); break; case CK_ZNVER2: defineCPUMacros(Builder, "znver2"); break; case CK_Geode: defineCPUMacros(Builder, "geode"); break; } // Target properties. Builder.defineMacro("__REGISTER_PREFIX__", ""); // Define __NO_MATH_INLINES on linux/x86 so that we don't get inline // functions in glibc header files that use FP Stack inline asm which the // backend can't deal with (PR879). Builder.defineMacro("__NO_MATH_INLINES"); if (HasAES) Builder.defineMacro("__AES__"); if (HasVAES) Builder.defineMacro("__VAES__"); if (HasPCLMUL) Builder.defineMacro("__PCLMUL__"); if (HasVPCLMULQDQ) Builder.defineMacro("__VPCLMULQDQ__"); if (HasLZCNT) Builder.defineMacro("__LZCNT__"); if (HasRDRND) Builder.defineMacro("__RDRND__"); if (HasFSGSBASE) Builder.defineMacro("__FSGSBASE__"); if (HasBMI) Builder.defineMacro("__BMI__"); if (HasBMI2) Builder.defineMacro("__BMI2__"); if (HasPOPCNT) Builder.defineMacro("__POPCNT__"); if (HasRTM) Builder.defineMacro("__RTM__"); if (HasPRFCHW) Builder.defineMacro("__PRFCHW__"); if (HasRDSEED) Builder.defineMacro("__RDSEED__"); if (HasADX) Builder.defineMacro("__ADX__"); if (HasTBM) Builder.defineMacro("__TBM__"); if (HasLWP) Builder.defineMacro("__LWP__"); if (HasMWAITX) Builder.defineMacro("__MWAITX__"); if (HasMOVBE) Builder.defineMacro("__MOVBE__"); switch (XOPLevel) { case XOP: Builder.defineMacro("__XOP__"); LLVM_FALLTHROUGH; case FMA4: Builder.defineMacro("__FMA4__"); LLVM_FALLTHROUGH; case SSE4A: Builder.defineMacro("__SSE4A__"); LLVM_FALLTHROUGH; case NoXOP: break; } if (HasFMA) Builder.defineMacro("__FMA__"); if (HasF16C) Builder.defineMacro("__F16C__"); if (HasGFNI) Builder.defineMacro("__GFNI__"); if (HasAVX512CD) Builder.defineMacro("__AVX512CD__"); if (HasAVX512VPOPCNTDQ) Builder.defineMacro("__AVX512VPOPCNTDQ__"); if (HasAVX512VNNI) Builder.defineMacro("__AVX512VNNI__"); if (HasAVX512BF16) Builder.defineMacro("__AVX512BF16__"); if (HasAVX512ER) Builder.defineMacro("__AVX512ER__"); if (HasAVX512PF) Builder.defineMacro("__AVX512PF__"); if (HasAVX512DQ) Builder.defineMacro("__AVX512DQ__"); if (HasAVX512BITALG) Builder.defineMacro("__AVX512BITALG__"); if (HasAVX512BW) Builder.defineMacro("__AVX512BW__"); if (HasAVX512VL) Builder.defineMacro("__AVX512VL__"); if (HasAVX512VBMI) Builder.defineMacro("__AVX512VBMI__"); if (HasAVX512VBMI2) Builder.defineMacro("__AVX512VBMI2__"); if (HasAVX512IFMA) Builder.defineMacro("__AVX512IFMA__"); if (HasAVX512VP2INTERSECT) Builder.defineMacro("__AVX512VP2INTERSECT__"); if (HasSHA) Builder.defineMacro("__SHA__"); if (HasFXSR) Builder.defineMacro("__FXSR__"); if (HasXSAVE) Builder.defineMacro("__XSAVE__"); if (HasXSAVEOPT) Builder.defineMacro("__XSAVEOPT__"); if (HasXSAVEC) Builder.defineMacro("__XSAVEC__"); if (HasXSAVES) Builder.defineMacro("__XSAVES__"); if (HasPKU) Builder.defineMacro("__PKU__"); if (HasCLFLUSHOPT) Builder.defineMacro("__CLFLUSHOPT__"); if (HasCLWB) Builder.defineMacro("__CLWB__"); if (HasWBNOINVD) Builder.defineMacro("__WBNOINVD__"); if (HasSHSTK) Builder.defineMacro("__SHSTK__"); if (HasSGX) Builder.defineMacro("__SGX__"); if (HasPREFETCHWT1) Builder.defineMacro("__PREFETCHWT1__"); if (HasCLZERO) Builder.defineMacro("__CLZERO__"); if (HasRDPID) Builder.defineMacro("__RDPID__"); if (HasCLDEMOTE) Builder.defineMacro("__CLDEMOTE__"); if (HasWAITPKG) Builder.defineMacro("__WAITPKG__"); if (HasMOVDIRI) Builder.defineMacro("__MOVDIRI__"); if (HasMOVDIR64B) Builder.defineMacro("__MOVDIR64B__"); if (HasPCONFIG) Builder.defineMacro("__PCONFIG__"); if (HasPTWRITE) Builder.defineMacro("__PTWRITE__"); if (HasINVPCID) Builder.defineMacro("__INVPCID__"); if (HasENQCMD) Builder.defineMacro("__ENQCMD__"); // Each case falls through to the previous one here. switch (SSELevel) { case AVX512F: Builder.defineMacro("__AVX512F__"); LLVM_FALLTHROUGH; case AVX2: Builder.defineMacro("__AVX2__"); LLVM_FALLTHROUGH; case AVX: Builder.defineMacro("__AVX__"); LLVM_FALLTHROUGH; case SSE42: Builder.defineMacro("__SSE4_2__"); LLVM_FALLTHROUGH; case SSE41: Builder.defineMacro("__SSE4_1__"); LLVM_FALLTHROUGH; case SSSE3: Builder.defineMacro("__SSSE3__"); LLVM_FALLTHROUGH; case SSE3: Builder.defineMacro("__SSE3__"); LLVM_FALLTHROUGH; case SSE2: Builder.defineMacro("__SSE2__"); Builder.defineMacro("__SSE2_MATH__"); // -mfp-math=sse always implied. LLVM_FALLTHROUGH; case SSE1: Builder.defineMacro("__SSE__"); Builder.defineMacro("__SSE_MATH__"); // -mfp-math=sse always implied. LLVM_FALLTHROUGH; case NoSSE: break; } if (Opts.MicrosoftExt && getTriple().getArch() == llvm::Triple::x86) { switch (SSELevel) { case AVX512F: case AVX2: case AVX: case SSE42: case SSE41: case SSSE3: case SSE3: case SSE2: Builder.defineMacro("_M_IX86_FP", Twine(2)); break; case SSE1: Builder.defineMacro("_M_IX86_FP", Twine(1)); break; default: Builder.defineMacro("_M_IX86_FP", Twine(0)); break; } } // Each case falls through to the previous one here. switch (MMX3DNowLevel) { case AMD3DNowAthlon: Builder.defineMacro("__3dNOW_A__"); LLVM_FALLTHROUGH; case AMD3DNow: Builder.defineMacro("__3dNOW__"); LLVM_FALLTHROUGH; case MMX: Builder.defineMacro("__MMX__"); LLVM_FALLTHROUGH; case NoMMX3DNow: break; } if (CPU >= CK_i486 || CPU == CK_Generic) { Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1"); Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2"); Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4"); } if (HasCX8) Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8"); if (HasCX16 && getTriple().getArch() == llvm::Triple::x86_64) Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_16"); if (HasFloat128) Builder.defineMacro("__SIZEOF_FLOAT128__", "16"); } bool X86TargetInfo::isValidFeatureName(StringRef Name) const { return llvm::StringSwitch(Name) .Case("3dnow", true) .Case("3dnowa", true) .Case("adx", true) .Case("aes", true) .Case("avx", true) .Case("avx2", true) .Case("avx512f", true) .Case("avx512cd", true) .Case("avx512vpopcntdq", true) .Case("avx512vnni", true) .Case("avx512bf16", true) .Case("avx512er", true) .Case("avx512pf", true) .Case("avx512dq", true) .Case("avx512bitalg", true) .Case("avx512bw", true) .Case("avx512vl", true) .Case("avx512vbmi", true) .Case("avx512vbmi2", true) .Case("avx512ifma", true) .Case("avx512vp2intersect", true) .Case("bmi", true) .Case("bmi2", true) .Case("cldemote", true) .Case("clflushopt", true) .Case("clwb", true) .Case("clzero", true) .Case("cx16", true) .Case("enqcmd", true) .Case("f16c", true) .Case("fma", true) .Case("fma4", true) .Case("fsgsbase", true) .Case("fxsr", true) .Case("gfni", true) .Case("invpcid", true) .Case("lwp", true) .Case("lzcnt", true) .Case("mmx", true) .Case("movbe", true) .Case("movdiri", true) .Case("movdir64b", true) .Case("mwaitx", true) .Case("pclmul", true) .Case("pconfig", true) .Case("pku", true) .Case("popcnt", true) .Case("prefetchwt1", true) .Case("prfchw", true) .Case("ptwrite", true) .Case("rdpid", true) .Case("rdrnd", true) .Case("rdseed", true) .Case("rtm", true) .Case("sahf", true) .Case("sgx", true) .Case("sha", true) .Case("shstk", true) .Case("sse", true) .Case("sse2", true) .Case("sse3", true) .Case("ssse3", true) .Case("sse4", true) .Case("sse4.1", true) .Case("sse4.2", true) .Case("sse4a", true) .Case("tbm", true) .Case("vaes", true) .Case("vpclmulqdq", true) .Case("wbnoinvd", true) .Case("waitpkg", true) .Case("x87", true) .Case("xop", true) .Case("xsave", true) .Case("xsavec", true) .Case("xsaves", true) .Case("xsaveopt", true) .Default(false); } bool X86TargetInfo::hasFeature(StringRef Feature) const { return llvm::StringSwitch(Feature) .Case("adx", HasADX) .Case("aes", HasAES) .Case("avx", SSELevel >= AVX) .Case("avx2", SSELevel >= AVX2) .Case("avx512f", SSELevel >= AVX512F) .Case("avx512cd", HasAVX512CD) .Case("avx512vpopcntdq", HasAVX512VPOPCNTDQ) .Case("avx512vnni", HasAVX512VNNI) .Case("avx512bf16", HasAVX512BF16) .Case("avx512er", HasAVX512ER) .Case("avx512pf", HasAVX512PF) .Case("avx512dq", HasAVX512DQ) .Case("avx512bitalg", HasAVX512BITALG) .Case("avx512bw", HasAVX512BW) .Case("avx512vl", HasAVX512VL) .Case("avx512vbmi", HasAVX512VBMI) .Case("avx512vbmi2", HasAVX512VBMI2) .Case("avx512ifma", HasAVX512IFMA) .Case("avx512vp2intersect", HasAVX512VP2INTERSECT) .Case("bmi", HasBMI) .Case("bmi2", HasBMI2) .Case("cldemote", HasCLDEMOTE) .Case("clflushopt", HasCLFLUSHOPT) .Case("clwb", HasCLWB) .Case("clzero", HasCLZERO) .Case("cx8", HasCX8) .Case("cx16", HasCX16) .Case("enqcmd", HasENQCMD) .Case("f16c", HasF16C) .Case("fma", HasFMA) .Case("fma4", XOPLevel >= FMA4) .Case("fsgsbase", HasFSGSBASE) .Case("fxsr", HasFXSR) .Case("gfni", HasGFNI) .Case("invpcid", HasINVPCID) .Case("lwp", HasLWP) .Case("lzcnt", HasLZCNT) .Case("mm3dnow", MMX3DNowLevel >= AMD3DNow) .Case("mm3dnowa", MMX3DNowLevel >= AMD3DNowAthlon) .Case("mmx", MMX3DNowLevel >= MMX) .Case("movbe", HasMOVBE) .Case("movdiri", HasMOVDIRI) .Case("movdir64b", HasMOVDIR64B) .Case("mwaitx", HasMWAITX) .Case("pclmul", HasPCLMUL) .Case("pconfig", HasPCONFIG) .Case("pku", HasPKU) .Case("popcnt", HasPOPCNT) .Case("prefetchwt1", HasPREFETCHWT1) .Case("prfchw", HasPRFCHW) .Case("ptwrite", HasPTWRITE) .Case("rdpid", HasRDPID) .Case("rdrnd", HasRDRND) .Case("rdseed", HasRDSEED) .Case("retpoline-external-thunk", HasRetpolineExternalThunk) .Case("rtm", HasRTM) .Case("sahf", HasLAHFSAHF) .Case("sgx", HasSGX) .Case("sha", HasSHA) .Case("shstk", HasSHSTK) .Case("sse", SSELevel >= SSE1) .Case("sse2", SSELevel >= SSE2) .Case("sse3", SSELevel >= SSE3) .Case("ssse3", SSELevel >= SSSE3) .Case("sse4.1", SSELevel >= SSE41) .Case("sse4.2", SSELevel >= SSE42) .Case("sse4a", XOPLevel >= SSE4A) .Case("tbm", HasTBM) .Case("vaes", HasVAES) .Case("vpclmulqdq", HasVPCLMULQDQ) .Case("wbnoinvd", HasWBNOINVD) .Case("waitpkg", HasWAITPKG) .Case("x86", true) .Case("x86_32", getTriple().getArch() == llvm::Triple::x86) .Case("x86_64", getTriple().getArch() == llvm::Triple::x86_64) .Case("xop", XOPLevel >= XOP) .Case("xsave", HasXSAVE) .Case("xsavec", HasXSAVEC) .Case("xsaves", HasXSAVES) .Case("xsaveopt", HasXSAVEOPT) .Default(false); } // We can't use a generic validation scheme for the features accepted here // versus subtarget features accepted in the target attribute because the // bitfield structure that's initialized in the runtime only supports the // below currently rather than the full range of subtarget features. (See // X86TargetInfo::hasFeature for a somewhat comprehensive list). bool X86TargetInfo::validateCpuSupports(StringRef FeatureStr) const { return llvm::StringSwitch(FeatureStr) #define X86_FEATURE_COMPAT(VAL, ENUM, STR) .Case(STR, true) #include "llvm/Support/X86TargetParser.def" .Default(false); } static llvm::X86::ProcessorFeatures getFeature(StringRef Name) { return llvm::StringSwitch(Name) #define X86_FEATURE_COMPAT(VAL, ENUM, STR) .Case(STR, llvm::X86::ENUM) #include "llvm/Support/X86TargetParser.def" ; // Note, this function should only be used after ensuring the value is // correct, so it asserts if the value is out of range. } static unsigned getFeaturePriority(llvm::X86::ProcessorFeatures Feat) { enum class FeatPriority { #define FEATURE(FEAT) FEAT, #include "clang/Basic/X86Target.def" }; switch (Feat) { #define FEATURE(FEAT) \ case llvm::X86::FEAT: \ return static_cast(FeatPriority::FEAT); #include "clang/Basic/X86Target.def" default: llvm_unreachable("No Feature Priority for non-CPUSupports Features"); } } unsigned X86TargetInfo::multiVersionSortPriority(StringRef Name) const { // Valid CPUs have a 'key feature' that compares just better than its key // feature. CPUKind Kind = getCPUKind(Name); if (Kind != CK_Generic) { switch (Kind) { default: llvm_unreachable( "CPU Type without a key feature used in 'target' attribute"); #define PROC_WITH_FEAT(ENUM, STR, IS64, KEY_FEAT) \ case CK_##ENUM: \ return (getFeaturePriority(llvm::X86::KEY_FEAT) << 1) + 1; #include "clang/Basic/X86Target.def" } } // Now we know we have a feature, so get its priority and shift it a few so // that we have sufficient room for the CPUs (above). return getFeaturePriority(getFeature(Name)) << 1; } bool X86TargetInfo::validateCPUSpecificCPUDispatch(StringRef Name) const { return llvm::StringSwitch(Name) #define CPU_SPECIFIC(NAME, MANGLING, FEATURES) .Case(NAME, true) #define CPU_SPECIFIC_ALIAS(NEW_NAME, NAME) .Case(NEW_NAME, true) #include "clang/Basic/X86Target.def" .Default(false); } static StringRef CPUSpecificCPUDispatchNameDealias(StringRef Name) { return llvm::StringSwitch(Name) #define CPU_SPECIFIC_ALIAS(NEW_NAME, NAME) .Case(NEW_NAME, NAME) #include "clang/Basic/X86Target.def" .Default(Name); } char X86TargetInfo::CPUSpecificManglingCharacter(StringRef Name) const { return llvm::StringSwitch(CPUSpecificCPUDispatchNameDealias(Name)) #define CPU_SPECIFIC(NAME, MANGLING, FEATURES) .Case(NAME, MANGLING) #include "clang/Basic/X86Target.def" .Default(0); } void X86TargetInfo::getCPUSpecificCPUDispatchFeatures( StringRef Name, llvm::SmallVectorImpl &Features) const { StringRef WholeList = llvm::StringSwitch(CPUSpecificCPUDispatchNameDealias(Name)) #define CPU_SPECIFIC(NAME, MANGLING, FEATURES) .Case(NAME, FEATURES) #include "clang/Basic/X86Target.def" .Default(""); WholeList.split(Features, ',', /*MaxSplit=*/-1, /*KeepEmpty=*/false); } // We can't use a generic validation scheme for the cpus accepted here // versus subtarget cpus accepted in the target attribute because the // variables intitialized by the runtime only support the below currently // rather than the full range of cpus. bool X86TargetInfo::validateCpuIs(StringRef FeatureStr) const { return llvm::StringSwitch(FeatureStr) #define X86_VENDOR(ENUM, STRING) .Case(STRING, true) #define X86_CPU_TYPE_COMPAT_WITH_ALIAS(ARCHNAME, ENUM, STR, ALIAS) \ .Cases(STR, ALIAS, true) #define X86_CPU_TYPE_COMPAT(ARCHNAME, ENUM, STR) .Case(STR, true) #define X86_CPU_SUBTYPE_COMPAT(ARCHNAME, ENUM, STR) .Case(STR, true) #include "llvm/Support/X86TargetParser.def" .Default(false); } static unsigned matchAsmCCConstraint(const char *&Name) { auto RV = llvm::StringSwitch(Name) .Case("@cca", 4) .Case("@ccae", 5) .Case("@ccb", 4) .Case("@ccbe", 5) .Case("@ccc", 4) .Case("@cce", 4) .Case("@ccz", 4) .Case("@ccg", 4) .Case("@ccge", 5) .Case("@ccl", 4) .Case("@ccle", 5) .Case("@ccna", 5) .Case("@ccnae", 6) .Case("@ccnb", 5) .Case("@ccnbe", 6) .Case("@ccnc", 5) .Case("@ccne", 5) .Case("@ccnz", 5) .Case("@ccng", 5) .Case("@ccnge", 6) .Case("@ccnl", 5) .Case("@ccnle", 6) .Case("@ccno", 5) .Case("@ccnp", 5) .Case("@ccns", 5) .Case("@cco", 4) .Case("@ccp", 4) .Case("@ccs", 4) .Default(0); return RV; } bool X86TargetInfo::validateAsmConstraint( const char *&Name, TargetInfo::ConstraintInfo &Info) const { switch (*Name) { default: return false; // Constant constraints. case 'e': // 32-bit signed integer constant for use with sign-extending x86_64 // instructions. case 'Z': // 32-bit unsigned integer constant for use with zero-extending // x86_64 instructions. case 's': Info.setRequiresImmediate(); return true; case 'I': Info.setRequiresImmediate(0, 31); return true; case 'J': Info.setRequiresImmediate(0, 63); return true; case 'K': Info.setRequiresImmediate(-128, 127); return true; case 'L': Info.setRequiresImmediate({int(0xff), int(0xffff), int(0xffffffff)}); return true; case 'M': Info.setRequiresImmediate(0, 3); return true; case 'N': Info.setRequiresImmediate(0, 255); return true; case 'O': Info.setRequiresImmediate(0, 127); return true; // Register constraints. case 'Y': // 'Y' is the first character for several 2-character constraints. // Shift the pointer to the second character of the constraint. Name++; switch (*Name) { default: return false; case 'z': case '0': // First SSE register. case '2': case 't': // Any SSE register, when SSE2 is enabled. case 'i': // Any SSE register, when SSE2 and inter-unit moves enabled. case 'm': // Any MMX register, when inter-unit moves enabled. case 'k': // AVX512 arch mask registers: k1-k7. Info.setAllowsRegister(); return true; } case 'f': // Any x87 floating point stack register. // Constraint 'f' cannot be used for output operands. if (Info.ConstraintStr[0] == '=') return false; Info.setAllowsRegister(); return true; case 'a': // eax. case 'b': // ebx. case 'c': // ecx. case 'd': // edx. case 'S': // esi. case 'D': // edi. case 'A': // edx:eax. case 't': // Top of floating point stack. case 'u': // Second from top of floating point stack. case 'q': // Any register accessible as [r]l: a, b, c, and d. case 'y': // Any MMX register. case 'v': // Any {X,Y,Z}MM register (Arch & context dependent) case 'x': // Any SSE register. case 'k': // Any AVX512 mask register (same as Yk, additionally allows k0 // for intermideate k reg operations). case 'Q': // Any register accessible as [r]h: a, b, c, and d. case 'R': // "Legacy" registers: ax, bx, cx, dx, di, si, sp, bp. case 'l': // "Index" registers: any general register that can be used as an // index in a base+index memory access. Info.setAllowsRegister(); return true; // Floating point constant constraints. case 'C': // SSE floating point constant. case 'G': // x87 floating point constant. return true; case '@': // CC condition changes. if (auto Len = matchAsmCCConstraint(Name)) { Name += Len - 1; Info.setAllowsRegister(); return true; } return false; } } bool X86TargetInfo::validateOutputSize(const llvm::StringMap &FeatureMap, StringRef Constraint, unsigned Size) const { // Strip off constraint modifiers. while (Constraint[0] == '=' || Constraint[0] == '+' || Constraint[0] == '&') Constraint = Constraint.substr(1); return validateOperandSize(FeatureMap, Constraint, Size); } bool X86TargetInfo::validateInputSize(const llvm::StringMap &FeatureMap, StringRef Constraint, unsigned Size) const { return validateOperandSize(FeatureMap, Constraint, Size); } bool X86TargetInfo::validateOperandSize(const llvm::StringMap &FeatureMap, StringRef Constraint, unsigned Size) const { switch (Constraint[0]) { default: break; case 'k': // Registers k0-k7 (AVX512) size limit is 64 bit. case 'y': return Size <= 64; case 'f': case 't': case 'u': return Size <= 128; case 'Y': // 'Y' is the first character for several 2-character constraints. switch (Constraint[1]) { default: return false; case 'm': // 'Ym' is synonymous with 'y'. case 'k': return Size <= 64; case 'z': case '0': - // XMM0 - if (FeatureMap.lookup("sse")) + // XMM0/YMM/ZMM0 + if (FeatureMap.lookup("avx512f")) + // ZMM0 can be used if target supports AVX512F. + return Size <= 512U; + else if (FeatureMap.lookup("avx")) + // YMM0 can be used if target supports AVX. + return Size <= 256U; + else if (FeatureMap.lookup("sse")) return Size <= 128U; return false; case 'i': case 't': case '2': // 'Yi','Yt','Y2' are synonymous with 'x' when SSE2 is enabled. if (SSELevel < SSE2) return false; break; } LLVM_FALLTHROUGH; case 'v': case 'x': if (FeatureMap.lookup("avx512f")) // 512-bit zmm registers can be used if target supports AVX512F. return Size <= 512U; else if (FeatureMap.lookup("avx")) // 256-bit ymm registers can be used if target supports AVX. return Size <= 256U; return Size <= 128U; } return true; } std::string X86TargetInfo::convertConstraint(const char *&Constraint) const { switch (*Constraint) { case '@': if (auto Len = matchAsmCCConstraint(Constraint)) { std::string Converted = "{" + std::string(Constraint, Len) + "}"; Constraint += Len - 1; return Converted; } return std::string(1, *Constraint); case 'a': return std::string("{ax}"); case 'b': return std::string("{bx}"); case 'c': return std::string("{cx}"); case 'd': return std::string("{dx}"); case 'S': return std::string("{si}"); case 'D': return std::string("{di}"); case 'p': // address return std::string("im"); case 't': // top of floating point stack. return std::string("{st}"); case 'u': // second from top of floating point stack. return std::string("{st(1)}"); // second from top of floating point stack. case 'Y': switch (Constraint[1]) { default: // Break from inner switch and fall through (copy single char), // continue parsing after copying the current constraint into // the return string. break; case 'k': case 'm': case 'i': case 't': case 'z': case '0': case '2': // "^" hints llvm that this is a 2 letter constraint. // "Constraint++" is used to promote the string iterator // to the next constraint. return std::string("^") + std::string(Constraint++, 2); } LLVM_FALLTHROUGH; default: return std::string(1, *Constraint); } } bool X86TargetInfo::checkCPUKind(CPUKind Kind) const { // Perform any per-CPU checks necessary to determine if this CPU is // acceptable. switch (Kind) { case CK_Generic: // No processor selected! return false; #define PROC(ENUM, STRING, IS64BIT) \ case CK_##ENUM: \ return IS64BIT || getTriple().getArch() == llvm::Triple::x86; #include "clang/Basic/X86Target.def" } llvm_unreachable("Unhandled CPU kind"); } void X86TargetInfo::fillValidCPUList(SmallVectorImpl &Values) const { #define PROC(ENUM, STRING, IS64BIT) \ if (IS64BIT || getTriple().getArch() == llvm::Triple::x86) \ Values.emplace_back(STRING); // For aliases we need to lookup the CPUKind to check get the 64-bit ness. #define PROC_ALIAS(ENUM, ALIAS) \ if (checkCPUKind(CK_##ENUM)) \ Values.emplace_back(ALIAS); #include "clang/Basic/X86Target.def" } X86TargetInfo::CPUKind X86TargetInfo::getCPUKind(StringRef CPU) const { return llvm::StringSwitch(CPU) #define PROC(ENUM, STRING, IS64BIT) .Case(STRING, CK_##ENUM) #define PROC_ALIAS(ENUM, ALIAS) .Case(ALIAS, CK_##ENUM) #include "clang/Basic/X86Target.def" .Default(CK_Generic); } ArrayRef X86TargetInfo::getGCCRegNames() const { return llvm::makeArrayRef(GCCRegNames); } ArrayRef X86TargetInfo::getGCCAddlRegNames() const { return llvm::makeArrayRef(AddlRegNames); } ArrayRef X86_32TargetInfo::getTargetBuiltins() const { return llvm::makeArrayRef(BuiltinInfoX86, clang::X86::LastX86CommonBuiltin - Builtin::FirstTSBuiltin + 1); } ArrayRef X86_64TargetInfo::getTargetBuiltins() const { return llvm::makeArrayRef(BuiltinInfoX86, X86::LastTSBuiltin - Builtin::FirstTSBuiltin); } Index: stable/11/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- stable/11/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp (revision 366451) +++ stable/11/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp (revision 366452) @@ -1,47336 +1,47375 @@ //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines the interfaces that X86 uses to lower LLVM code into a // selection DAG. // //===----------------------------------------------------------------------===// #include "X86ISelLowering.h" #include "Utils/X86ShuffleDecode.h" #include "X86CallingConv.h" #include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86IntrinsicsInfo.h" #include "X86MachineFunctionInfo.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" #include #include #include #include using namespace llvm; #define DEBUG_TYPE "x86-isel" STATISTIC(NumTailCalls, "Number of tail calls"); static cl::opt ExperimentalPrefLoopAlignment( "x86-experimental-pref-loop-alignment", cl::init(4), cl::desc( "Sets the preferable loop alignment for experiments (as log2 bytes)" "(the last x86-experimental-pref-loop-alignment bits" " of the loop header PC will be 0)."), cl::Hidden); // Added in 10.0. static cl::opt EnableOldKNLABI( "x86-enable-old-knl-abi", cl::init(false), cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of " "one ZMM register on AVX512F, but not AVX512BW targets."), cl::Hidden); static cl::opt MulConstantOptimization( "mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden); static cl::opt ExperimentalUnorderedISEL( "x86-experimental-unordered-atomic-isel", cl::init(false), cl::desc("Use LoadSDNode and StoreSDNode instead of " "AtomicSDNode for unordered atomic loads and " "stores respectively."), cl::Hidden); /// Call this when the user attempts to do something unsupported, like /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike /// report_fatal_error, so calling code should attempt to recover without /// crashing. static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl, const char *Msg) { MachineFunction &MF = DAG.getMachineFunction(); DAG.getContext()->diagnose( DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc())); } X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(STI) { bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87(); X86ScalarSSEf64 = Subtarget.hasSSE2(); X86ScalarSSEf32 = Subtarget.hasSSE1(); MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0)); // Set up the TargetLowering object. // X86 is weird. It always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); // X86-SSE is even stranger. It uses -1 or 0 for vector masks. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // For 64-bit, since we have so many registers, use the ILP scheduler. // For 32-bit, use the register pressure specific scheduling. // For Atom, always use ILP scheduling. if (Subtarget.isAtom()) setSchedulingPreference(Sched::ILP); else if (Subtarget.is64Bit()) setSchedulingPreference(Sched::ILP); else setSchedulingPreference(Sched::RegPressure); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); // Bypass expensive divides and use cheaper ones. if (TM.getOptLevel() >= CodeGenOpt::Default) { if (Subtarget.hasSlowDivide32()) addBypassSlowDiv(32, 8); if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit()) addBypassSlowDiv(64, 32); } if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) { // Setup Windows compiler runtime calls. setLibcallName(RTLIB::SDIV_I64, "_alldiv"); setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); setLibcallName(RTLIB::SREM_I64, "_allrem"); setLibcallName(RTLIB::UREM_I64, "_aullrem"); setLibcallName(RTLIB::MUL_I64, "_allmul"); setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); } if (Subtarget.getTargetTriple().isOSMSVCRT()) { // MSVCRT doesn't have powi; fall back to pow setLibcallName(RTLIB::POWI_F32, nullptr); setLibcallName(RTLIB::POWI_F64, nullptr); } // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b. // FIXME: Should we be limitting the atomic size on other configs? Default is // 1024. if (!Subtarget.hasCmpxchg8b()) setMaxAtomicSizeInBitsSupported(32); // Set up the register classes. addRegisterClass(MVT::i8, &X86::GR8RegClass); addRegisterClass(MVT::i16, &X86::GR16RegClass); addRegisterClass(MVT::i32, &X86::GR32RegClass); if (Subtarget.is64Bit()) addRegisterClass(MVT::i64, &X86::GR64RegClass); for (MVT VT : MVT::integer_valuetypes()) setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); // We don't accept any truncstore of integer registers. setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::i64, MVT::i16, Expand); setTruncStoreAction(MVT::i64, MVT::i8 , Expand); setTruncStoreAction(MVT::i32, MVT::i16, Expand); setTruncStoreAction(MVT::i32, MVT::i8 , Expand); setTruncStoreAction(MVT::i16, MVT::i8, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); // SETOEQ and SETUNE require checking two conditions. setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); // Integer absolute. if (Subtarget.hasCMov()) { setOperationAction(ISD::ABS , MVT::i16 , Custom); setOperationAction(ISD::ABS , MVT::i32 , Custom); } setOperationAction(ISD::ABS , MVT::i64 , Custom); // Funnel shifts. for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) { setOperationAction(ShiftOp , MVT::i16 , Custom); setOperationAction(ShiftOp , MVT::i32 , Custom); if (Subtarget.is64Bit()) setOperationAction(ShiftOp , MVT::i64 , Custom); } if (!Subtarget.useSoftFloat()) { // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this // operation. setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote); setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote); // We have an algorithm for SSE2, and we turn this into a 64-bit // FILD or VCVTUSI2SS/SD for other targets. setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have // this operation. setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote); // SSE has no i16 to fp conversion, only i32. We promote in the handler // to allow f80 to use i16 and f64 to use i16 with sse1 only setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom); // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 // are Legal, f80 is custom lowered. setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have // this operation. setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote); // FIXME: This doesn't generate invalid exception when it should. PR44019. setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote); setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 // are Legal, f80 is custom lowered. setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); // Handle FP_TO_UINT by promoting the destination to a larger signed // conversion. setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote); // FIXME: This doesn't generate invalid exception when it should. PR44019. setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote); setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); // FIXME: This doesn't generate invalid exception when it should. PR44019. setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom); } // Handle address space casts between mixed sized pointers. setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); // TODO: when we have SSE, these could be more efficient, by using movd/movq. if (!X86ScalarSSEf64) { setOperationAction(ISD::BITCAST , MVT::f32 , Expand); setOperationAction(ISD::BITCAST , MVT::i32 , Expand); if (Subtarget.is64Bit()) { setOperationAction(ISD::BITCAST , MVT::f64 , Expand); // Without SSE, i64->f64 goes through memory. setOperationAction(ISD::BITCAST , MVT::i64 , Expand); } } else if (!Subtarget.is64Bit()) setOperationAction(ISD::BITCAST , MVT::i64 , Custom); // Scalar integer divide and remainder are lowered to use operations that // produce two results, to match the available instructions. This exposes // the two-result form to trivial CSE, which is able to combine x/y and x%y // into a single instruction. // // Scalar integer multiply-high is also lowered to use two-result // operations, to match the available instructions. However, plain multiply // (low) operations are left as Legal, as there are single-result // instructions for this in x86. Using the two-result multiply instructions // when both high and low results are needed must be arranged by dagcombine. for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); } setOperationAction(ISD::BR_JT , MVT::Other, Expand); setOperationAction(ISD::BRCOND , MVT::Other, Custom); for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128, MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::BR_CC, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); } if (Subtarget.is64Bit()) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); setOperationAction(ISD::FREM , MVT::f32 , Expand); setOperationAction(ISD::FREM , MVT::f64 , Expand); setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FREM , MVT::f128 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); // Promote the i8 variants and force them on up to i32 which has a shorter // encoding. setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32); setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32); if (!Subtarget.hasBMI()) { setOperationAction(ISD::CTTZ , MVT::i16 , Custom); setOperationAction(ISD::CTTZ , MVT::i32 , Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal); if (Subtarget.is64Bit()) { setOperationAction(ISD::CTTZ , MVT::i64 , Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal); } } if (Subtarget.hasLZCNT()) { // When promoting the i8 variants, force them to i32 for a shorter // encoding. setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32); setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); } else { setOperationAction(ISD::CTLZ , MVT::i8 , Custom); setOperationAction(ISD::CTLZ , MVT::i16 , Custom); setOperationAction(ISD::CTLZ , MVT::i32 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); if (Subtarget.is64Bit()) { setOperationAction(ISD::CTLZ , MVT::i64 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); } } // Special handling for half-precision floating point conversions. // If we don't have F16C support, then lower half float conversions // into library calls. if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) { setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); } // There's never any support for operations beyond MVT::f32. setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand); setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f80, MVT::f16, Expand); setTruncStoreAction(MVT::f128, MVT::f16, Expand); if (Subtarget.hasPOPCNT()) { setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32); } else { setOperationAction(ISD::CTPOP , MVT::i8 , Expand); setOperationAction(ISD::CTPOP , MVT::i16 , Expand); setOperationAction(ISD::CTPOP , MVT::i32 , Expand); if (Subtarget.is64Bit()) setOperationAction(ISD::CTPOP , MVT::i64 , Expand); else setOperationAction(ISD::CTPOP , MVT::i64 , Custom); } setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); if (!Subtarget.hasMOVBE()) setOperationAction(ISD::BSWAP , MVT::i16 , Expand); // X86 wants to expand cmov itself. for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) { setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); } for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); } // Custom action for SELECT MMX and expand action for SELECT_CC MMX setOperationAction(ISD::SELECT, MVT::x86mmx, Custom); setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand); setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since // LLVM/Clang supports zero-cost DWARF and SEH exception handling. setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); if (TM.Options.ExceptionModel == ExceptionHandling::SjLj) setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); // Darwin ABI issue. for (auto VT : { MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::ConstantPool , VT, Custom); setOperationAction(ISD::JumpTable , VT, Custom); setOperationAction(ISD::GlobalAddress , VT, Custom); setOperationAction(ISD::GlobalTLSAddress, VT, Custom); setOperationAction(ISD::ExternalSymbol , VT, Custom); setOperationAction(ISD::BlockAddress , VT, Custom); } // 64-bit shl, sra, srl (iff 32-bit x86) for (auto VT : { MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::SHL_PARTS, VT, Custom); setOperationAction(ISD::SRA_PARTS, VT, Custom); setOperationAction(ISD::SRL_PARTS, VT, Custom); } if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow()) setOperationAction(ISD::PREFETCH , MVT::Other, Legal); setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); // Expand certain atomics for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom); setOperationAction(ISD::ATOMIC_STORE, VT, Custom); } if (!Subtarget.is64Bit()) setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); if (Subtarget.hasCmpxchg16b()) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); } // FIXME - use subtarget debug flags if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() && !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() && TM.Options.ExceptionModel != ExceptionHandling::SjLj) { setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); } setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::TRAP, MVT::Other, Legal); setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); setOperationAction(ISD::VAEND , MVT::Other, Expand); bool Is64Bit = Subtarget.is64Bit(); setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand); setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand); setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom); // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering. setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom); if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) { // f32 and f64 use SSE. // Set up the FP register classes. addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass); addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass); // Disable f32->f64 extload as we can only generate this in one instruction // under optsize. So its easier to pattern match (fpext (load)) for that // case instead of needing to emit 2 instructions for extload in the // non-optsize case. setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); for (auto VT : { MVT::f32, MVT::f64 }) { // Use ANDPD to simulate FABS. setOperationAction(ISD::FABS, VT, Custom); // Use XORP to simulate FNEG. setOperationAction(ISD::FNEG, VT, Custom); // Use ANDPD and ORPD to simulate FCOPYSIGN. setOperationAction(ISD::FCOPYSIGN, VT, Custom); // These might be better off as horizontal vector ops. setOperationAction(ISD::FADD, VT, Custom); setOperationAction(ISD::FSUB, VT, Custom); // We don't support sin/cos/fmod setOperationAction(ISD::FSIN , VT, Expand); setOperationAction(ISD::FCOS , VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); } // Lower this to MOVMSK plus an AND. setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); } else if (!useSoftFloat() && X86ScalarSSEf32 && (UseX87 || Is64Bit)) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. addRegisterClass(MVT::f32, &X86::FR32RegClass); if (UseX87) addRegisterClass(MVT::f64, &X86::RFP64RegClass); // Use ANDPS to simulate FABS. setOperationAction(ISD::FABS , MVT::f32, Custom); // Use XORP to simulate FNEG. setOperationAction(ISD::FNEG , MVT::f32, Custom); if (UseX87) setOperationAction(ISD::UNDEF, MVT::f64, Expand); // Use ANDPS and ORPS to simulate FCOPYSIGN. if (UseX87) setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); // We don't support sin/cos/fmod setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); if (UseX87) { // Always expand sin/cos functions even though x87 has an instruction. setOperationAction(ISD::FSIN, MVT::f64, Expand); setOperationAction(ISD::FCOS, MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); } } else if (UseX87) { // f32 and f64 in x87. // Set up the FP register classes. addRegisterClass(MVT::f64, &X86::RFP64RegClass); addRegisterClass(MVT::f32, &X86::RFP32RegClass); for (auto VT : { MVT::f32, MVT::f64 }) { setOperationAction(ISD::UNDEF, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); // Always expand sin/cos functions even though x87 has an instruction. setOperationAction(ISD::FSIN , VT, Expand); setOperationAction(ISD::FCOS , VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); } } // Expand FP32 immediates into loads from the stack, save special cases. if (isTypeLegal(MVT::f32)) { if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) { addLegalFPImmediate(APFloat(+0.0f)); // FLD0 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS } else // SSE immediates. addLegalFPImmediate(APFloat(+0.0f)); // xorps } // Expand FP64 immediates into loads from the stack, save special cases. if (isTypeLegal(MVT::f64)) { if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) { addLegalFPImmediate(APFloat(+0.0)); // FLD0 addLegalFPImmediate(APFloat(+1.0)); // FLD1 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS } else // SSE immediates. addLegalFPImmediate(APFloat(+0.0)); // xorpd } // Handle constrained floating-point operations of scalar. setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal); // We don't support FMA. setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); // f80 always uses X87. if (UseX87) { addRegisterClass(MVT::f80, &X86::RFP80RegClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); { APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended()); addLegalFPImmediate(TmpFlt); // FLD0 TmpFlt.changeSign(); addLegalFPImmediate(TmpFlt); // FLD0/FCHS bool ignored; APFloat TmpFlt2(+1.0); TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven, &ignored); addLegalFPImmediate(TmpFlt2); // FLD1 TmpFlt2.changeSign(); addLegalFPImmediate(TmpFlt2); // FLD1/FCHS } // Always expand sin/cos functions even though x87 has an instruction. setOperationAction(ISD::FSIN , MVT::f80, Expand); setOperationAction(ISD::FCOS , MVT::f80, Expand); setOperationAction(ISD::FSINCOS, MVT::f80, Expand); setOperationAction(ISD::FFLOOR, MVT::f80, Expand); setOperationAction(ISD::FCEIL, MVT::f80, Expand); setOperationAction(ISD::FTRUNC, MVT::f80, Expand); setOperationAction(ISD::FRINT, MVT::f80, Expand); setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); setOperationAction(ISD::FMA, MVT::f80, Expand); setOperationAction(ISD::LROUND, MVT::f80, Expand); setOperationAction(ISD::LLROUND, MVT::f80, Expand); setOperationAction(ISD::LRINT, MVT::f80, Expand); setOperationAction(ISD::LLRINT, MVT::f80, Expand); // Handle constrained floating-point operations of scalar. setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal); setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal); setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal); setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal); setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal); // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten // as Custom. setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal); } // f128 uses xmm registers, but most operations require libcalls. if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) { addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps setOperationAction(ISD::FADD, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall); setOperationAction(ISD::FSUB, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall); setOperationAction(ISD::FDIV, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall); setOperationAction(ISD::FMUL, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall); setOperationAction(ISD::FMA, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall); setOperationAction(ISD::FABS, MVT::f128, Custom); setOperationAction(ISD::FNEG, MVT::f128, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); setOperationAction(ISD::FSIN, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall); setOperationAction(ISD::FCOS, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall); setOperationAction(ISD::FSINCOS, MVT::f128, LibCall); // No STRICT_FSINCOS setOperationAction(ISD::FSQRT, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall); setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom); // We need to custom handle any FP_ROUND with an f128 input, but // LegalizeDAG uses the result type to know when to run a custom handler. // So we have to list all legal floating point result types here. if (isTypeLegal(MVT::f32)) { setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); } if (isTypeLegal(MVT::f64)) { setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); } if (isTypeLegal(MVT::f80)) { setOperationAction(ISD::FP_ROUND, MVT::f80, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom); } setOperationAction(ISD::SETCC, MVT::f128, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand); setTruncStoreAction(MVT::f128, MVT::f32, Expand); setTruncStoreAction(MVT::f128, MVT::f64, Expand); setTruncStoreAction(MVT::f128, MVT::f80, Expand); } // Always use a library call for pow. setOperationAction(ISD::FPOW , MVT::f32 , Expand); setOperationAction(ISD::FPOW , MVT::f64 , Expand); setOperationAction(ISD::FPOW , MVT::f80 , Expand); setOperationAction(ISD::FPOW , MVT::f128 , Expand); setOperationAction(ISD::FLOG, MVT::f80, Expand); setOperationAction(ISD::FLOG2, MVT::f80, Expand); setOperationAction(ISD::FLOG10, MVT::f80, Expand); setOperationAction(ISD::FEXP, MVT::f80, Expand); setOperationAction(ISD::FEXP2, MVT::f80, Expand); setOperationAction(ISD::FMINNUM, MVT::f80, Expand); setOperationAction(ISD::FMAXNUM, MVT::f80, Expand); // Some FP actions are always expanded for vector types. for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v2f64, MVT::v4f64, MVT::v8f64 }) { setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FLOG10, VT, Expand); setOperationAction(ISD::FEXP, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); } // First set operation action for all vector types to either promote // (for widening) or expand (for scalarization). Then we will selectively // turn on ones that can be effectively codegen'd. for (MVT VT : MVT::fixedlen_vector_valuetypes()) { setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); setOperationAction(ISD::FMA, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); setOperationAction(ISD::FRINT, VT, Expand); setOperationAction(ISD::FNEARBYINT, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::SETCC, VT, Expand); setOperationAction(ISD::FP_TO_UINT, VT, Expand); setOperationAction(ISD::FP_TO_SINT, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); setOperationAction(ISD::TRUNCATE, VT, Expand); setOperationAction(ISD::SIGN_EXTEND, VT, Expand); setOperationAction(ISD::ZERO_EXTEND, VT, Expand); setOperationAction(ISD::ANY_EXTEND, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { setTruncStoreAction(InnerVT, VT, Expand); setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand); setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand); // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like // types, we have to deal with them whether we ask for Expansion or not. // Setting Expand causes its own optimisation problems though, so leave // them legal. if (VT.getVectorElementType() == MVT::i1) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are // split/scalarized right now. if (VT.getVectorElementType() == MVT::f16) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); } } // FIXME: In order to prevent SSE instructions being expanded to MMX ones // with -msoft-float, disable use of MMX as well. if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) { addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); // No operations on x86mmx supported, everything uses intrinsics. } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) { addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); setOperationAction(ISD::FNEG, MVT::v4f32, Custom); setOperationAction(ISD::FABS, MVT::v4f32, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); setOperationAction(ISD::LOAD, MVT::v2f32, Custom); setOperationAction(ISD::STORE, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM // registers cannot be used even for integer operations. addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16, MVT::v2i32 }) { setOperationAction(ISD::SDIV, VT, Custom); setOperationAction(ISD::SREM, VT, Custom); setOperationAction(ISD::UDIV, VT, Custom); setOperationAction(ISD::UREM, VT, Custom); } setOperationAction(ISD::MUL, MVT::v2i8, Custom); setOperationAction(ISD::MUL, MVT::v4i8, Custom); setOperationAction(ISD::MUL, MVT::v8i8, Custom); setOperationAction(ISD::MUL, MVT::v16i8, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::MULHU, MVT::v4i32, Custom); setOperationAction(ISD::MULHS, MVT::v4i32, Custom); setOperationAction(ISD::MULHU, MVT::v16i8, Custom); setOperationAction(ISD::MULHS, MVT::v16i8, Custom); setOperationAction(ISD::MULHU, MVT::v8i16, Legal); setOperationAction(ISD::MULHS, MVT::v8i16, Legal); setOperationAction(ISD::MUL, MVT::v8i16, Legal); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); setOperationAction(ISD::FABS, MVT::v2f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom); for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom); setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom); setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom); setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom); } setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal); setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal); setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal); setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal); setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal); setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal); setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal); setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal); setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom); setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom); setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom); setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::ABS, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. setCondCodeAction(ISD::SETLT, VT, Custom); setCondCodeAction(ISD::SETLE, VT, Custom); } for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } for (auto VT : { MVT::v2f64, MVT::v2i64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); if (VT == MVT::v2i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } // Custom lower v2i64 and v2f64 selects. setOperationAction(ISD::SELECT, MVT::v2f64, Custom); setOperationAction(ISD::SELECT, MVT::v2i64, Custom); setOperationAction(ISD::SELECT, MVT::v4i32, Custom); setOperationAction(ISD::SELECT, MVT::v8i16, Custom); setOperationAction(ISD::SELECT, MVT::v16i8, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom); // Custom legalize these to avoid over promotion or custom promotion. for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) { setOperationAction(ISD::FP_TO_SINT, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom); } setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom); // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion. setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom); // We want to legalize this to an f64 load rather than an i64 load on // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for // store. setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i16, Custom); setOperationAction(ISD::LOAD, MVT::v8i8, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i16, Custom); setOperationAction(ISD::STORE, MVT::v8i8, Custom); setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); if (!Subtarget.hasAVX512()) setOperationAction(ISD::BITCAST, MVT::v16i1, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); // In the customized shift lowering, the legal v4i32/v2i64 cases // in AVX2 will be recognized. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); } setOperationAction(ISD::ROTL, MVT::v4i32, Custom); setOperationAction(ISD::ROTL, MVT::v8i16, Custom); // With AVX512, expanding (and promoting the shifts) is better. if (!Subtarget.hasAVX512()) setOperationAction(ISD::ROTL, MVT::v16i8, Custom); setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { setOperationAction(ISD::ABS, MVT::v16i8, Legal); setOperationAction(ISD::ABS, MVT::v8i16, Legal); setOperationAction(ISD::ABS, MVT::v4i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom); setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); // These might be better off as horizontal vector ops. setOperationAction(ISD::ADD, MVT::i16, Custom); setOperationAction(ISD::ADD, MVT::i32, Custom); setOperationAction(ISD::SUB, MVT::i16, Custom); setOperationAction(ISD::SUB, MVT::i32, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) { for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { setOperationAction(ISD::FFLOOR, RoundedTy, Legal); setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal); setOperationAction(ISD::FCEIL, RoundedTy, Legal); setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal); setOperationAction(ISD::FTRUNC, RoundedTy, Legal); setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal); setOperationAction(ISD::FRINT, RoundedTy, Legal); setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal); setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal); } setOperationAction(ISD::SMAX, MVT::v16i8, Legal); setOperationAction(ISD::SMAX, MVT::v4i32, Legal); setOperationAction(ISD::UMAX, MVT::v8i16, Legal); setOperationAction(ISD::UMAX, MVT::v4i32, Legal); setOperationAction(ISD::SMIN, MVT::v16i8, Legal); setOperationAction(ISD::SMIN, MVT::v4i32, Legal); setOperationAction(ISD::UMIN, MVT::v8i16, Legal); setOperationAction(ISD::UMIN, MVT::v4i32, Legal); // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); // We directly match byte blends in the backend as they match the VSELECT // condition form. setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); // SSE41 brings specific instructions for doing vector sign extend even in // cases where we don't have SRA. for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal); } // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal); } // i8 vectors are custom because the source register and source // source memory operand types are not the same width. setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) { // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can // do the pre and post work in the vector domain. setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom); // We need to mark SINT_TO_FP as Custom even though we want to expand it // so that DAG combine doesn't try to turn it into uint_to_fp. setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom); } } if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) { for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) setOperationAction(ISD::ROTL, VT, Custom); // XOP can efficiently perform BITREVERSE with VPPERM. for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) setOperationAction(ISD::BITREVERSE, VT, Custom); for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) setOperationAction(ISD::BITREVERSE, VT, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) { bool HasInt256 = Subtarget.hasInt256(); addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); for (auto VT : { MVT::v8f32, MVT::v4f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); setOperationAction(ISD::FCEIL, VT, Legal); setOperationAction(ISD::STRICT_FCEIL, VT, Legal); setOperationAction(ISD::FTRUNC, VT, Legal); setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); setOperationAction(ISD::FRINT, VT, Legal); setOperationAction(ISD::STRICT_FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FCOPYSIGN, VT, Custom); } // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted // even though v8i16 is a legal type. setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal); if (!Subtarget.hasAVX512()) setOperationAction(ISD::BITCAST, MVT::v32i1, Custom); // In the customized shift lowering, the legal v8i32/v4i64 cases // in AVX2 will be recognized. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); } // These types need custom splitting if their input is a 128-bit vector. setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ROTL, MVT::v8i32, Custom); setOperationAction(ISD::ROTL, MVT::v16i16, Custom); // With BWI, expanding (and promoting the shifts) is the better. if (!Subtarget.hasBWI()) setOperationAction(ISD::ROTL, MVT::v32i8, Custom); setOperationAction(ISD::SELECT, MVT::v4f64, Custom); setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8i32, Custom); setOperationAction(ISD::SELECT, MVT::v16i16, Custom); setOperationAction(ISD::SELECT, MVT::v32i8, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SIGN_EXTEND, VT, Custom); setOperationAction(ISD::ZERO_EXTEND, VT, Custom); setOperationAction(ISD::ANY_EXTEND, VT, Custom); } setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom); for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. setCondCodeAction(ISD::SETLT, VT, Custom); setCondCodeAction(ISD::SETLE, VT, Custom); } if (Subtarget.hasAnyFMA()) { for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::STRICT_FMA, VT, Legal); } } for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom); } setOperationAction(ISD::MUL, MVT::v4i64, Custom); setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom); setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MUL, MVT::v32i8, Custom); setOperationAction(ISD::MULHU, MVT::v8i32, Custom); setOperationAction(ISD::MULHS, MVT::v8i32, Custom); setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MULHU, MVT::v32i8, Custom); setOperationAction(ISD::MULHS, MVT::v32i8, Custom); setOperationAction(ISD::ABS, MVT::v4i64, Custom); setOperationAction(ISD::SMAX, MVT::v4i64, Custom); setOperationAction(ISD::UMAX, MVT::v4i64, Custom); setOperationAction(ISD::SMIN, MVT::v4i64, Custom); setOperationAction(ISD::UMIN, MVT::v4i64, Custom); setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom); setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom); setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom); setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom); setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom); } for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); } if (HasInt256) { // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom); // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal); setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal); setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal); setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal); setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal); } } for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::MSTORE, VT, Legal); } // Extract subvector is special because the value type // (result) is 128-bit but the source is 256-bit wide. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v4f32, MVT::v2f64 }) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); } // Custom lower several nodes for 256-bit types. for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, MVT::v8f32, MVT::v4f64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::STORE, VT, Custom); } if (HasInt256) { setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); // Custom legalize 2x32 to get a little better code. setOperationAction(ISD::MGATHER, MVT::v2f32, Custom); setOperationAction(ISD::MGATHER, MVT::v2i32, Custom); for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) setOperationAction(ISD::MGATHER, VT, Custom); } } // This block controls legalization of the mask vector sizes that are // available with AVX512. 512-bit vectors are in a separate block controlled // by useAVX512Regs. if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { addRegisterClass(MVT::v1i1, &X86::VK1RegClass); addRegisterClass(MVT::v2i1, &X86::VK2RegClass); addRegisterClass(MVT::v4i1, &X86::VK4RegClass); addRegisterClass(MVT::v8i1, &X86::VK8RegClass); addRegisterClass(MVT::v16i1, &X86::VK16RegClass); setOperationAction(ISD::SELECT, MVT::v1i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32); setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32); setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom); // There is no byte sized k-register load or store without AVX512DQ. if (!Subtarget.hasDQI()) { setOperationAction(ISD::LOAD, MVT::v1i1, Custom); setOperationAction(ISD::LOAD, MVT::v2i1, Custom); setOperationAction(ISD::LOAD, MVT::v4i1, Custom); setOperationAction(ISD::LOAD, MVT::v8i1, Custom); setOperationAction(ISD::STORE, MVT::v1i1, Custom); setOperationAction(ISD::STORE, MVT::v2i1, Custom); setOperationAction(ISD::STORE, MVT::v4i1, Custom); setOperationAction(ISD::STORE, MVT::v8i1, Custom); } // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SIGN_EXTEND, VT, Custom); setOperationAction(ISD::ZERO_EXTEND, VT, Custom); setOperationAction(ISD::ANY_EXTEND, VT, Custom); } for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) { setOperationAction(ISD::ADD, VT, Custom); setOperationAction(ISD::SUB, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::UADDSAT, VT, Custom); setOperationAction(ISD::SADDSAT, VT, Custom); setOperationAction(ISD::USUBSAT, VT, Custom); setOperationAction(ISD::SSUBSAT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Expand); } for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); } // This block controls legalization for 512-bit operations with 32/64 bit // elements. 512-bits can be disabled based on prefer-vector-width and // required-vector-width function attributes. if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) { addRegisterClass(MVT::v16i32, &X86::VR512RegClass); addRegisterClass(MVT::v16f32, &X86::VR512RegClass); addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal); setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal); } for (MVT VT : { MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::STRICT_FMA, VT, Legal); setOperationAction(ISD::FCOPYSIGN, VT, Custom); } for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) { setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32); setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32); } setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE // to 512-bit rather than use the AVX2 instructions so that we can use // k-masks. if (!Subtarget.hasVLX()) { for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) { setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::MSTORE, VT, Custom); } } setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); // Need to custom widen this if we don't have AVX512BW. setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom); for (auto VT : { MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); setOperationAction(ISD::FCEIL, VT, Legal); setOperationAction(ISD::STRICT_FCEIL, VT, Legal); setOperationAction(ISD::FTRUNC, VT, Legal); setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); setOperationAction(ISD::FRINT, VT, Legal); setOperationAction(ISD::STRICT_FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); setOperationAction(ISD::SELECT, VT, Custom); } // Without BWI we need to use custom lowering to handle MVT::v64i8 input. for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); } setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); setOperationAction(ISD::MUL, MVT::v8i64, Custom); setOperationAction(ISD::MUL, MVT::v16i32, Legal); setOperationAction(ISD::MULHU, MVT::v16i32, Custom); setOperationAction(ISD::MULHS, MVT::v16i32, Custom); for (auto VT : { MVT::v16i32, MVT::v8i64 }) { setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::ABS, VT, Legal); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. setCondCodeAction(ISD::SETLT, VT, Custom); setCondCodeAction(ISD::SETLE, VT, Custom); } if (Subtarget.hasDQI()) { setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal); setOperationAction(ISD::MUL, MVT::v8i64, Legal); } if (Subtarget.hasCDI()) { // NonVLX sub-targets extend 128/256 vectors to use the 512 version. for (auto VT : { MVT::v16i32, MVT::v8i64} ) { setOperationAction(ISD::CTLZ, VT, Legal); } } // Subtarget.hasCDI() if (Subtarget.hasVPOPCNTDQ()) { for (auto VT : { MVT::v16i32, MVT::v8i64 }) setOperationAction(ISD::CTPOP, VT, Legal); } // Extract subvector is special because the value type // (result) is 256-bit but the source is 512-bit wide. // 128-bit was made Legal under AVX1. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, MVT::v8f32, MVT::v4f64 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } if (!Subtarget.hasBWI()) { // Need to custom split v32i16/v64i8 bitcasts. setOperationAction(ISD::BITCAST, MVT::v32i16, Custom); setOperationAction(ISD::BITCAST, MVT::v64i8, Custom); // Better to split these into two 256-bit ops. setOperationAction(ISD::BITREVERSE, MVT::v8i64, Custom); setOperationAction(ISD::BITREVERSE, MVT::v16i32, Custom); } if (Subtarget.hasVBMI2()) { for (auto VT : { MVT::v16i32, MVT::v8i64 }) { setOperationAction(ISD::FSHL, VT, Custom); setOperationAction(ISD::FSHR, VT, Custom); } } }// has AVX-512 // This block controls legalization for operations that don't have // pre-AVX512 equivalents. Without VLX we use 512-bit operations for // narrower widths. if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { // These operations are handled on non-VLX by artificially widening in // isel patterns. setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Subtarget.hasVLX() ? Legal : Custom); for (auto VT : { MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::ABS, VT, Legal); } for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); } // Custom legalize 2x32 to get a little better code. setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom); setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom); for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) setOperationAction(ISD::MSCATTER, VT, Custom); if (Subtarget.hasDQI()) { for (auto VT : { MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::SINT_TO_FP, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::UINT_TO_FP, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::FP_TO_SINT, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::FP_TO_UINT, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::MUL, VT, Legal); } } if (Subtarget.hasCDI()) { for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::CTLZ, VT, Legal); } } // Subtarget.hasCDI() if (Subtarget.hasVPOPCNTDQ()) { for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) setOperationAction(ISD::CTPOP, VT, Legal); } } // This block control legalization of v32i1/v64i1 which are available with // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with // useBWIRegs. if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) { addRegisterClass(MVT::v32i1, &X86::VK32RegClass); addRegisterClass(MVT::v64i1, &X86::VK64RegClass); for (auto VT : { MVT::v32i1, MVT::v64i1 }) { setOperationAction(ISD::ADD, VT, Custom); setOperationAction(ISD::SUB, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::UADDSAT, VT, Custom); setOperationAction(ISD::SADDSAT, VT, Custom); setOperationAction(ISD::USUBSAT, VT, Custom); setOperationAction(ISD::SSUBSAT, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); } setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); for (auto VT : { MVT::v16i1, MVT::v32i1 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); // Extends from v32i1 masks to 256-bit vectors. setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom); } // This block controls legalization for v32i16 and v64i8. 512-bits can be // disabled based on prefer-vector-width and required-vector-width function // attributes. if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) { addRegisterClass(MVT::v32i16, &X86::VR512RegClass); addRegisterClass(MVT::v64i8, &X86::VR512RegClass); // Extends from v64i1 masks to 512-bit vectors. setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::MUL, MVT::v32i16, Legal); setOperationAction(ISD::MUL, MVT::v64i8, Custom); setOperationAction(ISD::MULHS, MVT::v32i16, Legal); setOperationAction(ISD::MULHU, MVT::v32i16, Legal); setOperationAction(ISD::MULHS, MVT::v64i8, Custom); setOperationAction(ISD::MULHU, MVT::v64i8, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); for (auto VT : { MVT::v64i8, MVT::v32i16 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::ABS, VT, Legal); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::UADDSAT, VT, Legal); setOperationAction(ISD::SADDSAT, VT, Legal); setOperationAction(ISD::USUBSAT, VT, Legal); setOperationAction(ISD::SSUBSAT, VT, Legal); setOperationAction(ISD::SELECT, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. setCondCodeAction(ISD::SETLT, VT, Custom); setCondCodeAction(ISD::SETLE, VT, Custom); } for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal); } if (Subtarget.hasBITALG()) { for (auto VT : { MVT::v64i8, MVT::v32i16 }) setOperationAction(ISD::CTPOP, VT, Legal); } if (Subtarget.hasVBMI2()) { setOperationAction(ISD::FSHL, MVT::v32i16, Custom); setOperationAction(ISD::FSHR, MVT::v32i16, Custom); } } if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) { for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) { setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom); } // These operations are handled on non-VLX by artificially widening in // isel patterns. // TODO: Custom widen in lowering on non-VLX and drop the isel patterns? if (Subtarget.hasBITALG()) { for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 }) setOperationAction(ISD::CTPOP, VT, Legal); } } if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) { setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); if (Subtarget.hasDQI()) { // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion. // v2f32 UINT_TO_FP is already custom under SSE2. assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"); // v2i64 FP_TO_S/UINT(v2f32) custom conversion. setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom); } if (Subtarget.hasBWI()) { setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); } if (Subtarget.hasVBMI2()) { // TODO: Make these legal even without VLX? for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::FSHL, VT, Custom); setOperationAction(ISD::FSHR, VT, Custom); } } setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom); } // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); if (!Subtarget.is64Bit()) { setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); } // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. // // FIXME: We really should do custom legalization for addition and // subtraction on x86-32 once PR3203 is fixed. We really can't do much better // than generic legalization for 64-bit multiplication-with-overflow, though. for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; // Add/Sub/Mul with overflow operations are custom lowered. setOperationAction(ISD::SADDO, VT, Custom); setOperationAction(ISD::UADDO, VT, Custom); setOperationAction(ISD::SSUBO, VT, Custom); setOperationAction(ISD::USUBO, VT, Custom); setOperationAction(ISD::SMULO, VT, Custom); setOperationAction(ISD::UMULO, VT, Custom); // Support carry in as value rather than glue. setOperationAction(ISD::ADDCARRY, VT, Custom); setOperationAction(ISD::SUBCARRY, VT, Custom); setOperationAction(ISD::SETCCCARRY, VT, Custom); } if (!Subtarget.is64Bit()) { // These libcalls are not available in 32-bit. setLibcallName(RTLIB::SHL_I128, nullptr); setLibcallName(RTLIB::SRL_I128, nullptr); setLibcallName(RTLIB::SRA_I128, nullptr); setLibcallName(RTLIB::MUL_I128, nullptr); } // Combine sin / cos into _sincos_stret if it is available. if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { setOperationAction(ISD::FSINCOS, MVT::f64, Custom); setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } if (Subtarget.isTargetWin64()) { setOperationAction(ISD::SDIV, MVT::i128, Custom); setOperationAction(ISD::UDIV, MVT::i128, Custom); setOperationAction(ISD::SREM, MVT::i128, Custom); setOperationAction(ISD::UREM, MVT::i128, Custom); setOperationAction(ISD::SDIVREM, MVT::i128, Custom); setOperationAction(ISD::UDIVREM, MVT::i128, Custom); } // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)` // is. We should promote the value to 64-bits to solve this. // This is what the CRT headers do - `fmodf` is an inline header // function casting to f64 and calling `fmod`. if (Subtarget.is32Bit() && (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium())) for (ISD::NodeType Op : {ISD::FCEIL, ISD::STRICT_FCEIL, ISD::FCOS, ISD::STRICT_FCOS, ISD::FEXP, ISD::STRICT_FEXP, ISD::FFLOOR, ISD::STRICT_FFLOOR, ISD::FREM, ISD::STRICT_FREM, ISD::FLOG, ISD::STRICT_FLOG, ISD::FLOG10, ISD::STRICT_FLOG10, ISD::FPOW, ISD::STRICT_FPOW, ISD::FSIN, ISD::STRICT_FSIN}) if (isOperationExpand(Op, MVT::f32)) setOperationAction(Op, MVT::f32, Promote); // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::CONCAT_VECTORS); setTargetDAGCombine(ISD::INSERT_SUBVECTOR); setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR); setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FNEG); setTargetDAGCombine(ISD::FMA); setTargetDAGCombine(ISD::FMINNUM); setTargetDAGCombine(ISD::FMAXNUM); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::MLOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::MSTORE); setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG); setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG); setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::STRICT_SINT_TO_FP); setTargetDAGCombine(ISD::STRICT_UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); setTargetDAGCombine(ISD::MSCATTER); setTargetDAGCombine(ISD::MGATHER); computeRegisterProperties(Subtarget.getRegisterInfo()); MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = 4; // TODO: These control memcmp expansion in CGP and could be raised higher, but // that needs to benchmarked and balanced with the potential use of vector // load/store types (PR33329, PR33914). MaxLoadsPerMemcmp = 2; MaxLoadsPerMemcmpOptSize = 2; // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4). setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment)); // An out-of-order CPU can speculatively execute past a predictable branch, // but a conditional move could be stalled by an expensive earlier operation. PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder(); EnableExtLdPromotion = true; setPrefFunctionAlignment(Align(16)); verifyIntrinsicTables(); // Default to having -disable-strictnode-mutation on IsStrictFPEnabled = true; } // This has so far only been implemented for 64-bit MachO. bool X86TargetLowering::useLoadStackGuardNode() const { return Subtarget.isTargetMachO() && Subtarget.is64Bit(); } bool X86TargetLowering::useStackGuardXorFP() const { // Currently only MSVC CRTs XOR the frame pointer into the stack guard value. return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO(); } SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const { EVT PtrTy = getPointerTy(DAG.getDataLayout()); unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP; MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val); return SDValue(Node, 0); } TargetLoweringBase::LegalizeTypeAction X86TargetLowering::getPreferredVectorAction(MVT VT) const { if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return TypeSplitVector; if (VT.getVectorNumElements() != 1 && VT.getVectorElementType() != MVT::i1) return TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { // v32i1 vectors should be promoted to v32i8 to match avx2. if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return MVT::v32i8; // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512() && (!isPowerOf2_32(VT.getVectorNumElements()) || (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) return MVT::i8; // Split v64i1 vectors if we don't have v64i8 available. if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && CC != CallingConv::X86_RegCall) return MVT::v32i1; // FIXME: Should we just make these types legal and custom split operations? if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI && Subtarget.useAVX512Regs() && !Subtarget.hasBWI()) return MVT::v16i32; return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); } unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { // v32i1 vectors should be promoted to v32i8 to match avx2. if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return 1; // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512() && (!isPowerOf2_32(VT.getVectorNumElements()) || (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) return VT.getVectorNumElements(); // Split v64i1 vectors if we don't have v64i8 available. if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && CC != CallingConv::X86_RegCall) return 2; // FIXME: Should we just make these types legal and custom split operations? if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI && Subtarget.useAVX512Regs() && !Subtarget.hasBWI()) return 1; return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); } unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv( LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const { // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512() && (!isPowerOf2_32(VT.getVectorNumElements()) || (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) { RegisterVT = MVT::i8; IntermediateVT = MVT::i1; NumIntermediates = VT.getVectorNumElements(); return NumIntermediates; } // Split v64i1 vectors if we don't have v64i8 available. if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && CC != CallingConv::X86_RegCall) { RegisterVT = MVT::v32i1; IntermediateVT = MVT::v32i1; NumIntermediates = 2; return 2; } return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); } EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext& Context, EVT VT) const { if (!VT.isVector()) return MVT::i8; if (Subtarget.hasAVX512()) { const unsigned NumElts = VT.getVectorNumElements(); // Figure out what this type will be legalized to. EVT LegalVT = VT; while (getTypeAction(Context, LegalVT) != TypeLegal) LegalVT = getTypeToTransformTo(Context, LegalVT); // If we got a 512-bit vector then we'll definitely have a vXi1 compare. if (LegalVT.getSimpleVT().is512BitVector()) return EVT::getVectorVT(Context, MVT::i1, NumElts); if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) { // If we legalized to less than a 512-bit vector, then we will use a vXi1 // compare for vXi32/vXi64 for sure. If we have BWI we will also support // vXi16/vXi8. MVT EltVT = LegalVT.getSimpleVT().getVectorElementType(); if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32) return EVT::getVectorVT(Context, MVT::i1, NumElts); } } return VT.changeVectorElementTypeToInteger(); } /// Helper for getByValTypeAlignment to determine /// the desired ByVal argument alignment. static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { if (MaxAlign == 16) return; if (VectorType *VTy = dyn_cast(Ty)) { if (VTy->getBitWidth() == 128) MaxAlign = 16; } else if (ArrayType *ATy = dyn_cast(Ty)) { unsigned EltAlign = 0; getMaxByValAlign(ATy->getElementType(), EltAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; } else if (StructType *STy = dyn_cast(Ty)) { for (auto *EltTy : STy->elements()) { unsigned EltAlign = 0; getMaxByValAlign(EltTy, EltAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; if (MaxAlign == 16) break; } } } /// Return the desired alignment for ByVal aggregate /// function arguments in the caller parameter area. For X86, aggregates /// that contain SSE vectors are placed at 16-byte boundaries while the rest /// are at 4-byte boundaries. unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty, const DataLayout &DL) const { if (Subtarget.is64Bit()) { // Max of 8 and alignment of type. unsigned TyAlign = DL.getABITypeAlignment(Ty); if (TyAlign > 8) return TyAlign; return 8; } unsigned Align = 4; if (Subtarget.hasSSE1()) getMaxByValAlign(Ty, Align); return Align; } /// Returns the target specific optimal type for load /// and store operations as a result of memset, memcpy, and memmove /// lowering. If DstAlign is zero that means it's safe to destination /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it /// means there isn't a need to check it against alignment requirement, /// probably because the source does not need to be loaded. If 'IsMemset' is /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy /// source is constant so it does not need to be loaded. /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. /// For vector ops we check that the overall size isn't larger than our /// preferred vector width. EVT X86TargetLowering::getOptimalMemOpType( uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, const AttributeList &FuncAttributes) const { if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() || ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16)))) { // FIXME: Check if unaligned 64-byte accesses are slow. if (Size >= 64 && Subtarget.hasAVX512() && (Subtarget.getPreferVectorWidth() >= 512)) { return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32; } // FIXME: Check if unaligned 32-byte accesses are slow. if (Size >= 32 && Subtarget.hasAVX() && (Subtarget.getPreferVectorWidth() >= 256)) { // Although this isn't a well-supported type for AVX1, we'll let // legalization and shuffle lowering produce the optimal codegen. If we // choose an optimal type with a vector element larger than a byte, // getMemsetStores() may create an intermediate splat (using an integer // multiply) before we splat as a vector. return MVT::v32i8; } if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128)) return MVT::v16i8; // TODO: Can SSE1 handle a byte vector? // If we have SSE1 registers we should be able to use them. if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) && (Subtarget.getPreferVectorWidth() >= 128)) return MVT::v4f32; } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) { // Do not use f64 to lower memcpy if source is string constant. It's // better to use i32 to avoid the loads. // Also, do not use f64 to lower memset unless this is a memset of zeros. // The gymnastics of splatting a byte value into an XMM register and then // only using 8-byte stores (because this is a CPU with slow unaligned // 16-byte accesses) makes that a loser. return MVT::f64; } } // This is a compromise. If we reach here, unaligned accesses may be slow on // this target. However, creating smaller, aligned accesses could be even // slower and would certainly be a lot more code. if (Subtarget.is64Bit() && Size >= 8) return MVT::i64; return MVT::i32; } bool X86TargetLowering::isSafeMemOpType(MVT VT) const { if (VT == MVT::f32) return X86ScalarSSEf32; else if (VT == MVT::f64) return X86ScalarSSEf64; return true; } bool X86TargetLowering::allowsMisalignedMemoryAccesses( EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags, bool *Fast) const { if (Fast) { switch (VT.getSizeInBits()) { default: // 8-byte and under are always assumed to be fast. *Fast = true; break; case 128: *Fast = !Subtarget.isUnalignedMem16Slow(); break; case 256: *Fast = !Subtarget.isUnalignedMem32Slow(); break; // TODO: What about AVX-512 (512-bit) accesses? } } // NonTemporal vector memory ops must be aligned. if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) { // NT loads can only be vector aligned, so if its less aligned than the // minimum vector size (which we can split the vector down to), we might as // well use a regular unaligned vector load. // We don't have any NT loads pre-SSE41. if (!!(Flags & MachineMemOperand::MOLoad)) return (Align < 16 || !Subtarget.hasSSE41()); return false; } // Misaligned accesses of any size are always allowed. return true; } /// Return the entry encoding for a jump table in the /// current function. The returned value is a member of the /// MachineJumpTableInfo::JTEntryKind enum. unsigned X86TargetLowering::getJumpTableEncoding() const { // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF // symbol. if (isPositionIndependent() && Subtarget.isPICStyleGOT()) return MachineJumpTableInfo::EK_Custom32; // Otherwise, use the normal jump table encoding heuristics. return TargetLowering::getJumpTableEncoding(); } bool X86TargetLowering::useSoftFloat() const { return Subtarget.useSoftFloat(); } void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC, ArgListTy &Args) const { // Only relabel X86-32 for C / Stdcall CCs. if (Subtarget.is64Bit()) return; if (CC != CallingConv::C && CC != CallingConv::X86_StdCall) return; unsigned ParamRegs = 0; if (auto *M = MF->getFunction().getParent()) ParamRegs = M->getNumberRegisterParameters(); // Mark the first N int arguments as having reg for (unsigned Idx = 0; Idx < Args.size(); Idx++) { Type *T = Args[Idx].Ty; if (T->isIntOrPtrTy()) if (MF->getDataLayout().getTypeAllocSize(T) <= 8) { unsigned numRegs = 1; if (MF->getDataLayout().getTypeAllocSize(T) > 4) numRegs = 2; if (ParamRegs < numRegs) return; ParamRegs -= numRegs; Args[Idx].IsInReg = true; } } } const MCExpr * X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, unsigned uid,MCContext &Ctx) const{ assert(isPositionIndependent() && Subtarget.isPICStyleGOT()); // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF // entries. return MCSymbolRefExpr::create(MBB->getSymbol(), MCSymbolRefExpr::VK_GOTOFF, Ctx); } /// Returns relocation base for the given PIC jumptable. SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const { if (!Subtarget.is64Bit()) // This doesn't have SDLoc associated with it, but is not really the // same as a Register. return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy(DAG.getDataLayout())); return Table; } /// This returns the relocation base for the given PIC jumptable, /// the same as getPICJumpTableRelocBase, but as an MCExpr. const MCExpr *X86TargetLowering:: getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const { // X86-64 uses RIP relative addressing based on the jump table label. if (Subtarget.isPICStyleRIPRel()) return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); // Otherwise, the reference is relative to the PIC base. return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); } std::pair X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const { const TargetRegisterClass *RRC = nullptr; uint8_t Cost = 1; switch (VT.SimpleTy) { default: return TargetLowering::findRepresentativeClass(TRI, VT); case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; break; case MVT::x86mmx: RRC = &X86::VR64RegClass; break; case MVT::f32: case MVT::f64: case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: case MVT::v4f64: case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64: case MVT::v16f32: case MVT::v8f64: RRC = &X86::VR128XRegClass; break; } return std::make_pair(RRC, Cost); } unsigned X86TargetLowering::getAddressSpace() const { if (Subtarget.is64Bit()) return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257; return 256; } static bool hasStackGuardSlotTLS(const Triple &TargetTriple) { return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() || (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17)); } static Constant* SegmentOffset(IRBuilder<> &IRB, unsigned Offset, unsigned AddressSpace) { return ConstantExpr::getIntToPtr( ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); } Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { // glibc, bionic, and Fuchsia have a special slot for the stack guard in // tcbhead_t; use it instead of the usual global variable (see // sysdeps/{i386,x86_64}/nptl/tls.h) if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) { if (Subtarget.isTargetFuchsia()) { // defines ZX_TLS_STACK_GUARD_OFFSET with this value. return SegmentOffset(IRB, 0x10, getAddressSpace()); } else { // %fs:0x28, unless we're using a Kernel code model, in which case // it's %gs:0x28. gs:0x14 on i386. unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14; return SegmentOffset(IRB, Offset, getAddressSpace()); } } return TargetLowering::getIRStackGuard(IRB); } void X86TargetLowering::insertSSPDeclarations(Module &M) const { // MSVC CRT provides functionalities for stack protection. if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { // MSVC CRT has a global variable holding security cookie. M.getOrInsertGlobal("__security_cookie", Type::getInt8PtrTy(M.getContext())); // MSVC CRT has a function to validate security cookie. FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( "__security_check_cookie", Type::getVoidTy(M.getContext()), Type::getInt8PtrTy(M.getContext())); if (Function *F = dyn_cast(SecurityCheckCookie.getCallee())) { F->setCallingConv(CallingConv::X86_FastCall); F->addAttribute(1, Attribute::AttrKind::InReg); } return; } // glibc, bionic, and Fuchsia have a special slot for the stack guard. if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) return; TargetLowering::insertSSPDeclarations(M); } Value *X86TargetLowering::getSDagStackGuard(const Module &M) const { // MSVC CRT has a global variable holding security cookie. if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { return M.getGlobalVariable("__security_cookie"); } return TargetLowering::getSDagStackGuard(M); } Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { // MSVC CRT has a function to validate security cookie. if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { return M.getFunction("__security_check_cookie"); } return TargetLowering::getSSPStackGuardCheck(M); } Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { if (Subtarget.getTargetTriple().isOSContiki()) return getDefaultSafeStackPointerLocation(IRB, false); // Android provides a fixed TLS slot for the SafeStack pointer. See the // definition of TLS_SLOT_SAFESTACK in // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h if (Subtarget.isTargetAndroid()) { // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: // %gs:0x24 on i386 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24; return SegmentOffset(IRB, Offset, getAddressSpace()); } // Fuchsia is similar. if (Subtarget.isTargetFuchsia()) { // defines ZX_TLS_UNSAFE_SP_OFFSET with this value. return SegmentOffset(IRB, 0x18, getAddressSpace()); } return TargetLowering::getSafeStackPointerLocation(IRB); } bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { assert(SrcAS != DestAS && "Expected different address spaces!"); const TargetMachine &TM = getTargetMachine(); if (TM.getPointerSize(SrcAS) != TM.getPointerSize(DestAS)) return false; return SrcAS < 256 && DestAS < 256; } //===----------------------------------------------------------------------===// // Return Value Calling Convention Implementation //===----------------------------------------------------------------------===// bool X86TargetLowering::CanLowerReturn( CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const { SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_X86); } const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; return ScratchRegs; } /// Lowers masks values (v*i1) to the local register values /// \returns DAG node after lowering to register type static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc, const SDLoc &Dl, SelectionDAG &DAG) { EVT ValVT = ValArg.getValueType(); if (ValVT == MVT::v1i1) return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg, DAG.getIntPtrConstant(0, Dl)); if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) || (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) { // Two stage lowering might be required // bitcast: v8i1 -> i8 / v16i1 -> i16 // anyextend: i8 -> i32 / i16 -> i32 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16; SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg); if (ValLoc == MVT::i32) ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy); return ValToCopy; } if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) || (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) { // One stage lowering is required // bitcast: v32i1 -> i32 / v64i1 -> i64 return DAG.getBitcast(ValLoc, ValArg); } return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg); } /// Breaks v64i1 value into two registers and adds the new node to the DAG static void Passv64i1ArgInRegs( const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg, SmallVectorImpl> &RegsToPass, CCValAssign &VA, CCValAssign &NextVA, const X86Subtarget &Subtarget) { assert(Subtarget.hasBWI() && "Expected AVX512BW target!"); assert(Subtarget.is32Bit() && "Expecting 32 bit target"); assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value"); assert(VA.isRegLoc() && NextVA.isRegLoc() && "The value should reside in two registers"); // Before splitting the value we cast it to i64 Arg = DAG.getBitcast(MVT::i64, Arg); // Splitting the value into two i32 types SDValue Lo, Hi; Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg, DAG.getConstant(0, Dl, MVT::i32)); Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg, DAG.getConstant(1, Dl, MVT::i32)); // Attach the two i32 types into corresponding registers RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo)); RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi)); } SDValue X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &dl, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); // In some cases we need to disable registers from the default CSR list. // For example, when they are used for argument passing. bool ShouldDisableCalleeSavedRegister = CallConv == CallingConv::X86_RegCall || MF.getFunction().hasFnAttribute("no_caller_saved_registers"); if (CallConv == CallingConv::X86_INTR && !Outs.empty()) report_fatal_error("X86 interrupts may not return any value"); SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); SDValue Flag; SmallVector RetOps; RetOps.push_back(Chain); // Operand #0 = Chain (updated below) // Operand #1 = Bytes To Pop RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl, MVT::i32)); // Copy the result values into the output registers. for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E; ++I, ++OutsIndex) { CCValAssign &VA = RVLocs[I]; assert(VA.isRegLoc() && "Can only return in registers!"); // Add the register to the CalleeSaveDisableRegs list. if (ShouldDisableCalleeSavedRegister) MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg()); SDValue ValToCopy = OutVals[OutsIndex]; EVT ValVT = ValToCopy.getValueType(); // Promote values to the appropriate types. if (VA.getLocInfo() == CCValAssign::SExt) ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::ZExt) ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::AExt) { if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG); else ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); } else if (VA.getLocInfo() == CCValAssign::BCvt) ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy); assert(VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."); // Report an error if we have attempted to return a value via an XMM // register and SSE was disabled. if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) { errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } else if (!Subtarget.hasSSE2() && X86::FR64XRegClass.contains(VA.getLocReg()) && ValVT == MVT::f64) { // When returning a double via an XMM register, report an error if SSE2 is // not enabled. errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } // Returns in ST0/ST1 are handled specially: these are pushed as operands to // the RET instruction and handled by the FP Stackifier. if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) { // If this is a copy from an xmm register to ST(0), use an FPExtend to // change the value to the FP stack register class. if (isScalarFPTypeInSSEReg(VA.getValVT())) ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); RetOps.push_back(ValToCopy); // Don't emit a copytoreg. continue; } // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 // which is returned in RAX / RDX. if (Subtarget.is64Bit()) { if (ValVT == MVT::x86mmx) { if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy); ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); // If we don't have SSE2 available, convert to v4f32 so the generated // register is legal. if (!Subtarget.hasSSE2()) ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy); } } } SmallVector, 8> RegsToPass; if (VA.needsCustom()) { assert(VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"); Passv64i1ArgInRegs(dl, DAG, ValToCopy, RegsToPass, VA, RVLocs[++I], Subtarget); assert(2 == RegsToPass.size() && "Expecting two registers after Pass64BitArgInRegs"); // Add the second register to the CalleeSaveDisableRegs list. if (ShouldDisableCalleeSavedRegister) MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg()); } else { RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); } // Add nodes to the DAG and add the values into the RetOps list for (auto &Reg : RegsToPass) { Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType())); } } // Swift calling convention does not require we copy the sret argument // into %rax/%eax for the return, and SRetReturnReg is not set for Swift. // All x86 ABIs require that for returning structs by value we copy // the sret argument into %rax/%eax (depending on ABI) for the return. // We saved the argument into a virtual register in the entry block, // so now we copy the value out and into %rax/%eax. // // Checking Function.hasStructRetAttr() here is insufficient because the IR // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is // false, then an sret argument may be implicitly inserted in the SelDAG. In // either case FuncInfo->setSRetReturnReg() will have been called. if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { // When we have both sret and another return value, we should use the // original Chain stored in RetOps[0], instead of the current Chain updated // in the above loop. If we only have sret, RetOps[0] equals to Chain. // For the case of sret and another return value, we have // Chain_0 at the function entry // Chain_1 = getCopyToReg(Chain_0) in the above loop // If we use Chain_1 in getCopyFromReg, we will have // Val = getCopyFromReg(Chain_1) // Chain_2 = getCopyToReg(Chain_1, Val) from below // getCopyToReg(Chain_0) will be glued together with // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be // in Unit B, and we will have cyclic dependency between Unit A and Unit B: // Data dependency from Unit B to Unit A due to usage of Val in // getCopyToReg(Chain_1, Val) // Chain dependency from Unit A to Unit B // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg. SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg, getPointerTy(MF.getDataLayout())); unsigned RetValReg = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ? X86::RAX : X86::EAX; Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); Flag = Chain.getValue(1); // RAX/EAX now acts like a return value. RetOps.push_back( DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); // Add the returned register to the CalleeSaveDisableRegs list. if (ShouldDisableCalleeSavedRegister) MF.getRegInfo().disableCalleeSavedRegister(RetValReg); } const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); if (I) { for (; *I; ++I) { if (X86::GR64RegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::i64)); else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); } } RetOps[0] = Chain; // Update chain. // Add the flag if we have it. if (Flag.getNode()) RetOps.push_back(Flag); X86ISD::NodeType opcode = X86ISD::RET_FLAG; if (CallConv == CallingConv::X86_INTR) opcode = X86ISD::IRET; return DAG.getNode(opcode, dl, MVT::Other, RetOps); } bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0)) return false; SDValue TCChain = Chain; SDNode *Copy = *N->use_begin(); if (Copy->getOpcode() == ISD::CopyToReg) { // If the copy has a glue operand, we conservatively assume it isn't safe to // perform a tail call. if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) return false; TCChain = Copy->getOperand(0); } else if (Copy->getOpcode() != ISD::FP_EXTEND) return false; bool HasRet = false; for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); UI != UE; ++UI) { if (UI->getOpcode() != X86ISD::RET_FLAG) return false; // If we are returning more than one value, we can definitely // not make a tail call see PR19530 if (UI->getNumOperands() > 4) return false; if (UI->getNumOperands() == 4 && UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue) return false; HasRet = true; } if (!HasRet) return false; Chain = TCChain; return true; } EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const { MVT ReturnMVT = MVT::i32; bool Darwin = Subtarget.getTargetTriple().isOSDarwin(); if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) { // The ABI does not require i1, i8 or i16 to be extended. // // On Darwin, there is code in the wild relying on Clang's old behaviour of // always extending i8/i16 return values, so keep doing that for now. // (PR26665). ReturnMVT = MVT::i8; } EVT MinVT = getRegisterType(Context, ReturnMVT); return VT.bitsLT(MinVT) ? MinVT : VT; } /// Reads two 32 bit registers and creates a 64 bit mask value. /// \param VA The current 32 bit value that need to be assigned. /// \param NextVA The next 32 bit value that need to be assigned. /// \param Root The parent DAG node. /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for /// glue purposes. In the case the DAG is already using /// physical register instead of virtual, we should glue /// our new SDValue to InFlag SDvalue. /// \return a new SDvalue of size 64bit. static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, SDValue &Root, SelectionDAG &DAG, const SDLoc &Dl, const X86Subtarget &Subtarget, SDValue *InFlag = nullptr) { assert((Subtarget.hasBWI()) && "Expected AVX512BW target!"); assert(Subtarget.is32Bit() && "Expecting 32 bit target"); assert(VA.getValVT() == MVT::v64i1 && "Expecting first location of 64 bit width type"); assert(NextVA.getValVT() == VA.getValVT() && "The locations should have the same type"); assert(VA.isRegLoc() && NextVA.isRegLoc() && "The values should reside in two registers"); SDValue Lo, Hi; SDValue ArgValueLo, ArgValueHi; MachineFunction &MF = DAG.getMachineFunction(); const TargetRegisterClass *RC = &X86::GR32RegClass; // Read a 32 bit value from the registers. if (nullptr == InFlag) { // When no physical register is present, // create an intermediate virtual register. unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); Reg = MF.addLiveIn(NextVA.getLocReg(), RC); ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); } else { // When a physical register is available read the value from it and glue // the reads together. ArgValueLo = DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag); *InFlag = ArgValueLo.getValue(2); ArgValueHi = DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag); *InFlag = ArgValueHi.getValue(2); } // Convert the i32 type into v32i1 type. Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo); // Convert the i32 type into v32i1 type. Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi); // Concatenate the two values together. return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi); } /// The function will lower a register of various sizes (8/16/32/64) /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1) /// \returns a DAG node contains the operand after lowering to mask type. static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT, const EVT &ValLoc, const SDLoc &Dl, SelectionDAG &DAG) { SDValue ValReturned = ValArg; if (ValVT == MVT::v1i1) return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned); if (ValVT == MVT::v64i1) { // In 32 bit machine, this case is handled by getv64i1Argument assert(ValLoc == MVT::i64 && "Expecting only i64 locations"); // In 64 bit machine, There is no need to truncate the value only bitcast } else { MVT maskLen; switch (ValVT.getSimpleVT().SimpleTy) { case MVT::v8i1: maskLen = MVT::i8; break; case MVT::v16i1: maskLen = MVT::i16; break; case MVT::v32i1: maskLen = MVT::i32; break; default: llvm_unreachable("Expecting a vector of i1 types"); } ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned); } return DAG.getBitcast(ValVT, ValReturned); } /// Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. /// SDValue X86TargetLowering::LowerCallResult( SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals, uint32_t *RegMask) const { const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); // Assign locations to each value returned by this call. SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E; ++I, ++InsIndex) { CCValAssign &VA = RVLocs[I]; EVT CopyVT = VA.getLocVT(); // In some calling conventions we need to remove the used registers // from the register mask. if (RegMask) { for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true); SubRegs.isValid(); ++SubRegs) RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32)); } // Report an error if there was an attempt to return FP values via XMM // registers. if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) { errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); if (VA.getLocReg() == X86::XMM1) VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts. else VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } else if (!Subtarget.hasSSE2() && X86::FR64XRegClass.contains(VA.getLocReg()) && CopyVT == MVT::f64) { errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); if (VA.getLocReg() == X86::XMM1) VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts. else VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } // If we prefer to use the value in xmm registers, copy it out as f80 and // use a truncate to move it from fp stack reg to xmm reg. bool RoundAfterCopy = false; if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && isScalarFPTypeInSSEReg(VA.getValVT())) { if (!Subtarget.hasX87()) report_fatal_error("X87 register return with X87 disabled"); CopyVT = MVT::f80; RoundAfterCopy = (CopyVT != VA.getLocVT()); } SDValue Val; if (VA.needsCustom()) { assert(VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"); Val = getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag); } else { Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag) .getValue(1); Val = Chain.getValue(0); InFlag = Chain.getValue(2); } if (RoundAfterCopy) Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, // This truncation won't change the value. DAG.getIntPtrConstant(1, dl)); if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) { if (VA.getValVT().isVector() && ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG); } else Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); } if (VA.getLocInfo() == CCValAssign::BCvt) Val = DAG.getBitcast(VA.getValVT(), Val); InVals.push_back(Val); } return Chain; } //===----------------------------------------------------------------------===// // C & StdCall & Fast Calling Convention implementation //===----------------------------------------------------------------------===// // StdCall calling convention seems to be standard for many Windows' API // routines and around. It differs from C calling convention just a little: // callee should clean up the stack, not caller. Symbols should be also // decorated in some fancy way :) It doesn't support any vector arguments. // For info on fast calling convention see Fast Calling Convention (tail call) // implementation LowerX86_32FastCCCallTo. /// CallIsStructReturn - Determines whether a call uses struct return /// semantics. enum StructReturnType { NotStructReturn, RegStructReturn, StackStructReturn }; static StructReturnType callIsStructReturn(ArrayRef Outs, bool IsMCU) { if (Outs.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Outs[0].Flags; if (!Flags.isSRet()) return NotStructReturn; if (Flags.isInReg() || IsMCU) return RegStructReturn; return StackStructReturn; } /// Determines whether a function uses struct return semantics. static StructReturnType argsAreStructReturn(ArrayRef Ins, bool IsMCU) { if (Ins.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Ins[0].Flags; if (!Flags.isSRet()) return NotStructReturn; if (Flags.isInReg() || IsMCU) return RegStructReturn; return StackStructReturn; } /// Make a copy of an aggregate at address specified by "Src" to address /// "Dst" with size and alignment information specified by the specific /// parameter attribute. The copy will be passed as a byval function parameter. static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, const SDLoc &dl) { SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), /*isVolatile*/false, /*AlwaysInline=*/true, /*isTailCall*/false, MachinePointerInfo(), MachinePointerInfo()); } /// Return true if the calling convention is one that we can guarantee TCO for. static bool canGuaranteeTCO(CallingConv::ID CC) { return (CC == CallingConv::Fast || CC == CallingConv::GHC || CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE || CC == CallingConv::HHVM || CC == CallingConv::Tail); } /// Return true if we might ever do TCO for calls with this calling convention. static bool mayTailCallThisCC(CallingConv::ID CC) { switch (CC) { // C calling conventions: case CallingConv::C: case CallingConv::Win64: case CallingConv::X86_64_SysV: // Callee pop conventions: case CallingConv::X86_ThisCall: case CallingConv::X86_StdCall: case CallingConv::X86_VectorCall: case CallingConv::X86_FastCall: // Swift: case CallingConv::Swift: return true; default: return canGuaranteeTCO(CC); } } /// Return true if the function is being made into a tailcall target by /// changing its ABI. static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail; } bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { if (!CI->isTailCall()) return false; ImmutableCallSite CS(CI); CallingConv::ID CalleeCC = CS.getCallingConv(); if (!mayTailCallThisCC(CalleeCC)) return false; return true; } SDValue X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, MachineFrameInfo &MFI, unsigned i) const { // Create the nodes corresponding to a load from this parameter slot. ISD::ArgFlagsTy Flags = Ins[i].Flags; bool AlwaysUseMutable = shouldGuaranteeTCO( CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); EVT ValVT; MVT PtrVT = getPointerTy(DAG.getDataLayout()); // If value is passed by pointer we have address passed instead of the value // itself. No need to extend if the mask value and location share the same // absolute size. bool ExtendedInMem = VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 && VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits(); if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem) ValVT = VA.getLocVT(); else ValVT = VA.getValVT(); // FIXME: For now, all byval parameter objects are marked mutable. This can be // changed with more analysis. // In case of tail call optimization mark all arguments mutable. Since they // could be overwritten by lowering of arguments in case of a tail call. if (Flags.isByVal()) { unsigned Bytes = Flags.getByValSize(); if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. // FIXME: For now, all byval parameter objects are marked as aliasing. This // can be improved with deeper analysis. int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable, /*isAliased=*/true); return DAG.getFrameIndex(FI, PtrVT); } // This is an argument in memory. We might be able to perform copy elision. // If the argument is passed directly in memory without any extension, then we // can perform copy elision. Large vector types, for example, may be passed // indirectly by pointer. if (Flags.isCopyElisionCandidate() && VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) { EVT ArgVT = Ins[i].ArgVT; SDValue PartAddr; if (Ins[i].PartOffset == 0) { // If this is a one-part value or the first part of a multi-part value, // create a stack object for the entire argument value type and return a // load from our portion of it. This assumes that if the first part of an // argument is in memory, the rest will also be in memory. int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(), /*IsImmutable=*/false); PartAddr = DAG.getFrameIndex(FI, PtrVT); return DAG.getLoad( ValVT, dl, Chain, PartAddr, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); } else { // This is not the first piece of an argument in memory. See if there is // already a fixed stack object including this offset. If so, assume it // was created by the PartOffset == 0 branch above and create a load from // the appropriate offset into it. int64_t PartBegin = VA.getLocMemOffset(); int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8; int FI = MFI.getObjectIndexBegin(); for (; MFI.isFixedObjectIndex(FI); ++FI) { int64_t ObjBegin = MFI.getObjectOffset(FI); int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI); if (ObjBegin <= PartBegin && PartEnd <= ObjEnd) break; } if (MFI.isFixedObjectIndex(FI)) { SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT), DAG.getIntPtrConstant(Ins[i].PartOffset, dl)); return DAG.getLoad( ValVT, dl, Chain, Addr, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI, Ins[i].PartOffset)); } } } int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8, VA.getLocMemOffset(), isImmutable); // Set SExt or ZExt flag. if (VA.getLocInfo() == CCValAssign::ZExt) { MFI.setObjectZExt(FI, true); } else if (VA.getLocInfo() == CCValAssign::SExt) { MFI.setObjectSExt(FI, true); } SDValue FIN = DAG.getFrameIndex(FI, PtrVT); SDValue Val = DAG.getLoad( ValVT, dl, Chain, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); return ExtendedInMem ? (VA.getValVT().isVector() ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val) : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)) : Val; } // FIXME: Get this from tablegen. static ArrayRef get64BitArgumentGPRs(CallingConv::ID CallConv, const X86Subtarget &Subtarget) { assert(Subtarget.is64Bit()); if (Subtarget.isCallingConvWin64(CallConv)) { static const MCPhysReg GPR64ArgRegsWin64[] = { X86::RCX, X86::RDX, X86::R8, X86::R9 }; return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64)); } static const MCPhysReg GPR64ArgRegs64Bit[] = { X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 }; return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit)); } // FIXME: Get this from tablegen. static ArrayRef get64BitArgumentXMMs(MachineFunction &MF, CallingConv::ID CallConv, const X86Subtarget &Subtarget) { assert(Subtarget.is64Bit()); if (Subtarget.isCallingConvWin64(CallConv)) { // The XMM registers which might contain var arg parameters are shadowed // in their paired GPR. So we only need to save the GPR to their home // slots. // TODO: __vectorcall will change this. return None; } const Function &F = MF.getFunction(); bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat); bool isSoftFloat = Subtarget.useSoftFloat(); assert(!(isSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"); if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1()) // Kernel mode asks for SSE to be disabled, so there are no XMM argument // registers. return None; static const MCPhysReg XMMArgRegs64Bit[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); } #ifndef NDEBUG static bool isSortedByValueNo(ArrayRef ArgLocs) { return std::is_sorted(ArgLocs.begin(), ArgLocs.end(), [](const CCValAssign &A, const CCValAssign &B) -> bool { return A.getValNo() < B.getValNo(); }); } #endif SDValue X86TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); const Function &F = MF.getFunction(); if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() && F.getName() == "main") FuncInfo->setForceFramePointer(true); MachineFrameInfo &MFI = MF.getFrameInfo(); bool Is64Bit = Subtarget.is64Bit(); bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); assert( !(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"); // Assign locations to all of the incoming arguments. SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64. if (IsWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeArguments(Ins, CC_X86); // In vectorcall calling convention a second pass is required for the HVA // types. if (CallingConv::X86_VectorCall == CallConv) { CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86); } // The next loop assumes that the locations are in the same order of the // input arguments. assert(isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"); SDValue ArgValue; for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E; ++I, ++InsIndex) { assert(InsIndex < Ins.size() && "Invalid Ins index"); CCValAssign &VA = ArgLocs[I]; if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); if (VA.needsCustom()) { assert( VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"); // v64i1 values, in regcall calling convention, that are // compiled to 32 bit arch, are split up into two registers. ArgValue = getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget); } else { const TargetRegisterClass *RC; if (RegVT == MVT::i8) RC = &X86::GR8RegClass; else if (RegVT == MVT::i16) RC = &X86::GR16RegClass; else if (RegVT == MVT::i32) RC = &X86::GR32RegClass; else if (Is64Bit && RegVT == MVT::i64) RC = &X86::GR64RegClass; else if (RegVT == MVT::f32) RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; else if (RegVT == MVT::f64) RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass; else if (RegVT == MVT::f80) RC = &X86::RFP80RegClass; else if (RegVT == MVT::f128) RC = &X86::VR128RegClass; else if (RegVT.is512BitVector()) RC = &X86::VR512RegClass; else if (RegVT.is256BitVector()) RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass; else if (RegVT.is128BitVector()) RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass; else if (RegVT == MVT::x86mmx) RC = &X86::VR64RegClass; else if (RegVT == MVT::v1i1) RC = &X86::VK1RegClass; else if (RegVT == MVT::v8i1) RC = &X86::VK8RegClass; else if (RegVT == MVT::v16i1) RC = &X86::VK16RegClass; else if (RegVT == MVT::v32i1) RC = &X86::VK32RegClass; else if (RegVT == MVT::v64i1) RC = &X86::VK64RegClass; else llvm_unreachable("Unknown argument type!"); unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); } // If this is an 8 or 16-bit value, it is really passed promoted to 32 // bits. Insert an assert[sz]ext to capture this, then truncate to the // right size. if (VA.getLocInfo() == CCValAssign::SExt) ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); else if (VA.getLocInfo() == CCValAssign::ZExt) ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); else if (VA.getLocInfo() == CCValAssign::BCvt) ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue); if (VA.isExtInLoc()) { // Handle MMX values passed in XMM regs. if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1) ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); else if (VA.getValVT().isVector() && VA.getValVT().getScalarType() == MVT::i1 && ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG); } else ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); } } else { assert(VA.isMemLoc()); ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex); } // If value is passed via pointer - do a load. if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal()) ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo()); InVals.push_back(ArgValue); } for (unsigned I = 0, E = Ins.size(); I != E; ++I) { // Swift calling convention does not require we copy the sret argument // into %rax/%eax for the return. We don't set SRetReturnReg for Swift. if (CallConv == CallingConv::Swift) continue; // All x86 ABIs require that for returning structs by value we copy the // sret argument into %rax/%eax (depending on ABI) for the return. Save // the argument into a virtual register so that we can access it from the // return points. if (Ins[I].Flags.isSRet()) { unsigned Reg = FuncInfo->getSRetReturnReg(); if (!Reg) { MVT PtrTy = getPointerTy(DAG.getDataLayout()); Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); FuncInfo->setSRetReturnReg(Reg); } SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]); Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); break; } } unsigned StackSize = CCInfo.getNextStackOffset(); // Align stack specially for tail calls. if (shouldGuaranteeTCO(CallConv, MF.getTarget().Options.GuaranteedTailCallOpt)) StackSize = GetAlignedArgumentStackSize(StackSize, DAG); // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. We // can skip this if there are no va_start calls. if (MFI.hasVAStart() && (Is64Bit || (CallConv != CallingConv::X86_FastCall && CallConv != CallingConv::X86_ThisCall))) { FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true)); } // Figure out if XMM registers are in use. assert(!(Subtarget.useSoftFloat() && F.hasFnAttribute(Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"); // 64-bit calling conventions support varargs and register parameters, so we // have to do extra work to spill them in the prologue. if (Is64Bit && isVarArg && MFI.hasVAStart()) { // Find the first unallocated argument registers. ArrayRef ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); ArrayRef ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs); unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs); assert(!(NumXMMRegs && !Subtarget.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"); // Gather all the live in physical registers. SmallVector LiveGPRs; SmallVector LiveXMMRegs; SDValue ALVal; for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass); LiveGPRs.push_back( DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64)); } if (!ArgXMMs.empty()) { unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8); for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) { unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass); LiveXMMRegs.push_back( DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32)); } } if (IsWin64) { // Get to the caller-allocated home save location. Add 8 to account // for the return address. int HomeOffset = TFI.getOffsetOfLocalArea() + 8; FuncInfo->setRegSaveFrameIndex( MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); // Fixup to set vararg frame on shadow area (4 x i64). if (NumIntRegs < 4) FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); } else { // For X86-64, if there are vararg parameters that are passed via // registers, then we must store them to their spots on the stack so // they may be loaded by dereferencing the result of va_next. FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject( ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); } // Store the integer parameter registers. SmallVector MemOps; SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy(DAG.getDataLayout())); unsigned Offset = FuncInfo->getVarArgsGPOffset(); for (SDValue Val : LiveGPRs) { SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), RSFIN, DAG.getIntPtrConstant(Offset, dl)); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(), Offset)); MemOps.push_back(Store); Offset += 8; } if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { // Now store the XMM (fp + vector) parameter registers. SmallVector SaveXMMOps; SaveXMMOps.push_back(Chain); SaveXMMOps.push_back(ALVal); SaveXMMOps.push_back(DAG.getIntPtrConstant( FuncInfo->getRegSaveFrameIndex(), dl)); SaveXMMOps.push_back(DAG.getIntPtrConstant( FuncInfo->getVarArgsFPOffset(), dl)); SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), LiveXMMRegs.end()); MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, MVT::Other, SaveXMMOps)); } if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); } if (isVarArg && MFI.hasMustTailInVarArgFunc()) { // Find the largest legal vector type. MVT VecVT = MVT::Other; // FIXME: Only some x86_32 calling conventions support AVX512. if (Subtarget.useAVX512Regs() && (Is64Bit || (CallConv == CallingConv::X86_VectorCall || CallConv == CallingConv::Intel_OCL_BI))) VecVT = MVT::v16f32; else if (Subtarget.hasAVX()) VecVT = MVT::v8f32; else if (Subtarget.hasSSE2()) VecVT = MVT::v4f32; // We forward some GPRs and some vector types. SmallVector RegParmTypes; MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32; RegParmTypes.push_back(IntVT); if (VecVT != MVT::Other) RegParmTypes.push_back(VecVT); // Compute the set of forwarded registers. The rest are scratch. SmallVectorImpl &Forwards = FuncInfo->getForwardedMustTailRegParms(); CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); // Forward AL for SysV x86_64 targets, since it is used for varargs. if (Is64Bit && !IsWin64 && !CCInfo.isAllocated(X86::AL)) { unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass); Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); } // Copy all forwards from physical to virtual registers. for (ForwardedRegister &FR : Forwards) { // FIXME: Can we use a less constrained schedule? SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT); FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT)); Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal); } } // Some CCs need callee pop. if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, MF.getTarget().Options.GuaranteedTailCallOpt)) { FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { // X86 interrupts must pop the error code (and the alignment padding) if // present. FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4); } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. if (!Is64Bit && !canGuaranteeTCO(CallConv) && !Subtarget.getTargetTriple().isOSMSVCRT() && argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn) FuncInfo->setBytesToPopOnReturn(4); } if (!Is64Bit) { // RegSaveFrameIndex is X86-64 only. FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); if (CallConv == CallingConv::X86_FastCall || CallConv == CallingConv::X86_ThisCall) // fastcc functions can't have varargs. FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); } FuncInfo->setArgumentStackSize(StackSize); if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) { EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn()); if (Personality == EHPersonality::CoreCLR) { assert(Is64Bit); // TODO: Add a mechanism to frame lowering that will allow us to indicate // that we'd prefer this slot be allocated towards the bottom of the frame // (i.e. near the stack pointer after allocating the frame). Every // funclet needs a copy of this slot in its (mostly empty) frame, and the // offset from the bottom of this and each funclet's frame must be the // same, so the size of funclets' (mostly empty) frames is dictated by // how far this slot is from the bottom (since they allocate just enough // space to accommodate holding this slot at the correct offset). int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false); EHInfo->PSPSymFrameIdx = PSPSymFI; } } if (CallConv == CallingConv::X86_RegCall || F.hasFnAttribute("no_caller_saved_registers")) { MachineRegisterInfo &MRI = MF.getRegInfo(); for (std::pair Pair : MRI.liveins()) MRI.disableCalleeSavedRegister(Pair.first); } return Chain; } SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), StackPtr, PtrOff); if (Flags.isByVal()) return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); return DAG.getStore( Chain, dl, Arg, PtrOff, MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); } /// Emit a load of return address if tail call /// optimization is performed and it is required. SDValue X86TargetLowering::EmitTailCallLoadRetAddr( SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, bool Is64Bit, int FPDiff, const SDLoc &dl) const { // Adjust the Return address stack slot. EVT VT = getPointerTy(DAG.getDataLayout()); OutRetAddr = getReturnAddressFrameIndex(DAG); // Load the "old" Return address. OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo()); return SDValue(OutRetAddr.getNode(), 1); } /// Emit a store of the return address if tail call /// optimization is performed and it is required (FPDiff!=0). static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, unsigned SlotSize, int FPDiff, const SDLoc &dl) { // Store the return address to the appropriate stack slot. if (!FPDiff) return Chain; // Calculate the new stack slot for the return address. int NewReturnAddrFI = MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, false); SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), NewReturnAddrFI)); return Chain; } /// Returns a vector_shuffle mask for an movs{s|d}, movd /// operation of specified width. static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector Mask; Mask.push_back(NumElems); for (unsigned i = 1; i != NumElems; ++i) Mask.push_back(i); return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); } SDValue X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; SDLoc &dl = CLI.DL; SmallVectorImpl &Outs = CLI.Outs; SmallVectorImpl &OutVals = CLI.OutVals; SmallVectorImpl &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; CallingConv::ID CallConv = CLI.CallConv; bool &isTailCall = CLI.IsTailCall; bool isVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget.is64Bit(); bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU()); bool IsSibcall = false; bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt || CallConv == CallingConv::Tail; X86MachineFunctionInfo *X86Info = MF.getInfo(); const auto *CI = dyn_cast_or_null(CLI.CS.getInstruction()); const Function *Fn = CI ? CI->getCalledFunction() : nullptr; bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) || (Fn && Fn->hasFnAttribute("no_caller_saved_registers")); const auto *II = dyn_cast_or_null(CLI.CS.getInstruction()); bool HasNoCfCheck = (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck()); const Module *M = MF.getMMI().getModule(); Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch"); MachineFunction::CallSiteInfo CSInfo; if (CallConv == CallingConv::X86_INTR) report_fatal_error("X86 interrupts may not be called directly"); if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) { // If we are using a GOT, disable tail calls to external symbols with // default visibility. Tail calling such a symbol requires using a GOT // relocation, which forces early binding of the symbol. This breaks code // that require lazy function symbol resolution. Using musttail or // GuaranteedTailCallOpt will override this. GlobalAddressSDNode *G = dyn_cast(Callee); if (!G || (!G->getGlobal()->hasLocalLinkage() && G->getGlobal()->hasDefaultVisibility())) isTailCall = false; } bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall(); if (IsMustTail) { // Force this to be a tail call. The verifier rules are enough to ensure // that we can lower this successfully without moving the return address // around. isTailCall = true; } else if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, SR != NotStructReturn, MF.getFunction().hasStructRetAttr(), CLI.RetTy, Outs, OutVals, Ins, DAG); // Sibcalls are automatically detected tailcalls which do not require // ABI changes. if (!IsGuaranteeTCO && isTailCall) IsSibcall = true; if (isTailCall) ++NumTailCalls; } assert(!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64. if (IsWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeArguments(Outs, CC_X86); // In vectorcall calling convention a second pass is required for the HVA // types. if (CallingConv::X86_VectorCall == CallConv) { CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86); } // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); if (IsSibcall) // This is a sibcall. The memory operands are available in caller's // own caller's stack. NumBytes = 0; else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; if (isTailCall && !IsSibcall && !IsMustTail) { // Lower arguments at fp - stackoffset + fpdiff. unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); FPDiff = NumBytesCallerPushed - NumBytes; // Set the delta of movement of the returnaddr stackslot. // But only set if delta is greater than previous delta. if (FPDiff < X86Info->getTCReturnAddrDelta()) X86Info->setTCReturnAddrDelta(FPDiff); } unsigned NumBytesToPush = NumBytes; unsigned NumBytesToPop = NumBytes; // If we have an inalloca argument, all stack space has already been allocated // for us and be right at the top of the stack. We don't support multiple // arguments passed in memory when using inalloca. if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { NumBytesToPush = 0; if (!ArgLocs.back().isMemLoc()) report_fatal_error("cannot use inalloca attribute on a register " "parameter"); if (ArgLocs.back().getLocMemOffset() != 0) report_fatal_error("any parameter with the inalloca attribute must be " "the only memory argument"); } if (!IsSibcall && !IsMustTail) Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush, NumBytes - NumBytesToPush, dl); SDValue RetAddrFrIdx; // Load return address for tail calls. if (isTailCall && FPDiff) Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, Is64Bit, FPDiff, dl); SmallVector, 8> RegsToPass; SmallVector MemOpChains; SDValue StackPtr; // The next loop assumes that the locations are in the same order of the // input arguments. assert(isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"); // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E; ++I, ++OutIndex) { assert(OutIndex < Outs.size() && "Invalid Out index"); // Skip inalloca arguments, they have already been written. ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags; if (Flags.isInAlloca()) continue; CCValAssign &VA = ArgLocs[I]; EVT RegVT = VA.getLocVT(); SDValue Arg = OutVals[OutIndex]; bool isByVal = Flags.isByVal(); // Promote the value if needed. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::SExt: Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); break; case CCValAssign::ZExt: Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); break; case CCValAssign::AExt: if (Arg.getValueType().isVector() && Arg.getValueType().getVectorElementType() == MVT::i1) Arg = lowerMasksToReg(Arg, RegVT, dl, DAG); else if (RegVT.is128BitVector()) { // Special case: passing MMX values in XMM registers. Arg = DAG.getBitcast(MVT::i64, Arg); Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); } else Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); break; case CCValAssign::BCvt: Arg = DAG.getBitcast(RegVT, Arg); break; case CCValAssign::Indirect: { if (isByVal) { // Memcpy the argument to a temporary stack slot to prevent // the caller from seeing any modifications the callee may make // as guaranteed by the `byval` attribute. int FrameIdx = MF.getFrameInfo().CreateStackObject( Flags.getByValSize(), std::max(16, (int)Flags.getByValAlign()), false); SDValue StackSlot = DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout())); Chain = CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl); // From now on treat this as a regular pointer Arg = StackSlot; isByVal = false; } else { // Store the argument. SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); int FI = cast(SpillSlot)->getIndex(); Chain = DAG.getStore( Chain, dl, Arg, SpillSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); Arg = SpillSlot; } break; } } if (VA.needsCustom()) { assert(VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"); // Split v64i1 value into two registers Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget); } else if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); const TargetOptions &Options = DAG.getTarget().Options; if (Options.EnableDebugEntryValues) CSInfo.emplace_back(VA.getLocReg(), I); if (isVarArg && IsWin64) { // Win64 ABI requires argument XMM reg to be copied to the corresponding // shadow reg if callee is a varargs function. unsigned ShadowReg = 0; switch (VA.getLocReg()) { case X86::XMM0: ShadowReg = X86::RCX; break; case X86::XMM1: ShadowReg = X86::RDX; break; case X86::XMM2: ShadowReg = X86::R8; break; case X86::XMM3: ShadowReg = X86::R9; break; } if (ShadowReg) RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); } } else if (!IsSibcall && (!isTailCall || isByVal)) { assert(VA.isMemLoc()); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), getPointerTy(DAG.getDataLayout())); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, dl, DAG, VA, Flags)); } } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); if (Subtarget.isPICStyleGOT()) { // ELF / PIC requires GOT in the EBX register before function calls via PLT // GOT pointer. if (!isTailCall) { RegsToPass.push_back(std::make_pair( unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy(DAG.getDataLayout())))); } else { // If we are tail calling and generating PIC/GOT style code load the // address of the callee into ECX. The value in ecx is used as target of // the tail jump. This is done to circumvent the ebx/callee-saved problem // for tail calls on PIC/GOT architectures. Normally we would just put the // address of GOT into ebx and then call target@PLT. But for tail calls // ebx would be restored (since ebx is callee saved) before jumping to the // target@PLT. // Note: The actual moving to ECX is done further down. GlobalAddressSDNode *G = dyn_cast(Callee); if (G && !G->getGlobal()->hasLocalLinkage() && G->getGlobal()->hasDefaultVisibility()) Callee = LowerGlobalAddress(Callee, DAG); else if (isa(Callee)) Callee = LowerExternalSymbol(Callee, DAG); } } if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) { // From AMD64 ABI document: // For calls that may call functions that use varargs or stdargs // (prototype-less calls or calls to functions containing ellipsis (...) in // the declaration) %al is used as hidden argument to specify the number // of SSE registers used. The contents of %al do not need to match exactly // the number of registers, but must be an ubound on the number of SSE // registers used and is in the range 0 - 8 inclusive. // Count the number of XMM registers allocated. static const MCPhysReg XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); assert((Subtarget.hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); RegsToPass.push_back(std::make_pair(unsigned(X86::AL), DAG.getConstant(NumXMMRegs, dl, MVT::i8))); } if (isVarArg && IsMustTail) { const auto &Forwards = X86Info->getForwardedMustTailRegParms(); for (const auto &F : Forwards) { SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); } } // For tail calls lower the arguments to the 'real' stack slots. Sibcalls // don't need this because the eligibility check rejects calls that require // shuffling arguments passed in memory. if (!IsSibcall && isTailCall) { // Force all the incoming stack arguments to be loaded from the stack // before any new outgoing arguments are stored to the stack, because the // outgoing stack slots may alias the incoming argument stack slots, and // the alias isn't otherwise explicit. This is slightly more conservative // than necessary, because it means that each store effectively depends // on every argument instead of just those arguments it would clobber. SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); SmallVector MemOpChains2; SDValue FIN; int FI = 0; for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E; ++I, ++OutsIndex) { CCValAssign &VA = ArgLocs[I]; if (VA.isRegLoc()) { if (VA.needsCustom()) { assert((CallConv == CallingConv::X86_RegCall) && "Expecting custom case only in regcall calling convention"); // This means that we are in special case where one argument was // passed through two register locations - Skip the next location ++I; } continue; } assert(VA.isMemLoc()); SDValue Arg = OutVals[OutsIndex]; ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags; // Skip inalloca arguments. They don't require any work. if (Flags.isInAlloca()) continue; // Create frame index. int32_t Offset = VA.getLocMemOffset()+FPDiff; uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); if (Flags.isByVal()) { // Copy relative to framepointer. SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), getPointerTy(DAG.getDataLayout())); Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), StackPtr, Source); MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, ArgChain, Flags, DAG, dl)); } else { // Store relative to framepointer. MemOpChains2.push_back(DAG.getStore( ArgChain, dl, Arg, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI))); } } if (!MemOpChains2.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); // Store the return address to the appropriate stack slot. Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, getPointerTy(DAG.getDataLayout()), RegInfo->getSlotSize(), FPDiff, dl); } // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into registers. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } if (DAG.getTarget().getCodeModel() == CodeModel::Large) { assert(Is64Bit && "Large code model is only legal in 64-bit mode."); // In the 64-bit large code model, we have to make all calls // through a register, since the call instruction's 32-bit // pc-relative offset may not be large enough to hold the whole // address. } else if (Callee->getOpcode() == ISD::GlobalAddress || Callee->getOpcode() == ISD::ExternalSymbol) { // Lower direct calls to global addresses and external symbols. Setting // ForCall to true here has the effect of removing WrapperRIP when possible // to allow direct calls to be selected without first materializing the // address into a register. Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true); } else if (Subtarget.isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) { // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); } // Returns a chain & a flag for retval copy to use. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SmallVector Ops; if (!IsSibcall && isTailCall && !IsMustTail) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytesToPop, dl, true), DAG.getIntPtrConstant(0, dl, true), InFlag, dl); InFlag = Chain.getValue(1); } Ops.push_back(Chain); Ops.push_back(Callee); if (isTailCall) Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32)); // Add argument registers to the end of the list so that they are known live // into the call. for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we // set X86_INTR calling convention because it has the same CSR mask // (same preserved registers). const uint32_t *Mask = RegInfo->getCallPreservedMask( MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv); assert(Mask && "Missing call preserved mask for calling convention"); // If this is an invoke in a 32-bit function using a funclet-based // personality, assume the function clobbers all registers. If an exception // is thrown, the runtime will not restore CSRs. // FIXME: Model this more precisely so that we can register allocate across // the normal edge and spill and fill across the exceptional edge. if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) { const Function &CallerFn = MF.getFunction(); EHPersonality Pers = CallerFn.hasPersonalityFn() ? classifyEHPersonality(CallerFn.getPersonalityFn()) : EHPersonality::Unknown; if (isFuncletEHPersonality(Pers)) Mask = RegInfo->getNoPreservedMask(); } // Define a new register mask from the existing mask. uint32_t *RegMask = nullptr; // In some calling conventions we need to remove the used physical registers // from the reg mask. if (CallConv == CallingConv::X86_RegCall || HasNCSR) { const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); // Allocate a new Reg Mask and copy Mask. RegMask = MF.allocateRegMask(); unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs()); memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize); // Make sure all sub registers of the argument registers are reset // in the RegMask. for (auto const &RegPair : RegsToPass) for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true); SubRegs.isValid(); ++SubRegs) RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32)); // Create the RegMask Operand according to our updated mask. Ops.push_back(DAG.getRegisterMask(RegMask)); } else { // Create the RegMask Operand according to the static mask. Ops.push_back(DAG.getRegisterMask(Mask)); } if (InFlag.getNode()) Ops.push_back(InFlag); if (isTailCall) { // We used to do: //// If this is the first return lowered for this function, add the regs //// to the liveout set for the function. // This isn't right, although it's probably harmless on x86; liveouts // should be computed from returns not tail calls. Consider a void // function making a tail call to a function returning int. MF.getFrameInfo().setHasTailCall(); SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); return Ret; } if (HasNoCfCheck && IsCFProtectionSupported) { Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops); } else { Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); } InFlag = Chain.getValue(1); DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); // Save heapallocsite metadata. if (CLI.CS) if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite")) DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc); // Create the CALLSEQ_END node. unsigned NumBytesForCalleeToPop; if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, DAG.getTarget().Options.GuaranteedTailCallOpt)) NumBytesForCalleeToPop = NumBytes; // Callee pops everything else if (!Is64Bit && !canGuaranteeTCO(CallConv) && !Subtarget.getTargetTriple().isOSMSVCRT() && SR == StackStructReturn) // If this is a call to a struct-return function, the callee // pops the hidden struct pointer, so we have to push it back. // This is common for Darwin/X86, Linux & Mingw32 targets. // For MSVC Win32 targets, the caller pops the hidden struct pointer. NumBytesForCalleeToPop = 4; else NumBytesForCalleeToPop = 0; // Callee pops nothing. if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) { // No need to reset the stack after the call if the call doesn't return. To // make the MI verify, we'll pretend the callee does it for us. NumBytesForCalleeToPop = NumBytes; } // Returns a flag for retval copy to use. if (!IsSibcall) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytesToPop, dl, true), DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl, true), InFlag, dl); InFlag = Chain.getValue(1); } // Handle result values, copying them out of physregs into vregs that we // return. return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, InVals, RegMask); } //===----------------------------------------------------------------------===// // Fast Calling Convention (tail call) implementation //===----------------------------------------------------------------------===// // Like std call, callee cleans arguments, convention except that ECX is // reserved for storing the tail called function address. Only 2 registers are // free for argument passing (inreg). Tail call optimization is performed // provided: // * tailcallopt is enabled // * caller/callee are fastcc // On X86_64 architecture with GOT-style position independent code only local // (within module) calls are supported at the moment. // To keep the stack aligned according to platform abi the function // GetAlignedArgumentStackSize ensures that argument delta is always multiples // of stack alignment. (Dynamic linkers need this - darwin's dyld for example) // If a tail called function callee has more arguments than the caller the // caller needs to make sure that there is room to move the RETADDR to. This is // achieved by reserving an area the size of the argument delta right after the // original RETADDR, but before the saved framepointer or the spilled registers // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) // stack layout: // arg1 // arg2 // RETADDR // [ new RETADDR // move area ] // (possible EBP) // ESI // EDI // local1 .. /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align /// requirement. unsigned X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize, SelectionDAG &DAG) const { const Align StackAlignment(Subtarget.getFrameLowering()->getStackAlignment()); const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize(); assert(StackSize % SlotSize == 0 && "StackSize must be a multiple of SlotSize"); return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize; } /// Return true if the given stack call argument is already available in the /// same position (relatively) of the caller's incoming argument stack. static bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, const X86InstrInfo *TII, const CCValAssign &VA) { unsigned Bytes = Arg.getValueSizeInBits() / 8; for (;;) { // Look through nodes that don't alter the bits of the incoming value. unsigned Op = Arg.getOpcode(); if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) { Arg = Arg.getOperand(0); continue; } if (Op == ISD::TRUNCATE) { const SDValue &TruncInput = Arg.getOperand(0); if (TruncInput.getOpcode() == ISD::AssertZext && cast(TruncInput.getOperand(1))->getVT() == Arg.getValueType()) { Arg = TruncInput.getOperand(0); continue; } } break; } int FI = INT_MAX; if (Arg.getOpcode() == ISD::CopyFromReg) { unsigned VR = cast(Arg.getOperand(1))->getReg(); if (!Register::isVirtualRegister(VR)) return false; MachineInstr *Def = MRI->getVRegDef(VR); if (!Def) return false; if (!Flags.isByVal()) { if (!TII->isLoadFromStackSlot(*Def, FI)) return false; } else { unsigned Opcode = Def->getOpcode(); if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) && Def->getOperand(1).isFI()) { FI = Def->getOperand(1).getIndex(); Bytes = Flags.getByValSize(); } else return false; } } else if (LoadSDNode *Ld = dyn_cast(Arg)) { if (Flags.isByVal()) // ByVal argument is passed in as a pointer but it's now being // dereferenced. e.g. // define @foo(%struct.X* %A) { // tail call @bar(%struct.X* byval %A) // } return false; SDValue Ptr = Ld->getBasePtr(); FrameIndexSDNode *FINode = dyn_cast(Ptr); if (!FINode) return false; FI = FINode->getIndex(); } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { FrameIndexSDNode *FINode = cast(Arg); FI = FINode->getIndex(); Bytes = Flags.getByValSize(); } else return false; assert(FI != INT_MAX); if (!MFI.isFixedObjectIndex(FI)) return false; if (Offset != MFI.getObjectOffset(FI)) return false; // If this is not byval, check that the argument stack object is immutable. // inalloca and argument copy elision can create mutable argument stack // objects. Byval objects can be mutated, but a byval call intends to pass the // mutated memory. if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI)) return false; if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) { // If the argument location is wider than the argument type, check that any // extension flags match. if (Flags.isZExt() != MFI.isObjectZExt(FI) || Flags.isSExt() != MFI.isObjectSExt(FI)) { return false; } } return Bytes == MFI.getObjectSize(FI); } /// Check whether the call is eligible for tail call optimization. Targets /// that want to do tail call optimization should implement this function. bool X86TargetLowering::IsEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG &DAG) const { if (!mayTailCallThisCC(CalleeCC)) return false; // If -tailcallopt is specified, make fastcc functions tail-callable. MachineFunction &MF = DAG.getMachineFunction(); const Function &CallerF = MF.getFunction(); // If the function return type is x86_fp80 and the callee return type is not, // then the FP_EXTEND of the call result is not a nop. It's not safe to // perform a tailcall optimization here. if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) return false; CallingConv::ID CallerCC = CallerF.getCallingConv(); bool CCMatch = CallerCC == CalleeCC; bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC); bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC); bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt || CalleeCC == CallingConv::Tail; // Win64 functions have extra shadow space for argument homing. Don't do the // sibcall if the caller and callee have mismatched expectations for this // space. if (IsCalleeWin64 != IsCallerWin64) return false; if (IsGuaranteeTCO) { if (canGuaranteeTCO(CalleeCC) && CCMatch) return true; return false; } // Look for obvious safe cases to perform tail call optimization that do not // require ABI changes. This is what gcc calls sibcall. // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to // emit a special epilogue. const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); if (RegInfo->needsStackRealignment(MF)) return false; // Also avoid sibcall optimization if either caller or callee uses struct // return semantics. if (isCalleeStructRet || isCallerStructRet) return false; // Do not sibcall optimize vararg calls unless all arguments are passed via // registers. LLVMContext &C = *DAG.getContext(); if (isVarArg && !Outs.empty()) { // Optimizing for varargs on Win64 is unlikely to be safe without // additional testing. if (IsCalleeWin64 || IsCallerWin64) return false; SmallVector ArgLocs; CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); CCInfo.AnalyzeCallOperands(Outs, CC_X86); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) if (!ArgLocs[i].isRegLoc()) return false; } // If the call result is in ST0 / ST1, it needs to be popped off the x87 // stack. Therefore, if it's not used by the call it is not safe to optimize // this into a sibcall. bool Unused = false; for (unsigned i = 0, e = Ins.size(); i != e; ++i) { if (!Ins[i].Used) { Unused = true; break; } } if (Unused) { SmallVector RVLocs; CCState CCInfo(CalleeCC, false, MF, RVLocs, C); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) return false; } } // Check that the call results are passed in the same way. if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, RetCC_X86, RetCC_X86)) return false; // The callee has to preserve all registers the caller needs to preserve. const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); if (!CCMatch) { const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) return false; } unsigned StackArgsSize = 0; // If the callee takes no arguments then go on to check the results of the // call. if (!Outs.empty()) { // Check if stack adjustment is needed. For now, do not do this if any // argument is passed on the stack. SmallVector ArgLocs; CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); // Allocate shadow area for Win64 if (IsCalleeWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeCallOperands(Outs, CC_X86); StackArgsSize = CCInfo.getNextStackOffset(); if (CCInfo.getNextStackOffset()) { // Check if the arguments are already laid out in the right way as // the caller's fixed stack objects. MachineFrameInfo &MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; if (VA.getLocInfo() == CCValAssign::Indirect) return false; if (!VA.isRegLoc()) { if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI, TII, VA)) return false; } } } bool PositionIndependent = isPositionIndependent(); // If the tailcall address may be in a register, then make sure it's // possible to register allocate for it. In 32-bit, the call address can // only target EAX, EDX, or ECX since the tail call must be scheduled after // callee-saved registers are restored. These happen to be the same // registers used to pass 'inreg' arguments so watch out for those. if (!Subtarget.is64Bit() && ((!isa(Callee) && !isa(Callee)) || PositionIndependent)) { unsigned NumInRegs = 0; // In PIC we need an extra register to formulate the address computation // for the callee. unsigned MaxInRegs = PositionIndependent ? 2 : 3; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (!VA.isRegLoc()) continue; Register Reg = VA.getLocReg(); switch (Reg) { default: break; case X86::EAX: case X86::EDX: case X86::ECX: if (++NumInRegs == MaxInRegs) return false; break; } } } const MachineRegisterInfo &MRI = MF.getRegInfo(); if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) return false; } bool CalleeWillPop = X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg, MF.getTarget().Options.GuaranteedTailCallOpt); if (unsigned BytesToPop = MF.getInfo()->getBytesToPopOnReturn()) { // If we have bytes to pop, the callee must pop them. bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize; if (!CalleePopMatches) return false; } else if (CalleeWillPop && StackArgsSize > 0) { // If we don't have bytes to pop, make sure the callee doesn't pop any. return false; } return true; } FastISel * X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { return X86::createFastISel(funcInfo, libInfo); } //===----------------------------------------------------------------------===// // Other Lowering Hooks //===----------------------------------------------------------------------===// static bool MayFoldLoad(SDValue Op) { return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); } static bool MayFoldIntoStore(SDValue Op) { return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); } static bool MayFoldIntoZeroExtend(SDValue Op) { if (Op.hasOneUse()) { unsigned Opcode = Op.getNode()->use_begin()->getOpcode(); return (ISD::ZERO_EXTEND == Opcode); } return false; } static bool isTargetShuffle(unsigned Opcode) { switch(Opcode) { default: return false; case X86ISD::BLENDI: case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::SHUFP: case X86ISD::INSERTPS: case X86ISD::EXTRQI: case X86ISD::INSERTQI: case X86ISD::PALIGNR: case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: case X86ISD::MOVLHPS: case X86ISD::MOVHLPS: case X86ISD::MOVSHDUP: case X86ISD::MOVSLDUP: case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VBROADCAST: case X86ISD::VPERMILPI: case X86ISD::VPERMILPV: case X86ISD::VPERM2X128: case X86ISD::SHUF128: case X86ISD::VPERMIL2: case X86ISD::VPERMI: case X86ISD::VPPERM: case X86ISD::VPERMV: case X86ISD::VPERMV3: case X86ISD::VZEXT_MOVL: return true; } } static bool isTargetShuffleVariableMask(unsigned Opcode) { switch (Opcode) { default: return false; // Target Shuffles. case X86ISD::PSHUFB: case X86ISD::VPERMILPV: case X86ISD::VPERMIL2: case X86ISD::VPPERM: case X86ISD::VPERMV: case X86ISD::VPERMV3: return true; // 'Faux' Target Shuffles. case ISD::OR: case ISD::AND: case X86ISD::ANDNP: return true; } } SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); int ReturnAddrIndex = FuncInfo->getRAIndex(); if (ReturnAddrIndex == 0) { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize, -(int64_t)SlotSize, false); FuncInfo->setRAIndex(ReturnAddrIndex); } return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout())); } bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement) { // Offset should fit into 32 bit immediate field. if (!isInt<32>(Offset)) return false; // If we don't have a symbolic displacement - we don't have any extra // restrictions. if (!hasSymbolicDisplacement) return true; // FIXME: Some tweaks might be needed for medium code model. if (M != CodeModel::Small && M != CodeModel::Kernel) return false; // For small code model we assume that latest object is 16MB before end of 31 // bits boundary. We may also accept pretty large negative constants knowing // that all objects are in the positive half of address space. if (M == CodeModel::Small && Offset < 16*1024*1024) return true; // For kernel code model we know that all object resist in the negative half // of 32bits address space. We may not accept negative offsets, since they may // be just off and we may accept pretty large positive ones. if (M == CodeModel::Kernel && Offset >= 0) return true; return false; } /// Determines whether the callee is required to pop its own arguments. /// Callee pop is necessary to support tail calls. bool X86::isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg, bool GuaranteeTCO) { // If GuaranteeTCO is true, we force some calls to be callee pop so that we // can guarantee TCO. if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO)) return true; switch (CallingConv) { default: return false; case CallingConv::X86_StdCall: case CallingConv::X86_FastCall: case CallingConv::X86_ThisCall: case CallingConv::X86_VectorCall: return !is64Bit; } } /// Return true if the condition is an signed comparison operation. static bool isX86CCSigned(unsigned X86CC) { switch (X86CC) { default: llvm_unreachable("Invalid integer condition!"); case X86::COND_E: case X86::COND_NE: case X86::COND_B: case X86::COND_A: case X86::COND_BE: case X86::COND_AE: return false; case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: return true; } } static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) { switch (SetCCOpcode) { default: llvm_unreachable("Invalid integer condition!"); case ISD::SETEQ: return X86::COND_E; case ISD::SETGT: return X86::COND_G; case ISD::SETGE: return X86::COND_GE; case ISD::SETLT: return X86::COND_L; case ISD::SETLE: return X86::COND_LE; case ISD::SETNE: return X86::COND_NE; case ISD::SETULT: return X86::COND_B; case ISD::SETUGT: return X86::COND_A; case ISD::SETULE: return X86::COND_BE; case ISD::SETUGE: return X86::COND_AE; } } /// Do a one-to-one translation of a ISD::CondCode to the X86-specific /// condition code, returning the condition code and the LHS/RHS of the /// comparison to make. static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { if (!isFP) { if (ConstantSDNode *RHSC = dyn_cast(RHS)) { if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { // X > -1 -> X == 0, jump !sign. RHS = DAG.getConstant(0, DL, RHS.getValueType()); return X86::COND_NS; } if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { // X < 0 -> X == 0, jump on sign. return X86::COND_S; } if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) { // X >= 0 -> X == 0, jump on !sign. return X86::COND_NS; } if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) { // X < 1 -> X <= 0 RHS = DAG.getConstant(0, DL, RHS.getValueType()); return X86::COND_LE; } } return TranslateIntegerX86CC(SetCCOpcode); } // First determine if it is required or is profitable to flip the operands. // If LHS is a foldable load, but RHS is not, flip the condition. if (ISD::isNON_EXTLoad(LHS.getNode()) && !ISD::isNON_EXTLoad(RHS.getNode())) { SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); std::swap(LHS, RHS); } switch (SetCCOpcode) { default: break; case ISD::SETOLT: case ISD::SETOLE: case ISD::SETUGT: case ISD::SETUGE: std::swap(LHS, RHS); break; } // On a floating point condition, the flags are set as follows: // ZF PF CF op // 0 | 0 | 0 | X > Y // 0 | 0 | 1 | X < Y // 1 | 0 | 0 | X == Y // 1 | 1 | 1 | unordered switch (SetCCOpcode) { default: llvm_unreachable("Condcode should be pre-legalized away"); case ISD::SETUEQ: case ISD::SETEQ: return X86::COND_E; case ISD::SETOLT: // flipped case ISD::SETOGT: case ISD::SETGT: return X86::COND_A; case ISD::SETOLE: // flipped case ISD::SETOGE: case ISD::SETGE: return X86::COND_AE; case ISD::SETUGT: // flipped case ISD::SETULT: case ISD::SETLT: return X86::COND_B; case ISD::SETUGE: // flipped case ISD::SETULE: case ISD::SETLE: return X86::COND_BE; case ISD::SETONE: case ISD::SETNE: return X86::COND_NE; case ISD::SETUO: return X86::COND_P; case ISD::SETO: return X86::COND_NP; case ISD::SETOEQ: case ISD::SETUNE: return X86::COND_INVALID; } } /// Is there a floating point cmov for the specific X86 condition code? /// Current x86 isa includes the following FP cmov instructions: /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. static bool hasFPCMov(unsigned X86CC) { switch (X86CC) { default: return false; case X86::COND_B: case X86::COND_BE: case X86::COND_E: case X86::COND_P: case X86::COND_A: case X86::COND_AE: case X86::COND_NE: case X86::COND_NP: return true; } } bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const { const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic); if (!IntrData) return false; Info.flags = MachineMemOperand::MONone; Info.offset = 0; switch (IntrData->Type) { case TRUNCATE_TO_MEM_VI8: case TRUNCATE_TO_MEM_VI16: case TRUNCATE_TO_MEM_VI32: { Info.opc = ISD::INTRINSIC_VOID; Info.ptrVal = I.getArgOperand(0); MVT VT = MVT::getVT(I.getArgOperand(1)->getType()); MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE; if (IntrData->Type == TRUNCATE_TO_MEM_VI8) ScalarVT = MVT::i8; else if (IntrData->Type == TRUNCATE_TO_MEM_VI16) ScalarVT = MVT::i16; else if (IntrData->Type == TRUNCATE_TO_MEM_VI32) ScalarVT = MVT::i32; Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements()); Info.align = Align::None(); Info.flags |= MachineMemOperand::MOStore; break; } case GATHER: case GATHER_AVX2: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.ptrVal = nullptr; MVT DataVT = MVT::getVT(I.getType()); MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType()); unsigned NumElts = std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements()); Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); Info.align = Align::None(); Info.flags |= MachineMemOperand::MOLoad; break; } case SCATTER: { Info.opc = ISD::INTRINSIC_VOID; Info.ptrVal = nullptr; MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType()); MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType()); unsigned NumElts = std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements()); Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); Info.align = Align::None(); Info.flags |= MachineMemOperand::MOStore; break; } default: return false; } return true; } /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const { for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) return true; } return false; } bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const { assert(cast(Load)->isSimple() && "illegal to narrow"); // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF // relocation target a movq or addq instruction: don't let the load shrink. SDValue BasePtr = cast(Load)->getBasePtr(); if (BasePtr.getOpcode() == X86ISD::WrapperRIP) if (const auto *GA = dyn_cast(BasePtr.getOperand(0))) return GA->getTargetFlags() != X86II::MO_GOTTPOFF; // If this is an (1) AVX vector load with (2) multiple uses and (3) all of // those uses are extracted directly into a store, then the extract + store // can be store-folded. Therefore, it's probably not worth splitting the load. EVT VT = Load->getValueType(0); if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) { for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) { // Skip uses of the chain value. Result 0 of the node is the load value. if (UI.getUse().getResNo() != 0) continue; // If this use is not an extract + store, it's probably worth splitting. if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() || UI->use_begin()->getOpcode() != ISD::STORE) return true; } // All non-chain uses are extract + store. return false; } return true; } /// Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); if (BitSize == 0 || BitSize > 64) return false; return true; } bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const { // If we are using XMM registers in the ABI and the condition of the select is // a floating-point compare and we have blendv or conditional move, then it is // cheaper to select instead of doing a cross-register move and creating a // load that depends on the compare result. bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128; return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX(); } bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const { // TODO: It might be a win to ease or lift this restriction, but the generic // folds in DAGCombiner conflict with vector folds for an AVX512 target. if (VT.isVector() && Subtarget.hasAVX512()) return false; return true; } bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const { // TODO: We handle scalars using custom code, but generic combining could make // that unnecessary. APInt MulC; if (!ISD::isConstantSplatVector(C.getNode(), MulC)) return false; // Find the type this will be legalized too. Otherwise we might prematurely // convert this to shl+add/sub and then still have to type legalize those ops. // Another choice would be to defer the decision for illegal types until // after type legalization. But constant splat vectors of i64 can't make it // through type legalization on 32-bit targets so we would need to special // case vXi64. while (getTypeAction(Context, VT) != TypeLegal) VT = getTypeToTransformTo(Context, VT); // If vector multiply is legal, assume that's faster than shl + add/sub. // TODO: Multiply is a complex op with higher latency and lower throughput in // most implementations, so this check could be loosened based on type // and/or a CPU attribute. if (isOperationLegal(ISD::MUL, VT)) return false; // shl+add, shl+sub, shl+add+neg return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() || (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2(); } bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const { if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) return false; // Mask vectors support all subregister combinations and operations that // extract half of vector. if (ResVT.getVectorElementType() == MVT::i1) return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) && (Index == ResVT.getVectorNumElements())); return (Index % ResVT.getVectorNumElements()) == 0; } bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const { unsigned Opc = VecOp.getOpcode(); // Assume target opcodes can't be scalarized. // TODO - do we have any exceptions? if (Opc >= ISD::BUILTIN_OP_END) return false; // If the vector op is not supported, try to convert to scalar. EVT VecVT = VecOp.getValueType(); if (!isOperationLegalOrCustomOrPromote(Opc, VecVT)) return true; // If the vector op is supported, but the scalar op is not, the transform may // not be worthwhile. EVT ScalarVT = VecVT.getScalarType(); return isOperationLegalOrCustomOrPromote(Opc, ScalarVT); } bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT) const { // TODO: Allow vectors? if (VT.isVector()) return false; return VT.isSimple() || !isOperationExpand(Opcode, VT); } bool X86TargetLowering::isCheapToSpeculateCttz() const { // Speculate cttz only if we can directly use TZCNT. return Subtarget.hasBMI(); } bool X86TargetLowering::isCheapToSpeculateCtlz() const { // Speculate ctlz only if we can directly use LZCNT. return Subtarget.hasLZCNT(); } bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const { if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() && BitcastVT.getVectorElementType() == MVT::i1) return false; if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8) return false; // If both types are legal vectors, it's always ok to convert them. if (LoadVT.isVector() && BitcastVT.isVector() && isTypeLegal(LoadVT) && isTypeLegal(BitcastVT)) return true; return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO); } bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const SelectionDAG &DAG) const { // Do not merge to float value size (128 bytes) if no implicit // float attribute is set. bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat); if (NoFloat) { unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32; return (MemVT.getSizeInBits() <= MaxIntSize); } // Make sure we don't merge greater than our preferred vector // width. if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth()) return false; return true; } bool X86TargetLowering::isCtlzFast() const { return Subtarget.hasFastLZCNT(); } bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial( const Instruction &AndI) const { return true; } bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { EVT VT = Y.getValueType(); if (VT.isVector()) return false; if (!Subtarget.hasBMI()) return false; // There are only 32-bit and 64-bit forms for 'andn'. if (VT != MVT::i32 && VT != MVT::i64) return false; return !isa(Y); } bool X86TargetLowering::hasAndNot(SDValue Y) const { EVT VT = Y.getValueType(); if (!VT.isVector()) return hasAndNotCompare(Y); // Vector. if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128) return false; if (VT == MVT::v4i32) return true; return Subtarget.hasSSE2(); } bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const { return X.getValueType().isScalarInteger(); // 'bt' } bool X86TargetLowering:: shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const { // Does baseline recommend not to perform the fold by default? if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG)) return false; // For scalars this transform is always beneficial. if (X.getValueType().isScalarInteger()) return true; // If all the shift amounts are identical, then transform is beneficial even // with rudimentary SSE2 shifts. if (DAG.isSplatValue(Y, /*AllowUndefs=*/true)) return true; // If we have AVX2 with it's powerful shift operations, then it's also good. if (Subtarget.hasAVX2()) return true; // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'. return NewShiftOpcode == ISD::SHL; } bool X86TargetLowering::shouldFoldConstantShiftPairToMask( const SDNode *N, CombineLevel Level) const { assert(((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && "Expected shift-shift mask"); EVT VT = N->getValueType(0); if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) || (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) { // Only fold if the shift values are equal - so it folds to AND. // TODO - we should fold if either is a non-uniform vector but we don't do // the fold for non-splats yet. return N->getOperand(1) == N->getOperand(0).getOperand(1); } return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level); } bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const { EVT VT = Y.getValueType(); // For vectors, we don't have a preference, but we probably want a mask. if (VT.isVector()) return false; // 64-bit shifts on 32-bit targets produce really bad bloated code. if (VT == MVT::i64 && !Subtarget.is64Bit()) return false; return true; } bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const { if (DAG.getMachineFunction().getFunction().hasMinSize() && !Subtarget.isOSWindows()) return false; return true; } bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const { // Any legal vector type can be splatted more efficiently than // loading/spilling from memory. return isTypeLegal(VT); } MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const { MVT VT = MVT::getIntegerVT(NumBits); if (isTypeLegal(VT)) return VT; // PMOVMSKB can handle this. if (NumBits == 128 && isTypeLegal(MVT::v16i8)) return MVT::v16i8; // VPMOVMSKB can handle this. if (NumBits == 256 && isTypeLegal(MVT::v32i8)) return MVT::v32i8; // TODO: Allow 64-bit type for 32-bit target. // TODO: 512-bit types should be allowed, but make sure that those // cases are handled in combineVectorSizedSetCCEquality(). return MVT::INVALID_SIMPLE_VALUE_TYPE; } /// Val is the undef sentinel value or equal to the specified value. static bool isUndefOrEqual(int Val, int CmpVal) { return ((Val == SM_SentinelUndef) || (Val == CmpVal)); } /// Val is either the undef or zero sentinel value. static bool isUndefOrZero(int Val) { return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero)); } /// Return true if every element in Mask, beginning from position Pos and ending /// in Pos+Size is the undef sentinel value. static bool isUndefInRange(ArrayRef Mask, unsigned Pos, unsigned Size) { return llvm::all_of(Mask.slice(Pos, Size), [](int M) { return M == SM_SentinelUndef; }); } /// Return true if the mask creates a vector whose lower half is undefined. static bool isUndefLowerHalf(ArrayRef Mask) { unsigned NumElts = Mask.size(); return isUndefInRange(Mask, 0, NumElts / 2); } /// Return true if the mask creates a vector whose upper half is undefined. static bool isUndefUpperHalf(ArrayRef Mask) { unsigned NumElts = Mask.size(); return isUndefInRange(Mask, NumElts / 2, NumElts / 2); } /// Return true if Val falls within the specified range (L, H]. static bool isInRange(int Val, int Low, int Hi) { return (Val >= Low && Val < Hi); } /// Return true if the value of any element in Mask falls within the specified /// range (L, H]. static bool isAnyInRange(ArrayRef Mask, int Low, int Hi) { return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); }); } /// Return true if Val is undef or if its value falls within the /// specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi); } /// Return true if every element in Mask is undef or if its value /// falls within the specified range (L, H]. static bool isUndefOrInRange(ArrayRef Mask, int Low, int Hi) { return llvm::all_of( Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); }); } /// Return true if Val is undef, zero or if its value falls within the /// specified range (L, H]. static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) { return isUndefOrZero(Val) || isInRange(Val, Low, Hi); } /// Return true if every element in Mask is undef, zero or if its value /// falls within the specified range (L, H]. static bool isUndefOrZeroOrInRange(ArrayRef Mask, int Low, int Hi) { return llvm::all_of( Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); }); } /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos + Size, falls within the specified /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef. static bool isSequentialOrUndefInRange(ArrayRef Mask, unsigned Pos, unsigned Size, int Low, int Step = 1) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step) if (!isUndefOrEqual(Mask[i], Low)) return false; return true; } /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (Low, Low+Size], or is undef or is zero. static bool isSequentialOrUndefOrZeroInRange(ArrayRef Mask, unsigned Pos, unsigned Size, int Low, int Step = 1) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step) if (!isUndefOrZero(Mask[i]) && Mask[i] != Low) return false; return true; } /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size is undef or is zero. static bool isUndefOrZeroInRange(ArrayRef Mask, unsigned Pos, unsigned Size) { return llvm::all_of(Mask.slice(Pos, Size), [](int M) { return isUndefOrZero(M); }); } /// Helper function to test whether a shuffle mask could be /// simplified by widening the elements being shuffled. /// /// Appends the mask for wider elements in WidenedMask if valid. Otherwise /// leaves it in an unspecified state. /// /// NOTE: This must handle normal vector shuffle masks and *target* vector /// shuffle masks. The latter have the special property of a '-2' representing /// a zero-ed lane of a vector. static bool canWidenShuffleElements(ArrayRef Mask, SmallVectorImpl &WidenedMask) { WidenedMask.assign(Mask.size() / 2, 0); for (int i = 0, Size = Mask.size(); i < Size; i += 2) { int M0 = Mask[i]; int M1 = Mask[i + 1]; // If both elements are undef, its trivial. if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) { WidenedMask[i / 2] = SM_SentinelUndef; continue; } // Check for an undef mask and a mask value properly aligned to fit with // a pair of values. If we find such a case, use the non-undef mask's value. if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) { WidenedMask[i / 2] = M1 / 2; continue; } if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) { WidenedMask[i / 2] = M0 / 2; continue; } // When zeroing, we need to spread the zeroing across both lanes to widen. if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) { if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) && (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) { WidenedMask[i / 2] = SM_SentinelZero; continue; } return false; } // Finally check if the two mask values are adjacent and aligned with // a pair. if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) { WidenedMask[i / 2] = M0 / 2; continue; } // Otherwise we can't safely widen the elements used in this shuffle. return false; } assert(WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!"); return true; } static bool canWidenShuffleElements(ArrayRef Mask, const APInt &Zeroable, bool V2IsZero, SmallVectorImpl &WidenedMask) { // Create an alternative mask with info about zeroable elements. // Here we do not set undef elements as zeroable. SmallVector ZeroableMask(Mask.begin(), Mask.end()); if (V2IsZero) { assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!"); for (int i = 0, Size = Mask.size(); i != Size; ++i) if (Mask[i] != SM_SentinelUndef && Zeroable[i]) ZeroableMask[i] = SM_SentinelZero; } return canWidenShuffleElements(ZeroableMask, WidenedMask); } static bool canWidenShuffleElements(ArrayRef Mask) { SmallVector WidenedMask; return canWidenShuffleElements(Mask, WidenedMask); } /// Returns true if Elt is a constant zero or a floating point constant +0.0. bool X86::isZeroNode(SDValue Elt) { return isNullConstant(Elt) || isNullFPConstant(Elt); } // Build a vector of constants. // Use an UNDEF node if MaskElt == -1. // Split 64-bit constants in the 32-bit mode. static SDValue getConstVector(ArrayRef Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask = false) { SmallVector Ops; bool Split = false; MVT ConstVecVT = VT; unsigned NumElts = VT.getVectorNumElements(); bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); Split = true; } MVT EltVT = ConstVecVT.getVectorElementType(); for (unsigned i = 0; i < NumElts; ++i) { bool IsUndef = Values[i] < 0 && IsMask; SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) : DAG.getConstant(Values[i], dl, EltVT); Ops.push_back(OpNode); if (Split) Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) : DAG.getConstant(0, dl, EltVT)); } SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops); if (Split) ConstsNode = DAG.getBitcast(VT, ConstsNode); return ConstsNode; } static SDValue getConstVector(ArrayRef Bits, APInt &Undefs, MVT VT, SelectionDAG &DAG, const SDLoc &dl) { assert(Bits.size() == Undefs.getBitWidth() && "Unequal constant and undef arrays"); SmallVector Ops; bool Split = false; MVT ConstVecVT = VT; unsigned NumElts = VT.getVectorNumElements(); bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); Split = true; } MVT EltVT = ConstVecVT.getVectorElementType(); for (unsigned i = 0, e = Bits.size(); i != e; ++i) { if (Undefs[i]) { Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT)); continue; } const APInt &V = Bits[i]; assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes"); if (Split) { Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT)); Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT)); } else if (EltVT == MVT::f32) { APFloat FV(APFloat::IEEEsingle(), V); Ops.push_back(DAG.getConstantFP(FV, dl, EltVT)); } else if (EltVT == MVT::f64) { APFloat FV(APFloat::IEEEdouble(), V); Ops.push_back(DAG.getConstantFP(FV, dl, EltVT)); } else { Ops.push_back(DAG.getConstant(V, dl, EltVT)); } } SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops); return DAG.getBitcast(VT, ConstsNode); } /// Returns a vector of specified type with all zero elements. static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl) { assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && "Unexpected vector type"); // Try to build SSE/AVX zero vectors as bitcasted to their dest // type. This ensures they get CSE'd. But if the integer type is not // available, use a floating-point +0.0 instead. SDValue Vec; if (!Subtarget.hasSSE2() && VT.is128BitVector()) { Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32); } else if (VT.isFloatingPoint()) { Vec = DAG.getConstantFP(+0.0, dl, VT); } else if (VT.getVectorElementType() == MVT::i1) { assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && "Unexpected vector type"); Vec = DAG.getConstant(0, dl, VT); } else { unsigned Num32BitElts = VT.getSizeInBits() / 32; Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts)); } return DAG.getBitcast(VT, Vec); } static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth) { EVT VT = Vec.getValueType(); EVT ElVT = VT.getVectorElementType(); unsigned Factor = VT.getSizeInBits()/vectorWidth; EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, VT.getVectorNumElements()/Factor); // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // This is the index of the first element of the vectorWidth-bit chunk // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. IdxVal &= ~(ElemsPerChunk - 1); // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getBuildVector(ResultVT, dl, Vec->ops().slice(IdxVal, ElemsPerChunk)); SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); } /// Generate a DAG to grab 128-bits from a vector > 128 bits. This /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 /// instructions or a simple subregister reference. Idx is an index in the /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes /// lowering EXTRACT_VECTOR_ELT operations easier. static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl) { assert((Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); return extractSubVector(Vec, IdxVal, DAG, dl, 128); } /// Generate a DAG to grab 256-bits from a 512-bit vector. static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl) { assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); return extractSubVector(Vec, IdxVal, DAG, dl, 256); } static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth) { assert((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"); // Inserting UNDEF is Result if (Vec.isUndef()) return Result; EVT VT = Vec.getValueType(); EVT ElVT = VT.getVectorElementType(); EVT ResultVT = Result.getValueType(); // Insert the relevant vectorWidth bits. unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // This is the index of the first element of the vectorWidth-bit chunk // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. IdxVal &= ~(ElemsPerChunk - 1); SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); } /// Generate a DAG to put 128-bits into a vector > 128 bits. This /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a /// simple superregister reference. Idx is an index in the 128 bits /// we want. It need not be aligned to a 128-bit boundary. That makes /// lowering INSERT_VECTOR_ELT operations easier. static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl) { assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128); } /// Widen a vector to a larger size with the same scalar type, with the new /// elements either zero or undef. static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl) { assert(Vec.getValueSizeInBits() < VT.getSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && "Unsupported vector widening type"); SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl) : DAG.getUNDEF(VT); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec, DAG.getIntPtrConstant(0, dl)); } /// Widen a vector to a larger size with the same scalar type, with the new /// elements either zero or undef. static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl, unsigned WideSizeInBits) { assert(Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type"); unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits(); MVT SVT = Vec.getSimpleValueType().getScalarType(); MVT VT = MVT::getVectorVT(SVT, WideNumElts); return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl); } // Helper function to collect subvector ops that are concated together, // either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series. // The subvectors in Ops are guaranteed to be the same type. static bool collectConcatOps(SDNode *N, SmallVectorImpl &Ops) { assert(Ops.empty() && "Expected an empty ops vector"); if (N->getOpcode() == ISD::CONCAT_VECTORS) { Ops.append(N->op_begin(), N->op_end()); return true; } if (N->getOpcode() == ISD::INSERT_SUBVECTOR && isa(N->getOperand(2))) { SDValue Src = N->getOperand(0); SDValue Sub = N->getOperand(1); const APInt &Idx = N->getConstantOperandAPInt(2); EVT VT = Src.getValueType(); EVT SubVT = Sub.getValueType(); // TODO - Handle more general insert_subvector chains. if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) && Idx == (VT.getVectorNumElements() / 2) && Src.getOpcode() == ISD::INSERT_SUBVECTOR && Src.getOperand(1).getValueType() == SubVT && isNullConstant(Src.getOperand(2))) { Ops.push_back(Src.getOperand(1)); Ops.push_back(Sub); return true; } } return false; } // Helper for splitting operands of an operation to legal target size and // apply a function on each part. // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for // deciding if/how to split Ops. Ops elements do *not* have to be of type VT. // The argument Builder is a function that will be applied on each split part: // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef) template SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef Ops, F Builder, bool CheckBWI = true) { assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2"); unsigned NumSubs = 1; if ((CheckBWI && Subtarget.useBWIRegs()) || (!CheckBWI && Subtarget.useAVX512Regs())) { if (VT.getSizeInBits() > 512) { NumSubs = VT.getSizeInBits() / 512; assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size"); } } else if (Subtarget.hasAVX2()) { if (VT.getSizeInBits() > 256) { NumSubs = VT.getSizeInBits() / 256; assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size"); } } else { if (VT.getSizeInBits() > 128) { NumSubs = VT.getSizeInBits() / 128; assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size"); } } if (NumSubs == 1) return Builder(DAG, DL, Ops); SmallVector Subs; for (unsigned i = 0; i != NumSubs; ++i) { SmallVector SubOps; for (SDValue Op : Ops) { EVT OpVT = Op.getValueType(); unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs; unsigned SizeSub = OpVT.getSizeInBits() / NumSubs; SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub)); } Subs.push_back(Builder(DAG, DL, SubOps)); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs); } /// Insert i1-subvector to i1-vector. static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); SDValue SubVec = Op.getOperand(1); SDValue Idx = Op.getOperand(2); if (!isa(Idx)) return SDValue(); // Inserting undef is a nop. We can just return the original vector. if (SubVec.isUndef()) return Vec; unsigned IdxVal = cast(Idx)->getZExtValue(); if (IdxVal == 0 && Vec.isUndef()) // the operation is legal return Op; MVT OpVT = Op.getSimpleValueType(); unsigned NumElems = OpVT.getVectorNumElements(); SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); // Extend to natively supported kshift. MVT WideOpVT = OpVT; if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts // if necessary. if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) { // May need to promote to a legal type. Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, DAG.getConstant(0, dl, WideOpVT), SubVec, Idx); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } MVT SubVecVT = SubVec.getSimpleValueType(); unsigned SubVecNumElems = SubVecVT.getVectorNumElements(); assert(IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"); SDValue Undef = DAG.getUNDEF(WideOpVT); if (IdxVal == 0) { // Zero lower bits of the Vec SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); // Merge them together, SubVec should be zero extended. SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, DAG.getConstant(0, dl, WideOpVT), SubVec, ZeroIdx); Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, SubVec, ZeroIdx); if (Vec.isUndef()) { assert(IdxVal != 0 && "Unexpected index"); SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getTargetConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } if (ISD::isBuildVectorAllZeros(Vec.getNode())) { assert(IdxVal != 0 && "Unexpected index"); NumElems = WideOpVT.getVectorNumElements(); unsigned ShiftLeft = NumElems - SubVecNumElems; unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); if (ShiftRight != 0) SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } // Simple case when we put subvector in the upper part if (IdxVal + SubVecNumElems == NumElems) { SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getTargetConstant(IdxVal, dl, MVT::i8)); if (SubVecNumElems * 2 == NumElems) { // Special case, use legal zero extending insert_subvector. This allows // isel to opimitize when bits are known zero. Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, DAG.getConstant(0, dl, WideOpVT), Vec, ZeroIdx); } else { // Otherwise use explicit shifts to zero the bits. Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); NumElems = WideOpVT.getVectorNumElements(); SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8); Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); } Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } // Inserting into the middle is more complicated. NumElems = WideOpVT.getVectorNumElements(); // Widen the vector if needed. Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); unsigned ShiftLeft = NumElems - SubVecNumElems; unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; // Do an optimization for the the most frequently used types. if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) { APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems); Mask0.flipAllBits(); SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems)); SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0); Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0); SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); // Reduce to original width if needed. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } // Clear the upper bits of the subvector and move it to its insert position. SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); // Isolate the bits below the insertion point. unsigned LowShift = NumElems - IdxVal; SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, DAG.getTargetConstant(LowShift, dl, MVT::i8)); Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low, DAG.getTargetConstant(LowShift, dl, MVT::i8)); // Isolate the bits after the last inserted bit. unsigned HighShift = IdxVal + SubVecNumElems; SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, DAG.getTargetConstant(HighShift, dl, MVT::i8)); High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High, DAG.getTargetConstant(HighShift, dl, MVT::i8)); // Now OR all 3 pieces together. Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High); SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec); // Reduce to original width if needed. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl) { assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch"); EVT SubVT = V1.getValueType(); EVT SubSVT = SubVT.getScalarType(); unsigned SubNumElts = SubVT.getVectorNumElements(); unsigned SubVectorWidth = SubVT.getSizeInBits(); EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts); SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth); return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth); } /// Returns a vector of specified type with all bits set. /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>. /// Then bitcast to their original type, ensuring they get CSE'd. static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"); APInt Ones = APInt::getAllOnesValue(32); unsigned NumElts = VT.getSizeInBits() / 32; SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts)); return DAG.getBitcast(VT, Vec); } // Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode. static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) { switch (Opcode) { case ISD::ANY_EXTEND: case ISD::ANY_EXTEND_VECTOR_INREG: return ISD::ANY_EXTEND_VECTOR_INREG; case ISD::ZERO_EXTEND: case ISD::ZERO_EXTEND_VECTOR_INREG: return ISD::ZERO_EXTEND_VECTOR_INREG; case ISD::SIGN_EXTEND: case ISD::SIGN_EXTEND_VECTOR_INREG: return ISD::SIGN_EXTEND_VECTOR_INREG; } llvm_unreachable("Unknown opcode"); } static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG) { EVT InVT = In.getValueType(); assert(VT.isVector() && InVT.isVector() && "Expected vector VTs."); assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && "Unknown extension opcode"); // For 256-bit vectors, we only need the lower (128-bit) input half. // For 512-bit vectors, we only need the lower input half or quarter. if (InVT.getSizeInBits() > 128) { assert(VT.getSizeInBits() == InVT.getSizeInBits() && "Expected VTs to be the same size!"); unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits(); In = extractSubVector(In, 0, DAG, DL, std::max(128U, (unsigned)VT.getSizeInBits() / Scale)); InVT = In.getValueType(); } if (VT.getVectorNumElements() != InVT.getVectorNumElements()) Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode); return DAG.getNode(Opcode, DL, VT, In); } // Match (xor X, -1) -> X. // Match extract_subvector(xor X, -1) -> extract_subvector(X). // Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y). static SDValue IsNOT(SDValue V, SelectionDAG &DAG) { V = peekThroughBitcasts(V); if (V.getOpcode() == ISD::XOR && ISD::isBuildVectorAllOnes(V.getOperand(1).getNode())) return V.getOperand(0); if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) { if (SDValue Not = IsNOT(V.getOperand(0), DAG)) { Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(), Not, V.getOperand(1)); } } SmallVector CatOps; if (collectConcatOps(V.getNode(), CatOps)) { for (SDValue &CatOp : CatOps) { SDValue NotCat = IsNOT(CatOp, DAG); if (!NotCat) return SDValue(); CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat); } return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps); } return SDValue(); } /// Returns a vector_shuffle node for an unpackl operation. static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, SDValue V2) { SmallVector Mask; createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false); return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); } /// Returns a vector_shuffle node for an unpackh operation. static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, SDValue V2) { SmallVector Mask; createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false); return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); } /// Return a vector_shuffle of the specified vector of zero or undef vector. /// This produces a shuffle where the low element of V2 is swizzled into the /// zero/undef vector, landing at element Idx. /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = V2.getSimpleValueType(); SDValue V1 = IsZero ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); int NumElems = VT.getVectorNumElements(); SmallVector MaskVec(NumElems); for (int i = 0; i != NumElems; ++i) // If this is the insertion idx, put the low elt of V2 here. MaskVec[i] = (i == Idx) ? NumElems : i; return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec); } static const Constant *getTargetConstantFromNode(LoadSDNode *Load) { if (!Load || !ISD::isNormalLoad(Load)) return nullptr; SDValue Ptr = Load->getBasePtr(); if (Ptr->getOpcode() == X86ISD::Wrapper || Ptr->getOpcode() == X86ISD::WrapperRIP) Ptr = Ptr->getOperand(0); auto *CNode = dyn_cast(Ptr); if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0) return nullptr; return CNode->getConstVal(); } static const Constant *getTargetConstantFromNode(SDValue Op) { Op = peekThroughBitcasts(Op); return getTargetConstantFromNode(dyn_cast(Op)); } const Constant * X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const { assert(LD && "Unexpected null LoadSDNode"); return getTargetConstantFromNode(LD); } // Extract raw constant bits from constant pools. static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl &EltBits, bool AllowWholeUndefs = true, bool AllowPartialUndefs = true) { assert(EltBits.empty() && "Expected an empty EltBits vector"); Op = peekThroughBitcasts(Op); EVT VT = Op.getValueType(); unsigned SizeInBits = VT.getSizeInBits(); assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!"); unsigned NumElts = SizeInBits / EltSizeInBits; // Bitcast a source array of element bits to the target size. auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef SrcEltBits) { unsigned NumSrcElts = UndefSrcElts.getBitWidth(); unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth(); assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits && "Constant bit sizes don't match"); // Don't split if we don't allow undef bits. bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs; if (UndefSrcElts.getBoolValue() && !AllowUndefs) return false; // If we're already the right size, don't bother bitcasting. if (NumSrcElts == NumElts) { UndefElts = UndefSrcElts; EltBits.assign(SrcEltBits.begin(), SrcEltBits.end()); return true; } // Extract all the undef/constant element data and pack into single bitsets. APInt UndefBits(SizeInBits, 0); APInt MaskBits(SizeInBits, 0); for (unsigned i = 0; i != NumSrcElts; ++i) { unsigned BitOffset = i * SrcEltSizeInBits; if (UndefSrcElts[i]) UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits); MaskBits.insertBits(SrcEltBits[i], BitOffset); } // Split the undef/constant single bitset data into the target elements. UndefElts = APInt(NumElts, 0); EltBits.resize(NumElts, APInt(EltSizeInBits, 0)); for (unsigned i = 0; i != NumElts; ++i) { unsigned BitOffset = i * EltSizeInBits; APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset); // Only treat an element as UNDEF if all bits are UNDEF. if (UndefEltBits.isAllOnesValue()) { if (!AllowWholeUndefs) return false; UndefElts.setBit(i); continue; } // If only some bits are UNDEF then treat them as zero (or bail if not // supported). if (UndefEltBits.getBoolValue() && !AllowPartialUndefs) return false; EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset); } return true; }; // Collect constant bits and insert into mask/undef bit masks. auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs, unsigned UndefBitIndex) { if (!Cst) return false; if (isa(Cst)) { Undefs.setBit(UndefBitIndex); return true; } if (auto *CInt = dyn_cast(Cst)) { Mask = CInt->getValue(); return true; } if (auto *CFP = dyn_cast(Cst)) { Mask = CFP->getValueAPF().bitcastToAPInt(); return true; } return false; }; // Handle UNDEFs. if (Op.isUndef()) { APInt UndefSrcElts = APInt::getAllOnesValue(NumElts); SmallVector SrcEltBits(NumElts, APInt(EltSizeInBits, 0)); return CastBitData(UndefSrcElts, SrcEltBits); } // Extract scalar constant bits. if (auto *Cst = dyn_cast(Op)) { APInt UndefSrcElts = APInt::getNullValue(1); SmallVector SrcEltBits(1, Cst->getAPIntValue()); return CastBitData(UndefSrcElts, SrcEltBits); } if (auto *Cst = dyn_cast(Op)) { APInt UndefSrcElts = APInt::getNullValue(1); APInt RawBits = Cst->getValueAPF().bitcastToAPInt(); SmallVector SrcEltBits(1, RawBits); return CastBitData(UndefSrcElts, SrcEltBits); } // Extract constant bits from build vector. if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; APInt UndefSrcElts(NumSrcElts, 0); SmallVector SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0)); for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { const SDValue &Src = Op.getOperand(i); if (Src.isUndef()) { UndefSrcElts.setBit(i); continue; } auto *Cst = cast(Src); SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits); } return CastBitData(UndefSrcElts, SrcEltBits); } if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) { unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; APInt UndefSrcElts(NumSrcElts, 0); SmallVector SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0)); for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { const SDValue &Src = Op.getOperand(i); if (Src.isUndef()) { UndefSrcElts.setBit(i); continue; } auto *Cst = cast(Src); APInt RawBits = Cst->getValueAPF().bitcastToAPInt(); SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits); } return CastBitData(UndefSrcElts, SrcEltBits); } // Extract constant bits from constant pool vector. if (auto *Cst = getTargetConstantFromNode(Op)) { Type *CstTy = Cst->getType(); unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits(); if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0) return false; unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits(); unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; APInt UndefSrcElts(NumSrcElts, 0); SmallVector SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0)); for (unsigned i = 0; i != NumSrcElts; ++i) if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i], UndefSrcElts, i)) return false; return CastBitData(UndefSrcElts, SrcEltBits); } // Extract constant bits from a broadcasted constant pool scalar. if (Op.getOpcode() == X86ISD::VBROADCAST && EltSizeInBits <= VT.getScalarSizeInBits()) { if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) { unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits(); unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; APInt UndefSrcElts(NumSrcElts, 0); SmallVector SrcEltBits(1, APInt(SrcEltSizeInBits, 0)); if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) { if (UndefSrcElts[0]) UndefSrcElts.setBits(0, NumSrcElts); SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]); return CastBitData(UndefSrcElts, SrcEltBits); } } } if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD && EltSizeInBits <= VT.getScalarSizeInBits()) { auto *MemIntr = cast(Op); if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits()) return false; SDValue Ptr = MemIntr->getBasePtr(); if (Ptr->getOpcode() == X86ISD::Wrapper || Ptr->getOpcode() == X86ISD::WrapperRIP) Ptr = Ptr->getOperand(0); auto *CNode = dyn_cast(Ptr); if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0) return false; if (const Constant *C = CNode->getConstVal()) { unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits(); unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; APInt UndefSrcElts(NumSrcElts, 0); SmallVector SrcEltBits(1, APInt(SrcEltSizeInBits, 0)); if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) { if (UndefSrcElts[0]) UndefSrcElts.setBits(0, NumSrcElts); SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]); return CastBitData(UndefSrcElts, SrcEltBits); } } } // Extract constant bits from a subvector broadcast. if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) { SmallVector SubEltBits; if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts, SubEltBits, AllowWholeUndefs, AllowPartialUndefs)) { UndefElts = APInt::getSplat(NumElts, UndefElts); while (EltBits.size() < NumElts) EltBits.append(SubEltBits.begin(), SubEltBits.end()); return true; } } // Extract a rematerialized scalar constant insertion. if (Op.getOpcode() == X86ISD::VZEXT_MOVL && Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && isa(Op.getOperand(0).getOperand(0))) { unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; APInt UndefSrcElts(NumSrcElts, 0); SmallVector SrcEltBits; auto *CN = cast(Op.getOperand(0).getOperand(0)); SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits)); SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0)); return CastBitData(UndefSrcElts, SrcEltBits); } // Insert constant bits from a base and sub vector sources. if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && isa(Op.getOperand(2))) { // TODO - support insert_subvector through bitcasts. if (EltSizeInBits != VT.getScalarSizeInBits()) return false; APInt UndefSubElts; SmallVector EltSubBits; if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits, UndefSubElts, EltSubBits, AllowWholeUndefs, AllowPartialUndefs) && getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts, EltBits, AllowWholeUndefs, AllowPartialUndefs)) { unsigned BaseIdx = Op.getConstantOperandVal(2); UndefElts.insertBits(UndefSubElts, BaseIdx); for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i) EltBits[BaseIdx + i] = EltSubBits[i]; return true; } } // Extract constant bits from a subvector's source. if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && isa(Op.getOperand(1))) { // TODO - support extract_subvector through bitcasts. if (EltSizeInBits != VT.getScalarSizeInBits()) return false; if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts, EltBits, AllowWholeUndefs, AllowPartialUndefs)) { EVT SrcVT = Op.getOperand(0).getValueType(); unsigned NumSrcElts = SrcVT.getVectorNumElements(); unsigned NumSubElts = VT.getVectorNumElements(); unsigned BaseIdx = Op.getConstantOperandVal(1); UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx); if ((BaseIdx + NumSubElts) != NumSrcElts) EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end()); if (BaseIdx != 0) EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx); return true; } } // Extract constant bits from shuffle node sources. if (auto *SVN = dyn_cast(Op)) { // TODO - support shuffle through bitcasts. if (EltSizeInBits != VT.getScalarSizeInBits()) return false; ArrayRef Mask = SVN->getMask(); if ((!AllowWholeUndefs || !AllowPartialUndefs) && llvm::any_of(Mask, [](int M) { return M < 0; })) return false; APInt UndefElts0, UndefElts1; SmallVector EltBits0, EltBits1; if (isAnyInRange(Mask, 0, NumElts) && !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts0, EltBits0, AllowWholeUndefs, AllowPartialUndefs)) return false; if (isAnyInRange(Mask, NumElts, 2 * NumElts) && !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits, UndefElts1, EltBits1, AllowWholeUndefs, AllowPartialUndefs)) return false; UndefElts = APInt::getNullValue(NumElts); for (int i = 0; i != (int)NumElts; ++i) { int M = Mask[i]; if (M < 0) { UndefElts.setBit(i); EltBits.push_back(APInt::getNullValue(EltSizeInBits)); } else if (M < (int)NumElts) { if (UndefElts0[M]) UndefElts.setBit(i); EltBits.push_back(EltBits0[M]); } else { if (UndefElts1[M - NumElts]) UndefElts.setBit(i); EltBits.push_back(EltBits1[M - NumElts]); } } return true; } return false; } namespace llvm { namespace X86 { bool isConstantSplat(SDValue Op, APInt &SplatVal) { APInt UndefElts; SmallVector EltBits; if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits, true, false)) { int SplatIndex = -1; for (int i = 0, e = EltBits.size(); i != e; ++i) { if (UndefElts[i]) continue; if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) { SplatIndex = -1; break; } SplatIndex = i; } if (0 <= SplatIndex) { SplatVal = EltBits[SplatIndex]; return true; } } return false; } } // namespace X86 } // namespace llvm static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl &RawMask, APInt &UndefElts) { // Extract the raw target constant bits. SmallVector EltBits; if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts, EltBits, /* AllowWholeUndefs */ true, /* AllowPartialUndefs */ false)) return false; // Insert the extracted elements into the mask. for (APInt Elt : EltBits) RawMask.push_back(Elt.getZExtValue()); return true; } /// Create a shuffle mask that matches the PACKSS/PACKUS truncation. /// Note: This ignores saturation, so inputs must be checked first. static void createPackShuffleMask(MVT VT, SmallVectorImpl &Mask, bool Unary) { assert(Mask.empty() && "Expected an empty shuffle mask vector"); unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits() / 128; unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits(); unsigned Offset = Unary ? 0 : NumElts; for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2) Mask.push_back(Elt + (Lane * NumEltsPerLane)); for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2) Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset); } } // Split the demanded elts of a PACKSS/PACKUS node between its operands. static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS) { int NumLanes = VT.getSizeInBits() / 128; int NumElts = DemandedElts.getBitWidth(); int NumInnerElts = NumElts / 2; int NumEltsPerLane = NumElts / NumLanes; int NumInnerEltsPerLane = NumInnerElts / NumLanes; DemandedLHS = APInt::getNullValue(NumInnerElts); DemandedRHS = APInt::getNullValue(NumInnerElts); // Map DemandedElts to the packed operands. for (int Lane = 0; Lane != NumLanes; ++Lane) { for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) { int OuterIdx = (Lane * NumEltsPerLane) + Elt; int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt; if (DemandedElts[OuterIdx]) DemandedLHS.setBit(InnerIdx); if (DemandedElts[OuterIdx + NumInnerEltsPerLane]) DemandedRHS.setBit(InnerIdx); } } } // Split the demanded elts of a HADD/HSUB node between its operands. static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS) { int NumLanes = VT.getSizeInBits() / 128; int NumElts = DemandedElts.getBitWidth(); int NumEltsPerLane = NumElts / NumLanes; int HalfEltsPerLane = NumEltsPerLane / 2; DemandedLHS = APInt::getNullValue(NumElts); DemandedRHS = APInt::getNullValue(NumElts); // Map DemandedElts to the horizontal operands. for (int Idx = 0; Idx != NumElts; ++Idx) { if (!DemandedElts[Idx]) continue; int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane; int LocalIdx = Idx % NumEltsPerLane; if (LocalIdx < HalfEltsPerLane) { DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0); DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1); } else { LocalIdx -= HalfEltsPerLane; DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0); DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1); } } } /// Calculates the shuffle mask corresponding to the target-specific opcode. /// If the mask could be calculated, returns it in \p Mask, returns the shuffle /// operands in \p Ops, and returns true. /// Sets \p IsUnary to true if only one source is used. Note that this will set /// IsUnary for shuffles which use a single input multiple times, and in those /// cases it will adjust the mask to only have indices within that single input. /// It is an error to call this with non-empty Mask/Ops vectors. static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, SmallVectorImpl &Ops, SmallVectorImpl &Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); unsigned MaskEltSize = VT.getScalarSizeInBits(); SmallVector RawMask; APInt RawUndefs; SDValue ImmN; assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"); assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"); IsUnary = false; bool IsFakeUnary = false; switch (N->getOpcode()) { case X86ISD::BLENDI: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodeBLENDMask(NumElems, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::SHUFP: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodeSHUFPMask(NumElems, MaskEltSize, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::INSERTPS: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodeINSERTPSMask(cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::EXTRQI: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); if (isa(N->getOperand(1)) && isa(N->getOperand(2))) { int BitLen = N->getConstantOperandVal(1); int BitIdx = N->getConstantOperandVal(2); DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask); IsUnary = true; } break; case X86ISD::INSERTQI: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); if (isa(N->getOperand(2)) && isa(N->getOperand(3))) { int BitLen = N->getConstantOperandVal(2); int BitIdx = N->getConstantOperandVal(3); DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); } break; case X86ISD::UNPCKH: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeUNPCKHMask(NumElems, MaskEltSize, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::UNPCKL: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeUNPCKLMask(NumElems, MaskEltSize, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVHLPS: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeMOVHLPSMask(NumElems, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVLHPS: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeMOVLHPSMask(NumElems, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::PALIGNR: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodePALIGNRMask(NumElems, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); Ops.push_back(N->getOperand(1)); Ops.push_back(N->getOperand(0)); break; case X86ISD::VSHLDQ: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodePSLLDQMask(NumElems, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::VSRLDQ: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodePSRLDQMask(NumElems, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFD: case X86ISD::VPERMILPI: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodePSHUFMask(NumElems, MaskEltSize, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFHW: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodePSHUFHWMask(NumElems, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFLW: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodePSHUFLWMask(NumElems, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::VZEXT_MOVL: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeZeroMoveLowMask(NumElems, Mask); IsUnary = true; break; case X86ISD::VBROADCAST: { SDValue N0 = N->getOperand(0); // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so, // add the pre-extracted value to the Ops vector. if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && N0.getOperand(0).getValueType() == VT && N0.getConstantOperandVal(1) == 0) Ops.push_back(N0.getOperand(0)); // We only decode broadcasts of same-sized vectors, unless the broadcast // came from an extract from the original width. If we found one, we // pushed it the Ops vector above. if (N0.getValueType() == VT || !Ops.empty()) { DecodeVectorBroadcast(NumElems, Mask); IsUnary = true; break; } return false; } case X86ISD::VPERMILPV: { assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); IsUnary = true; SDValue MaskNode = N->getOperand(1); if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, RawUndefs)) { DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask); break; } return false; } case X86ISD::PSHUFB: { assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = true; SDValue MaskNode = N->getOperand(1); if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) { DecodePSHUFBMask(RawMask, RawUndefs, Mask); break; } return false; } case X86ISD::VPERMI: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodeVPERMMask(NumElems, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::MOVSS: case X86ISD::MOVSD: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask); break; case X86ISD::VPERM2X128: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodeVPERM2X128Mask(NumElems, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::SHUF128: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVSLDUP: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeMOVSLDUPMask(NumElems, Mask); IsUnary = true; break; case X86ISD::MOVSHDUP: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeMOVSHDUPMask(NumElems, Mask); IsUnary = true; break; case X86ISD::MOVDDUP: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeMOVDDUPMask(NumElems, Mask); IsUnary = true; break; case X86ISD::VPERMIL2: { assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); SDValue MaskNode = N->getOperand(2); SDValue CtrlNode = N->getOperand(3); if (ConstantSDNode *CtrlOp = dyn_cast(CtrlNode)) { unsigned CtrlImm = CtrlOp->getZExtValue(); if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, RawUndefs)) { DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs, Mask); break; } } return false; } case X86ISD::VPPERM: { assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); SDValue MaskNode = N->getOperand(2); if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) { DecodeVPPERMMask(RawMask, RawUndefs, Mask); break; } return false; } case X86ISD::VPERMV: { assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = true; // Unlike most shuffle nodes, VPERMV's mask operand is operand 0. Ops.push_back(N->getOperand(1)); SDValue MaskNode = N->getOperand(0); if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, RawUndefs)) { DecodeVPERMVMask(RawMask, RawUndefs, Mask); break; } return false; } case X86ISD::VPERMV3: { assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(2).getValueType() == VT && "Unexpected value type"); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2); // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one. Ops.push_back(N->getOperand(0)); Ops.push_back(N->getOperand(2)); SDValue MaskNode = N->getOperand(1); if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, RawUndefs)) { DecodeVPERMV3Mask(RawMask, RawUndefs, Mask); break; } return false; } default: llvm_unreachable("unknown target shuffle node"); } // Empty mask indicates the decode failed. if (Mask.empty()) return false; // Check if we're getting a shuffle mask with zero'd elements. if (!AllowSentinelZero) if (any_of(Mask, [](int M) { return M == SM_SentinelZero; })) return false; // If we have a fake unary shuffle, the shuffle mask is spread across two // inputs that are actually the same node. Re-map the mask to always point // into the first input. if (IsFakeUnary) for (int &M : Mask) if (M >= (int)Mask.size()) M -= Mask.size(); // If we didn't already add operands in the opcode-specific code, default to // adding 1 or 2 operands starting at 0. if (Ops.empty()) { Ops.push_back(N->getOperand(0)); if (!IsUnary || IsFakeUnary) Ops.push_back(N->getOperand(1)); } return true; } /// Compute whether each element of a shuffle is zeroable. /// /// A "zeroable" vector shuffle element is one which can be lowered to zero. /// Either it is an undef element in the shuffle mask, the element of the input /// referenced is undef, or the element of the input referenced is known to be /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle /// as many lanes with this technique as possible to simplify the remaining /// shuffle. static void computeZeroableShuffleElements(ArrayRef Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero) { int Size = Mask.size(); KnownUndef = KnownZero = APInt::getNullValue(Size); V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); int VectorSizeInBits = V1.getValueSizeInBits(); int ScalarSizeInBits = VectorSizeInBits / Size; assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size"); for (int i = 0; i < Size; ++i) { int M = Mask[i]; // Handle the easy cases. if (M < 0) { KnownUndef.setBit(i); continue; } if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { KnownZero.setBit(i); continue; } // Determine shuffle input and normalize the mask. SDValue V = M < Size ? V1 : V2; M %= Size; // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements. if (V.getOpcode() != ISD::BUILD_VECTOR) continue; // If the BUILD_VECTOR has fewer elements then the bitcasted portion of // the (larger) source element must be UNDEF/ZERO. if ((Size % V.getNumOperands()) == 0) { int Scale = Size / V->getNumOperands(); SDValue Op = V.getOperand(M / Scale); if (Op.isUndef()) KnownUndef.setBit(i); if (X86::isZeroNode(Op)) KnownZero.setBit(i); else if (ConstantSDNode *Cst = dyn_cast(Op)) { APInt Val = Cst->getAPIntValue(); Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits); if (Val == 0) KnownZero.setBit(i); } else if (ConstantFPSDNode *Cst = dyn_cast(Op)) { APInt Val = Cst->getValueAPF().bitcastToAPInt(); Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits); if (Val == 0) KnownZero.setBit(i); } continue; } // If the BUILD_VECTOR has more elements then all the (smaller) source // elements must be UNDEF or ZERO. if ((V.getNumOperands() % Size) == 0) { int Scale = V->getNumOperands() / Size; bool AllUndef = true; bool AllZero = true; for (int j = 0; j < Scale; ++j) { SDValue Op = V.getOperand((M * Scale) + j); AllUndef &= Op.isUndef(); AllZero &= X86::isZeroNode(Op); } if (AllUndef) KnownUndef.setBit(i); if (AllZero) KnownZero.setBit(i); continue; } } } /// Decode a target shuffle mask and inputs and see if any values are /// known to be undef or zero from their inputs. /// Returns true if the target shuffle mask was decoded. /// FIXME: Merge this with computeZeroableShuffleElements? static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl &Mask, SmallVectorImpl &Ops, APInt &KnownUndef, APInt &KnownZero) { bool IsUnary; if (!isTargetShuffle(N.getOpcode())) return false; MVT VT = N.getSimpleValueType(); if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary)) return false; int Size = Mask.size(); SDValue V1 = Ops[0]; SDValue V2 = IsUnary ? V1 : Ops[1]; KnownUndef = KnownZero = APInt::getNullValue(Size); V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); assert((VT.getSizeInBits() % Size) == 0 && "Illegal split of shuffle value type"); unsigned EltSizeInBits = VT.getSizeInBits() / Size; // Extract known constant input data. APInt UndefSrcElts[2]; SmallVector SrcEltBits[2]; bool IsSrcConstant[2] = { getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0], SrcEltBits[0], true, false), getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1], SrcEltBits[1], true, false)}; for (int i = 0; i < Size; ++i) { int M = Mask[i]; // Already decoded as SM_SentinelZero / SM_SentinelUndef. if (M < 0) { assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!"); if (SM_SentinelUndef == M) KnownUndef.setBit(i); if (SM_SentinelZero == M) KnownZero.setBit(i); continue; } // Determine shuffle input and normalize the mask. unsigned SrcIdx = M / Size; SDValue V = M < Size ? V1 : V2; M %= Size; // We are referencing an UNDEF input. if (V.isUndef()) { KnownUndef.setBit(i); continue; } // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF. // TODO: We currently only set UNDEF for integer types - floats use the same // registers as vectors and many of the scalar folded loads rely on the // SCALAR_TO_VECTOR pattern. if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && (Size % V.getValueType().getVectorNumElements()) == 0) { int Scale = Size / V.getValueType().getVectorNumElements(); int Idx = M / Scale; if (Idx != 0 && !VT.isFloatingPoint()) KnownUndef.setBit(i); else if (Idx == 0 && X86::isZeroNode(V.getOperand(0))) KnownZero.setBit(i); continue; } // Attempt to extract from the source's constant bits. if (IsSrcConstant[SrcIdx]) { if (UndefSrcElts[SrcIdx][M]) KnownUndef.setBit(i); else if (SrcEltBits[SrcIdx][M] == 0) KnownZero.setBit(i); } } assert(VT.getVectorNumElements() == (unsigned)Size && "Different mask size from vector size!"); return true; } // Replace target shuffle mask elements with known undef/zero sentinels. static void resolveTargetShuffleFromZeroables(SmallVectorImpl &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros= true) { unsigned NumElts = Mask.size(); assert(KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"); for (unsigned i = 0; i != NumElts; ++i) { if (KnownUndef[i]) Mask[i] = SM_SentinelUndef; else if (ResolveKnownZeros && KnownZero[i]) Mask[i] = SM_SentinelZero; } } // Extract target shuffle mask sentinel elements to known undef/zero bitmasks. static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl &Mask, APInt &KnownUndef, APInt &KnownZero) { unsigned NumElts = Mask.size(); KnownUndef = KnownZero = APInt::getNullValue(NumElts); for (unsigned i = 0; i != NumElts; ++i) { int M = Mask[i]; if (SM_SentinelUndef == M) KnownUndef.setBit(i); if (SM_SentinelZero == M) KnownZero.setBit(i); } } // Forward declaration (for getFauxShuffleMask recursive check). // TODO: Use DemandedElts variant. static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl &Inputs, SmallVectorImpl &Mask, SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts); // Attempt to decode ops that could be represented as a shuffle mask. // The decoded shuffle mask may contain a different number of elements to the // destination value type. static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl &Mask, SmallVectorImpl &Ops, SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts) { Mask.clear(); Ops.clear(); MVT VT = N.getSimpleValueType(); unsigned NumElts = VT.getVectorNumElements(); unsigned NumSizeInBits = VT.getSizeInBits(); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0) return false; assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size"); unsigned Opcode = N.getOpcode(); switch (Opcode) { case ISD::VECTOR_SHUFFLE: { // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here. ArrayRef ShuffleMask = cast(N)->getMask(); if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) { Mask.append(ShuffleMask.begin(), ShuffleMask.end()); Ops.push_back(N.getOperand(0)); Ops.push_back(N.getOperand(1)); return true; } return false; } case ISD::AND: case X86ISD::ANDNP: { // Attempt to decode as a per-byte mask. APInt UndefElts; SmallVector EltBits; SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); bool IsAndN = (X86ISD::ANDNP == Opcode); uint64_t ZeroMask = IsAndN ? 255 : 0; if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits)) return false; for (int i = 0, e = (int)EltBits.size(); i != e; ++i) { if (UndefElts[i]) { Mask.push_back(SM_SentinelUndef); continue; } const APInt &ByteBits = EltBits[i]; if (ByteBits != 0 && ByteBits != 255) return false; Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i); } Ops.push_back(IsAndN ? N1 : N0); return true; } case ISD::OR: { // Inspect each operand at the byte level. We can merge these into a // blend shuffle mask if for each byte at least one is masked out (zero). KnownBits Known0 = DAG.computeKnownBits(N.getOperand(0), DemandedElts, Depth + 1); KnownBits Known1 = DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1); if (Known0.One.isNullValue() && Known1.One.isNullValue()) { bool IsByteMask = true; unsigned NumSizeInBytes = NumSizeInBits / 8; unsigned NumBytesPerElt = NumBitsPerElt / 8; APInt ZeroMask = APInt::getNullValue(NumBytesPerElt); APInt SelectMask = APInt::getNullValue(NumBytesPerElt); for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) { unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue(); unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue(); if (LHS == 255 && RHS == 0) SelectMask.setBit(i); else if (LHS == 255 && RHS == 255) ZeroMask.setBit(i); else if (!(LHS == 0 && RHS == 255)) IsByteMask = false; } if (IsByteMask) { for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) { for (unsigned j = 0; j != NumBytesPerElt; ++j) { unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0); int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs)); Mask.push_back(Idx); } } Ops.push_back(N.getOperand(0)); Ops.push_back(N.getOperand(1)); return true; } } // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other // is a valid shuffle index. SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0)); SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1)); if (!N0.getValueType().isVector() || !N1.getValueType().isVector()) return false; SmallVector SrcMask0, SrcMask1; SmallVector SrcInputs0, SrcInputs1; if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1, true) || !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1, true)) return false; size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size()); SmallVector Mask0, Mask1; scaleShuffleMask(MaskSize / SrcMask0.size(), SrcMask0, Mask0); scaleShuffleMask(MaskSize / SrcMask1.size(), SrcMask1, Mask1); for (size_t i = 0; i != MaskSize; ++i) { if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef) Mask.push_back(SM_SentinelUndef); else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero) Mask.push_back(SM_SentinelZero); else if (Mask1[i] == SM_SentinelZero) Mask.push_back(Mask0[i]); else if (Mask0[i] == SM_SentinelZero) Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size())); else return false; } Ops.append(SrcInputs0.begin(), SrcInputs0.end()); Ops.append(SrcInputs1.begin(), SrcInputs1.end()); return true; } case ISD::INSERT_SUBVECTOR: { SDValue Src = N.getOperand(0); SDValue Sub = N.getOperand(1); EVT SubVT = Sub.getValueType(); unsigned NumSubElts = SubVT.getVectorNumElements(); if (!isa(N.getOperand(2)) || !N->isOnlyUserOf(Sub.getNode())) return false; uint64_t InsertIdx = N.getConstantOperandVal(2); // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)). if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR && Sub.getOperand(0).getValueType() == VT && isa(Sub.getOperand(1))) { uint64_t ExtractIdx = Sub.getConstantOperandVal(1); for (int i = 0; i != (int)NumElts; ++i) Mask.push_back(i); for (int i = 0; i != (int)NumSubElts; ++i) Mask[InsertIdx + i] = NumElts + ExtractIdx + i; Ops.push_back(Src); Ops.push_back(Sub.getOperand(0)); return true; } // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)). SmallVector SubMask; SmallVector SubInputs; if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs, SubMask, DAG, Depth + 1, ResolveKnownElts)) return false; if (SubMask.size() != NumSubElts) { assert(((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"); if ((NumSubElts % SubMask.size()) == 0) { int Scale = NumSubElts / SubMask.size(); SmallVector ScaledSubMask; scaleShuffleMask(Scale, SubMask, ScaledSubMask); SubMask = ScaledSubMask; } else { int Scale = SubMask.size() / NumSubElts; NumSubElts = SubMask.size(); NumElts *= Scale; InsertIdx *= Scale; } } Ops.push_back(Src); for (SDValue &SubInput : SubInputs) { EVT SubSVT = SubInput.getValueType().getScalarType(); EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT, NumSizeInBits / SubSVT.getSizeInBits()); Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT, DAG.getUNDEF(AltVT), SubInput, DAG.getIntPtrConstant(0, SDLoc(N)))); } for (int i = 0; i != (int)NumElts; ++i) Mask.push_back(i); for (int i = 0; i != (int)NumSubElts; ++i) { int M = SubMask[i]; if (0 <= M) { int InputIdx = M / NumSubElts; M = (NumElts * (1 + InputIdx)) + (M % NumSubElts); } Mask[i + InsertIdx] = M; } return true; } case ISD::SCALAR_TO_VECTOR: { // Match against a scalar_to_vector of an extract from a vector, // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar. SDValue N0 = N.getOperand(0); SDValue SrcExtract; if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && N0.getOperand(0).getValueType() == VT) || (N0.getOpcode() == X86ISD::PEXTRW && N0.getOperand(0).getValueType() == MVT::v8i16) || (N0.getOpcode() == X86ISD::PEXTRB && N0.getOperand(0).getValueType() == MVT::v16i8)) { SrcExtract = N0; } if (!SrcExtract || !isa(SrcExtract.getOperand(1))) return false; SDValue SrcVec = SrcExtract.getOperand(0); EVT SrcVT = SrcVec.getValueType(); unsigned NumSrcElts = SrcVT.getVectorNumElements(); unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1; unsigned SrcIdx = SrcExtract.getConstantOperandVal(1); if (NumSrcElts <= SrcIdx) return false; Ops.push_back(SrcVec); Mask.push_back(SrcIdx); Mask.append(NumZeros, SM_SentinelZero); Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef); return true; } case X86ISD::PINSRB: case X86ISD::PINSRW: { SDValue InVec = N.getOperand(0); SDValue InScl = N.getOperand(1); SDValue InIndex = N.getOperand(2); if (!isa(InIndex) || cast(InIndex)->getAPIntValue().uge(NumElts)) return false; uint64_t InIdx = N.getConstantOperandVal(2); // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern. if (X86::isZeroNode(InScl)) { Ops.push_back(InVec); for (unsigned i = 0; i != NumElts; ++i) Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i); return true; } // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern. // TODO: Expand this to support INSERT_VECTOR_ELT/etc. unsigned ExOp = (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW); if (InScl.getOpcode() != ExOp) return false; SDValue ExVec = InScl.getOperand(0); SDValue ExIndex = InScl.getOperand(1); if (!isa(ExIndex) || cast(ExIndex)->getAPIntValue().uge(NumElts)) return false; uint64_t ExIdx = InScl.getConstantOperandVal(1); Ops.push_back(InVec); Ops.push_back(ExVec); for (unsigned i = 0; i != NumElts; ++i) Mask.push_back(i == InIdx ? NumElts + ExIdx : i); return true; } case X86ISD::PACKSS: case X86ISD::PACKUS: { SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && "Unexpected input value type"); APInt EltsLHS, EltsRHS; getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS); // If we know input saturation won't happen we can treat this // as a truncation shuffle. if (Opcode == X86ISD::PACKSS) { if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) || (!N1.isUndef() && DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt)) return false; } else { APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt); if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) || (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1))) return false; } bool IsUnary = (N0 == N1); Ops.push_back(N0); if (!IsUnary) Ops.push_back(N1); createPackShuffleMask(VT, Mask, IsUnary); return true; } case X86ISD::VSHLI: case X86ISD::VSRLI: { uint64_t ShiftVal = N.getConstantOperandVal(1); // Out of range bit shifts are guaranteed to be zero. if (NumBitsPerElt <= ShiftVal) { Mask.append(NumElts, SM_SentinelZero); return true; } // We can only decode 'whole byte' bit shifts as shuffles. if ((ShiftVal % 8) != 0) break; uint64_t ByteShift = ShiftVal / 8; unsigned NumBytes = NumSizeInBits / 8; unsigned NumBytesPerElt = NumBitsPerElt / 8; Ops.push_back(N.getOperand(0)); // Clear mask to all zeros and insert the shifted byte indices. Mask.append(NumBytes, SM_SentinelZero); if (X86ISD::VSHLI == Opcode) { for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt) for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) Mask[i + j] = i + j - ByteShift; } else { for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt) for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) Mask[i + j - ByteShift] = i + j; } return true; } case X86ISD::VBROADCAST: { SDValue Src = N.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); if (!SrcVT.isVector()) return false; if (NumSizeInBits != SrcVT.getSizeInBits()) { assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && "Illegal broadcast type"); SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), NumSizeInBits / SrcVT.getScalarSizeInBits()); Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT, DAG.getUNDEF(SrcVT), Src, DAG.getIntPtrConstant(0, SDLoc(N))); } Ops.push_back(Src); Mask.append(NumElts, 0); return true; } case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND_VECTOR_INREG: case ISD::ANY_EXTEND_VECTOR_INREG: { SDValue Src = N.getOperand(0); EVT SrcVT = Src.getValueType(); // Extended source must be a simple vector. if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 || (SrcVT.getScalarSizeInBits() % 8) != 0) return false; unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits(); bool IsAnyExtend = (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode); DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend, Mask); if (NumSizeInBits != SrcVT.getSizeInBits()) { assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && "Illegal zero-extension type"); SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(), NumSizeInBits / NumSrcBitsPerElt); Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT, DAG.getUNDEF(SrcVT), Src, DAG.getIntPtrConstant(0, SDLoc(N))); } Ops.push_back(Src); return true; } } return false; } /// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask. static void resolveTargetShuffleInputsAndMask(SmallVectorImpl &Inputs, SmallVectorImpl &Mask) { int MaskWidth = Mask.size(); SmallVector UsedInputs; for (int i = 0, e = Inputs.size(); i < e; ++i) { int lo = UsedInputs.size() * MaskWidth; int hi = lo + MaskWidth; // Strip UNDEF input usage. if (Inputs[i].isUndef()) for (int &M : Mask) if ((lo <= M) && (M < hi)) M = SM_SentinelUndef; // Check for unused inputs. if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) { for (int &M : Mask) if (lo <= M) M -= MaskWidth; continue; } // Check for repeated inputs. bool IsRepeat = false; for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) { if (UsedInputs[j] != Inputs[i]) continue; for (int &M : Mask) if (lo <= M) M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth); IsRepeat = true; break; } if (IsRepeat) continue; UsedInputs.push_back(Inputs[i]); } Inputs = UsedInputs; } /// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs /// and then sets the SM_SentinelUndef and SM_SentinelZero values. /// Returns true if the target shuffle mask was decoded. static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl &Inputs, SmallVectorImpl &Mask, APInt &KnownUndef, APInt &KnownZero, SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts) { EVT VT = Op.getValueType(); if (!VT.isSimple() || !VT.isVector()) return false; if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) { if (ResolveKnownElts) resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero); return true; } if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth, ResolveKnownElts)) { resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero); return true; } return false; } static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl &Inputs, SmallVectorImpl &Mask, SelectionDAG &DAG, unsigned Depth = 0, bool ResolveKnownElts = true) { EVT VT = Op.getValueType(); if (!VT.isSimple() || !VT.isVector()) return false; APInt KnownUndef, KnownZero; unsigned NumElts = Op.getValueType().getVectorNumElements(); APInt DemandedElts = APInt::getAllOnesValue(NumElts); return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef, KnownZero, DAG, Depth, ResolveKnownElts); } /// Returns the scalar element that will make up the ith /// element of the result of the vector shuffle. static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, unsigned Depth) { if (Depth == 6) return SDValue(); // Limit search depth. SDValue V = SDValue(N, 0); EVT VT = V.getValueType(); unsigned Opcode = V.getOpcode(); // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. if (const ShuffleVectorSDNode *SV = dyn_cast(N)) { int Elt = SV->getMaskElt(Index); if (Elt < 0) return DAG.getUNDEF(VT.getVectorElementType()); unsigned NumElems = VT.getVectorNumElements(); SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1); return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { MVT ShufVT = V.getSimpleValueType(); MVT ShufSVT = ShufVT.getVectorElementType(); int NumElems = (int)ShufVT.getVectorNumElements(); SmallVector ShuffleMask; SmallVector ShuffleOps; bool IsUnary; if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary)) return SDValue(); int Elt = ShuffleMask[Index]; if (Elt == SM_SentinelZero) return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT) : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT); if (Elt == SM_SentinelUndef) return DAG.getUNDEF(ShufSVT); assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range"); SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1]; return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } // Recurse into insert_subvector base/sub vector to find scalars. if (Opcode == ISD::INSERT_SUBVECTOR && isa(N->getOperand(2))) { SDValue Vec = N->getOperand(0); SDValue Sub = N->getOperand(1); EVT SubVT = Sub.getValueType(); unsigned NumSubElts = SubVT.getVectorNumElements(); uint64_t SubIdx = N->getConstantOperandVal(2); if (SubIdx <= Index && Index < (SubIdx + NumSubElts)) return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1); return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1); } // Recurse into extract_subvector src vector to find scalars. if (Opcode == ISD::EXTRACT_SUBVECTOR && isa(N->getOperand(1))) { SDValue Src = N->getOperand(0); uint64_t SrcIdx = N->getConstantOperandVal(1); return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1); } // Actual nodes that may contain scalar elements if (Opcode == ISD::BITCAST) { V = V.getOperand(0); EVT SrcVT = V.getValueType(); unsigned NumElems = VT.getVectorNumElements(); if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) return SDValue(); } if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) return (Index == 0) ? V.getOperand(0) : DAG.getUNDEF(VT.getVectorElementType()); if (V.getOpcode() == ISD::BUILD_VECTOR) return V.getOperand(Index); return SDValue(); } // Use PINSRB/PINSRW/PINSRD to create a build vector. static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); unsigned NumElts = VT.getVectorNumElements(); assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && "Illegal vector insertion"); SDLoc dl(Op); SDValue V; bool First = true; for (unsigned i = 0; i < NumElts; ++i) { bool IsNonZero = (NonZeros & (1 << i)) != 0; if (!IsNonZero) continue; // If the build vector contains zeros or our first insertion is not the // first index then insert into zero vector to break any register // dependency else use SCALAR_TO_VECTOR. if (First) { First = false; if (NumZero || 0 != i) V = getZeroVector(VT, Subtarget, DAG, dl); else { assert(0 == i && "Expected insertion into zero-index"); V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); V = DAG.getBitcast(VT, V); continue; } } V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } return V; } /// Custom lower build_vector of v16i8. static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (NumNonZero > 8 && !Subtarget.hasSSE41()) return SDValue(); // SSE4.1 - use PINSRB to insert each byte directly. if (Subtarget.hasSSE41()) return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG, Subtarget); SDLoc dl(Op); SDValue V; // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. for (unsigned i = 0; i < 16; i += 2) { bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0; if (!ThisIsNonZero && !NextIsNonZero) continue; // FIXME: Investigate combining the first 4 bytes as a i32 instead. SDValue Elt; if (ThisIsNonZero) { if (NumZero || NextIsNonZero) Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32); else Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); } if (NextIsNonZero) { SDValue NextElt = Op.getOperand(i + 1); if (i == 0 && NumZero) NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32); else NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32); NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt, DAG.getConstant(8, dl, MVT::i8)); if (ThisIsNonZero) Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt); else Elt = NextElt; } // If our first insertion is not the first index then insert into zero // vector to break any register dependency else use SCALAR_TO_VECTOR. if (!V) { if (i != 0) V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); else { V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt); V = DAG.getBitcast(MVT::v8i16, V); continue; } } Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt); V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt, DAG.getIntPtrConstant(i / 2, dl)); } return DAG.getBitcast(MVT::v16i8, V); } /// Custom lower build_vector of v8i16. static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (NumNonZero > 4 && !Subtarget.hasSSE41()) return SDValue(); // Use PINSRW to insert each byte directly. return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG, Subtarget); } /// Custom lower build_vector of v4i32 or v4f32. static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // If this is a splat of a pair of elements, use MOVDDUP (unless the target // has XOP; in that case defer lowering to potentially use VPERMIL2PS). // Because we're creating a less complicated build vector here, we may enable // further folding of the MOVDDUP via shuffle transforms. if (Subtarget.hasSSE3() && !Subtarget.hasXOP() && Op.getOperand(0) == Op.getOperand(2) && Op.getOperand(1) == Op.getOperand(3) && Op.getOperand(0) != Op.getOperand(1)) { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); // Create a new build vector with the first 2 elements followed by undef // padding, bitcast to v2f64, duplicate, and bitcast back. SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1), DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) }; SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops)); SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV); return DAG.getBitcast(VT, Dup); } // Find all zeroable elements. std::bitset<4> Zeroable, Undefs; for (int i = 0; i < 4; ++i) { SDValue Elt = Op.getOperand(i); Undefs[i] = Elt.isUndef(); Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt)); } assert(Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!"); // We only know how to deal with build_vector nodes where elements are either // zeroable or extract_vector_elt with constant index. SDValue FirstNonZero; unsigned FirstNonZeroIdx; for (unsigned i = 0; i < 4; ++i) { if (Zeroable[i]) continue; SDValue Elt = Op.getOperand(i); if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Elt.getOperand(1))) return SDValue(); // Make sure that this node is extracting from a 128-bit vector. MVT VT = Elt.getOperand(0).getSimpleValueType(); if (!VT.is128BitVector()) return SDValue(); if (!FirstNonZero.getNode()) { FirstNonZero = Elt; FirstNonZeroIdx = i; } } assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!"); SDValue V1 = FirstNonZero.getOperand(0); MVT VT = V1.getSimpleValueType(); // See if this build_vector can be lowered as a blend with zero. SDValue Elt; unsigned EltMaskIdx, EltIdx; int Mask[4]; for (EltIdx = 0; EltIdx < 4; ++EltIdx) { if (Zeroable[EltIdx]) { // The zero vector will be on the right hand side. Mask[EltIdx] = EltIdx+4; continue; } Elt = Op->getOperand(EltIdx); // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index. EltMaskIdx = Elt.getConstantOperandVal(1); if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx) break; Mask[EltIdx] = EltIdx; } if (EltIdx == 4) { // Let the shuffle legalizer deal with blend operations. SDValue VZeroOrUndef = (Zeroable == Undefs) ? DAG.getUNDEF(VT) : getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); if (V1.getSimpleValueType() != VT) V1 = DAG.getBitcast(VT, V1); return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask); } // See if we can lower this build_vector to a INSERTPS. if (!Subtarget.hasSSE41()) return SDValue(); SDValue V2 = Elt.getOperand(0); if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx) V1 = SDValue(); bool CanFold = true; for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) { if (Zeroable[i]) continue; SDValue Current = Op->getOperand(i); SDValue SrcVector = Current->getOperand(0); if (!V1.getNode()) V1 = SrcVector; CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i); } if (!CanFold) return SDValue(); assert(V1.getNode() && "Expected at least two non-zero elements!"); if (V1.getSimpleValueType() != MVT::v4f32) V1 = DAG.getBitcast(MVT::v4f32, V1); if (V2.getSimpleValueType() != MVT::v4f32) V2 = DAG.getBitcast(MVT::v4f32, V2); // Ok, we can emit an INSERTPS instruction. unsigned ZMask = Zeroable.to_ulong(); unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask; assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); SDLoc DL(Op); SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, DAG.getIntPtrConstant(InsertPSMask, DL, true)); return DAG.getBitcast(VT, Result); } /// Return a vector logical shift node. static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl) { assert(VT.is128BitVector() && "Unknown type for VShift"); MVT ShVT = MVT::v16i8; unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; SrcOp = DAG.getBitcast(ShVT, SrcOp); assert(NumBits % 8 == 0 && "Only support byte sized shifts"); SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8); return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); } static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG) { // Check if the scalar load can be widened into a vector load. And if // the address is "base + cst" see if the cst can be "absorbed" into // the shuffle mask. if (LoadSDNode *LD = dyn_cast(SrcOp)) { SDValue Ptr = LD->getBasePtr(); if (!ISD::isNormalLoad(LD) || !LD->isSimple()) return SDValue(); EVT PVT = LD->getValueType(0); if (PVT != MVT::i32 && PVT != MVT::f32) return SDValue(); int FI = -1; int64_t Offset = 0; if (FrameIndexSDNode *FINode = dyn_cast(Ptr)) { FI = FINode->getIndex(); Offset = 0; } else if (DAG.isBaseWithConstantOffset(Ptr) && isa(Ptr.getOperand(0))) { FI = cast(Ptr.getOperand(0))->getIndex(); Offset = Ptr.getConstantOperandVal(1); Ptr = Ptr.getOperand(0); } else { return SDValue(); } // FIXME: 256-bit vector instructions don't require a strict alignment, // improve this code to support it better. unsigned RequiredAlign = VT.getSizeInBits()/8; SDValue Chain = LD->getChain(); // Make sure the stack object alignment is at least 16 or 32. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { if (MFI.isFixedObjectIndex(FI)) { // Can't change the alignment. FIXME: It's possible to compute // the exact stack offset and reference FI + adjust offset instead. // If someone *really* cares about this. That's the way to implement it. return SDValue(); } else { MFI.setObjectAlignment(FI, RequiredAlign); } } // (Offset % 16 or 32) must be multiple of 4. Then address is then // Ptr + (Offset & ~15). if (Offset < 0) return SDValue(); if ((Offset % RequiredAlign) & 3) return SDValue(); int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1); if (StartOffset) { SDLoc DL(Ptr); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, DAG.getConstant(StartOffset, DL, Ptr.getValueType())); } int EltNo = (Offset - StartOffset) >> 2; unsigned NumElems = VT.getVectorNumElements(); EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(StartOffset)); SmallVector Mask(NumElems, EltNo); return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask); } return SDValue(); } // Recurse to find a LoadSDNode source and the accumulated ByteOffest. static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { if (ISD::isNON_EXTLoad(Elt.getNode())) { auto *BaseLd = cast(Elt); if (!BaseLd->isSimple()) return false; Ld = BaseLd; ByteOffset = 0; return true; } switch (Elt.getOpcode()) { case ISD::BITCAST: case ISD::TRUNCATE: case ISD::SCALAR_TO_VECTOR: return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset); case ISD::SRL: if (isa(Elt.getOperand(1))) { uint64_t Idx = Elt.getConstantOperandVal(1); if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) { ByteOffset += Idx / 8; return true; } } break; case ISD::EXTRACT_VECTOR_ELT: if (isa(Elt.getOperand(1))) { SDValue Src = Elt.getOperand(0); unsigned SrcSizeInBits = Src.getScalarValueSizeInBits(); unsigned DstSizeInBits = Elt.getScalarValueSizeInBits(); if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 && findEltLoadSrc(Src, Ld, ByteOffset)) { uint64_t Idx = Elt.getConstantOperandVal(1); ByteOffset += Idx * (SrcSizeInBits / 8); return true; } } break; } return false; } /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the /// elements can be replaced by a single large load which has the same value as /// a build_vector or insert_subvector whose loaded operands are 'Elts'. /// /// Example: -> zextload a static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool isAfterLegalize) { if ((VT.getScalarSizeInBits() % 8) != 0) return SDValue(); unsigned NumElems = Elts.size(); int LastLoadedElt = -1; APInt LoadMask = APInt::getNullValue(NumElems); APInt ZeroMask = APInt::getNullValue(NumElems); APInt UndefMask = APInt::getNullValue(NumElems); SmallVector Loads(NumElems, nullptr); SmallVector ByteOffsets(NumElems, 0); // For each element in the initializer, see if we've found a load, zero or an // undef. for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = peekThroughBitcasts(Elts[i]); if (!Elt.getNode()) return SDValue(); if (Elt.isUndef()) { UndefMask.setBit(i); continue; } if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) { ZeroMask.setBit(i); continue; } // Each loaded element must be the correct fractional portion of the // requested vector load. unsigned EltSizeInBits = Elt.getValueSizeInBits(); if ((NumElems * EltSizeInBits) != VT.getSizeInBits()) return SDValue(); if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0) return SDValue(); unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0); if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits) return SDValue(); LoadMask.setBit(i); LastLoadedElt = i; } assert((ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && "Incomplete element masks"); // Handle Special Cases - all undef or undef/zero. if (UndefMask.countPopulation() == NumElems) return DAG.getUNDEF(VT); // FIXME: Should we return this as a BUILD_VECTOR instead? if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems) return VT.isInteger() ? DAG.getConstant(0, DL, VT) : DAG.getConstantFP(0.0, DL, VT); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); int FirstLoadedElt = LoadMask.countTrailingZeros(); SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]); EVT EltBaseVT = EltBase.getValueType(); assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && "Register/Memory size mismatch"); LoadSDNode *LDBase = Loads[FirstLoadedElt]; assert(LDBase && "Did not find base load for merging consecutive loads"); unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits(); unsigned BaseSizeInBytes = BaseSizeInBits / 8; int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits; assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected"); // TODO: Support offsetting the base load. if (ByteOffsets[FirstLoadedElt] != 0) return SDValue(); // Check to see if the element's load is consecutive to the base load // or offset from a previous (already checked) load. auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) { LoadSDNode *Ld = Loads[EltIdx]; int64_t ByteOffset = ByteOffsets[EltIdx]; if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) { int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes); return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] && Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0); } return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes, EltIdx - FirstLoadedElt); }; // Consecutive loads can contain UNDEFS but not ZERO elements. // Consecutive loads with UNDEFs and ZEROs elements require a // an additional shuffle stage to clear the ZERO elements. bool IsConsecutiveLoad = true; bool IsConsecutiveLoadWithZeros = true; for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) { if (LoadMask[i]) { if (!CheckConsecutiveLoad(LDBase, i)) { IsConsecutiveLoad = false; IsConsecutiveLoadWithZeros = false; break; } } else if (ZeroMask[i]) { IsConsecutiveLoad = false; } } auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) { auto MMOFlags = LDBase->getMemOperand()->getFlags(); assert(LDBase->isSimple() && "Cannot merge volatile or atomic loads."); SDValue NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags); for (auto *LD : Loads) if (LD) DAG.makeEquivalentMemoryOrdering(LD, NewLd); return NewLd; }; // Check if the base load is entirely dereferenceable. bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable( VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout()); // LOAD - all consecutive load/undefs (must start/end with a load or be // entirely dereferenceable). If we have found an entire vector of loads and // undefs, then return a large load of the entire vector width starting at the // base pointer. If the vector contains zeros, then attempt to shuffle those // elements. if (FirstLoadedElt == 0 && (LastLoadedElt == (int)(NumElems - 1) || IsDereferenceable) && (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) { if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT)) return SDValue(); // Don't create 256-bit non-temporal aligned loads without AVX2 as these // will lower to regular temporal loads and use the cache. if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 && VT.is256BitVector() && !Subtarget.hasInt256()) return SDValue(); if (NumElems == 1) return DAG.getBitcast(VT, Elts[FirstLoadedElt]); if (!ZeroMask) return CreateLoad(VT, LDBase); // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded // vector and a zero vector to clear out the zero elements. if (!isAfterLegalize && VT.isVector()) { unsigned NumMaskElts = VT.getVectorNumElements(); if ((NumMaskElts % NumElems) == 0) { unsigned Scale = NumMaskElts / NumElems; SmallVector ClearMask(NumMaskElts, -1); for (unsigned i = 0; i < NumElems; ++i) { if (UndefMask[i]) continue; int Offset = ZeroMask[i] ? NumMaskElts : 0; for (unsigned j = 0; j != Scale; ++j) ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset; } SDValue V = CreateLoad(VT, LDBase); SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT) : DAG.getConstantFP(0.0, DL, VT); return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask); } } } // If the upper half of a ymm/zmm load is undef then just load the lower half. if (VT.is256BitVector() || VT.is512BitVector()) { unsigned HalfNumElems = NumElems / 2; if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) { EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems); SDValue HalfLD = EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL, DAG, Subtarget, isAfterLegalize); if (HalfLD) return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), HalfLD, DAG.getIntPtrConstant(0, DL)); } } // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs. if (IsConsecutiveLoad && FirstLoadedElt == 0 && (LoadSizeInBits == 32 || LoadSizeInBits == 64) && ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) { MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits) : MVT::getIntegerVT(LoadSizeInBits); MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits); if (TLI.isTypeLegal(VecVT)) { SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(), LDBase->getAlignment(), MachineMemOperand::MOLoad); for (auto *LD : Loads) if (LD) DAG.makeEquivalentMemoryOrdering(LD, ResNode); return DAG.getBitcast(VT, ResNode); } } // BROADCAST - match the smallest possible repetition pattern, load that // scalar/subvector element and then broadcast to the entire vector. if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() && (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) { for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) { unsigned RepeatSize = SubElems * BaseSizeInBits; unsigned ScalarSize = std::min(RepeatSize, 64u); if (!Subtarget.hasAVX2() && ScalarSize < 32) continue; bool Match = true; SmallVector RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT)); for (unsigned i = 0; i != NumElems && Match; ++i) { if (!LoadMask[i]) continue; SDValue Elt = peekThroughBitcasts(Elts[i]); if (RepeatedLoads[i % SubElems].isUndef()) RepeatedLoads[i % SubElems] = Elt; else Match &= (RepeatedLoads[i % SubElems] == Elt); } // We must have loads at both ends of the repetition. Match &= !RepeatedLoads.front().isUndef(); Match &= !RepeatedLoads.back().isUndef(); if (!Match) continue; EVT RepeatVT = VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64)) ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize) : EVT::getFloatingPointVT(ScalarSize); if (RepeatSize > ScalarSize) RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT, RepeatSize / ScalarSize); EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(), VT.getSizeInBits() / ScalarSize); if (TLI.isTypeLegal(BroadcastVT)) { if (SDValue RepeatLoad = EltsFromConsecutiveLoads( RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) { unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST : X86ISD::VBROADCAST; SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad); return DAG.getBitcast(VT, Broadcast); } } } } return SDValue(); } // Combine a vector ops (shuffles etc.) that is equal to build_vector load1, // load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses // are consecutive, non-overlapping, and in the right order. static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool isAfterLegalize) { SmallVector Elts; for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) { Elts.push_back(Elt); continue; } return SDValue(); } assert(Elts.size() == VT.getVectorNumElements()); return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget, isAfterLegalize); } static Constant *getConstantVector(MVT VT, const APInt &SplatValue, unsigned SplatBitSize, LLVMContext &C) { unsigned ScalarSize = VT.getScalarSizeInBits(); unsigned NumElm = SplatBitSize / ScalarSize; SmallVector ConstantVec; for (unsigned i = 0; i < NumElm; i++) { APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i); Constant *Const; if (VT.isFloatingPoint()) { if (ScalarSize == 32) { Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val)); } else { assert(ScalarSize == 64 && "Unsupported floating point scalar size"); Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val)); } } else Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val); ConstantVec.push_back(Const); } return ConstantVector::get(ArrayRef(ConstantVec)); } static bool isFoldableUseOfShuffle(SDNode *N) { for (auto *U : N->uses()) { unsigned Opc = U->getOpcode(); // VPERMV/VPERMV3 shuffles can never fold their index operands. if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N) return false; if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N) return false; if (isTargetShuffle(Opc)) return true; if (Opc == ISD::BITCAST) // Ignore bitcasts return isFoldableUseOfShuffle(U); if (N->hasOneUse()) return true; } return false; } // Check if the current node of build vector is a zero extended vector. // // If so, return the value extended. // // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a. // // NumElt - return the number of zero extended identical values. // // EltType - return the type of the value include the zero extend. static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op, unsigned &NumElt, MVT &EltType) { SDValue ExtValue = Op->getOperand(0); unsigned NumElts = Op->getNumOperands(); unsigned Delta = NumElts; for (unsigned i = 1; i < NumElts; i++) { if (Op->getOperand(i) == ExtValue) { Delta = i; break; } if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i)))) return SDValue(); } if (!isPowerOf2_32(Delta) || Delta == 1) return SDValue(); for (unsigned i = Delta; i < NumElts; i++) { if (i % Delta == 0) { if (Op->getOperand(i) != ExtValue) return SDValue(); } else if (!(isNullConstant(Op->getOperand(i)) || Op->getOperand(i).isUndef())) return SDValue(); } unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits(); unsigned ExtVTSize = EltSize * Delta; EltType = MVT::getIntegerVT(ExtVTSize); NumElt = NumElts / Delta; return ExtValue; } /// Attempt to use the vbroadcast instruction to generate a splat value /// from a splat BUILD_VECTOR which uses: /// a. A single scalar load, or a constant. /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>). /// /// The VBROADCAST node is returned when a pattern is found, /// or SDValue() otherwise. static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // VBROADCAST requires AVX. // TODO: Splats could be generated for non-AVX CPUs using SSE // instructions, but there's less potential gain for only 128-bit vectors. if (!Subtarget.hasAVX()) return SDValue(); MVT VT = BVOp->getSimpleValueType(0); SDLoc dl(BVOp); assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Unsupported vector type for broadcast."); BitVector UndefElements; SDValue Ld = BVOp->getSplatValue(&UndefElements); // Attempt to use VBROADCASTM // From this paterrn: // a. t0 = (zext_i64 (bitcast_i8 v2i1 X)) // b. t1 = (build_vector t0 t0) // // Create (VBROADCASTM v2i1 X) if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) { MVT EltType = VT.getScalarType(); unsigned NumElts = VT.getVectorNumElements(); SDValue BOperand; SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType); if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) || (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND && Ld.getOperand(0).getOpcode() == ISD::BITCAST)) { if (ZeroExtended) BOperand = ZeroExtended.getOperand(0); else BOperand = Ld.getOperand(0).getOperand(0); MVT MaskVT = BOperand.getSimpleValueType(); if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d SDValue Brdcst = DAG.getNode(X86ISD::VBROADCASTM, dl, MVT::getVectorVT(EltType, NumElts), BOperand); return DAG.getBitcast(VT, Brdcst); } } } unsigned NumElts = VT.getVectorNumElements(); unsigned NumUndefElts = UndefElements.count(); if (!Ld || (NumElts - NumUndefElts) <= 1) { APInt SplatValue, Undef; unsigned SplatBitSize; bool HasUndef; // Check if this is a repeated constant pattern suitable for broadcasting. if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) && SplatBitSize > VT.getScalarSizeInBits() && SplatBitSize < VT.getSizeInBits()) { // Avoid replacing with broadcast when it's a use of a shuffle // instruction to preserve the present custom lowering of shuffles. if (isFoldableUseOfShuffle(BVOp)) return SDValue(); // replace BUILD_VECTOR with broadcast of the repeated constants. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); LLVMContext *Ctx = DAG.getContext(); MVT PVT = TLI.getPointerTy(DAG.getDataLayout()); if (Subtarget.hasAVX()) { if (SplatBitSize <= 64 && Subtarget.hasAVX2() && !(SplatBitSize == 64 && Subtarget.is32Bit())) { // Splatted value can fit in one INTEGER constant in constant pool. // Load the constant and broadcast it. MVT CVT = MVT::getIntegerVT(SplatBitSize); Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize); Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue); SDValue CP = DAG.getConstantPool(C, PVT); unsigned Repeat = VT.getSizeInBits() / SplatBitSize; unsigned Alignment = cast(CP)->getAlignment(); Ld = DAG.getLoad( CVT, dl, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment); SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl, MVT::getVectorVT(CVT, Repeat), Ld); return DAG.getBitcast(VT, Brdcst); } else if (SplatBitSize == 32 || SplatBitSize == 64) { // Splatted value can fit in one FLOAT constant in constant pool. // Load the constant and broadcast it. // AVX have support for 32 and 64 bit broadcast for floats only. // No 64bit integer in 32bit subtarget. MVT CVT = MVT::getFloatingPointVT(SplatBitSize); // Lower the splat via APFloat directly, to avoid any conversion. Constant *C = SplatBitSize == 32 ? ConstantFP::get(*Ctx, APFloat(APFloat::IEEEsingle(), SplatValue)) : ConstantFP::get(*Ctx, APFloat(APFloat::IEEEdouble(), SplatValue)); SDValue CP = DAG.getConstantPool(C, PVT); unsigned Repeat = VT.getSizeInBits() / SplatBitSize; unsigned Alignment = cast(CP)->getAlignment(); Ld = DAG.getLoad( CVT, dl, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment); SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl, MVT::getVectorVT(CVT, Repeat), Ld); return DAG.getBitcast(VT, Brdcst); } else if (SplatBitSize > 64) { // Load the vector of constants and broadcast it. MVT CVT = VT.getScalarType(); Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx); SDValue VCP = DAG.getConstantPool(VecC, PVT); unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits(); unsigned Alignment = cast(VCP)->getAlignment(); Ld = DAG.getLoad( MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment); SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld); return DAG.getBitcast(VT, Brdcst); } } } // If we are moving a scalar into a vector (Ld must be set and all elements // but 1 are undef) and that operation is not obviously supported by // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast. // That's better than general shuffling and may eliminate a load to GPR and // move from scalar to vector register. if (!Ld || NumElts - NumUndefElts != 1) return SDValue(); unsigned ScalarSize = Ld.getValueSizeInBits(); if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64))) return SDValue(); } bool ConstSplatVal = (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP); // Make sure that all of the users of a non-constant load are from the // BUILD_VECTOR node. if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode())) return SDValue(); unsigned ScalarSize = Ld.getValueSizeInBits(); bool IsGE256 = (VT.getSizeInBits() >= 256); // When optimizing for size, generate up to 5 extra bytes for a broadcast // instruction to save 8 or more bytes of constant pool data. // TODO: If multiple splats are generated to load the same constant, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. bool OptForSize = DAG.shouldOptForSize(); // Handle broadcasting a single constant scalar from the constant pool // into a vector. // On Sandybridge (no AVX2), it is still better to load a constant vector // from the constant pool and not to broadcast it from a scalar. // But override that restriction when optimizing for size. // TODO: Check if splatting is recommended for other AVX-capable CPUs. if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) { EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); // Splat f32, i32, v4f64, v4i64 in all cases with AVX2. // For size optimization, also splat v2f64 and v2i64, and for size opt // with AVX2, also splat i8 and i16. // With pattern matching, the VBROADCAST node may become a VMOVDDUP. if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) { const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast(Ld)) C = CI->getConstantIntValue(); else if (ConstantFPSDNode *CF = dyn_cast(Ld)) C = CF->getConstantFPValue(); assert(C && "Invalid constant type"); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); unsigned Alignment = cast(CP)->getAlignment(); Ld = DAG.getLoad( CVT, dl, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment); return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } } bool IsLoad = ISD::isNormalLoad(Ld.getNode()); // Handle AVX2 in-register broadcasts. if (!IsLoad && Subtarget.hasInt256() && (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The scalar source must be a normal load. if (!IsLoad) return SDValue(); if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || (Subtarget.hasVLX() && ScalarSize == 64)) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The integer check is needed for the 64-bit into 128-bit so it doesn't match // double since there is no vbroadcastsd xmm if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) { if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } // Unsupported broadcast. return SDValue(); } /// For an EXTRACT_VECTOR_ELT with a constant index return the real /// underlying vector and index. /// /// Modifies \p ExtractedFromVec to the real vector and returns the real /// index. static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx) { int Idx = cast(ExtIdx)->getZExtValue(); if (!isa(ExtractedFromVec)) return Idx; // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already // lowered this: // (extract_vector_elt (v8f32 %1), Constant<6>) // to: // (extract_vector_elt (vector_shuffle<2,u,u,u> // (extract_subvector (v8f32 %0), Constant<4>), // undef) // Constant<0>) // In this case the vector is the extract_subvector expression and the index // is 2, as specified by the shuffle. ShuffleVectorSDNode *SVOp = cast(ExtractedFromVec); SDValue ShuffleVec = SVOp->getOperand(0); MVT ShuffleVecVT = ShuffleVec.getSimpleValueType(); assert(ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()); int ShuffleIdx = SVOp->getMaskElt(Idx); if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) { ExtractedFromVec = ShuffleVec; return ShuffleIdx; } return Idx; } static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); // Skip if insert_vec_elt is not supported. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) return SDValue(); SDLoc DL(Op); unsigned NumElems = Op.getNumOperands(); SDValue VecIn1; SDValue VecIn2; SmallVector InsertIndices; SmallVector Mask(NumElems, -1); for (unsigned i = 0; i != NumElems; ++i) { unsigned Opc = Op.getOperand(i).getOpcode(); if (Opc == ISD::UNDEF) continue; if (Opc != ISD::EXTRACT_VECTOR_ELT) { // Quit if more than 1 elements need inserting. if (InsertIndices.size() > 1) return SDValue(); InsertIndices.push_back(i); continue; } SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); SDValue ExtIdx = Op.getOperand(i).getOperand(1); // Quit if non-constant index. if (!isa(ExtIdx)) return SDValue(); int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx); // Quit if extracted from vector of different type. if (ExtractedFromVec.getValueType() != VT) return SDValue(); if (!VecIn1.getNode()) VecIn1 = ExtractedFromVec; else if (VecIn1 != ExtractedFromVec) { if (!VecIn2.getNode()) VecIn2 = ExtractedFromVec; else if (VecIn2 != ExtractedFromVec) // Quit if more than 2 vectors to shuffle return SDValue(); } if (ExtractedFromVec == VecIn1) Mask[i] = Idx; else if (ExtractedFromVec == VecIn2) Mask[i] = Idx + NumElems; } if (!VecIn1.getNode()) return SDValue(); VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask); for (unsigned Idx : InsertIndices) NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), DAG.getIntPtrConstant(Idx, DL)); return NV; } static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) { assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && "Can not convert non-constant vector"); uint64_t Immediate = 0; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (!In.isUndef()) Immediate |= (cast(In)->getZExtValue() & 0x1) << idx; } SDLoc dl(Op); MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8)); return DAG.getConstant(Immediate, dl, VT); } // Lower BUILD_VECTOR operation for v8i1 and v16i1 types. static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); assert((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!"); SDLoc dl(Op); if (ISD::isBuildVectorAllZeros(Op.getNode()) || ISD::isBuildVectorAllOnes(Op.getNode())) return Op; uint64_t Immediate = 0; SmallVector NonConstIdx; bool IsSplat = true; bool HasConstElts = false; int SplatIdx = -1; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (In.isUndef()) continue; if (!isa(In)) NonConstIdx.push_back(idx); else { Immediate |= (cast(In)->getZExtValue() & 0x1) << idx; HasConstElts = true; } if (SplatIdx < 0) SplatIdx = idx; else if (In != Op.getOperand(SplatIdx)) IsSplat = false; } // for splat use " (select i1 splat_elt, all-ones, all-zeroes)" if (IsSplat) { // The build_vector allows the scalar element to be larger than the vector // element type. We need to mask it to use as a condition unless we know // the upper bits are zero. // FIXME: Use computeKnownBits instead of checking specific opcode? SDValue Cond = Op.getOperand(SplatIdx); assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!"); if (Cond.getOpcode() != ISD::SETCC) Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond, DAG.getConstant(1, dl, MVT::i8)); return DAG.getSelect(dl, VT, Cond, DAG.getConstant(1, dl, VT), DAG.getConstant(0, dl, VT)); } // insert elements one by one SDValue DstVec; if (HasConstElts) { if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32); SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32); ImmL = DAG.getBitcast(MVT::v32i1, ImmL); ImmH = DAG.getBitcast(MVT::v32i1, ImmH); DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH); } else { MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U)); SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT); MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1; DstVec = DAG.getBitcast(VecVT, Imm); DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec, DAG.getIntPtrConstant(0, dl)); } } else DstVec = DAG.getUNDEF(VT); for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) { unsigned InsertIdx = NonConstIdx[i]; DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, Op.getOperand(InsertIdx), DAG.getIntPtrConstant(InsertIdx, dl)); } return DstVec; } /// This is a helper function of LowerToHorizontalOp(). /// This function checks that the build_vector \p N in input implements a /// 128-bit partial horizontal operation on a 256-bit vector, but that operation /// may not match the layout of an x86 256-bit horizontal instruction. /// In other words, if this returns true, then some extraction/insertion will /// be required to produce a valid horizontal instruction. /// /// Parameter \p Opcode defines the kind of horizontal operation to match. /// For example, if \p Opcode is equal to ISD::ADD, then this function /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode /// is equal to ISD::SUB, then this function checks if this is a horizontal /// arithmetic sub. /// /// This function only analyzes elements of \p N whose indices are /// in range [BaseIdx, LastIdx). /// /// TODO: This function was originally used to match both real and fake partial /// horizontal operations, but the index-matching logic is incorrect for that. /// See the corrected implementation in isHopBuildVector(). Can we reduce this /// code because it is only used for partial h-op matching now? static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1) { EVT VT = N->getValueType(0); assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"); assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && "Invalid Vector in input!"); bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); bool CanFold = true; unsigned ExpectedVExtractIdx = BaseIdx; unsigned NumElts = LastIdx - BaseIdx; V0 = DAG.getUNDEF(VT); V1 = DAG.getUNDEF(VT); // Check if N implements a horizontal binop. for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) { SDValue Op = N->getOperand(i + BaseIdx); // Skip UNDEFs. if (Op->isUndef()) { // Update the expected vector extract index. if (i * 2 == NumElts) ExpectedVExtractIdx = BaseIdx; ExpectedVExtractIdx += 2; continue; } CanFold = Op->getOpcode() == Opcode && Op->hasOneUse(); if (!CanFold) break; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); // Try to match the following pattern: // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1)) CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op0.getOperand(0) == Op1.getOperand(0) && isa(Op0.getOperand(1)) && isa(Op1.getOperand(1))); if (!CanFold) break; unsigned I0 = cast(Op0.getOperand(1))->getZExtValue(); unsigned I1 = cast(Op1.getOperand(1))->getZExtValue(); if (i * 2 < NumElts) { if (V0.isUndef()) { V0 = Op0.getOperand(0); if (V0.getValueType() != VT) return false; } } else { if (V1.isUndef()) { V1 = Op0.getOperand(0); if (V1.getValueType() != VT) return false; } if (i * 2 == NumElts) ExpectedVExtractIdx = BaseIdx; } SDValue Expected = (i * 2 < NumElts) ? V0 : V1; if (I0 == ExpectedVExtractIdx) CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected; else if (IsCommutable && I1 == ExpectedVExtractIdx) { // Try to match the following dag sequence: // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I)) CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected; } else CanFold = false; ExpectedVExtractIdx += 2; } return CanFold; } /// Emit a sequence of two 128-bit horizontal add/sub followed by /// a concat_vector. /// /// This is a helper function of LowerToHorizontalOp(). /// This function expects two 256-bit vectors called V0 and V1. /// At first, each vector is split into two separate 128-bit vectors. /// Then, the resulting 128-bit vectors are used to implement two /// horizontal binary operations. /// /// The kind of horizontal binary operation is defined by \p X86Opcode. /// /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to /// the two new horizontal binop. /// When Mode is set, the first horizontal binop dag node would take as input /// the lower 128-bit of V0 and the upper 128-bit of V0. The second /// horizontal binop dag node would take as input the lower 128-bit of V1 /// and the upper 128-bit of V1. /// Example: /// HADD V0_LO, V0_HI /// HADD V1_LO, V1_HI /// /// Otherwise, the first horizontal binop dag node takes as input the lower /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1. /// Example: /// HADD V0_LO, V1_LO /// HADD V0_HI, V1_HI /// /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to /// the upper 128-bits of the result. static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI) { MVT VT = V0.getSimpleValueType(); assert(VT.is256BitVector() && VT == V1.getSimpleValueType() && "Invalid nodes in input!"); unsigned NumElts = VT.getVectorNumElements(); SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL); SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL); SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL); SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL); MVT NewVT = V0_LO.getSimpleValueType(); SDValue LO = DAG.getUNDEF(NewVT); SDValue HI = DAG.getUNDEF(NewVT); if (Mode) { // Don't emit a horizontal binop if the result is expected to be UNDEF. if (!isUndefLO && !V0->isUndef()) LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); if (!isUndefHI && !V1->isUndef()) HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); } else { // Don't emit a horizontal binop if the result is expected to be UNDEF. if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef())) LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef())) HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); } /// Returns true iff \p BV builds a vector with the result equivalent to /// the result of ADDSUB/SUBADD operation. /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters /// \p Opnd0 and \p Opnd1. static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd) { MVT VT = BV->getSimpleValueType(0); if (!Subtarget.hasSSE3() || !VT.isFloatingPoint()) return false; unsigned NumElts = VT.getVectorNumElements(); SDValue InVec0 = DAG.getUNDEF(VT); SDValue InVec1 = DAG.getUNDEF(VT); NumExtracts = 0; // Odd-numbered elements in the input build vector are obtained from // adding/subtracting two integer/float elements. // Even-numbered elements in the input build vector are obtained from // subtracting/adding two integer/float elements. unsigned Opc[2] = {0, 0}; for (unsigned i = 0, e = NumElts; i != e; ++i) { SDValue Op = BV->getOperand(i); // Skip 'undef' values. unsigned Opcode = Op.getOpcode(); if (Opcode == ISD::UNDEF) continue; // Early exit if we found an unexpected opcode. if (Opcode != ISD::FADD && Opcode != ISD::FSUB) return false; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); // Try to match the following pattern: // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i)) // Early exit if we cannot match that sequence. if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Op0.getOperand(1)) || !isa(Op1.getOperand(1)) || Op0.getOperand(1) != Op1.getOperand(1)) return false; unsigned I0 = cast(Op0.getOperand(1))->getZExtValue(); if (I0 != i) return false; // We found a valid add/sub node, make sure its the same opcode as previous // elements for this parity. if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode) return false; Opc[i % 2] = Opcode; // Update InVec0 and InVec1. if (InVec0.isUndef()) { InVec0 = Op0.getOperand(0); if (InVec0.getSimpleValueType() != VT) return false; } if (InVec1.isUndef()) { InVec1 = Op1.getOperand(0); if (InVec1.getSimpleValueType() != VT) return false; } // Make sure that operands in input to each add/sub node always // come from a same pair of vectors. if (InVec0 != Op0.getOperand(0)) { if (Opcode == ISD::FSUB) return false; // FADD is commutable. Try to commute the operands // and then test again. std::swap(Op0, Op1); if (InVec0 != Op0.getOperand(0)) return false; } if (InVec1 != Op1.getOperand(0)) return false; // Increment the number of extractions done. ++NumExtracts; } // Ensure we have found an opcode for both parities and that they are // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the // inputs are undef. if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] || InVec0.isUndef() || InVec1.isUndef()) return false; IsSubAdd = Opc[0] == ISD::FADD; Opnd0 = InVec0; Opnd1 = InVec1; return true; } /// Returns true if is possible to fold MUL and an idiom that has already been /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2. /// /// Prior to calling this function it should be known that there is some /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called /// before replacement of such SDNode with ADDSUB operation. Thus the number /// of \p Opnd0 uses is expected to be equal to 2. /// For example, this function may be called for the following IR: /// %AB = fmul fast <2 x double> %A, %B /// %Sub = fsub fast <2 x double> %AB, %C /// %Add = fadd fast <2 x double> %AB, %C /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, /// <2 x i32> /// There is a def for %Addsub here, which potentially can be replaced by /// X86ISD::ADDSUB operation: /// %Addsub = X86ISD::ADDSUB %AB, %C /// and such ADDSUB can further be replaced with FMADDSUB: /// %Addsub = FMADDSUB %A, %B, %C. /// /// The main reason why this method is called before the replacement of the /// recognized ADDSUB idiom with ADDSUB operation is that such replacement /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit /// FMADDSUB is. static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses) { if (Opnd0.getOpcode() != ISD::FMUL || !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA()) return false; // FIXME: These checks must match the similar ones in // DAGCombiner::visitFADDForFMACombine. It would be good to have one // function that would answer if it is Ok to fuse MUL + ADD to FMADD // or MUL + ADDSUB to FMADDSUB. const TargetOptions &Options = DAG.getTarget().Options; bool AllowFusion = (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); if (!AllowFusion) return false; Opnd2 = Opnd1; Opnd1 = Opnd0.getOperand(1); Opnd0 = Opnd0.getOperand(0); return true; } /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or /// X86ISD::FMSUBADD node. static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Opnd0, Opnd1; unsigned NumExtracts; bool IsSubAdd; if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd)) return SDValue(); MVT VT = BV->getSimpleValueType(0); SDLoc DL(BV); // Try to generate X86ISD::FMADDSUB node here. SDValue Opnd2; if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) { unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB; return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2); } // We only support ADDSUB. if (IsSubAdd) return SDValue(); // Do not generate X86ISD::ADDSUB node for 512-bit types even though // the ADDSUB idiom has been successfully recognized. There are no known // X86 targets with 512-bit ADDSUB instructions! // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom // recognition. if (VT.is512BitVector()) return SDValue(); return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); } static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1) { // Initialize outputs to known values. MVT VT = BV->getSimpleValueType(0); HOpcode = ISD::DELETED_NODE; V0 = DAG.getUNDEF(VT); V1 = DAG.getUNDEF(VT); // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit // half of the result is calculated independently from the 128-bit halves of // the inputs, so that makes the index-checking logic below more complicated. unsigned NumElts = VT.getVectorNumElements(); unsigned GenericOpcode = ISD::DELETED_NODE; unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1; unsigned NumEltsIn128Bits = NumElts / Num128BitChunks; unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2; for (unsigned i = 0; i != Num128BitChunks; ++i) { for (unsigned j = 0; j != NumEltsIn128Bits; ++j) { // Ignore undef elements. SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j); if (Op.isUndef()) continue; // If there's an opcode mismatch, we're done. if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode) return false; // Initialize horizontal opcode. if (HOpcode == ISD::DELETED_NODE) { GenericOpcode = Op.getOpcode(); switch (GenericOpcode) { case ISD::ADD: HOpcode = X86ISD::HADD; break; case ISD::SUB: HOpcode = X86ISD::HSUB; break; case ISD::FADD: HOpcode = X86ISD::FHADD; break; case ISD::FSUB: HOpcode = X86ISD::FHSUB; break; default: return false; } } SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op0.getOperand(0) != Op1.getOperand(0) || !isa(Op0.getOperand(1)) || !isa(Op1.getOperand(1)) || !Op.hasOneUse()) return false; // The source vector is chosen based on which 64-bit half of the // destination vector is being calculated. if (j < NumEltsIn64Bits) { if (V0.isUndef()) V0 = Op0.getOperand(0); } else { if (V1.isUndef()) V1 = Op0.getOperand(0); } SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1; if (SourceVec != Op0.getOperand(0)) return false; // op (extract_vector_elt A, I), (extract_vector_elt A, I+1) unsigned ExtIndex0 = Op0.getConstantOperandVal(1); unsigned ExtIndex1 = Op1.getConstantOperandVal(1); unsigned ExpectedIndex = i * NumEltsIn128Bits + (j % NumEltsIn64Bits) * 2; if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1) continue; // If this is not a commutative op, this does not match. if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD) return false; // Addition is commutative, so try swapping the extract indexes. // op (extract_vector_elt A, I+1), (extract_vector_elt A, I) if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1) continue; // Extract indexes do not match horizontal requirement. return false; } } // We matched. Opcode and operands are returned by reference as arguments. return true; } static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1) { // If either input vector is not the same size as the build vector, // extract/insert the low bits to the correct size. // This is free (examples: zmm --> xmm, xmm --> ymm). MVT VT = BV->getSimpleValueType(0); unsigned Width = VT.getSizeInBits(); if (V0.getValueSizeInBits() > Width) V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width); else if (V0.getValueSizeInBits() < Width) V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width); if (V1.getValueSizeInBits() > Width) V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width); else if (V1.getValueSizeInBits() < Width) V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width); unsigned NumElts = VT.getVectorNumElements(); APInt DemandedElts = APInt::getAllOnesValue(NumElts); for (unsigned i = 0; i != NumElts; ++i) if (BV->getOperand(i).isUndef()) DemandedElts.clearBit(i); // If we don't need the upper xmm, then perform as a xmm hop. unsigned HalfNumElts = NumElts / 2; if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) { MVT HalfVT = VT.getHalfNumVectorElementsVT(); V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128); V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128); SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1); return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256); } return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1); } /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible. static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // We need at least 2 non-undef elements to make this worthwhile by default. unsigned NumNonUndefs = count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); }); if (NumNonUndefs < 2) return SDValue(); // There are 4 sets of horizontal math operations distinguished by type: // int/FP at 128-bit/256-bit. Each type was introduced with a different // subtarget feature. Try to match those "native" patterns first. MVT VT = BV->getSimpleValueType(0); if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) || ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) || ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) || ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) { unsigned HOpcode; SDValue V0, V1; if (isHopBuildVector(BV, DAG, HOpcode, V0, V1)) return getHopForBuildVector(BV, DAG, HOpcode, V0, V1); } // Try harder to match 256-bit ops by using extract/concat. if (!Subtarget.hasAVX() || !VT.is256BitVector()) return SDValue(); // Count the number of UNDEF operands in the build_vector in input. unsigned NumElts = VT.getVectorNumElements(); unsigned Half = NumElts / 2; unsigned NumUndefsLO = 0; unsigned NumUndefsHI = 0; for (unsigned i = 0, e = Half; i != e; ++i) if (BV->getOperand(i)->isUndef()) NumUndefsLO++; for (unsigned i = Half, e = NumElts; i != e; ++i) if (BV->getOperand(i)->isUndef()) NumUndefsHI++; SDLoc DL(BV); SDValue InVec0, InVec1; if (VT == MVT::v8i32 || VT == MVT::v16i16) { SDValue InVec2, InVec3; unsigned X86Opcode; bool CanFold = true; if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) X86Opcode = X86ISD::HADD; else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) X86Opcode = X86ISD::HSUB; else CanFold = false; if (CanFold) { // Do not try to expand this build_vector into a pair of horizontal // add/sub if we can emit a pair of scalar add/sub. if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) return SDValue(); // Convert this build_vector into a pair of horizontal binops followed by // a concat vector. We must adjust the outputs from the partial horizontal // matching calls above to account for undefined vector halves. SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0; SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1; assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?"); bool isUndefLO = NumUndefsLO == Half; bool isUndefHI = NumUndefsHI == Half; return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO, isUndefHI); } } if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || VT == MVT::v16i16) { unsigned X86Opcode; if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HADD; else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HSUB; else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHADD; else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHSUB; else return SDValue(); // Don't try to expand this build_vector into a pair of horizontal add/sub // if we can simply emit a pair of scalar add/sub. if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) return SDValue(); // Convert this build_vector into two horizontal add/sub followed by // a concat vector. bool isUndefLO = NumUndefsLO == Half; bool isUndefHI = NumUndefsHI == Half; return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true, isUndefLO, isUndefHI); } return SDValue(); } /// If a BUILD_VECTOR's source elements all apply the same bit operation and /// one of their operands is constant, lower to a pair of BUILD_VECTOR and /// just apply the bit to the vectors. /// NOTE: Its not in our interest to start make a general purpose vectorizer /// from this, but enough scalar bit operations are created from the later /// legalization + scalarization stages to need basic support. static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, SelectionDAG &DAG) { SDLoc DL(Op); MVT VT = Op->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Check that all elements have the same opcode. // TODO: Should we allow UNDEFS and if so how many? unsigned Opcode = Op->getOperand(0).getOpcode(); for (unsigned i = 1; i < NumElems; ++i) if (Opcode != Op->getOperand(i).getOpcode()) return SDValue(); // TODO: We may be able to add support for other Ops (ADD/SUB + shifts). bool IsShift = false; switch (Opcode) { default: return SDValue(); case ISD::SHL: case ISD::SRL: case ISD::SRA: IsShift = true; break; case ISD::AND: case ISD::XOR: case ISD::OR: // Don't do this if the buildvector is a splat - we'd replace one // constant with an entire vector. if (Op->getSplatValue()) return SDValue(); if (!TLI.isOperationLegalOrPromote(Opcode, VT)) return SDValue(); break; } SmallVector LHSElts, RHSElts; for (SDValue Elt : Op->ops()) { SDValue LHS = Elt.getOperand(0); SDValue RHS = Elt.getOperand(1); // We expect the canonicalized RHS operand to be the constant. if (!isa(RHS)) return SDValue(); // Extend shift amounts. if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) { if (!IsShift) return SDValue(); RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType()); } LHSElts.push_back(LHS); RHSElts.push_back(RHS); } // Limit to shifts by uniform immediates. // TODO: Only accept vXi8/vXi64 special cases? // TODO: Permit non-uniform XOP/AVX2/MULLO cases? if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; })) return SDValue(); SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts); SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts); return DAG.getNode(Opcode, DL, VT, LHS, RHS); } /// Create a vector constant without a load. SSE/AVX provide the bare minimum /// functionality to do this, so it's all zeros, all ones, or some derivation /// that is cheap to calculate. static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); // Vectors containing all zeros can be matched by pxor and xorps. if (ISD::isBuildVectorAllZeros(Op.getNode())) return Op; // Vectors containing all ones can be matched by pcmpeqd on 128-bit width // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use // vpcmpeqd on 256-bit vectors. if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) return Op; return getOnesVector(VT, DAG, DL); } return SDValue(); } /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute /// from a vector of source values and a vector of extraction indices. /// The vectors might be manipulated to match the type of the permute op. static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT ShuffleVT = VT; EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger(); unsigned NumElts = VT.getVectorNumElements(); unsigned SizeInBits = VT.getSizeInBits(); // Adjust IndicesVec to match VT size. assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts && "Illegal variable permute mask size"); if (IndicesVec.getValueType().getVectorNumElements() > NumElts) IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec), NumElts * VT.getScalarSizeInBits()); IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT); // Handle SrcVec that don't match VT type. if (SrcVec.getValueSizeInBits() != SizeInBits) { if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) { // Handle larger SrcVec by treating it as a larger permute. unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits; VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts); IndicesVT = EVT(VT).changeVectorElementTypeToInteger(); IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false, Subtarget, DAG, SDLoc(IndicesVec)); return extractSubVector( createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0, DAG, DL, SizeInBits); } else if (SrcVec.getValueSizeInBits() < SizeInBits) { // Widen smaller SrcVec to match VT. SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec)); } else return SDValue(); } auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) { assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"); EVT SrcVT = Idx.getValueType(); unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale; uint64_t IndexScale = 0; uint64_t IndexOffset = 0; // If we're scaling a smaller permute op, then we need to repeat the // indices, scaling and offsetting them as well. // e.g. v4i32 -> v16i8 (Scale = 4) // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4) // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0) for (uint64_t i = 0; i != Scale; ++i) { IndexScale |= Scale << (i * NumDstBits); IndexOffset |= i << (i * NumDstBits); } Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx, DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT)); Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx, DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT)); return Idx; }; unsigned Opcode = 0; switch (VT.SimpleTy) { default: break; case MVT::v16i8: if (Subtarget.hasSSSE3()) Opcode = X86ISD::PSHUFB; break; case MVT::v8i16: if (Subtarget.hasVLX() && Subtarget.hasBWI()) Opcode = X86ISD::VPERMV; else if (Subtarget.hasSSSE3()) { Opcode = X86ISD::PSHUFB; ShuffleVT = MVT::v16i8; } break; case MVT::v4f32: case MVT::v4i32: if (Subtarget.hasAVX()) { Opcode = X86ISD::VPERMILPV; ShuffleVT = MVT::v4f32; } else if (Subtarget.hasSSSE3()) { Opcode = X86ISD::PSHUFB; ShuffleVT = MVT::v16i8; } break; case MVT::v2f64: case MVT::v2i64: if (Subtarget.hasAVX()) { // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec. IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec); Opcode = X86ISD::VPERMILPV; ShuffleVT = MVT::v2f64; } else if (Subtarget.hasSSE41()) { // SSE41 can compare v2i64 - select between indices 0 and 1. return DAG.getSelectCC( DL, IndicesVec, getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL), DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}), DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}), ISD::CondCode::SETEQ); } break; case MVT::v32i8: if (Subtarget.hasVLX() && Subtarget.hasVBMI()) Opcode = X86ISD::VPERMV; else if (Subtarget.hasXOP()) { SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL); SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL); SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL); SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL); return DAG.getNode( ISD::CONCAT_VECTORS, DL, VT, DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx), DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx)); } else if (Subtarget.hasAVX()) { SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL); SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL); SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo); SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi); auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { // Permute Lo and Hi and then select based on index range. // This works as SHUFB uses bits[3:0] to permute elements and we don't // care about the bit[7] as its just an index vector. SDValue Idx = Ops[2]; EVT VT = Idx.getValueType(); return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT), DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx), DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx), ISD::CondCode::SETGT); }; SDValue Ops[] = {LoLo, HiHi, IndicesVec}; return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops, PSHUFBBuilder); } break; case MVT::v16i16: if (Subtarget.hasVLX() && Subtarget.hasBWI()) Opcode = X86ISD::VPERMV; else if (Subtarget.hasAVX()) { // Scale to v32i8 and perform as v32i8. IndicesVec = ScaleIndices(IndicesVec, 2); return DAG.getBitcast( VT, createVariablePermute( MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec), DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget)); } break; case MVT::v8f32: case MVT::v8i32: if (Subtarget.hasAVX2()) Opcode = X86ISD::VPERMV; else if (Subtarget.hasAVX()) { SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec); SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec, {0, 1, 2, 3, 0, 1, 2, 3}); SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec, {4, 5, 6, 7, 4, 5, 6, 7}); if (Subtarget.hasXOP()) return DAG.getBitcast( VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi, IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8))); // Permute Lo and Hi and then select based on index range. // This works as VPERMILPS only uses index bits[0:1] to permute elements. SDValue Res = DAG.getSelectCC( DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32), DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec), DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec), ISD::CondCode::SETGT); return DAG.getBitcast(VT, Res); } break; case MVT::v4i64: case MVT::v4f64: if (Subtarget.hasAVX512()) { if (!Subtarget.hasVLX()) { MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8); SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec)); IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget, DAG, SDLoc(IndicesVec)); SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL, DAG, Subtarget); return extract256BitVector(Res, 0, DAG, DL); } Opcode = X86ISD::VPERMV; } else if (Subtarget.hasAVX()) { SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec); SDValue LoLo = DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1}); SDValue HiHi = DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3}); // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec. IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec); if (Subtarget.hasXOP()) return DAG.getBitcast( VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi, IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8))); // Permute Lo and Hi and then select based on index range. // This works as VPERMILPD only uses index bit[1] to permute elements. SDValue Res = DAG.getSelectCC( DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64), DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec), DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec), ISD::CondCode::SETGT); return DAG.getBitcast(VT, Res); } break; case MVT::v64i8: if (Subtarget.hasVBMI()) Opcode = X86ISD::VPERMV; break; case MVT::v32i16: if (Subtarget.hasBWI()) Opcode = X86ISD::VPERMV; break; case MVT::v16f32: case MVT::v16i32: case MVT::v8f64: case MVT::v8i64: if (Subtarget.hasAVX512()) Opcode = X86ISD::VPERMV; break; } if (!Opcode) return SDValue(); assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && "Illegal variable permute shuffle type"); uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits(); if (Scale > 1) IndicesVec = ScaleIndices(IndicesVec, Scale); EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger(); IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec); SrcVec = DAG.getBitcast(ShuffleVT, SrcVec); SDValue Res = Opcode == X86ISD::VPERMV ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec) : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec); return DAG.getBitcast(VT, Res); } // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be // reasoned to be a permutation of a vector by indices in a non-constant vector. // (build_vector (extract_elt V, (extract_elt I, 0)), // (extract_elt V, (extract_elt I, 1)), // ... // -> // (vpermv I, V) // // TODO: Handle undefs // TODO: Utilize pshufb and zero mask blending to support more efficient // construction of vectors with constant-0 elements. static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue SrcVec, IndicesVec; // Check for a match of the permute source vector and permute index elements. // This is done by checking that the i-th build_vector operand is of the form: // (extract_elt SrcVec, (extract_elt IndicesVec, i)). for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) { SDValue Op = V.getOperand(Idx); if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); // If this is the first extract encountered in V, set the source vector, // otherwise verify the extract is from the previously defined source // vector. if (!SrcVec) SrcVec = Op.getOperand(0); else if (SrcVec != Op.getOperand(0)) return SDValue(); SDValue ExtractedIndex = Op->getOperand(1); // Peek through extends. if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND || ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND) ExtractedIndex = ExtractedIndex.getOperand(0); if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); // If this is the first extract from the index vector candidate, set the // indices vector, otherwise verify the extract is from the previously // defined indices vector. if (!IndicesVec) IndicesVec = ExtractedIndex.getOperand(0); else if (IndicesVec != ExtractedIndex.getOperand(0)) return SDValue(); auto *PermIdx = dyn_cast(ExtractedIndex.getOperand(1)); if (!PermIdx || PermIdx->getAPIntValue() != Idx) return SDValue(); } SDLoc DL(V); MVT VT = V.getSimpleValueType(); return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget); } SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); unsigned NumElems = Op.getNumOperands(); // Generate vectors for predicate vectors. if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget); if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget)) return VectorConstant; BuildVectorSDNode *BV = cast(Op.getNode()); if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG)) return AddSub; if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG)) return HorizontalOp; if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG)) return Broadcast; if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG)) return BitOp; unsigned EVTBits = EltVT.getSizeInBits(); unsigned NumZero = 0; unsigned NumNonZero = 0; uint64_t NonZeros = 0; bool IsAllConstants = true; SmallSet Values; unsigned NumConstants = NumElems; for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Op.getOperand(i); if (Elt.isUndef()) continue; Values.insert(Elt); if (!isa(Elt) && !isa(Elt)) { IsAllConstants = false; NumConstants--; } if (X86::isZeroNode(Elt)) NumZero++; else { assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range. NonZeros |= ((uint64_t)1 << i); NumNonZero++; } } // All undef vector. Return an UNDEF. All zero vectors were handled above. if (NumNonZero == 0) return DAG.getUNDEF(VT); // If we are inserting one variable into a vector of non-zero constants, try // to avoid loading each constant element as a scalar. Load the constants as a // vector and then insert the variable scalar element. If insertion is not // supported, fall back to a shuffle to get the scalar blended with the // constants. Insertion into a zero vector is handled as a special-case // somewhere below here. if (NumConstants == NumElems - 1 && NumNonZero != 1 && (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) || isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) { // Create an all-constant vector. The variable element in the old // build vector is replaced by undef in the constant vector. Save the // variable scalar element and its index for use in the insertelement. LLVMContext &Context = *DAG.getContext(); Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context); SmallVector ConstVecOps(NumElems, UndefValue::get(EltType)); SDValue VarElt; SDValue InsIndex; for (unsigned i = 0; i != NumElems; ++i) { SDValue Elt = Op.getOperand(i); if (auto *C = dyn_cast(Elt)) ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue()); else if (auto *C = dyn_cast(Elt)) ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF()); else if (!Elt.isUndef()) { assert(!VarElt.getNode() && !InsIndex.getNode() && "Expected one variable element in this vector"); VarElt = Elt; InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout())); } } Constant *CV = ConstantVector::get(ConstVecOps); SDValue DAGConstVec = DAG.getConstantPool(CV, VT); // The constants we just created may not be legal (eg, floating point). We // must lower the vector right here because we can not guarantee that we'll // legalize it before loading it. This is also why we could not just create // a new build vector here. If the build vector contains illegal constants, // it could get split back up into a series of insert elements. // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD. SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG); MachineFunction &MF = DAG.getMachineFunction(); MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF); SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI); unsigned InsertC = cast(InsIndex)->getZExtValue(); unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits(); if (InsertC < NumEltsInLow128Bits) return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex); // There's no good way to insert into the high elements of a >128-bit // vector, so use shuffles to avoid an extract/insert sequence. assert(VT.getSizeInBits() > 128 && "Invalid insertion index?"); assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector"); SmallVector ShuffleMask; unsigned NumElts = VT.getVectorNumElements(); for (unsigned i = 0; i != NumElts; ++i) ShuffleMask.push_back(i == InsertC ? NumElts : i); SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt); return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask); } // Special case for single non-zero, non-undef, element. if (NumNonZero == 1) { unsigned Idx = countTrailingZeros(NonZeros); SDValue Item = Op.getOperand(Idx); // If we have a constant or non-constant insertion into the low element of // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into // the rest of the elements. This will be matched as movd/movq/movss/movsd // depending on what the source datatype is. if (Idx == 0) { if (NumZero == 0) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit())) { assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } // We can't directly insert an i8 or i16 into a vector, so zero extend // it to i32 first. if (EltVT == MVT::i16 || EltVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); return DAG.getBitcast(VT, Item); } } // Is it a vector logical left shift? if (NumElems == 2 && Idx == 1 && X86::isZeroNode(Op.getOperand(0)) && !X86::isZeroNode(Op.getOperand(1))) { unsigned NumBits = VT.getSizeInBits(); return getVShift(true, VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(1)), NumBits/2, DAG, *this, dl); } if (IsAllConstants) // Otherwise, it's better to do a constpool load. return SDValue(); // Otherwise, if this is a vector with i32 or f32 elements, and the element // is a non-constant being inserted into an element other than the low one, // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka // movd/movss) to move this into the low element, then shuffle it into // place. if (EVTBits == 32) { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); } } // Splat is obviously ok. Let legalizer expand it to a shuffle. if (Values.size() == 1) { if (EVTBits == 32) { // Instead of a shuffle like this: // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> // Check if it's possible to issue this instead. // shuffle (vload ptr)), undef, <1, 1, 1, 1> unsigned Idx = countTrailingZeros(NonZeros); SDValue Item = Op.getOperand(Idx); if (Op.getNode()->isOnlyUserOf(Item.getNode())) return LowerAsSplatVectorLoad(Item, VT, dl, DAG); } return SDValue(); } // A vector full of immediates; various special cases are already // handled, so this is best done with a single constant-pool load. if (IsAllConstants) return SDValue(); if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget)) return V; // See if we can use a vector load to get all of the elements. { SmallVector Ops(Op->op_begin(), Op->op_begin() + NumElems); if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false)) return LD; } // If this is a splat of pairs of 32-bit elements, we can use a narrower // build_vector and broadcast it. // TODO: We could probably generalize this more. if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) { SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1), DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) }; auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef Ops) { // Make sure all the even/odd operands match. for (unsigned i = 2; i != NumElems; ++i) if (Ops[i % 2] != Op.getOperand(i)) return false; return true; }; if (CanSplat(Op, NumElems, Ops)) { MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64; MVT NarrowVT = MVT::getVectorVT(EltVT, 4); // Create a new build vector and cast to v2i64/v2f64. SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2), DAG.getBuildVector(NarrowVT, dl, Ops)); // Broadcast from v2i64/v2f64 and cast to final VT. MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2); return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT, NewBV)); } } // For AVX-length vectors, build the individual 128-bit pieces and use // shuffles to put them in place. if (VT.getSizeInBits() > 128) { MVT HVT = MVT::getVectorVT(EltVT, NumElems/2); // Build both the lower and upper subvector. SDValue Lower = DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2)); SDValue Upper = DAG.getBuildVector( HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2)); // Recreate the wider vector with the lower and upper part. return concatSubVectors(Lower, Upper, DAG, dl); } // Let legalizer expand 2-wide build_vectors. if (EVTBits == 64) { if (NumNonZero == 1) { // One half is zero or undef. unsigned Idx = countTrailingZeros(NonZeros); SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(Idx)); return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); } return SDValue(); } // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8 && NumElems == 16) if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero, DAG, Subtarget)) return V; if (EVTBits == 16 && NumElems == 8) if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero, DAG, Subtarget)) return V; // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS if (EVTBits == 32 && NumElems == 4) if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget)) return V; // If element VT is == 32 bits, turn it into a number of shuffles. if (NumElems == 4 && NumZero > 0) { SmallVector Ops(NumElems); for (unsigned i = 0; i < 4; ++i) { bool isZero = !(NonZeros & (1ULL << i)); if (isZero) Ops[i] = getZeroVector(VT, Subtarget, DAG, dl); else Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); } for (unsigned i = 0; i < 2; ++i) { switch ((NonZeros >> (i*2)) & 0x3) { default: llvm_unreachable("Unexpected NonZero count"); case 0: Ops[i] = Ops[i*2]; // Must be a zero vector. break; case 1: Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]); break; case 2: Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]); break; case 3: Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]); break; } } bool Reverse1 = (NonZeros & 0x3) == 2; bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; int MaskVec[] = { Reverse1 ? 1 : 0, Reverse1 ? 0 : 1, static_cast(Reverse2 ? NumElems+1 : NumElems), static_cast(Reverse2 ? NumElems : NumElems+1) }; return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec); } assert(Values.size() > 1 && "Expected non-undef and non-splat vector"); // Check for a build vector from mostly shuffle plus few inserting. if (SDValue Sh = buildFromShuffleMostly(Op, DAG)) return Sh; // For SSE 4.1, use insertps to put the high elements into the low element. if (Subtarget.hasSSE41()) { SDValue Result; if (!Op.getOperand(0).isUndef()) Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); else Result = DAG.getUNDEF(VT); for (unsigned i = 1; i < NumElems; ++i) { if (Op.getOperand(i).isUndef()) continue; Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } return Result; } // Otherwise, expand into a number of unpckl*, start by extending each of // our (non-undef) elements to the full vector width with the element in the // bottom slot of the vector (which generates no code for SSE). SmallVector Ops(NumElems); for (unsigned i = 0; i < NumElems; ++i) { if (!Op.getOperand(i).isUndef()) Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); else Ops[i] = DAG.getUNDEF(VT); } // Next, we iteratively mix elements, e.g. for v4f32: // Step 1: unpcklps 0, 1 ==> X: // : unpcklps 2, 3 ==> Y: // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0> for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) { // Generate scaled UNPCKL shuffle mask. SmallVector Mask; for(unsigned i = 0; i != Scale; ++i) Mask.push_back(i); for (unsigned i = 0; i != Scale; ++i) Mask.push_back(NumElems+i); Mask.append(NumElems - Mask.size(), SM_SentinelUndef); for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i) Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask); } return Ops[0]; } // 256-bit AVX can use the vinsertf128 instruction // to create 256-bit vectors from two other 128-bit ones. // TODO: Detect subvector broadcast here instead of DAG combine? static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc dl(Op); MVT ResVT = Op.getSimpleValueType(); assert((ResVT.is256BitVector() || ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); unsigned NumOperands = Op.getNumOperands(); unsigned NumZero = 0; unsigned NumNonZero = 0; unsigned NonZeros = 0; for (unsigned i = 0; i != NumOperands; ++i) { SDValue SubVec = Op.getOperand(i); if (SubVec.isUndef()) continue; if (ISD::isBuildVectorAllZeros(SubVec.getNode())) ++NumZero; else { assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range. NonZeros |= 1 << i; ++NumNonZero; } } // If we have more than 2 non-zeros, build each half separately. if (NumNonZero > 2) { MVT HalfVT = ResVT.getHalfNumVectorElementsVT(); ArrayRef Ops = Op->ops(); SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(0, NumOperands/2)); SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(NumOperands/2)); return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } // Otherwise, build it up through insert_subvectors. SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl) : DAG.getUNDEF(ResVT); MVT SubVT = Op.getOperand(0).getSimpleValueType(); unsigned NumSubElems = SubVT.getVectorNumElements(); for (unsigned i = 0; i != NumOperands; ++i) { if ((NonZeros & (1 << i)) == 0) continue; Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i), DAG.getIntPtrConstant(i * NumSubElems, dl)); } return Vec; } // Returns true if the given node is a type promotion (by concatenating i1 // zeros) of the result of a node that already zeros all upper bits of // k-register. // TODO: Merge this with LowerAVXCONCAT_VECTORS? static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG & DAG) { SDLoc dl(Op); MVT ResVT = Op.getSimpleValueType(); unsigned NumOperands = Op.getNumOperands(); assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"); uint64_t Zeros = 0; uint64_t NonZeros = 0; for (unsigned i = 0; i != NumOperands; ++i) { SDValue SubVec = Op.getOperand(i); if (SubVec.isUndef()) continue; assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range. if (ISD::isBuildVectorAllZeros(SubVec.getNode())) Zeros |= (uint64_t)1 << i; else NonZeros |= (uint64_t)1 << i; } unsigned NumElems = ResVT.getVectorNumElements(); // If we are inserting non-zero vector and there are zeros in LSBs and undef // in the MSBs we need to emit a KSHIFTL. The generic lowering to // insert_subvector will give us two kshifts. if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros && Log2_64(NonZeros) != NumOperands - 1) { MVT ShiftVT = ResVT; if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; unsigned Idx = Log2_64(NonZeros); SDValue SubVec = Op.getOperand(Idx); unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements(); SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT, DAG.getUNDEF(ShiftVT), SubVec, DAG.getIntPtrConstant(0, dl)); Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec, DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op, DAG.getIntPtrConstant(0, dl)); } // If there are zero or one non-zeros we can handle this very simply. if (NonZeros == 0 || isPowerOf2_64(NonZeros)) { SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT); if (!NonZeros) return Vec; unsigned Idx = Log2_64(NonZeros); SDValue SubVec = Op.getOperand(Idx); unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements(); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec, DAG.getIntPtrConstant(Idx * SubVecNumElts, dl)); } if (NumOperands > 2) { MVT HalfVT = ResVT.getHalfNumVectorElementsVT(); ArrayRef Ops = Op->ops(); SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(0, NumOperands/2)); SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(NumOperands/2)); return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?"); if (ResVT.getVectorNumElements() >= 16) return Op; // The operation is legal with KUNPCK SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT), Op.getOperand(0), DAG.getIntPtrConstant(0, dl)); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1), DAG.getIntPtrConstant(NumElems/2, dl)); } static SDValue LowerCONCAT_VECTORS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); if (VT.getVectorElementType() == MVT::i1) return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG); assert((VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))); // AVX can use the vinsertf128 instruction to create 256-bit vectors // from two other 128-bit ones. // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget); } //===----------------------------------------------------------------------===// // Vector shuffle lowering // // This is an experimental code path for lowering vector shuffles on x86. It is // designed to handle arbitrary vector shuffles and blends, gracefully // degrading performance as necessary. It works hard to recognize idiomatic // shuffles and lower them to optimal instruction patterns without leaving // a framework that allows reasonably efficient handling of all vector shuffle // patterns. //===----------------------------------------------------------------------===// /// Tiny helper function to identify a no-op mask. /// /// This is a somewhat boring predicate function. It checks whether the mask /// array input, which is assumed to be a single-input shuffle mask of the kind /// used by the X86 shuffle instructions (not a fully general /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an /// in-place shuffle are 'no-op's. static bool isNoopShuffleMask(ArrayRef Mask) { for (int i = 0, Size = Mask.size(); i < Size; ++i) { assert(Mask[i] >= -1 && "Out of bound mask element!"); if (Mask[i] >= 0 && Mask[i] != i) return false; } return true; } /// Test whether there are elements crossing LaneSizeInBits lanes in this /// shuffle mask. /// /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations /// and we routinely test for these. static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef Mask) { assert(LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && "Illegal shuffle lane size"); int LaneSize = LaneSizeInBits / ScalarSizeInBits; int Size = Mask.size(); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) return true; return false; } /// Test whether there are elements crossing 128-bit lanes in this /// shuffle mask. static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef Mask) { return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask); } /// Test whether a shuffle mask is equivalent within each sub-lane. /// /// This checks a shuffle mask to see if it is performing the same /// lane-relative shuffle in each sub-lane. This trivially implies /// that it is also not lane-crossing. It may however involve a blend from the /// same lane of a second vector. /// /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is /// non-trivial to compute in the face of undef lanes. The representation is /// suitable for use with existing 128-bit shuffles as entries from the second /// vector have been remapped to [LaneSize, 2*LaneSize). static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); RepeatedMask.assign(LaneSize, -1); int Size = Mask.size(); for (int i = 0; i < Size; ++i) { assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0); if (Mask[i] < 0) continue; if ((Mask[i] % Size) / LaneSize != i / LaneSize) // This entry crosses lanes, so there is no way to model this shuffle. return false; // Ok, handle the in-lane shuffles by detecting if and when they repeat. // Adjust second vector indices to start at LaneSize instead of Size. int LocalM = Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize; if (RepeatedMask[i % LaneSize] < 0) // This is the first non-undef entry in this slot of a 128-bit lane. RepeatedMask[i % LaneSize] = LocalM; else if (RepeatedMask[i % LaneSize] != LocalM) // Found a mismatch with the repeated mask. return false; } return true; } /// Test whether a shuffle mask is equivalent within each 128-bit lane. static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask); } static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask) { SmallVector RepeatedMask; return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask); } /// Test whether a shuffle mask is equivalent within each 256-bit lane. static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask); } /// Test whether a target shuffle mask is equivalent within each sub-lane. /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero. static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); RepeatedMask.assign(LaneSize, SM_SentinelUndef); int Size = Mask.size(); for (int i = 0; i < Size; ++i) { assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0)); if (Mask[i] == SM_SentinelUndef) continue; if (Mask[i] == SM_SentinelZero) { if (!isUndefOrZero(RepeatedMask[i % LaneSize])) return false; RepeatedMask[i % LaneSize] = SM_SentinelZero; continue; } if ((Mask[i] % Size) / LaneSize != i / LaneSize) // This entry crosses lanes, so there is no way to model this shuffle. return false; // Ok, handle the in-lane shuffles by detecting if and when they repeat. // Adjust second vector indices to start at LaneSize instead of Size. int LocalM = Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize; if (RepeatedMask[i % LaneSize] == SM_SentinelUndef) // This is the first non-undef entry in this slot of a 128-bit lane. RepeatedMask[i % LaneSize] = LocalM; else if (RepeatedMask[i % LaneSize] != LocalM) // Found a mismatch with the repeated mask. return false; } return true; } /// Checks whether a shuffle mask is equivalent to an explicit list of /// arguments. /// /// This is a fast way to test a shuffle mask against a fixed pattern: /// /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... } /// /// It returns true if the mask is exactly as wide as the argument list, and /// each element of the mask is either -1 (signifying undef) or the value given /// in the argument. static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef Mask, ArrayRef ExpectedMask) { if (Mask.size() != ExpectedMask.size()) return false; int Size = Mask.size(); // If the values are build vectors, we can look through them to find // equivalent inputs that make the shuffles equivalent. auto *BV1 = dyn_cast(V1); auto *BV2 = dyn_cast(V2); for (int i = 0; i < Size; ++i) { assert(Mask[i] >= -1 && "Out of bound mask element!"); if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) { auto *MaskBV = Mask[i] < Size ? BV1 : BV2; auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2; if (!MaskBV || !ExpectedBV || MaskBV->getOperand(Mask[i] % Size) != ExpectedBV->getOperand(ExpectedMask[i] % Size)) return false; } } return true; } /// Checks whether a target shuffle mask is equivalent to an explicit pattern. /// /// The masks must be exactly the same width. /// /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding /// value in ExpectedMask is always accepted. Otherwise the indices must match. /// /// SM_SentinelZero is accepted as a valid negative index but must match in /// both. static bool isTargetShuffleEquivalent(ArrayRef Mask, ArrayRef ExpectedMask, SDValue V1 = SDValue(), SDValue V2 = SDValue()) { int Size = Mask.size(); if (Size != (int)ExpectedMask.size()) return false; assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && "Illegal target shuffle mask"); // Check for out-of-range target shuffle mask indices. if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size)) return false; // If the values are build vectors, we can look through them to find // equivalent inputs that make the shuffles equivalent. auto *BV1 = dyn_cast_or_null(V1); auto *BV2 = dyn_cast_or_null(V2); BV1 = ((BV1 && Size != (int)BV1->getNumOperands()) ? nullptr : BV1); BV2 = ((BV2 && Size != (int)BV2->getNumOperands()) ? nullptr : BV2); for (int i = 0; i < Size; ++i) { if (Mask[i] == SM_SentinelUndef || Mask[i] == ExpectedMask[i]) continue; if (0 <= Mask[i] && 0 <= ExpectedMask[i]) { auto *MaskBV = Mask[i] < Size ? BV1 : BV2; auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2; if (MaskBV && ExpectedBV && MaskBV->getOperand(Mask[i] % Size) == ExpectedBV->getOperand(ExpectedMask[i] % Size)) continue; } // TODO - handle SM_Sentinel equivalences. return false; } return true; } // Attempt to create a shuffle mask from a VSELECT condition mask. static bool createShuffleMaskFromVSELECT(SmallVectorImpl &Mask, SDValue Cond) { if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) return false; unsigned Size = Cond.getValueType().getVectorNumElements(); Mask.resize(Size, SM_SentinelUndef); for (int i = 0; i != (int)Size; ++i) { SDValue CondElt = Cond.getOperand(i); Mask[i] = i; // Arbitrarily choose from the 2nd operand if the select condition element // is undef. // TODO: Can we do better by matching patterns such as even/odd? if (CondElt.isUndef() || isNullConstant(CondElt)) Mask[i] += Size; } return true; } // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd // instructions. static bool isUnpackWdShuffleMask(ArrayRef Mask, MVT VT) { if (VT != MVT::v8i32 && VT != MVT::v8f32) return false; SmallVector Unpcklwd; createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true, /* Unary = */ false); SmallVector Unpckhwd; createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false, /* Unary = */ false); bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) || isTargetShuffleEquivalent(Mask, Unpckhwd)); return IsUnpackwdMask; } static bool is128BitUnpackShuffleMask(ArrayRef Mask) { // Create 128-bit vector type based on mask size. MVT EltVT = MVT::getIntegerVT(128 / Mask.size()); MVT VT = MVT::getVectorVT(EltVT, Mask.size()); // We can't assume a canonical shuffle mask, so try the commuted version too. SmallVector CommutedMask(Mask.begin(), Mask.end()); ShuffleVectorSDNode::commuteMask(CommutedMask); // Match any of unary/binary or low/high. for (unsigned i = 0; i != 4; ++i) { SmallVector UnpackMask; createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2); if (isTargetShuffleEquivalent(Mask, UnpackMask) || isTargetShuffleEquivalent(CommutedMask, UnpackMask)) return true; } return false; } /// Return true if a shuffle mask chooses elements identically in its top and /// bottom halves. For example, any splat mask has the same top and bottom /// halves. If an element is undefined in only one half of the mask, the halves /// are not considered identical. static bool hasIdenticalHalvesShuffleMask(ArrayRef Mask) { assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask"); unsigned HalfSize = Mask.size() / 2; for (unsigned i = 0; i != HalfSize; ++i) { if (Mask[i] != Mask[i + HalfSize]) return false; } return true; } /// Get a 4-lane 8-bit shuffle immediate for a mask. /// /// This helper function produces an 8-bit shuffle immediate corresponding to /// the ubiquitous shuffle encoding scheme used in x86 instructions for /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for /// example. /// /// NB: We rely heavily on "undef" masks preserving the input lane. static unsigned getV4X86ShuffleImm(ArrayRef Mask) { assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"); assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); unsigned Imm = 0; Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0; Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2; Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4; Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6; return Imm; } static SDValue getV4X86ShuffleImm8ForMask(ArrayRef Mask, const SDLoc &DL, SelectionDAG &DAG) { return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8); } // The Shuffle result is as follow: // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order. // Each Zeroable's element correspond to a particular Mask's element. // As described in computeZeroableShuffleElements function. // // The function looks for a sub-mask that the nonzero elements are in // increasing order. If such sub-mask exist. The function returns true. static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef Mask, const EVT &VectorType, bool &IsZeroSideLeft) { int NextElement = -1; // Check if the Mask's nonzero elements are in increasing order. for (int i = 0, e = Mask.size(); i < e; i++) { // Checks if the mask's zeros elements are built from only zeros. assert(Mask[i] >= -1 && "Out of bound mask element!"); if (Mask[i] < 0) return false; if (Zeroable[i]) continue; // Find the lowest non zero element if (NextElement < 0) { NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0; IsZeroSideLeft = NextElement != 0; } // Exit if the mask's non zero elements are not in increasing order. if (NextElement != Mask[i]) return false; NextElement++; } return true; } /// Try to lower a shuffle with a single PSHUFB of V1 or V2. static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Size = Mask.size(); int LaneSize = 128 / VT.getScalarSizeInBits(); const int NumBytes = VT.getSizeInBits() / 8; const int NumEltBytes = VT.getScalarSizeInBits() / 8; assert((Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())); SmallVector PSHUFBMask(NumBytes); // Sign bit set in i8 mask means zero element. SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8); SDValue V; for (int i = 0; i < NumBytes; ++i) { int M = Mask[i / NumEltBytes]; if (M < 0) { PSHUFBMask[i] = DAG.getUNDEF(MVT::i8); continue; } if (Zeroable[i / NumEltBytes]) { PSHUFBMask[i] = ZeroMask; continue; } // We can only use a single input of V1 or V2. SDValue SrcV = (M >= Size ? V2 : V1); if (V && V != SrcV) return SDValue(); V = SrcV; M %= Size; // PSHUFB can't cross lanes, ensure this doesn't happen. if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize)) return SDValue(); M = M % LaneSize; M = M * NumEltBytes + (i % NumEltBytes); PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8); } assert(V && "Failed to find a source input"); MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes); return DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V), DAG.getBuildVector(I8VT, DL, PSHUFBMask))); } static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl); // X86 has dedicated shuffle that can be lowered to VEXPAND static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT, const APInt &Zeroable, ArrayRef Mask, SDValue &V1, SDValue &V2, SelectionDAG &DAG, const X86Subtarget &Subtarget) { bool IsLeftZeroSide = true; if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(), IsLeftZeroSide)) return SDValue(); unsigned VEXPANDMask = (~Zeroable).getZExtValue(); MVT IntegerType = MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType); unsigned NumElts = VT.getVectorNumElements(); assert((NumElts == 4 || NumElts == 8 || NumElts == 16) && "Unexpected number of vector elements"); SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts), Subtarget, DAG, DL); SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL); SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1; return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask); } static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { int NumElts = VT.getVectorNumElements(); bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true; for (int i = 0; i != NumElts; i += 2) { int M1 = TargetMask[i + 0]; int M2 = TargetMask[i + 1]; Undef1 &= (SM_SentinelUndef == M1); Undef2 &= (SM_SentinelUndef == M2); Zero1 &= isUndefOrZero(M1); Zero2 &= isUndefOrZero(M2); } assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) && "Zeroable shuffle detected"); // Attempt to match the target mask against the unpack lo/hi mask patterns. SmallVector Unpckl, Unpckh; createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary); if (isTargetShuffleEquivalent(TargetMask, Unpckl)) { UnpackOpcode = X86ISD::UNPCKL; V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); return true; } createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary); if (isTargetShuffleEquivalent(TargetMask, Unpckh)) { UnpackOpcode = X86ISD::UNPCKH; V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); return true; } // If an unary shuffle, attempt to match as an unpack lo/hi with zero. if (IsUnary && (Zero1 || Zero2)) { // Don't bother if we can blend instead. if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) && isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0)) return false; bool MatchLo = true, MatchHi = true; for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) { int M = TargetMask[i]; // Ignore if the input is known to be zero or the index is undef. if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) || (M == SM_SentinelUndef)) continue; MatchLo &= (M == Unpckl[i]); MatchHi &= (M == Unpckh[i]); } if (MatchLo || MatchHi) { UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH; V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1; V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1; return true; } } // If a binary shuffle, commute and try again. if (!IsUnary) { ShuffleVectorSDNode::commuteMask(Unpckl); if (isTargetShuffleEquivalent(TargetMask, Unpckl)) { UnpackOpcode = X86ISD::UNPCKL; std::swap(V1, V2); return true; } ShuffleVectorSDNode::commuteMask(Unpckh); if (isTargetShuffleEquivalent(TargetMask, Unpckh)) { UnpackOpcode = X86ISD::UNPCKH; std::swap(V1, V2); return true; } } return false; } // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { SmallVector Unpckl; createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false); if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); SmallVector Unpckh; createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false); if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); // Commute and try again. ShuffleVectorSDNode::commuteMask(Unpckl); if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1); ShuffleVectorSDNode::commuteMask(Unpckh); if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1); return SDValue(); } static bool matchShuffleAsVPMOV(ArrayRef Mask, bool SwappedOps, int Delta) { int Size = (int)Mask.size(); int Split = Size / Delta; int TruncatedVectorStart = SwappedOps ? Size : 0; // Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,... if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta)) return false; // The rest of the mask should not refer to the truncated vector's elements. if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart, TruncatedVectorStart + Size)) return false; return true; } // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction. // // An example is the following: // // t0: ch = EntryToken // t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0 // t25: v4i32 = truncate t2 // t41: v8i16 = bitcast t25 // t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16, // Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0> // t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21 // t18: v2i64 = bitcast t51 // // Without avx512vl, this is lowered to: // // vpmovqd %zmm0, %ymm0 // vpshufb {{.*#+}} xmm0 = // xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero // // But when avx512vl is available, one can just use a single vpmovdw // instruction. static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (VT != MVT::v16i8 && VT != MVT::v8i16) return SDValue(); if (Mask.size() != VT.getVectorNumElements()) return SDValue(); bool SwappedOps = false; if (!ISD::isBuildVectorAllZeros(V2.getNode())) { if (!ISD::isBuildVectorAllZeros(V1.getNode())) return SDValue(); std::swap(V1, V2); SwappedOps = true; } // Look for: // // bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8> // bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16> // // and similar ones. if (V1.getOpcode() != ISD::BITCAST) return SDValue(); if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE) return SDValue(); SDValue Src = V1.getOperand(0).getOperand(0); MVT SrcVT = Src.getSimpleValueType(); // The vptrunc** instructions truncating 128 bit and 256 bit vectors // are only available with avx512vl. if (!SrcVT.is512BitVector() && !Subtarget.hasVLX()) return SDValue(); // Down Convert Word to Byte is only available with avx512bw. The case with // 256-bit output doesn't contain a shuffle and is therefore not handled here. if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 && !Subtarget.hasBWI()) return SDValue(); // The first half/quarter of the mask should refer to every second/fourth // element of the vector truncated and bitcasted. if (!matchShuffleAsVPMOV(Mask, SwappedOps, 2) && !matchShuffleAsVPMOV(Mask, SwappedOps, 4)) return SDValue(); return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src); } // X86 has dedicated pack instructions that can handle specific truncation // operations: PACKSS and PACKUS. static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef TargetMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned NumElts = VT.getVectorNumElements(); unsigned BitSize = VT.getScalarSizeInBits(); MVT PackSVT = MVT::getIntegerVT(BitSize * 2); MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2); auto MatchPACK = [&](SDValue N1, SDValue N2) { SDValue VV1 = DAG.getBitcast(PackVT, N1); SDValue VV2 = DAG.getBitcast(PackVT, N2); if (Subtarget.hasSSE41() || PackSVT == MVT::i16) { APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize); if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) && (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) { V1 = VV1; V2 = VV2; SrcVT = PackVT; PackOpcode = X86ISD::PACKUS; return true; } } if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) && (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) { V1 = VV1; V2 = VV2; SrcVT = PackVT; PackOpcode = X86ISD::PACKSS; return true; } return false; }; // Try binary shuffle. SmallVector BinaryMask; createPackShuffleMask(VT, BinaryMask, false); if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2)) if (MatchPACK(V1, V2)) return true; // Try unary shuffle. SmallVector UnaryMask; createPackShuffleMask(VT, UnaryMask, true); if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1)) if (MatchPACK(V1, V1)) return true; return false; } static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT PackVT; unsigned PackOpcode; if (matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG, Subtarget)) return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1), DAG.getBitcast(PackVT, V2)); return SDValue(); } /// Try to emit a bitmask instruction for a shuffle. /// /// This handles cases where we can model a blend exactly as a bitmask due to /// one of the inputs being zeroable. static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT MaskVT = VT; MVT EltVT = VT.getVectorElementType(); SDValue Zero, AllOnes; // Use f64 if i64 isn't legal. if (EltVT == MVT::i64 && !Subtarget.is64Bit()) { EltVT = MVT::f64; MaskVT = MVT::getVectorVT(EltVT, Mask.size()); } MVT LogicVT = VT; if (EltVT == MVT::f32 || EltVT == MVT::f64) { Zero = DAG.getConstantFP(0.0, DL, EltVT); AllOnes = DAG.getConstantFP( APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT); LogicVT = MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size()); } else { Zero = DAG.getConstant(0, DL, EltVT); AllOnes = DAG.getAllOnesConstant(DL, EltVT); } SmallVector VMaskOps(Mask.size(), Zero); SDValue V; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Zeroable[i]) continue; if (Mask[i] % Size != i) return SDValue(); // Not a blend. if (!V) V = Mask[i] < Size ? V1 : V2; else if (V != (Mask[i] < Size ? V1 : V2)) return SDValue(); // Can only let one input through the mask. VMaskOps[i] = AllOnes; } if (!V) return SDValue(); // No non-zeroable elements! SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps); VMask = DAG.getBitcast(LogicVT, VMask); V = DAG.getBitcast(LogicVT, V); SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask); return DAG.getBitcast(VT, And); } /// Try to emit a blend instruction for a shuffle using bit math. /// /// This is used as a fallback approach when first class blend instructions are /// unavailable. Currently it is only suitable for integer vectors, but could /// be generalized for floating point vectors if desirable. static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { assert(VT.isInteger() && "Only supports integer vector types!"); MVT EltVT = VT.getVectorElementType(); SDValue Zero = DAG.getConstant(0, DL, EltVT); SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT); SmallVector MaskOps; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size) return SDValue(); // Shuffled input! MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero); } SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps); V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask); V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2); return DAG.getNode(ISD::OR, DL, VT, V1, V2); } static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG); static bool matchShuffleAsBlend(SDValue V1, SDValue V2, MutableArrayRef Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask) { bool V1IsZeroOrUndef = V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZeroOrUndef = V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode()); BlendMask = 0; ForceV1Zero = false, ForceV2Zero = false; assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask"); // Attempt to generate the binary blend mask. If an input is zero then // we can use any lane. for (int i = 0, Size = Mask.size(); i < Size; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) continue; if (M == i) continue; if (M == i + Size) { BlendMask |= 1ull << i; continue; } if (Zeroable[i]) { if (V1IsZeroOrUndef) { ForceV1Zero = true; Mask[i] = i; continue; } if (V2IsZeroOrUndef) { ForceV2Zero = true; BlendMask |= 1ull << i; Mask[i] = i + Size; continue; } } return false; } return true; } static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) { uint64_t ScaledMask = 0; for (int i = 0; i != Size; ++i) if (BlendMask & (1ull << i)) ScaledMask |= ((1ull << Scale) - 1) << (i * Scale); return ScaledMask; } /// Try to emit a blend instruction for a shuffle. /// /// This doesn't do any checks for the availability of instructions for blending /// these values. It relies on the availability of the X86ISD::BLENDI pattern to /// be matched in the backend with the type given. What it does check for is /// that the shuffle mask is a blend, or convertible into a blend with zero. static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { uint64_t BlendMask = 0; bool ForceV1Zero = false, ForceV2Zero = false; SmallVector Mask(Original.begin(), Original.end()); if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero, BlendMask)) return SDValue(); // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. if (ForceV1Zero) V1 = getZeroVector(VT, Subtarget, DAG, DL); if (ForceV2Zero) V2 = getZeroVector(VT, Subtarget, DAG, DL); switch (VT.SimpleTy) { case MVT::v4i64: case MVT::v8i32: assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); LLVM_FALLTHROUGH; case MVT::v4f64: case MVT::v8f32: assert(Subtarget.hasAVX() && "256-bit float blends require AVX!"); LLVM_FALLTHROUGH; case MVT::v2f64: case MVT::v2i64: case MVT::v4f32: case MVT::v4i32: case MVT::v8i16: assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!"); return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, DAG.getTargetConstant(BlendMask, DL, MVT::i8)); case MVT::v16i16: { assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!"); SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { // We can lower these with PBLENDW which is mirrored across 128-bit lanes. assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); BlendMask = 0; for (int i = 0; i < 8; ++i) if (RepeatedMask[i] >= 8) BlendMask |= 1ull << i; return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getTargetConstant(BlendMask, DL, MVT::i8)); } // Use PBLENDW for lower/upper lanes and then blend lanes. // TODO - we should allow 2 PBLENDW here and leave shuffle combine to // merge to VSELECT where useful. uint64_t LoMask = BlendMask & 0xFF; uint64_t HiMask = (BlendMask >> 8) & 0xFF; if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) { SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getTargetConstant(LoMask, DL, MVT::i8)); SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getTargetConstant(HiMask, DL, MVT::i8)); return DAG.getVectorShuffle( MVT::v16i16, DL, Lo, Hi, {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}); } LLVM_FALLTHROUGH; } case MVT::v32i8: assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"); LLVM_FALLTHROUGH; case MVT::v16i8: { assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"); // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Masked; if (Subtarget.hasBWI() && Subtarget.hasVLX()) { MVT IntegerType = MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); } // Scale the blend by the number of bytes per element. int Scale = VT.getScalarSizeInBits() / 8; // This form of blend is always done on bytes. Compute the byte vector // type. MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); // x86 allows load folding with blendvb from the 2nd source operand. But // we are still using LLVM select here (see comment below), so that's V1. // If V2 can be load-folded and V1 cannot be load-folded, then commute to // allow that load-folding possibility. if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) { ShuffleVectorSDNode::commuteMask(Mask); std::swap(V1, V2); } // Compute the VSELECT mask. Note that VSELECT is really confusing in the // mix of LLVM's code generator and the x86 backend. We tell the code // generator that boolean values in the elements of an x86 vector register // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' // mapping a select to operand #1, and 'false' mapping to operand #2. The // reality in x86 is that vector masks (pre-AVX-512) use only the high bit // of the element (the remaining are ignored) and 0 in that high bit would // mean operand #1 while 1 in the high bit would mean operand #2. So while // the LLVM model for boolean values in vector elements gets the relevant // bit set, it is set backwards and over constrained relative to x86's // actual model. SmallVector VSELECTMask; for (int i = 0, Size = Mask.size(); i < Size; ++i) for (int j = 0; j < Scale; ++j) VSELECTMask.push_back( Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8)); V1 = DAG.getBitcast(BlendVT, V1); V2 = DAG.getBitcast(BlendVT, V2); return DAG.getBitcast( VT, DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2)); } case MVT::v16f32: case MVT::v8f64: case MVT::v8i64: case MVT::v16i32: case MVT::v32i16: case MVT::v64i8: { // Attempt to lower to a bitmask if we can. Only if not optimizing for size. bool OptForSize = DAG.shouldOptForSize(); if (!OptForSize) { if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Masked; } // Otherwise load an immediate into a GPR, cast to k-register, and use a // masked move. MVT IntegerType = MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); } default: llvm_unreachable("Not a supported integer vector type!"); } } /// Try to lower as a blend of elements from two inputs followed by /// a single-input permutation. /// /// This matches the pattern where we can blend elements from two inputs and /// then reduce the shuffle to a single-input permutation. static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG, bool ImmBlends = false) { // We build up the blend mask while checking whether a blend is a viable way // to reduce the shuffle. SmallVector BlendMask(Mask.size(), -1); SmallVector PermuteMask(Mask.size(), -1); for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] < 0) continue; assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds."); if (BlendMask[Mask[i] % Size] < 0) BlendMask[Mask[i] % Size] = Mask[i]; else if (BlendMask[Mask[i] % Size] != Mask[i]) return SDValue(); // Can't blend in the needed input! PermuteMask[i] = Mask[i] % Size; } // If only immediate blends, then bail if the blend mask can't be widened to // i16. unsigned EltSize = VT.getScalarSizeInBits(); if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask)) return SDValue(); SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); } /// Try to lower as an unpack of elements from two inputs followed by /// a single-input permutation. /// /// This matches the pattern where we can unpack elements from two inputs and /// then reduce the shuffle to a single-input (wider) permutation. static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { int NumElts = Mask.size(); int NumLanes = VT.getSizeInBits() / 128; int NumLaneElts = NumElts / NumLanes; int NumHalfLaneElts = NumLaneElts / 2; bool MatchLo = true, MatchHi = true; SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)}; // Determine UNPCKL/UNPCKH type and operand order. for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) { for (int Elt = 0; Elt != NumLaneElts; ++Elt) { int M = Mask[Lane + Elt]; if (M < 0) continue; SDValue &Op = Ops[Elt & 1]; if (M < NumElts && (Op.isUndef() || Op == V1)) Op = V1; else if (NumElts <= M && (Op.isUndef() || Op == V2)) Op = V2; else return SDValue(); int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts; MatchLo &= isUndefOrInRange(M, Lo, Mid) || isUndefOrInRange(M, NumElts + Lo, NumElts + Mid); MatchHi &= isUndefOrInRange(M, Mid, Hi) || isUndefOrInRange(M, NumElts + Mid, NumElts + Hi); if (!MatchLo && !MatchHi) return SDValue(); } } assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"); // Now check that each pair of elts come from the same unpack pair // and set the permute mask based on each pair. // TODO - Investigate cases where we permute individual elements. SmallVector PermuteMask(NumElts, -1); for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) { for (int Elt = 0; Elt != NumLaneElts; Elt += 2) { int M0 = Mask[Lane + Elt + 0]; int M1 = Mask[Lane + Elt + 1]; if (0 <= M0 && 0 <= M1 && (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts)) return SDValue(); if (0 <= M0) PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts)); if (0 <= M1) PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1; } } unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH; SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops); return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask); } /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then /// permuting the elements of the result in place. static SDValue lowerShuffleAsByteRotateAndPermute( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) || (VT.is256BitVector() && !Subtarget.hasAVX2()) || (VT.is512BitVector() && !Subtarget.hasBWI())) return SDValue(); // We don't currently support lane crossing permutes. if (is128BitLaneCrossingShuffleMask(VT, Mask)) return SDValue(); int Scale = VT.getScalarSizeInBits() / 8; int NumLanes = VT.getSizeInBits() / 128; int NumElts = VT.getVectorNumElements(); int NumEltsPerLane = NumElts / NumLanes; // Determine range of mask elts. bool Blend1 = true; bool Blend2 = true; std::pair Range1 = std::make_pair(INT_MAX, INT_MIN); std::pair Range2 = std::make_pair(INT_MAX, INT_MIN); for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) { for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) { int M = Mask[Lane + Elt]; if (M < 0) continue; if (M < NumElts) { Blend1 &= (M == (Lane + Elt)); assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask"); M = M % NumEltsPerLane; Range1.first = std::min(Range1.first, M); Range1.second = std::max(Range1.second, M); } else { M -= NumElts; Blend2 &= (M == (Lane + Elt)); assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask"); M = M % NumEltsPerLane; Range2.first = std::min(Range2.first, M); Range2.second = std::max(Range2.second, M); } } } // Bail if we don't need both elements. // TODO - it might be worth doing this for unary shuffles if the permute // can be widened. if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) || !(0 <= Range2.first && Range2.second < NumEltsPerLane)) return SDValue(); if (VT.getSizeInBits() > 128 && (Blend1 || Blend2)) return SDValue(); // Rotate the 2 ops so we can access both ranges, then permute the result. auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) { MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); SDValue Rotate = DAG.getBitcast( VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi), DAG.getBitcast(ByteVT, Lo), DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8))); SmallVector PermMask(NumElts, SM_SentinelUndef); for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) { for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) { int M = Mask[Lane + Elt]; if (M < 0) continue; if (M < NumElts) PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane); else PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane); } } return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask); }; // Check if the ranges are small enough to rotate from either direction. if (Range2.second < Range1.first) return RotateAndPermute(V1, V2, Range1.first, 0); if (Range1.second < Range2.first) return RotateAndPermute(V2, V1, Range2.first, NumElts); return SDValue(); } /// Generic routine to decompose a shuffle and blend into independent /// blends and permutes. /// /// This matches the extremely common pattern for handling combined /// shuffle+blend operations on newer X86 ISAs where we have very fast blend /// operations. It will try to pick the best arrangement of shuffles and /// blends. static SDValue lowerShuffleAsDecomposedShuffleBlend( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // Shuffle the input elements into the desired positions in V1 and V2 and // blend them together. SmallVector V1Mask(Mask.size(), -1); SmallVector V2Mask(Mask.size(), -1); SmallVector BlendMask(Mask.size(), -1); for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] >= 0 && Mask[i] < Size) { V1Mask[i] = Mask[i]; BlendMask[i] = i; } else if (Mask[i] >= Size) { V2Mask[i] = Mask[i] - Size; BlendMask[i] = i + Size; } // Try to lower with the simpler initial blend/unpack/rotate strategies unless // one of the input shuffles would be a no-op. We prefer to shuffle inputs as // the shuffle may be able to fold with a load or other benefit. However, when // we'll have to do 2x as many shuffles in order to achieve this, a 2-input // pre-shuffle first is a better strategy. if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) { // Only prefer immediate blends to unpack/rotate. if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true)) return BlendPerm; if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG)) return UnpackPerm; if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute( DL, VT, V1, V2, Mask, Subtarget, DAG)) return RotatePerm; // Unpack/rotate failed - try again with variable blends. if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) return BlendPerm; } V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); } /// Try to lower a vector shuffle as a rotation. /// /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512. static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef Mask) { int NumElts = Mask.size(); // We need to detect various ways of spelling a rotation: // [11, 12, 13, 14, 15, 0, 1, 2] // [-1, 12, 13, 14, -1, -1, 1, -1] // [-1, -1, -1, -1, -1, -1, 1, 2] // [ 3, 4, 5, 6, 7, 8, 9, 10] // [-1, 4, 5, 6, -1, -1, 9, -1] // [-1, 4, 5, 6, -1, -1, -1, -1] int Rotation = 0; SDValue Lo, Hi; for (int i = 0; i < NumElts; ++i) { int M = Mask[i]; assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && "Unexpected mask index."); if (M < 0) continue; // Determine where a rotated vector would have started. int StartIdx = i - (M % NumElts); if (StartIdx == 0) // The identity rotation isn't interesting, stop. return -1; // If we found the tail of a vector the rotation must be the missing // front. If we found the head of a vector, it must be how much of the // head. int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx; if (Rotation == 0) Rotation = CandidateRotation; else if (Rotation != CandidateRotation) // The rotations don't match, so we can't match this mask. return -1; // Compute which value this mask is pointing at. SDValue MaskV = M < NumElts ? V1 : V2; // Compute which of the two target values this index should be assigned // to. This reflects whether the high elements are remaining or the low // elements are remaining. SDValue &TargetV = StartIdx < 0 ? Hi : Lo; // Either set up this value if we've not encountered it before, or check // that it remains consistent. if (!TargetV) TargetV = MaskV; else if (TargetV != MaskV) // This may be a rotation, but it pulls from the inputs in some // unsupported interleaving. return -1; } // Check that we successfully analyzed the mask, and normalize the results. assert(Rotation != 0 && "Failed to locate a viable rotation!"); assert((Lo || Hi) && "Failed to find a rotated input vector!"); if (!Lo) Lo = Hi; else if (!Hi) Hi = Lo; V1 = Lo; V2 = Hi; return Rotation; } /// Try to lower a vector shuffle as a byte rotation. /// /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will /// try to generically lower a vector shuffle through such an pattern. It /// does not check for the profitability of lowering either as PALIGNR or /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. /// This matches shuffle vectors that look like: /// /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] /// /// Essentially it concatenates V1 and V2, shifts right by some number of /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef Mask) { // Don't accept any shuffles with zero elements. if (any_of(Mask, [](int M) { return M == SM_SentinelZero; })) return -1; // PALIGNR works on 128-bit lanes. SmallVector RepeatedMask; if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) return -1; int Rotation = matchShuffleAsRotate(V1, V2, RepeatedMask); if (Rotation <= 0) return -1; // PALIGNR rotates bytes, so we need to scale the // rotation based on how many bytes are in the vector lane. int NumElts = RepeatedMask.size(); int Scale = 16 / NumElts; return Rotation * Scale; } static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); SDValue Lo = V1, Hi = V2; int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask); if (ByteRotation <= 0) return SDValue(); // Cast the inputs to i8 vector of correct length to match PALIGNR or // PSLLDQ/PSRLDQ. MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); Lo = DAG.getBitcast(ByteVT, Lo); Hi = DAG.getBitcast(ByteVT, Hi); // SSSE3 targets can use the palignr instruction. if (Subtarget.hasSSSE3()) { assert((!VT.is512BitVector() || Subtarget.hasBWI()) && "512-bit PALIGNR requires BWI instructions"); return DAG.getBitcast( VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi, DAG.getTargetConstant(ByteRotation, DL, MVT::i8))); } assert(VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"); assert(Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"); assert(ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"); // Default SSE2 implementation int LoByteShift = 16 - ByteRotation; int HiByteShift = ByteRotation; SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo, DAG.getTargetConstant(LoByteShift, DL, MVT::i8)); SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi, DAG.getTargetConstant(HiByteShift, DL, MVT::i8)); return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift)); } /// Try to lower a vector shuffle as a dword/qword rotation. /// /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary /// rotation of the concatenation of two vectors; This routine will /// try to generically lower a vector shuffle through such an pattern. /// /// Essentially it concatenates V1 and V2, shifts right by some number of /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"); // 128/256-bit vectors are only supported with VLX. assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"); SDValue Lo = V1, Hi = V2; int Rotation = matchShuffleAsRotate(Lo, Hi, Mask); if (Rotation <= 0) return SDValue(); return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi, DAG.getTargetConstant(Rotation, DL, MVT::i8)); } /// Try to lower a vector shuffle as a byte shift sequence. static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); assert(VT.is128BitVector() && "Only 128-bit vectors supported"); // We need a shuffle that has zeros at one/both ends and a sequential // shuffle from one source within. unsigned ZeroLo = Zeroable.countTrailingOnes(); unsigned ZeroHi = Zeroable.countLeadingOnes(); if (!ZeroLo && !ZeroHi) return SDValue(); unsigned NumElts = Mask.size(); unsigned Len = NumElts - (ZeroLo + ZeroHi); if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo])) return SDValue(); unsigned Scale = VT.getScalarSizeInBits() / 8; ArrayRef StubMask = Mask.slice(ZeroLo, Len); if (!isUndefOrInRange(StubMask, 0, NumElts) && !isUndefOrInRange(StubMask, NumElts, 2 * NumElts)) return SDValue(); SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2; Res = DAG.getBitcast(MVT::v16i8, Res); // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an // inner sequential set of elements, possibly offset: // 01234567 --> zzzzzz01 --> 1zzzzzzz // 01234567 --> 4567zzzz --> zzzzz456 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz if (ZeroLo == 0) { unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8)); } else if (ZeroHi == 0) { unsigned Shift = Mask[ZeroLo] % NumElts; Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8)); } else if (!Subtarget.hasSSSE3()) { // If we don't have PSHUFB then its worth avoiding an AND constant mask // by performing 3 byte shifts. Shuffle combining can kick in above that. // TODO: There may be some cases where VSH{LR}DQ+PAND is still better. unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); Shift += Mask[ZeroLo] % NumElts; Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8)); } else return SDValue(); return DAG.getBitcast(VT, Res); } /// Try to lower a vector shuffle as a bit shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function /// matches elements from one of the input vectors shuffled to the left or /// right with zeroable elements 'shifted in'. It handles both the strictly /// bit-wise element shifts and the byte shift across an entire 128-bit double /// quad word lane. /// /// PSHL : (little-endian) left bit shift. /// [ zz, 0, zz, 2 ] /// [ -1, 4, zz, -1 ] /// PSRL : (little-endian) right bit shift. /// [ 1, zz, 3, zz] /// [ -1, -1, 7, zz] /// PSLLDQ : (little-endian) left byte shift /// [ zz, 0, 1, 2, 3, 4, 5, 6] /// [ zz, zz, -1, -1, 2, 3, 4, -1] /// [ zz, zz, zz, zz, zz, zz, -1, 1] /// PSRLDQ : (little-endian) right byte shift /// [ 5, 6, 7, zz, zz, zz, zz, zz] /// [ -1, 5, 6, 7, zz, zz, zz, zz] /// [ 1, 2, -1, -1, -1, -1, zz, zz] static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef Mask, int MaskOffset, const APInt &Zeroable, const X86Subtarget &Subtarget) { int Size = Mask.size(); unsigned SizeInBits = Size * ScalarSizeInBits; auto CheckZeros = [&](int Shift, int Scale, bool Left) { for (int i = 0; i < Size; i += Scale) for (int j = 0; j < Shift; ++j) if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))]) return false; return true; }; auto MatchShift = [&](int Shift, int Scale, bool Left) { for (int i = 0; i != Size; i += Scale) { unsigned Pos = Left ? i + Shift : i; unsigned Low = Left ? i : i + Shift; unsigned Len = Scale - Shift; if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset)) return -1; } int ShiftEltBits = ScalarSizeInBits * Scale; bool ByteShift = ShiftEltBits > 64; Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI) : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI); int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1); // Normalize the scale for byte shifts to still produce an i64 element // type. Scale = ByteShift ? Scale / 2 : Scale; // We need to round trip through the appropriate type for the shift. MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale); ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8) : MVT::getVectorVT(ShiftSVT, Size / Scale); return (int)ShiftAmt; }; // SSE/AVX supports logical shifts up to 64-bit integers - so we can just // keep doubling the size of the integer elements up to that. We can // then shift the elements of the integer vector by whole multiples of // their width within the elements of the larger integer vector. Test each // multiple to see if we can find a match with the moved element indices // and that the shifted in elements are all zeroable. unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128); for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2) for (int Shift = 1; Shift != Scale; ++Shift) for (bool Left : {true, false}) if (CheckZeros(Shift, Scale, Left)) { int ShiftAmt = MatchShift(Shift, Scale, Left); if (0 < ShiftAmt) return ShiftAmt; } // no match return -1; } static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Size = Mask.size(); assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); MVT ShiftVT; SDValue V = V1; unsigned Opcode; // Try to match shuffle against V1 shift. int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget); // If V1 failed, try to match shuffle against V2 shift. if (ShiftAmt < 0) { ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, Size, Zeroable, Subtarget); V = V2; } if (ShiftAmt < 0) return SDValue(); assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && "Illegal integer vector type"); V = DAG.getBitcast(ShiftVT, V); V = DAG.getNode(Opcode, DL, ShiftVT, V, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); return DAG.getBitcast(VT, V); } // EXTRQ: Extract Len elements from lower half of source, starting at Idx. // Remainder of lower half result is zero and upper half is all undef. static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask"); // Upper half must be undefined. if (!isUndefUpperHalf(Mask)) return false; // Determine the extraction length from the part of the // lower half that isn't zeroable. int Len = HalfSize; for (; Len > 0; --Len) if (!Zeroable[Len - 1]) break; assert(Len > 0 && "Zeroable shuffle mask"); // Attempt to match first Len sequential elements from the lower half. SDValue Src; int Idx = -1; for (int i = 0; i != Len; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) continue; SDValue &V = (M < Size ? V1 : V2); M = M % Size; // The extracted elements must start at a valid index and all mask // elements must be in the lower half. if (i > M || M >= HalfSize) return false; if (Idx < 0 || (Src == V && Idx == (M - i))) { Src = V; Idx = M - i; continue; } return false; } if (!Src || Idx < 0) return false; assert((Idx + Len) <= HalfSize && "Illegal extraction mask"); BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; V1 = Src; return true; } // INSERTQ: Extract lowest Len elements from lower half of second source and // insert over first source, starting at Idx. // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef Mask, uint64_t &BitLen, uint64_t &BitIdx) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); // Upper half must be undefined. if (!isUndefUpperHalf(Mask)) return false; for (int Idx = 0; Idx != HalfSize; ++Idx) { SDValue Base; // Attempt to match first source from mask before insertion point. if (isUndefInRange(Mask, 0, Idx)) { /* EMPTY */ } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) { Base = V1; } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) { Base = V2; } else { continue; } // Extend the extraction length looking to match both the insertion of // the second source and the remaining elements of the first. for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) { SDValue Insert; int Len = Hi - Idx; // Match insertion. if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) { Insert = V1; } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) { Insert = V2; } else { continue; } // Match the remaining elements of the lower half. if (isUndefInRange(Mask, Hi, HalfSize - Hi)) { /* EMPTY */ } else if ((!Base || (Base == V1)) && isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) { Base = V1; } else if ((!Base || (Base == V2)) && isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Size + Hi)) { Base = V2; } else { continue; } BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; V1 = Base; V2 = Insert; return true; } } return false; } /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, SelectionDAG &DAG) { uint64_t BitLen, BitIdx; if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1, DAG.getTargetConstant(BitLen, DL, MVT::i8), DAG.getTargetConstant(BitIdx, DL, MVT::i8)); if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx)) return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT), V2 ? V2 : DAG.getUNDEF(VT), DAG.getTargetConstant(BitLen, DL, MVT::i8), DAG.getTargetConstant(BitIdx, DL, MVT::i8)); return SDValue(); } /// Lower a vector shuffle as a zero or any extension. /// /// Given a specific number of elements, element bit width, and extension /// stride, produce either a zero or any extension based on the available /// features of the subtarget. The extended elements are consecutive and /// begin and can start from an offsetted element index in the input; to /// avoid excess shuffling the offset must either being in the bottom lane /// or at the start of a higher lane. All extended elements must be from /// the same lane. static SDValue lowerShuffleAsSpecificZeroOrAnyExtend( const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Scale > 1 && "Need a scale to extend."); int EltBits = VT.getScalarSizeInBits(); int NumElements = VT.getVectorNumElements(); int NumEltsPerLane = 128 / EltBits; int OffsetLane = Offset / NumEltsPerLane; assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Only 8, 16, and 32 bit elements can be extended."); assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."); assert(0 <= Offset && "Extension offset must be positive."); assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."); // Check that an index is in same lane as the base offset. auto SafeOffset = [&](int Idx) { return OffsetLane == (Idx / NumEltsPerLane); }; // Shift along an input so that the offset base moves to the first element. auto ShuffleOffset = [&](SDValue V) { if (!Offset) return V; SmallVector ShMask((unsigned)NumElements, -1); for (int i = 0; i * Scale < NumElements; ++i) { int SrcIdx = i + Offset; ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1; } return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask); }; // Found a valid a/zext mask! Try various lowering strategies based on the // input type and available ISA extensions. if (Subtarget.hasSSE41()) { // Not worth offsetting 128-bit vectors if scale == 2, a pattern using // PUNPCK will catch this in a later shuffle match. if (Offset && Scale == 2 && VT.is128BitVector()) return SDValue(); MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); InputV = ShuffleOffset(InputV); InputV = getExtendInVec(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND, DL, ExtVT, InputV, DAG); return DAG.getBitcast(VT, InputV); } assert(VT.is128BitVector() && "Only 128-bit vectors can be extended."); // For any extends we can cheat for larger element sizes and use shuffle // instructions that can fold with a load and/or copy. if (AnyExt && EltBits == 32) { int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1, -1}; return DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } if (AnyExt && EltBits == 16 && Scale > 2) { int PSHUFDMask[4] = {Offset / 2, -1, SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1}; InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); int PSHUFWMask[4] = {1, -1, -1, -1}; unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; return DAG.getBitcast( VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, InputV), getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG))); } // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes // to 64-bits. if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) { assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"); assert(VT.is128BitVector() && "Unexpected vector width!"); int LoIdx = Offset * EltBits; SDValue Lo = DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, DAG.getTargetConstant(EltBits, DL, MVT::i8), DAG.getTargetConstant(LoIdx, DL, MVT::i8))); if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1)) return DAG.getBitcast(VT, Lo); int HiIdx = (Offset + 1) * EltBits; SDValue Hi = DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, DAG.getTargetConstant(EltBits, DL, MVT::i8), DAG.getTargetConstant(HiIdx, DL, MVT::i8))); return DAG.getBitcast(VT, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi)); } // If this would require more than 2 unpack instructions to expand, use // pshufb when available. We can only use more than 2 unpack instructions // when zero extending i8 elements which also makes it easier to use pshufb. if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) { assert(NumElements == 16 && "Unexpected byte vector width!"); SDValue PSHUFBMask[16]; for (int i = 0; i < 16; ++i) { int Idx = Offset + (i / Scale); if ((i % Scale == 0 && SafeOffset(Idx))) { PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8); continue; } PSHUFBMask[i] = AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8); } InputV = DAG.getBitcast(MVT::v16i8, InputV); return DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask))); } // If we are extending from an offset, ensure we start on a boundary that // we can unpack from. int AlignToUnpack = Offset % (NumElements / Scale); if (AlignToUnpack) { SmallVector ShMask((unsigned)NumElements, -1); for (int i = AlignToUnpack; i < NumElements; ++i) ShMask[i - AlignToUnpack] = i; InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask); Offset -= AlignToUnpack; } // Otherwise emit a sequence of unpacks. do { unsigned UnpackLoHi = X86ISD::UNPCKL; if (Offset >= (NumElements / 2)) { UnpackLoHi = X86ISD::UNPCKH; Offset -= (NumElements / 2); } MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT) : getZeroVector(InputVT, Subtarget, DAG, DL); InputV = DAG.getBitcast(InputVT, InputV); InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext); Scale /= 2; EltBits *= 2; NumElements /= 2; } while (Scale > 1); return DAG.getBitcast(VT, InputV); } /// Try to lower a vector shuffle as a zero extension on any microarch. /// /// This routine will try to do everything in its power to cleverly lower /// a shuffle which happens to match the pattern of a zero extend. It doesn't /// check for the profitability of this lowering, it tries to aggressively /// match this pattern. It will use all of the micro-architectural details it /// can to emit an efficient lowering. It handles both blends with all-zero /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to /// masking out later). /// /// The reason we have dedicated lowering for zext-style shuffles is that they /// are both incredibly common and often quite performance sensitive. static SDValue lowerShuffleAsZeroOrAnyExtend( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Bits = VT.getSizeInBits(); int NumLanes = Bits / 128; int NumElements = VT.getVectorNumElements(); int NumEltsPerLane = NumElements / NumLanes; assert(VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit"); assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size"); // Define a helper function to check a particular ext-scale and lower to it if // valid. auto Lower = [&](int Scale) -> SDValue { SDValue InputV; bool AnyExt = true; int Offset = 0; int Matches = 0; for (int i = 0; i < NumElements; ++i) { int M = Mask[i]; if (M < 0) continue; // Valid anywhere but doesn't tell us anything. if (i % Scale != 0) { // Each of the extended elements need to be zeroable. if (!Zeroable[i]) return SDValue(); // We no longer are in the anyext case. AnyExt = false; continue; } // Each of the base elements needs to be consecutive indices into the // same input vector. SDValue V = M < NumElements ? V1 : V2; M = M % NumElements; if (!InputV) { InputV = V; Offset = M - (i / Scale); } else if (InputV != V) return SDValue(); // Flip-flopping inputs. // Offset must start in the lowest 128-bit lane or at the start of an // upper lane. // FIXME: Is it ever worth allowing a negative base offset? if (!((0 <= Offset && Offset < NumEltsPerLane) || (Offset % NumEltsPerLane) == 0)) return SDValue(); // If we are offsetting, all referenced entries must come from the same // lane. if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane)) return SDValue(); if ((M % NumElements) != (Offset + (i / Scale))) return SDValue(); // Non-consecutive strided elements. Matches++; } // If we fail to find an input, we have a zero-shuffle which should always // have already been handled. // FIXME: Maybe handle this here in case during blending we end up with one? if (!InputV) return SDValue(); // If we are offsetting, don't extend if we only match a single input, we // can always do better by using a basic PSHUF or PUNPCK. if (Offset != 0 && Matches < 2) return SDValue(); return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG); }; // The widest scale possible for extending is to a 64-bit integer. assert(Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"); int NumExtElements = Bits / 64; // Each iteration, try extending the elements half as much, but into twice as // many elements. for (; NumExtElements < NumElements; NumExtElements *= 2) { assert(NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size."); if (SDValue V = Lower(NumElements / NumExtElements)) return V; } // General extends failed, but 128-bit vectors may be able to use MOVQ. if (Bits != 128) return SDValue(); // Returns one of the source operands if the shuffle can be reduced to a // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits. auto CanZExtLowHalf = [&]() { for (int i = NumElements / 2; i != NumElements; ++i) if (!Zeroable[i]) return SDValue(); if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0)) return V1; if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements)) return V2; return SDValue(); }; if (SDValue V = CanZExtLowHalf()) { V = DAG.getBitcast(MVT::v2i64, V); V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V); return DAG.getBitcast(VT, V); } // No viable ext lowering found. return SDValue(); } /// Try to get a scalar value for a specific element of a vector. /// /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar. static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG) { MVT VT = V.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); V = peekThroughBitcasts(V); // If the bitcasts shift the element size, we can't extract an equivalent // element from it. MVT NewVT = V.getSimpleValueType(); if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) return SDValue(); if (V.getOpcode() == ISD::BUILD_VECTOR || (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) { // Ensure the scalar operand is the same size as the destination. // FIXME: Add support for scalar truncation where possible. SDValue S = V.getOperand(Idx); if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits()) return DAG.getBitcast(EltVT, S); } return SDValue(); } /// Helper to test for a load that can be folded with x86 shuffles. /// /// This is particularly important because the set of instructions varies /// significantly based on whether the operand is a load or not. static bool isShuffleFoldableLoad(SDValue V) { V = peekThroughBitcasts(V); return ISD::isNON_EXTLoad(V.getNode()); } /// Try to lower insertion of a single element into a zero vector. /// /// This is a common pattern that we have especially efficient patterns to lower /// across all subtarget feature sets. static SDValue lowerShuffleAsElementInsertion( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT ExtVT = VT; MVT EltVT = VT.getVectorElementType(); int V2Index = find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) - Mask.begin(); bool IsV1Zeroable = true; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (i != V2Index && !Zeroable[i]) { IsV1Zeroable = false; break; } // Check for a single input from a SCALAR_TO_VECTOR node. // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and // all the smarts here sunk into that routine. However, the current // lowering of BUILD_VECTOR makes that nearly impossible until the old // vector shuffle lowering is dead. SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(), DAG); if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) { // We need to zext the scalar if it is smaller than an i32. V2S = DAG.getBitcast(EltVT, V2S); if (EltVT == MVT::i8 || EltVT == MVT::i16) { // Using zext to expand a narrow element won't work for non-zero // insertions. if (!IsV1Zeroable) return SDValue(); // Zero-extend directly to i32. ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32); V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); } V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 || EltVT == MVT::i16) { // Either not inserting from the low element of the input or the input // element size is too small to use VZEXT_MOVL to clear the high bits. return SDValue(); } if (!IsV1Zeroable) { // If V1 can't be treated as a zero vector we have fewer options to lower // this. We can't support integer vectors or non-zero targets cheaply, and // the V1 elements can't be permuted in any way. assert(VT == ExtVT && "Cannot change extended type when non-zeroable!"); if (!VT.isFloatingPoint() || V2Index != 0) return SDValue(); SmallVector V1Mask(Mask.begin(), Mask.end()); V1Mask[V2Index] = -1; if (!isNoopShuffleMask(V1Mask)) return SDValue(); if (!VT.is128BitVector()) return SDValue(); // Otherwise, use MOVSD or MOVSS. assert((EltVT == MVT::f32 || EltVT == MVT::f64) && "Only two types of floating point element types to handle!"); return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL, ExtVT, V1, V2); } // This lowering only works for the low element with floating point vectors. if (VT.isFloatingPoint() && V2Index != 0) return SDValue(); V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); if (ExtVT != VT) V2 = DAG.getBitcast(VT, V2); if (V2Index != 0) { // If we have 4 or fewer lanes we can cheaply shuffle the element into // the desired position. Otherwise it is more efficient to do a vector // shift left. We know that we can do a vector shift left because all // the inputs are zero. if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) { SmallVector V2Shuffle(Mask.size(), 1); V2Shuffle[V2Index] = 0; V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); } else { V2 = DAG.getBitcast(MVT::v16i8, V2); V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2, DAG.getTargetConstant( V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8)); V2 = DAG.getBitcast(VT, V2); } } return V2; } /// Try to lower broadcast of a single - truncated - integer element, /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements. /// /// This assumes we have AVX2. static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"); EVT EltVT = VT.getVectorElementType(); EVT V0VT = V0.getValueType(); assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!"); assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!"); EVT V0EltVT = V0VT.getVectorElementType(); if (!V0EltVT.isInteger()) return SDValue(); const unsigned EltSize = EltVT.getSizeInBits(); const unsigned V0EltSize = V0EltVT.getSizeInBits(); // This is only a truncation if the original element type is larger. if (V0EltSize <= EltSize) return SDValue(); assert(((V0EltSize % EltSize) == 0) && "Scalar type sizes must all be powers of 2 on x86!"); const unsigned V0Opc = V0.getOpcode(); const unsigned Scale = V0EltSize / EltSize; const unsigned V0BroadcastIdx = BroadcastIdx / Scale; if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) && V0Opc != ISD::BUILD_VECTOR) return SDValue(); SDValue Scalar = V0.getOperand(V0BroadcastIdx); // If we're extracting non-least-significant bits, shift so we can truncate. // Hopefully, we can fold away the trunc/srl/load into the broadcast. // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd. if (const int OffsetIdx = BroadcastIdx % Scale) Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar, DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8)); return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar)); } /// Test whether this can be lowered with a single SHUFPS instruction. /// /// This is used to disable more specialized lowerings when the shufps lowering /// will happen to be efficient. static bool isSingleSHUFPSMask(ArrayRef Mask) { // This routine only handles 128-bit shufps. assert(Mask.size() == 4 && "Unsupported mask size!"); assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!"); assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!"); assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!"); assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!"); // To lower with a single SHUFPS we need to have the low half and high half // each requiring a single input. if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4)) return false; if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4)) return false; return true; } /// If we are extracting two 128-bit halves of a vector and shuffling the /// result, match that to a 256-bit AVX2 vperm* instruction to avoid a /// multi-shuffle lowering. static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef Mask, SelectionDAG &DAG) { EVT VT = N0.getValueType(); assert((VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"); // Check that both sources are extracts of the same source vector. if (!N0.hasOneUse() || !N1.hasOneUse() || N0.getOpcode() != ISD::EXTRACT_SUBVECTOR || N1.getOpcode() != ISD::EXTRACT_SUBVECTOR || N0.getOperand(0) != N1.getOperand(0)) return SDValue(); SDValue WideVec = N0.getOperand(0); EVT WideVT = WideVec.getValueType(); if (!WideVT.is256BitVector() || !isa(N0.getOperand(1)) || !isa(N1.getOperand(1))) return SDValue(); // Match extracts of each half of the wide source vector. Commute the shuffle // if the extract of the low half is N1. unsigned NumElts = VT.getVectorNumElements(); SmallVector NewMask(Mask.begin(), Mask.end()); const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1); const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1); if (ExtIndex1 == 0 && ExtIndex0 == NumElts) ShuffleVectorSDNode::commuteMask(NewMask); else if (ExtIndex0 != 0 || ExtIndex1 != NumElts) return SDValue(); // Final bailout: if the mask is simple, we are better off using an extract // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps // because that avoids a constant load from memory. if (NumElts == 4 && (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask))) return SDValue(); // Extend the shuffle mask with undef elements. NewMask.append(NumElts, -1); // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT), NewMask); // This is free: ymm -> xmm. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf, DAG.getIntPtrConstant(0, DL)); } /// Try to lower broadcast of a single element. /// /// For convenience, this code also bundles all of the subtarget feature set /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) || (Subtarget.hasAVX() && VT.isFloatingPoint()) || (Subtarget.hasAVX2() && VT.isInteger()))) return SDValue(); // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise // we can only broadcast from a register with AVX2. unsigned NumElts = Mask.size(); unsigned NumEltBits = VT.getScalarSizeInBits(); unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2()) ? X86ISD::MOVDDUP : X86ISD::VBROADCAST; bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2(); // Check that the mask is a broadcast. int BroadcastIdx = -1; for (int i = 0; i != (int)NumElts; ++i) { SmallVector BroadcastMask(NumElts, i); if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) { BroadcastIdx = i; break; } } if (BroadcastIdx < 0) return SDValue(); assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " "a sorted mask where the broadcast " "comes from V1."); // Go up the chain of (vector) values to find a scalar load that we can // combine with the broadcast. int BitOffset = BroadcastIdx * NumEltBits; SDValue V = V1; for (;;) { switch (V.getOpcode()) { case ISD::BITCAST: { V = V.getOperand(0); continue; } case ISD::CONCAT_VECTORS: { int OpBitWidth = V.getOperand(0).getValueSizeInBits(); int OpIdx = BitOffset / OpBitWidth; V = V.getOperand(OpIdx); BitOffset %= OpBitWidth; continue; } case ISD::INSERT_SUBVECTOR: { SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); auto ConstantIdx = dyn_cast(V.getOperand(2)); if (!ConstantIdx) break; int EltBitWidth = VOuter.getScalarValueSizeInBits(); int Idx = (int)ConstantIdx->getZExtValue(); int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements(); int BeginOffset = Idx * EltBitWidth; int EndOffset = BeginOffset + NumSubElts * EltBitWidth; if (BeginOffset <= BitOffset && BitOffset < EndOffset) { BitOffset -= BeginOffset; V = VInner; } else { V = VOuter; } continue; } } break; } assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset"); BroadcastIdx = BitOffset / NumEltBits; // Do we need to bitcast the source to retrieve the original broadcast index? bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits; // Check if this is a broadcast of a scalar. We special case lowering // for scalars so that we can more effectively fold with loads. // If the original value has a larger element type than the shuffle, the // broadcast element is in essence truncated. Make that explicit to ease // folding. if (BitCastSrc && VT.isInteger()) if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast( DL, VT, V, BroadcastIdx, Subtarget, DAG)) return TruncBroadcast; MVT BroadcastVT = VT; // Also check the simpler case, where we can directly reuse the scalar. if (!BitCastSrc && ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) { V = V.getOperand(BroadcastIdx); // If we can't broadcast from a register, check that the input is a load. if (!BroadcastFromReg && !isShuffleFoldableLoad(V)) return SDValue(); } else if (MayFoldLoad(V) && cast(V)->isSimple()) { // 32-bit targets need to load i64 as a f64 and then bitcast the result. if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) { BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2()) ? X86ISD::MOVDDUP : Opcode; } // If we are broadcasting a load that is only used by the shuffle // then we can reduce the vector load to the broadcasted scalar load. LoadSDNode *Ld = cast(V); SDValue BaseAddr = Ld->getOperand(1); EVT SVT = BroadcastVT.getScalarType(); unsigned Offset = BroadcastIdx * SVT.getStoreSize(); assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset"); SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL); V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, DAG.getMachineFunction().getMachineMemOperand( Ld->getMemOperand(), Offset, SVT.getStoreSize())); DAG.makeEquivalentMemoryOrdering(Ld, V); } else if (!BroadcastFromReg) { // We can't broadcast from a vector register. return SDValue(); } else if (BitOffset != 0) { // We can only broadcast from the zero-element of a vector register, // but it can be advantageous to broadcast from the zero-element of a // subvector. if (!VT.is256BitVector() && !VT.is512BitVector()) return SDValue(); // VPERMQ/VPERMPD can perform the cross-lane shuffle directly. if (VT == MVT::v4f64 || VT == MVT::v4i64) return SDValue(); // Only broadcast the zero-element of a 128-bit subvector. if ((BitOffset % 128) != 0) return SDValue(); assert((BitOffset % V.getScalarValueSizeInBits()) == 0 && "Unexpected bit-offset"); assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && "Unexpected vector size"); unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits(); V = extract128BitVector(V, ExtractIdx, DAG, DL); } if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, DAG.getBitcast(MVT::f64, V)); // Bitcast back to the same scalar type as BroadcastVT. if (V.getValueType().getScalarType() != BroadcastVT.getScalarType()) { assert(NumEltBits == BroadcastVT.getScalarSizeInBits() && "Unexpected vector element size"); MVT ExtVT; if (V.getValueType().isVector()) { unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits; ExtVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts); } else { ExtVT = BroadcastVT.getScalarType(); } V = DAG.getBitcast(ExtVT, V); } // 32-bit targets need to load i64 as a f64 and then bitcast the result. if (!Subtarget.is64Bit() && V.getValueType() == MVT::i64) { V = DAG.getBitcast(MVT::f64, V); unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements(); BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts); } // We only support broadcasting from 128-bit vectors to minimize the // number of patterns we need to deal with in isel. So extract down to // 128-bits, removing as many bitcasts as possible. if (V.getValueSizeInBits() > 128) { MVT ExtVT = V.getSimpleValueType().getScalarType(); ExtVT = MVT::getVectorVT(ExtVT, 128 / ExtVT.getScalarSizeInBits()); V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL); V = DAG.getBitcast(ExtVT, V); } return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V)); } // Check for whether we can use INSERTPS to perform the shuffle. We only use // INSERTPS when the V1 elements are already in the correct locations // because otherwise we can just always use two SHUFPS instructions which // are much smaller to encode than a SHUFPS and an INSERTPS. We can also // perform INSERTPS if a single V1 element is out of place and all V2 // elements are zeroable. static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef Mask, SelectionDAG &DAG) { assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); // Attempt to match INSERTPS with one element from VA or VB being // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask // are updated. auto matchAsInsertPS = [&](SDValue VA, SDValue VB, ArrayRef CandidateMask) { unsigned ZMask = 0; int VADstIndex = -1; int VBDstIndex = -1; bool VAUsedInPlace = false; for (int i = 0; i < 4; ++i) { // Synthesize a zero mask from the zeroable elements (includes undefs). if (Zeroable[i]) { ZMask |= 1 << i; continue; } // Flag if we use any VA inputs in place. if (i == CandidateMask[i]) { VAUsedInPlace = true; continue; } // We can only insert a single non-zeroable element. if (VADstIndex >= 0 || VBDstIndex >= 0) return false; if (CandidateMask[i] < 4) { // VA input out of place for insertion. VADstIndex = i; } else { // VB input for insertion. VBDstIndex = i; } } // Don't bother if we have no (non-zeroable) element for insertion. if (VADstIndex < 0 && VBDstIndex < 0) return false; // Determine element insertion src/dst indices. The src index is from the // start of the inserted vector, not the start of the concatenated vector. unsigned VBSrcIndex = 0; if (VADstIndex >= 0) { // If we have a VA input out of place, we use VA as the V2 element // insertion and don't use the original V2 at all. VBSrcIndex = CandidateMask[VADstIndex]; VBDstIndex = VADstIndex; VB = VA; } else { VBSrcIndex = CandidateMask[VBDstIndex] - 4; } // If no V1 inputs are used in place, then the result is created only from // the zero mask and the V2 insertion - so remove V1 dependency. if (!VAUsedInPlace) VA = DAG.getUNDEF(MVT::v4f32); // Update V1, V2 and InsertPSMask accordingly. V1 = VA; V2 = VB; // Insert the V2 element into the desired position. InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask; assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); return true; }; if (matchAsInsertPS(V1, V2, Mask)) return true; // Commute and try again. SmallVector CommutedMask(Mask.begin(), Mask.end()); ShuffleVectorSDNode::commuteMask(CommutedMask); if (matchAsInsertPS(V2, V1, CommutedMask)) return true; return false; } static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); // Attempt to match the insertps pattern. unsigned InsertPSMask; if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG)) return SDValue(); // Insert the V2 element into the desired position. return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); } /// Try to lower a shuffle as a permute of the inputs followed by an /// UNPCK instruction. /// /// This specifically targets cases where we end up with alternating between /// the two inputs, and so can permute them into something that feeds a single /// UNPCK instruction. Note that this routine only targets integer vectors /// because for floating point vectors we have a generalized SHUFPS lowering /// strategy that handles everything that doesn't *exactly* match an unpack, /// making this clever lowering unnecessary. static SDValue lowerShuffleAsPermuteAndUnpack( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!VT.isFloatingPoint() && "This routine only supports integer vectors."); assert(VT.is128BitVector() && "This routine only works on 128-bit vectors."); assert(!V2.isUndef() && "This routine should only be used when blending two inputs."); assert(Mask.size() >= 2 && "Single element masks are invalid."); int Size = Mask.size(); int NumLoInputs = count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; }); int NumHiInputs = count_if(Mask, [Size](int M) { return M % Size >= Size / 2; }); bool UnpackLo = NumLoInputs >= NumHiInputs; auto TryUnpack = [&](int ScalarSize, int Scale) { SmallVector V1Mask((unsigned)Size, -1); SmallVector V2Mask((unsigned)Size, -1); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; // Each element of the unpack contains Scale elements from this mask. int UnpackIdx = i / Scale; // We only handle the case where V1 feeds the first slots of the unpack. // We rely on canonicalization to ensure this is the case. if ((UnpackIdx % 2 == 0) != (Mask[i] < Size)) return SDValue(); // Setup the mask for this input. The indexing is tricky as we have to // handle the unpack stride. SmallVectorImpl &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask; VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] = Mask[i] % Size; } // If we will have to shuffle both inputs to use the unpack, check whether // we can just unpack first and shuffle the result. If so, skip this unpack. if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) return SDValue(); // Shuffle the inputs into place. V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); // Cast the inputs to the type we will use to unpack them. MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale); V1 = DAG.getBitcast(UnpackVT, V1); V2 = DAG.getBitcast(UnpackVT, V2); // Unpack the inputs and cast the result back to the desired type. return DAG.getBitcast( VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, UnpackVT, V1, V2)); }; // We try each unpack from the largest to the smallest to try and find one // that fits this mask. int OrigScalarSize = VT.getScalarSizeInBits(); for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize)) return Unpack; // If we're shuffling with a zero vector then we're better off not doing // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements. if (ISD::isBuildVectorAllZeros(V1.getNode()) || ISD::isBuildVectorAllZeros(V2.getNode())) return SDValue(); // If none of the unpack-rooted lowerings worked (or were profitable) try an // initial unpack. if (NumLoInputs == 0 || NumHiInputs == 0) { assert((NumLoInputs > 0 || NumHiInputs > 0) && "We have to have *some* inputs!"); int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0; // FIXME: We could consider the total complexity of the permute of each // possible unpacking. Or at the least we should consider how many // half-crossings are created. // FIXME: We could consider commuting the unpacks. SmallVector PermMask((unsigned)Size, -1); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!"); PermMask[i] = 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1); } return DAG.getVectorShuffle( VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT, V1, V2), DAG.getUNDEF(VT), PermMask); } return SDValue(); } /// Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full /// support for floating point shuffles but not integer shuffles. These /// instructions will incur a domain crossing penalty on some chips though so /// it is better to avoid lowering through this for integer vectors where /// possible. static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (V2.isUndef()) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. Simulate this by using the // single input as both of the "inputs" to this instruction.. unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); if (Subtarget.hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1, DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8)); } return DAG.getNode( X86ISD::SHUFP, DL, MVT::v2f64, Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1, Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1, DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8)); } assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[0] < 2 && "We sort V1 to be the first input."); assert(Mask[1] >= 2 && "We sort V2 to be the second input."); if (Subtarget.hasAVX2()) if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) return Extract; // When loading a scalar and then shuffling it into a vector we can often do // the insertion cheaply. if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG)) return Insertion; // Try to use one of the special instruction patterns to handle two common // blend patterns if a zero-blend above didn't work. if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) || isShuffleEquivalent(V1, V2, Mask, {1, 3})) if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG)) // We can either use a special instruction to load over the low double or // to move just the low double. return DAG.getNode( X86ISD::MOVSD, DL, MVT::v2f64, V2, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S)); if (Subtarget.hasSSE41()) if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG)) return V; unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2, DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8)); } /// Handle lowering of 2-lane 64-bit integer shuffles. /// /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by /// the integer unit to minimize domain crossing penalties. However, for blends /// it falls back to the floating point shuffle operation with appropriate bit /// casting. static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (V2.isUndef()) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We have to map the mask as it is actually a v4i32 shuffle instruction. V1 = DAG.getBitcast(MVT::v4i32, V1); int WidenedMask[4] = { std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1, std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1}; return DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG))); } assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[0] < 2 && "We sort V1 to be the first input."); assert(Mask[1] >= 2 && "We sort V2 to be the second input."); if (Subtarget.hasAVX2()) if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) return Extract; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // When loading a scalar and then shuffling it into a vector we can often do // the insertion cheaply. if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2}; if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG)) return Insertion; // We have different paths for blend lowering, but they all must use the // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG)) return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget.hasSSSE3()) { if (Subtarget.hasVLX()) if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; } // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG); // We implement this with SHUFPD which is pretty lame because it will likely // incur 2 cycles of stall for integer vectors on Nehalem and older chips. // However, all the alternatives are still more cycles and newer chips don't // have this problem. It would be really nice if x86 had better shuffles here. V1 = DAG.getBitcast(MVT::v2f64, V1); V2 = DAG.getBitcast(MVT::v2f64, V2); return DAG.getBitcast(MVT::v2i64, DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); } /// Lower a vector shuffle using the SHUFPS instruction. /// /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. /// It makes no assumptions about whether this is the *best* lowering, it simply /// uses it. static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { SDValue LowV = V1, HighV = V2; int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 1) { int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin(); // Compute the index adjacent to V2Index and in the same half by toggling // the low bit. int V2AdjIndex = V2Index ^ 1; if (Mask[V2AdjIndex] < 0) { // Handles all the cases where we have a single V2 element and an undef. // This will only ever happen in the high lanes because we commute the // vector otherwise. if (V2Index < 2) std::swap(LowV, HighV); NewMask[V2Index] -= 4; } else { // Handle the case where the V2 element ends up adjacent to a V1 element. // To make this work, blend them together as the first step. int V1Index = V2AdjIndex; int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); // Now proceed to reconstruct the final blend as we have the necessary // high or low half formed. if (V2Index < 2) { LowV = V2; HighV = V1; } else { HighV = V2; } NewMask[V1Index] = 2; // We put the V1 element in V2[2]. NewMask[V2Index] = 0; // We shifted the V2 element into V2[0]. } } else if (NumV2Elements == 2) { if (Mask[0] < 4 && Mask[1] < 4) { // Handle the easy case where we have V1 in the low lanes and V2 in the // high lanes. NewMask[2] -= 4; NewMask[3] -= 4; } else if (Mask[2] < 4 && Mask[3] < 4) { // We also handle the reversed case because this utility may get called // when we detect a SHUFPS pattern but can't easily commute the shuffle to // arrange things in the right direction. NewMask[0] -= 4; NewMask[1] -= 4; HighV = V1; LowV = V2; } else { // We have a mixture of V1 and V2 in both low and high lanes. Rather than // trying to place elements directly, just blend them and set up the final // shuffle to place them. // The first two blend mask elements are for V1, the second two are for // V2. int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1], Mask[2] < 4 ? Mask[2] : Mask[3], (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); // Now we do a normal shuffle of V1 by giving V1 as both operands to // a blend. LowV = HighV = V1; NewMask[0] = Mask[0] < 4 ? 0 : 2; NewMask[1] = Mask[0] < 4 ? 2 : 0; NewMask[2] = Mask[2] < 4 ? 1 : 3; NewMask[3] = Mask[2] < 4 ? 3 : 1; } } return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV, getV4X86ShuffleImm8ForMask(NewMask, DL, DAG)); } /// Lower 4-lane 32-bit floating point shuffles. /// /// Uses instructions exclusively from the floating point unit to minimize /// domain crossing penalties, as these are sufficient to implement all v4f32 /// shuffles. static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Use even/odd duplicate instructions for masks that match their pattern. if (Subtarget.hasSSE3()) { if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1); if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3})) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1); } if (Subtarget.hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid // in SSE1 because otherwise they are widened to v2f64 and never get here. if (!Subtarget.hasSSE2()) { if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1})) return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1); if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3})) return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1); } // Otherwise, use a straight shuffle of a single input vector. We pass the // input vector to both operands to simulate this with a SHUFPS. return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } if (Subtarget.hasAVX2()) if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) return Extract; // There are special ways we can lower some single-element blends. However, we // have custom ways we can lower more complex single-element blends below that // we defer to if both this and BLENDPS fail to match, so restrict this to // when the V2 input is targeting element 0 of the mask -- that is the fast // case here. if (NumV2Elements == 1 && Mask[0] >= 4) if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (Subtarget.hasSSE41()) { if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Use INSERTPS if we can complete the shuffle efficiently. if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG)) return V; if (!isSingleSHUFPSMask(Mask)) if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1, V2, Mask, DAG)) return BlendPerm; } // Use low/high mov instructions. These are only valid in SSE1 because // otherwise they are widened to v2f64 and never get here. if (!Subtarget.hasSSE2()) { if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2); if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7})) return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1); } // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) return V; // Otherwise fall back to a SHUFPS lowering strategy. return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); } /// Lower 4-lane i32 vector shuffles. /// /// We try to handle these with integer-domain shuffles where we can, but for /// blends we use the floating point domain blend instructions. static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 0) { // Try to use broadcast unless the mask only has one non-undef element. if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) { if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; } // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We coerce the shuffle pattern to be compatible with UNPCK instructions // but we aren't actually going to use the UNPCK instruction because doing // so prevents folding a load into this instruction or making a copy. const int UnpackLoMask[] = {0, 0, 1, 1}; const int UnpackHiMask[] = {2, 2, 3, 3}; if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1})) Mask = UnpackLoMask; else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3})) Mask = UnpackHiMask; return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } if (Subtarget.hasAVX2()) if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) return Extract; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // We have different paths for blend lowering, but they all must use the // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG)) return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget.hasSSSE3()) { if (Subtarget.hasVLX()) if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; } // Assume that a single SHUFPS is faster than an alternative sequence of // multiple instructions (even if the CPU has a domain penalty). // If some CPU is harmed by the domain switch, we can fix it in a later pass. if (!isSingleSHUFPSMask(Mask)) { // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG); // Try to lower by permuting the inputs into an unpack instruction. if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Unpack; } // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build // up the inputs, bypassing domain shift penalties that we would incur if we // directly used PSHUFD on Nehalem and older. For newer chips, this isn't // relevant. SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1); SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2); SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask); return DAG.getBitcast(MVT::v4i32, ShufPS); } /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 /// shuffle lowering, and the most complex part. /// /// The lowering strategy is to try to form pairs of input lanes which are /// targeted at the same half of the final vector, and then use a dword shuffle /// to place them onto the right half, and finally unpack the paired lanes into /// their final position. /// /// The exact breakdown of how to form these dword pairs and align them on the /// correct sides is really tricky. See the comments within the function for /// more of the details. /// /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16 /// vector, form the analogous 128-bit 8-element Mask. static SDValue lowerV8I16GeneralSingleInputShuffle( const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!"); MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); assert(Mask.size() == 8 && "Shuffle mask length doesn't match!"); MutableArrayRef LoMask = Mask.slice(0, 4); MutableArrayRef HiMask = Mask.slice(4, 4); // Attempt to directly match PSHUFLW or PSHUFHW. if (isUndefOrInRange(LoMask, 0, 4) && isSequentialOrUndefInRange(HiMask, 0, 4, 4)) { return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, getV4X86ShuffleImm8ForMask(LoMask, DL, DAG)); } if (isUndefOrInRange(HiMask, 4, 8) && isSequentialOrUndefInRange(LoMask, 0, 4, 0)) { for (int i = 0; i != 4; ++i) HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4)); return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, getV4X86ShuffleImm8ForMask(HiMask, DL, DAG)); } SmallVector LoInputs; copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; }); array_pod_sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); SmallVector HiInputs; copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; }); array_pod_sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin(); int NumHToL = LoInputs.size() - NumLToL; int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin(); int NumHToH = HiInputs.size() - NumLToH; MutableArrayRef LToLInputs(LoInputs.data(), NumLToL); MutableArrayRef LToHInputs(HiInputs.data(), NumLToH); MutableArrayRef HToLInputs(LoInputs.data() + NumLToL, NumHToL); MutableArrayRef HToHInputs(HiInputs.data() + NumLToH, NumHToH); // If we are shuffling values from one half - check how many different DWORD // pairs we need to create. If only 1 or 2 then we can perform this as a // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below. auto ShuffleDWordPairs = [&](ArrayRef PSHUFHalfMask, ArrayRef PSHUFDMask, unsigned ShufWOp) { V = DAG.getNode(ShufWOp, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); V = DAG.getBitcast(PSHUFDVT, V); V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V, getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); return DAG.getBitcast(VT, V); }; if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) { int PSHUFDMask[4] = { -1, -1, -1, -1 }; SmallVector, 4> DWordPairs; int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2); // Collect the different DWORD pairs. for (int DWord = 0; DWord != 4; ++DWord) { int M0 = Mask[2 * DWord + 0]; int M1 = Mask[2 * DWord + 1]; M0 = (M0 >= 0 ? M0 % 4 : M0); M1 = (M1 >= 0 ? M1 % 4 : M1); if (M0 < 0 && M1 < 0) continue; bool Match = false; for (int j = 0, e = DWordPairs.size(); j < e; ++j) { auto &DWordPair = DWordPairs[j]; if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) && (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) { DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first); DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second); PSHUFDMask[DWord] = DOffset + j; Match = true; break; } } if (!Match) { PSHUFDMask[DWord] = DOffset + DWordPairs.size(); DWordPairs.push_back(std::make_pair(M0, M1)); } } if (DWordPairs.size() <= 2) { DWordPairs.resize(2, std::make_pair(-1, -1)); int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second, DWordPairs[1].first, DWordPairs[1].second}; if ((NumHToL + NumHToH) == 0) return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW); if ((NumLToL + NumLToH) == 0) return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW); } } // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all // such inputs we can swap two of the dwords across the half mark and end up // with <=2 inputs to each half in each half. Once there, we can fall through // to the generic code below. For example: // // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] // // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half // and an existing 2-into-2 on the other half. In this case we may have to // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or // 1-into-3 which could cause us to cycle endlessly fixing each side in turn. // Fortunately, we don't have to handle anything but a 2-into-2 pattern // because any other situation (including a 3-into-1 or 1-into-3 in the other // half than the one we target for fixing) will be fixed when we re-enter this // path. We will also combine away any sequence of PSHUFD instructions that // result into a single instruction. Here is an example of the tricky case: // // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3] // // This now has a 1-into-3 in the high half! Instead, we do two shuffles: // // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6] // // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6] // // The result is fine to be handled by the generic logic. auto balanceSides = [&](ArrayRef AToAInputs, ArrayRef BToAInputs, ArrayRef BToBInputs, ArrayRef AToBInputs, int AOffset, int BOffset) { assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."); assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."); assert(AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."); bool ThreeAInputs = AToAInputs.size() == 3; // Compute the index of dword with only one word among the three inputs in // a half by taking the sum of the half with three inputs and subtracting // the sum of the actual three inputs. The difference is the remaining // slot. int ADWord = 0, BDWord = 0; int &TripleDWord = ThreeAInputs ? ADWord : BDWord; int &OneInputDWord = ThreeAInputs ? BDWord : ADWord; int TripleInputOffset = ThreeAInputs ? AOffset : BOffset; ArrayRef TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs; int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0]; int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); int TripleNonInputIdx = TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); TripleDWord = TripleNonInputIdx / 2; // We use xor with one to compute the adjacent DWord to whichever one the // OneInput is in. OneInputDWord = (OneInput / 2) ^ 1; // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA // and BToA inputs. If there is also such a problem with the BToB and AToB // inputs, we don't try to fix it necessarily -- we'll recurse and see it in // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it // is essential that we don't *create* a 3<-1 as then we might oscillate. if (BToBInputs.size() == 2 && AToBInputs.size() == 2) { // Compute how many inputs will be flipped by swapping these DWords. We // need // to balance this to ensure we don't form a 3-1 shuffle in the other // half. int NumFlippedAToBInputs = std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) + std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1); int NumFlippedBToBInputs = std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) + std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1); if ((NumFlippedAToBInputs == 1 && (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) || (NumFlippedBToBInputs == 1 && (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) { // We choose whether to fix the A half or B half based on whether that // half has zero flipped inputs. At zero, we may not be able to fix it // with that half. We also bias towards fixing the B half because that // will more commonly be the high half, and we have to bias one way. auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord, ArrayRef Inputs) { int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot. bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1); // Determine whether the free index is in the flipped dword or the // unflipped dword based on where the pinned index is. We use this bit // in an xor to conditionally select the adjacent dword. int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord)); bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx); if (IsFixIdxInput == IsFixFreeIdxInput) FixFreeIdx += 1; IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx); assert(IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"); int PSHUFHalfMask[] = {0, 1, 2, 3}; std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]); V = DAG.getNode( FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL, MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V, getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); for (int &M : Mask) if (M >= 0 && M == FixIdx) M = FixFreeIdx; else if (M >= 0 && M == FixFreeIdx) M = FixIdx; }; if (NumFlippedBToBInputs != 0) { int BPinnedIdx = BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs); } else { assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!"); int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput; FixFlippedInputs(APinnedIdx, ADWord, AToBInputs); } } } int PSHUFDMask[] = {0, 1, 2, 3}; PSHUFDMask[ADWord] = BDWord; PSHUFDMask[BDWord] = ADWord; V = DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); // Adjust the mask to match the new locations of A and B. for (int &M : Mask) if (M >= 0 && M/2 == ADWord) M = 2 * BDWord + M % 2; else if (M >= 0 && M/2 == BDWord) M = 2 * ADWord + M % 2; // Recurse back into this routine to re-compute state now that this isn't // a 3 and 1 problem. return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG); }; if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0); // At this point there are at most two inputs to the low and high halves from // each half. That means the inputs can always be grouped into dwords and // those dwords can then be moved to the correct half with a dword shuffle. // We use at most one low and one high word shuffle to collect these paired // inputs into dwords, and finally a dword shuffle to place them. int PSHUFLMask[4] = {-1, -1, -1, -1}; int PSHUFHMask[4] = {-1, -1, -1, -1}; int PSHUFDMask[4] = {-1, -1, -1, -1}; // First fix the masks for all the inputs that are staying in their // original halves. This will then dictate the targets of the cross-half // shuffles. auto fixInPlaceInputs = [&PSHUFDMask](ArrayRef InPlaceInputs, ArrayRef IncomingInputs, MutableArrayRef SourceHalfMask, MutableArrayRef HalfMask, int HalfOffset) { if (InPlaceInputs.empty()) return; if (InPlaceInputs.size() == 1) { SourceHalfMask[InPlaceInputs[0] - HalfOffset] = InPlaceInputs[0] - HalfOffset; PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; return; } if (IncomingInputs.empty()) { // Just fix all of the in place inputs. for (int Input : InPlaceInputs) { SourceHalfMask[Input - HalfOffset] = Input - HalfOffset; PSHUFDMask[Input / 2] = Input / 2; } return; } assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); SourceHalfMask[InPlaceInputs[0] - HalfOffset] = InPlaceInputs[0] - HalfOffset; // Put the second input next to the first so that they are packed into // a dword. We find the adjacent index by toggling the low bit. int AdjIndex = InPlaceInputs[0] ^ 1; SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset; std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; }; fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0); fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4); // Now gather the cross-half inputs and place them into a free dword of // their target half. // FIXME: This operation could almost certainly be simplified dramatically to // look more like the 3-1 fixing operation. auto moveInputsToRightHalf = [&PSHUFDMask]( MutableArrayRef IncomingInputs, ArrayRef ExistingInputs, MutableArrayRef SourceHalfMask, MutableArrayRef HalfMask, MutableArrayRef FinalSourceHalfMask, int SourceOffset, int DestOffset) { auto isWordClobbered = [](ArrayRef SourceHalfMask, int Word) { return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word; }; auto isDWordClobbered = [&isWordClobbered](ArrayRef SourceHalfMask, int Word) { int LowWord = Word & ~1; int HighWord = Word | 1; return isWordClobbered(SourceHalfMask, LowWord) || isWordClobbered(SourceHalfMask, HighWord); }; if (IncomingInputs.empty()) return; if (ExistingInputs.empty()) { // Map any dwords with inputs from them into the right half. for (int Input : IncomingInputs) { // If the source half mask maps over the inputs, turn those into // swaps and use the swapped lane. if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) { if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) { SourceHalfMask[SourceHalfMask[Input - SourceOffset]] = Input - SourceOffset; // We have to swap the uses in our half mask in one sweep. for (int &M : HalfMask) if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset) M = Input; else if (M == Input) M = SourceHalfMask[Input - SourceOffset] + SourceOffset; } else { assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"); } // Note that this correctly re-maps both when we do a swap and when // we observe the other side of the swap above. We rely on that to // avoid swapping the members of the input list directly. Input = SourceHalfMask[Input - SourceOffset] + SourceOffset; } // Map the input's dword into the correct half. if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0) PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2; else assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"); } // And just directly shift any other-half mask elements to be same-half // as we will have mirrored the dword containing the element into the // same position within that half. for (int &M : HalfMask) if (M >= SourceOffset && M < SourceOffset + 4) { M = M - SourceOffset + DestOffset; assert(M >= 0 && "This should never wrap below zero!"); } return; } // Ensure we have the input in a viable dword of its current half. This // is particularly tricky because the original position may be clobbered // by inputs being moved and *staying* in that half. if (IncomingInputs.size() == 1) { if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) + SourceOffset; SourceHalfMask[InputFixed - SourceOffset] = IncomingInputs[0] - SourceOffset; std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0], InputFixed); IncomingInputs[0] = InputFixed; } } else if (IncomingInputs.size() == 2) { if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { // We have two non-adjacent or clobbered inputs we need to extract from // the source half. To do this, we need to map them into some adjacent // dword slot in the source mask. int InputsFixed[2] = {IncomingInputs[0] - SourceOffset, IncomingInputs[1] - SourceOffset}; // If there is a free slot in the source half mask adjacent to one of // the inputs, place the other input in it. We use (Index XOR 1) to // compute an adjacent index. if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) && SourceHalfMask[InputsFixed[0] ^ 1] < 0) { SourceHalfMask[InputsFixed[0]] = InputsFixed[0]; SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; InputsFixed[1] = InputsFixed[0] ^ 1; } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) && SourceHalfMask[InputsFixed[1] ^ 1] < 0) { SourceHalfMask[InputsFixed[1]] = InputsFixed[1]; SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0]; InputsFixed[0] = InputsFixed[1] ^ 1; } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 && SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) { // The two inputs are in the same DWord but it is clobbered and the // adjacent DWord isn't used at all. Move both inputs to the free // slot. SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0]; SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1]; InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1); InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1; } else { // The only way we hit this point is if there is no clobbering // (because there are no off-half inputs to this half) and there is no // free slot adjacent to one of the inputs. In this case, we have to // swap an input with a non-input. for (int i = 0; i < 4; ++i) assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && "We can't handle any clobbers here!"); assert(InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"); SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1; // We also have to update the final source mask in this case because // it may need to undo the above swap. for (int &M : FinalSourceHalfMask) if (M == (InputsFixed[0] ^ 1) + SourceOffset) M = InputsFixed[1] + SourceOffset; else if (M == InputsFixed[1] + SourceOffset) M = (InputsFixed[0] ^ 1) + SourceOffset; InputsFixed[1] = InputsFixed[0] ^ 1; } // Point everything at the fixed inputs. for (int &M : HalfMask) if (M == IncomingInputs[0]) M = InputsFixed[0] + SourceOffset; else if (M == IncomingInputs[1]) M = InputsFixed[1] + SourceOffset; IncomingInputs[0] = InputsFixed[0] + SourceOffset; IncomingInputs[1] = InputsFixed[1] + SourceOffset; } } else { llvm_unreachable("Unhandled input size!"); } // Now hoist the DWord down to the right half. int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2; assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free"); PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; for (int &M : HalfMask) for (int Input : IncomingInputs) if (M == Input) M = FreeDWord * 2 + Input % 2; }; moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask, /*SourceOffset*/ 4, /*DestOffset*/ 0); moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask, /*SourceOffset*/ 0, /*DestOffset*/ 4); // Now enact all the shuffles we've computed to move the inputs into their // target half. if (!isNoopShuffleMask(PSHUFLMask)) V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG)); if (!isNoopShuffleMask(PSHUFHMask)) V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG)); if (!isNoopShuffleMask(PSHUFDMask)) V = DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); // At this point, each half should contain all its inputs, and we can then // just shuffle them into their final position. assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"); assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"); // Do a half shuffle for the low mask. if (!isNoopShuffleMask(LoMask)) V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, getV4X86ShuffleImm8ForMask(LoMask, DL, DAG)); // Do a half shuffle with the high mask after shifting its values down. for (int &M : HiMask) if (M >= 0) M -= 4; if (!isNoopShuffleMask(HiMask)) V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, getV4X86ShuffleImm8ForMask(HiMask, DL, DAG)); return V; } /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the /// blend if only one input is used. static SDValue lowerShuffleAsBlendOfPSHUFBs( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) { assert(!is128BitLaneCrossingShuffleMask(VT, Mask) && "Lane crossing shuffle masks not supported"); int NumBytes = VT.getSizeInBits() / 8; int Size = Mask.size(); int Scale = NumBytes / Size; SmallVector V1Mask(NumBytes, DAG.getUNDEF(MVT::i8)); SmallVector V2Mask(NumBytes, DAG.getUNDEF(MVT::i8)); V1InUse = false; V2InUse = false; for (int i = 0; i < NumBytes; ++i) { int M = Mask[i / Scale]; if (M < 0) continue; const int ZeroMask = 0x80; int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask; int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale; if (Zeroable[i / Scale]) V1Idx = V2Idx = ZeroMask; V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8); V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8); V1InUse |= (ZeroMask != V1Idx); V2InUse |= (ZeroMask != V2Idx); } MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes); if (V1InUse) V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1), DAG.getBuildVector(ShufVT, DL, V1Mask)); if (V2InUse) V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2), DAG.getBuildVector(ShufVT, DL, V2Mask)); // If we need shuffled inputs from both, blend the two. SDValue V; if (V1InUse && V2InUse) V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2); else V = V1InUse ? V1 : V2; // Cast the result back to the correct type. return DAG.getBitcast(VT, V); } /// Generic lowering of 8-lane i16 shuffles. /// /// This handles both single-input shuffles and combined shuffle/blends with /// two inputs. The single input shuffles are immediately delegated to /// a dedicated lowering routine. /// /// The blends are lowered in one of three fundamental ways. If there are few /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle /// of the input is significantly cheaper when lowered as an interleaving of /// the two inputs, try to interleave them. Otherwise, blend the low and high /// halves of the inputs separately (making them have relatively few inputs) /// and then concatenate them. static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; }); if (NumV2Inputs == 0) { // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable, Subtarget, DAG)) return Shift; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask, Subtarget, DAG)) return Rotate; // Make a copy of the mask so it can be modified. SmallVector MutableMask(Mask.begin(), Mask.end()); return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask, Subtarget, DAG); } assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input " "shuffles."); // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // See if we can use SSE4A Extraction / Insertion. if (Subtarget.hasSSE4A()) if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, Zeroable, DAG)) return V; // There are special ways we can lower some single-element blends. if (NumV2Inputs == 1) if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // We have different paths for blend lowering, but they all must use the // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue BitBlend = lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) return BitBlend; // Try to use byte shift instructions to mask. if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // Try to lower by permuting the inputs into an unpack instruction. if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return Unpack; // If we can't directly blend but can use PSHUFB, that will be better as it // can both shuffle and set up the inefficient blend. if (!IsBlendSupported && Subtarget.hasSSSE3()) { bool V1InUse, V2InUse; return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse); } // We can always bit-blend if we have to so the fallback strategy is to // decompose into single-input permutes and blends. return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG); } /// Check whether a compaction lowering can be done by dropping even /// elements and compute how many times even elements must be dropped. /// /// This handles shuffles which take every Nth element where N is a power of /// two. Example shuffle masks: /// /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 /// /// Any of these lanes can of course be undef. /// /// This routine only supports N <= 3. /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here /// for larger N. /// /// \returns N above, or the number of times even elements must be dropped if /// there is such a number. Otherwise returns zero. static int canLowerByDroppingEvenElements(ArrayRef Mask, bool IsSingleInput) { // The modulus for the shuffle vector entries is based on whether this is // a single input or not. int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2); assert(isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!"); uint64_t ModMask = (uint64_t)ShuffleModulus - 1; // We track whether the input is viable for all power-of-2 strides 2^1, 2^2, // and 2^3 simultaneously. This is because we may have ambiguity with // partially undef inputs. bool ViableForN[3] = {true, true, true}; for (int i = 0, e = Mask.size(); i < e; ++i) { // Ignore undef lanes, we'll optimistically collapse them to the pattern we // want. if (Mask[i] < 0) continue; bool IsAnyViable = false; for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) if (ViableForN[j]) { uint64_t N = j + 1; // The shuffle mask must be equal to (i * 2^N) % M. if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask)) IsAnyViable = true; else ViableForN[j] = false; } // Early exit if we exhaust the possible powers of two. if (!IsAnyViable) break; } for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) if (ViableForN[j]) return j + 1; // Return 0 as there is no viable power of two. return 0; } static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true); if (V2.isUndef()) return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2); } /// Generic lowering of v16i8 shuffles. /// /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to /// detect any complexity reducing interleaving. If that doesn't help, it uses /// UNPCK to spread the i8 elements across two i16-element vectors, and uses /// the existing lowering for v8i16 blends on each half, finally PACK-ing them /// back together. static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use a zext lowering. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // See if we can use SSE4A Extraction / Insertion. if (Subtarget.hasSSE4A()) if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG)) return V; int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; }); // For single-input shuffles, there are some nicer lowering tricks we can use. if (NumV2Elements == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Broadcast; if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) return V; // Check whether we can widen this to an i16 shuffle by duplicating bytes. // Notably, this handles splat and partial-splat shuffles more efficiently. // However, it only makes sense if the pre-duplication shuffle simplifies // things significantly. Currently, this means we need to be able to // express the pre-duplication shuffle as an i16 shuffle. // // FIXME: We should check for other patterns which can be widened into an // i16 shuffle as well. auto canWidenViaDuplication = [](ArrayRef Mask) { for (int i = 0; i < 16; i += 2) if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1]) return false; return true; }; auto tryToWidenViaDuplication = [&]() -> SDValue { if (!canWidenViaDuplication(Mask)) return SDValue(); SmallVector LoInputs; copy_if(Mask, std::back_inserter(LoInputs), [](int M) { return M >= 0 && M < 8; }); array_pod_sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); SmallVector HiInputs; copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; }); array_pod_sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); bool TargetLo = LoInputs.size() >= HiInputs.size(); ArrayRef InPlaceInputs = TargetLo ? LoInputs : HiInputs; ArrayRef MovingInputs = TargetLo ? HiInputs : LoInputs; int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1}; SmallDenseMap LaneMap; for (int I : InPlaceInputs) { PreDupI16Shuffle[I/2] = I/2; LaneMap[I] = I; } int j = TargetLo ? 0 : 4, je = j + 4; for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) { // Check if j is already a shuffle of this input. This happens when // there are two adjacent bytes after we move the low one. if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) { // If we haven't yet mapped the input, search for a slot into which // we can map it. while (j < je && PreDupI16Shuffle[j] >= 0) ++j; if (j == je) // We can't place the inputs into a single half with a simple i16 shuffle, so bail. return SDValue(); // Map this input with the i16 shuffle. PreDupI16Shuffle[j] = MovingInputs[i] / 2; } // Update the lane map based on the mapping we ended up with. LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2; } V1 = DAG.getBitcast( MVT::v16i8, DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1), DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); // Unpack the bytes to form the i16s that will be shuffled into place. bool EvenInUse = false, OddInUse = false; for (int i = 0; i < 16; i += 2) { EvenInUse |= (Mask[i + 0] >= 0); OddInUse |= (Mask[i + 1] >= 0); if (EvenInUse && OddInUse) break; } V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8), OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8)); int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; for (int i = 0; i < 16; ++i) if (Mask[i] >= 0) { int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); assert(MappedMask < 8 && "Invalid v8 shuffle mask!"); if (PostDupI16Shuffle[i / 2] < 0) PostDupI16Shuffle[i / 2] = MappedMask; else assert(PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entries in the original shuffle!"); } return DAG.getBitcast( MVT::v16i8, DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1), DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle)); }; if (SDValue V = tryToWidenViaDuplication()) return V; } if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) return V; // Try to use byte shift instructions to mask. if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly // with PSHUFB. It is important to do this before we attempt to generate any // blends but after all of the single-input lowerings. If the single input // lowerings can find an instruction sequence that is faster than a PSHUFB, we // want to preserve that and we can DAG combine any longer sequences into // a PSHUFB in the end. But once we start blending from multiple inputs, // the complexity of DAG combining bad patterns back into PSHUFB is too high, // and there are *very* few patterns that would actually be faster than the // PSHUFB approach because of its ability to zero lanes. // // FIXME: The only exceptions to the above are blends which are exact // interleavings with direct instructions supporting them. We currently don't // handle those well here. if (Subtarget.hasSSSE3()) { bool V1InUse = false; bool V2InUse = false; SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs( DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse); // If both V1 and V2 are in use and we can use a direct blend or an unpack, // do so. This avoids using them to handle blends-with-zero which is // important as a single pshufb is significantly faster for that. if (V1InUse && V2InUse) { if (Subtarget.hasSSE41()) if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // We can use an unpack to do the blending rather than an or in some // cases. Even though the or may be (very minorly) more efficient, we // preference this lowering because there are common cases where part of // the complexity of the shuffles goes away when we do the final blend as // an unpack. // FIXME: It might be worth trying to detect if the unpack-feeding // shuffles will both be pshufb, in which case we shouldn't bother with // this. if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack( DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Unpack; // If we have VBMI we can use one VPERM instead of multiple PSHUFBs. if (Subtarget.hasVBMI() && Subtarget.hasVLX()) return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG); // Use PALIGNR+Permute if possible - permute might become PSHUFB but the // PALIGNR will be cheaper than the second PSHUFB+OR. if (SDValue V = lowerShuffleAsByteRotateAndPermute( DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return V; } return PSHUFB; } // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG)) return Blend; // Check whether a compaction lowering can be done. This handles shuffles // which take every Nth element for some even N. See the helper function for // details. // // We special case these as they can be particularly efficiently handled with // the PACKUSB instruction on x86 and they show up in common patterns of // rearranging bytes to truncate wide elements. bool IsSingleInput = V2.isUndef(); if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) { // NumEvenDrops is the power of two stride of the elements. Another way of // thinking about it is that we need to drop the even elements this many // times to get the original input. // First we need to zero all the dropped bytes. assert(NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."); SmallVector ByteClearOps(16, DAG.getConstant(0, DL, MVT::i8)); for (unsigned i = 0; i != 16; i += 1 << NumEvenDrops) ByteClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i8); SDValue ByteClearMask = DAG.getBuildVector(MVT::v16i8, DL, ByteClearOps); V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask); if (!IsSingleInput) V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask); // Now pack things back together. V1 = DAG.getBitcast(MVT::v8i16, V1); V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2); SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2); for (int i = 1; i < NumEvenDrops; ++i) { Result = DAG.getBitcast(MVT::v8i16, Result); Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result); } return Result; } // Handle multi-input cases by blending single-input shuffles. if (NumV2Elements > 0) return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG); // The fallback path for single-input shuffles widens this into two v8i16 // vectors with unpacks, shuffles those, and then pulls them back together // with a pack. SDValue V = V1; std::array LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}}; std::array HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}}; for (int i = 0; i < 16; ++i) if (Mask[i] >= 0) (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i]; SDValue VLoHalf, VHiHalf; // Check if any of the odd lanes in the v16i8 are used. If not, we can mask // them out and avoid using UNPCK{L,H} to extract the elements of V as // i16s. if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) && none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) { // Use a mask to drop the high bytes. VLoHalf = DAG.getBitcast(MVT::v8i16, V); VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf, DAG.getConstant(0x00FF, DL, MVT::v8i16)); // This will be a single vector shuffle instead of a blend so nuke VHiHalf. VHiHalf = DAG.getUNDEF(MVT::v8i16); // Squash the masks to point directly into VLoHalf. for (int &M : LoBlendMask) if (M >= 0) M /= 2; for (int &M : HiBlendMask) if (M >= 0) M /= 2; } else { // Otherwise just unpack the low half of V into VLoHalf and the high half into // VHiHalf so that we can blend them as i16s. SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL); VLoHalf = DAG.getBitcast( MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); VHiHalf = DAG.getBitcast( MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); } SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask); SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask); return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV); } /// Dispatching routine to lower various 128-bit x86 vector shuffles. /// /// This routine breaks down the specific type of 128-bit shuffle and /// dispatches to the lowering routines accordingly. static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { switch (VT.SimpleTy) { case MVT::v2i64: return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v2f64: return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4i32: return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4f32: return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i16: return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i8: return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Unimplemented!"); } } /// Generic routine to split vector shuffle into half-sized shuffles. /// /// This routine just extracts two subvectors, shuffles them independently, and /// then concatenates them back together. This should work effectively with all /// AVX vector shuffle types. static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { assert(VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"); assert(V1.getSimpleValueType() == VT && "Bad operand type!"); assert(V2.getSimpleValueType() == VT && "Bad operand type!"); ArrayRef LoMask = Mask.slice(0, Mask.size() / 2); ArrayRef HiMask = Mask.slice(Mask.size() / 2); int NumElements = VT.getVectorNumElements(); int SplitNumElements = NumElements / 2; MVT ScalarVT = VT.getVectorElementType(); MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2); // Rather than splitting build-vectors, just build two narrower build // vectors. This helps shuffling with splats and zeros. auto SplitVector = [&](SDValue V) { V = peekThroughBitcasts(V); MVT OrigVT = V.getSimpleValueType(); int OrigNumElements = OrigVT.getVectorNumElements(); int OrigSplitNumElements = OrigNumElements / 2; MVT OrigScalarVT = OrigVT.getVectorElementType(); MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2); SDValue LoV, HiV; auto *BV = dyn_cast(V); if (!BV) { LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, DAG.getIntPtrConstant(0, DL)); HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, DAG.getIntPtrConstant(OrigSplitNumElements, DL)); } else { SmallVector LoOps, HiOps; for (int i = 0; i < OrigSplitNumElements; ++i) { LoOps.push_back(BV->getOperand(i)); HiOps.push_back(BV->getOperand(i + OrigSplitNumElements)); } LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps); HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps); } return std::make_pair(DAG.getBitcast(SplitVT, LoV), DAG.getBitcast(SplitVT, HiV)); }; SDValue LoV1, HiV1, LoV2, HiV2; std::tie(LoV1, HiV1) = SplitVector(V1); std::tie(LoV2, HiV2) = SplitVector(V2); // Now create two 4-way blends of these half-width vectors. auto HalfBlend = [&](ArrayRef HalfMask) { bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false; SmallVector V1BlendMask((unsigned)SplitNumElements, -1); SmallVector V2BlendMask((unsigned)SplitNumElements, -1); SmallVector BlendMask((unsigned)SplitNumElements, -1); for (int i = 0; i < SplitNumElements; ++i) { int M = HalfMask[i]; if (M >= NumElements) { if (M >= NumElements + SplitNumElements) UseHiV2 = true; else UseLoV2 = true; V2BlendMask[i] = M - NumElements; BlendMask[i] = SplitNumElements + i; } else if (M >= 0) { if (M >= SplitNumElements) UseHiV1 = true; else UseLoV1 = true; V1BlendMask[i] = M; BlendMask[i] = i; } } // Because the lowering happens after all combining takes place, we need to // manually combine these blend masks as much as possible so that we create // a minimal number of high-level vector shuffle nodes. // First try just blending the halves of V1 or V2. if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2) return DAG.getUNDEF(SplitVT); if (!UseLoV2 && !UseHiV2) return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); if (!UseLoV1 && !UseHiV1) return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); SDValue V1Blend, V2Blend; if (UseLoV1 && UseHiV1) { V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); } else { // We only use half of V1 so map the usage down into the final blend mask. V1Blend = UseLoV1 ? LoV1 : HiV1; for (int i = 0; i < SplitNumElements; ++i) if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements) BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements); } if (UseLoV2 && UseHiV2) { V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); } else { // We only use half of V2 so map the usage down into the final blend mask. V2Blend = UseLoV2 ? LoV2 : HiV2; for (int i = 0; i < SplitNumElements; ++i) if (BlendMask[i] >= SplitNumElements) BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0); } return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask); }; SDValue Lo = HalfBlend(LoMask); SDValue Hi = HalfBlend(HiMask); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); } /// Either split a vector in halves or decompose the shuffles and the /// blend. /// /// This is provided as a good fallback for many lowerings of non-single-input /// shuffles with more than one 128-bit lane. In those cases, we want to select /// between splitting the shuffle into 128-bit components and stitching those /// back together vs. extracting the single-input shuffles and blending those /// results. static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!V2.isUndef() && "This routine must not be used to lower single-input " "shuffles as it could then recurse on itself."); int Size = Mask.size(); // If this can be modeled as a broadcast of two elements followed by a blend, // prefer that lowering. This is especially important because broadcasts can // often fold with memory operands. auto DoBothBroadcast = [&] { int V1BroadcastIdx = -1, V2BroadcastIdx = -1; for (int M : Mask) if (M >= Size) { if (V2BroadcastIdx < 0) V2BroadcastIdx = M - Size; else if (M - Size != V2BroadcastIdx) return false; } else if (M >= 0) { if (V1BroadcastIdx < 0) V1BroadcastIdx = M; else if (M != V1BroadcastIdx) return false; } return true; }; if (DoBothBroadcast()) return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget, DAG); // If the inputs all stem from a single 128-bit lane of each input, then we // split them rather than blending because the split will decompose to // unusually few instructions. int LaneCount = VT.getSizeInBits() / 128; int LaneSize = Size / LaneCount; SmallBitVector LaneInputs[2]; LaneInputs[0].resize(LaneCount, false); LaneInputs[1].resize(LaneCount, false); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0) LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true; if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); // Otherwise, just fall back to decomposed shuffles and a blend. This requires // that the decomposed single-input shuffles don't end up here. return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget, DAG); } // Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)). // TODO: Extend to support v8f32 (+ 512-bit shuffles). static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { assert(VT == MVT::v4f64 && "Only for v4f64 shuffles"); int LHSMask[4] = {-1, -1, -1, -1}; int RHSMask[4] = {-1, -1, -1, -1}; unsigned SHUFPMask = 0; // As SHUFPD uses a single LHS/RHS element per lane, we can always // perform the shuffle once the lanes have been shuffled in place. for (int i = 0; i != 4; ++i) { int M = Mask[i]; if (M < 0) continue; int LaneBase = i & ~1; auto &LaneMask = (i & 1) ? RHSMask : LHSMask; LaneMask[LaneBase + (M & 1)] = M; SHUFPMask |= (M & 1) << i; } SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask); SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask); return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS, DAG.getTargetConstant(SHUFPMask, DL, MVT::i8)); } /// Lower a vector shuffle crossing multiple 128-bit lanes as /// a lane permutation followed by a per-lane permutation. /// /// This is mainly for cases where we can have non-repeating permutes /// in each lane. /// /// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask, /// we should investigate merging them. static SDValue lowerShuffleAsLanePermuteAndPermute( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { int NumElts = VT.getVectorNumElements(); int NumLanes = VT.getSizeInBits() / 128; int NumEltsPerLane = NumElts / NumLanes; SmallVector SrcLaneMask(NumLanes, SM_SentinelUndef); SmallVector PermMask(NumElts, SM_SentinelUndef); for (int i = 0; i != NumElts; ++i) { int M = Mask[i]; if (M < 0) continue; // Ensure that each lane comes from a single source lane. int SrcLane = M / NumEltsPerLane; int DstLane = i / NumEltsPerLane; if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane)) return SDValue(); SrcLaneMask[DstLane] = SrcLane; PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane); } // Make sure we set all elements of the lane mask, to avoid undef propagation. SmallVector LaneMask(NumElts, SM_SentinelUndef); for (int DstLane = 0; DstLane != NumLanes; ++DstLane) { int SrcLane = SrcLaneMask[DstLane]; if (0 <= SrcLane) for (int j = 0; j != NumEltsPerLane; ++j) { LaneMask[(DstLane * NumEltsPerLane) + j] = (SrcLane * NumEltsPerLane) + j; } } // If we're only shuffling a single lowest lane and the rest are identity // then don't bother. // TODO - isShuffleMaskInputInPlace could be extended to something like this. int NumIdentityLanes = 0; bool OnlyShuffleLowestLane = true; for (int i = 0; i != NumLanes; ++i) { if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane, i * NumEltsPerLane)) NumIdentityLanes++; else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes) OnlyShuffleLowestLane = false; } if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1)) return SDValue(); SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask); return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask); } /// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one /// source with a lane permutation. /// /// This lowering strategy results in four instructions in the worst case for a /// single-input cross lane shuffle which is lower than any other fully general /// cross-lane shuffle strategy I'm aware of. Special cases for each particular /// shuffle pattern should be handled prior to trying this lowering. static SDValue lowerShuffleAsLanePermuteAndShuffle( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // FIXME: This should probably be generalized for 512-bit vectors as well. assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!"); int Size = Mask.size(); int LaneSize = Size / 2; // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)). // Only do this if the elements aren't all from the lower lane, // otherwise we're (probably) better off doing a split. if (VT == MVT::v4f64 && !all_of(Mask, [LaneSize](int M) { return M < LaneSize; })) if (SDValue V = lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG)) return V; // If there are only inputs from one 128-bit lane, splitting will in fact be // less expensive. The flags track whether the given lane contains an element // that crosses to another lane. if (!Subtarget.hasAVX2()) { bool LaneCrossing[2] = {false, false}; for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize)) LaneCrossing[(Mask[i] % Size) / LaneSize] = true; if (!LaneCrossing[0] || !LaneCrossing[1]) return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); } else { bool LaneUsed[2] = {false, false}; for (int i = 0; i < Size; ++i) if (Mask[i] >= 0) LaneUsed[(Mask[i] % Size) / LaneSize] = true; if (!LaneUsed[0] || !LaneUsed[1]) return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); } // TODO - we could support shuffling V2 in the Flipped input. assert(V2.isUndef() && "This last part of this routine only works on single input shuffles"); SmallVector InLaneMask(Mask.begin(), Mask.end()); for (int i = 0; i < Size; ++i) { int &M = InLaneMask[i]; if (M < 0) continue; if (((M % Size) / LaneSize) != (i / LaneSize)) M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size; } assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && "In-lane shuffle mask expected"); // Flip the lanes, and shuffle the results which should now be in-lane. MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64; SDValue Flipped = DAG.getBitcast(PVT, V1); Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1}); Flipped = DAG.getBitcast(VT, Flipped); return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask); } /// Handle lowering 2-lane 128-bit shuffles. static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding. if (Subtarget.hasAVX2() && V2.isUndef()) return SDValue(); bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode()); SmallVector WidenedMask; if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask)) return SDValue(); bool IsLowZero = (Zeroable & 0x3) == 0x3; bool IsHighZero = (Zeroable & 0xc) == 0xc; // Try to use an insert into a zero vector. if (WidenedMask[0] == 0 && IsHighZero) { MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2); SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, getZeroVector(VT, Subtarget, DAG, DL), LoV, DAG.getIntPtrConstant(0, DL)); } // TODO: If minimizing size and one of the inputs is a zero vector and the // the zero vector has only one use, we could use a VPERM2X128 to save the // instruction bytes needed to explicitly generate the zero vector. // Blends are faster and handle all the non-lane-crossing cases. if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // If either input operand is a zero vector, use VPERM2X128 because its mask // allows us to replace the zero input with an implicit zero. if (!IsLowZero && !IsHighZero) { // Check for patterns which can be matched with a single insert of a 128-bit // subvector. bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}); if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) { // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise, // this will likely become vinsertf128 which can't fold a 256-bit memop. if (!isa(peekThroughBitcasts(V1))) { MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2); SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec, DAG.getIntPtrConstant(2, DL)); } } // Try to use SHUF128 if possible. if (Subtarget.hasVLX()) { if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) { unsigned PermMask = ((WidenedMask[0] % 2) << 0) | ((WidenedMask[1] % 2) << 1); return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, DAG.getTargetConstant(PermMask, DL, MVT::i8)); } } } // Otherwise form a 128-bit permutation. After accounting for undefs, // convert the 64-bit shuffle mask selection values into 128-bit // selection bits by dividing the indexes by 2 and shifting into positions // defined by a vperm2*128 instruction's immediate control byte. // The immediate permute control byte looks like this: // [1:0] - select 128 bits from sources for low half of destination // [2] - ignore // [3] - zero low half of destination // [5:4] - select 128 bits from sources for high half of destination // [6] - ignore // [7] - zero high half of destination assert((WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?"); unsigned PermMask = 0; PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0); PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4); // Check the immediate mask and replace unused sources with undef. if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00) V1 = DAG.getUNDEF(VT); if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20) V2 = DAG.getUNDEF(VT); return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, DAG.getTargetConstant(PermMask, DL, MVT::i8)); } /// Lower a vector shuffle by first fixing the 128-bit lanes and then /// shuffling each lane. /// /// This attempts to create a repeated lane shuffle where each lane uses one /// or two of the lanes of the inputs. The lanes of the input vectors are /// shuffled in one or two independent shuffles to get the lanes into the /// position needed by the final shuffle. static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!V2.isUndef() && "This is only useful with multiple inputs."); if (is128BitLaneRepeatedShuffleMask(VT, Mask)) return SDValue(); int NumElts = Mask.size(); int NumLanes = VT.getSizeInBits() / 128; int NumLaneElts = 128 / VT.getScalarSizeInBits(); SmallVector RepeatMask(NumLaneElts, -1); SmallVector, 2> LaneSrcs(NumLanes, {{-1, -1}}); // First pass will try to fill in the RepeatMask from lanes that need two // sources. for (int Lane = 0; Lane != NumLanes; ++Lane) { int Srcs[2] = {-1, -1}; SmallVector InLaneMask(NumLaneElts, -1); for (int i = 0; i != NumLaneElts; ++i) { int M = Mask[(Lane * NumLaneElts) + i]; if (M < 0) continue; // Determine which of the possible input lanes (NumLanes from each source) // this element comes from. Assign that as one of the sources for this // lane. We can assign up to 2 sources for this lane. If we run out // sources we can't do anything. int LaneSrc = M / NumLaneElts; int Src; if (Srcs[0] < 0 || Srcs[0] == LaneSrc) Src = 0; else if (Srcs[1] < 0 || Srcs[1] == LaneSrc) Src = 1; else return SDValue(); Srcs[Src] = LaneSrc; InLaneMask[i] = (M % NumLaneElts) + Src * NumElts; } // If this lane has two sources, see if it fits with the repeat mask so far. if (Srcs[1] < 0) continue; LaneSrcs[Lane][0] = Srcs[0]; LaneSrcs[Lane][1] = Srcs[1]; auto MatchMasks = [](ArrayRef M1, ArrayRef M2) { assert(M1.size() == M2.size() && "Unexpected mask size"); for (int i = 0, e = M1.size(); i != e; ++i) if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i]) return false; return true; }; auto MergeMasks = [](ArrayRef Mask, MutableArrayRef MergedMask) { assert(Mask.size() == MergedMask.size() && "Unexpected mask size"); for (int i = 0, e = MergedMask.size(); i != e; ++i) { int M = Mask[i]; if (M < 0) continue; assert((MergedMask[i] < 0 || MergedMask[i] == M) && "Unexpected mask element"); MergedMask[i] = M; } }; if (MatchMasks(InLaneMask, RepeatMask)) { // Merge this lane mask into the final repeat mask. MergeMasks(InLaneMask, RepeatMask); continue; } // Didn't find a match. Swap the operands and try again. std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]); ShuffleVectorSDNode::commuteMask(InLaneMask); if (MatchMasks(InLaneMask, RepeatMask)) { // Merge this lane mask into the final repeat mask. MergeMasks(InLaneMask, RepeatMask); continue; } // Couldn't find a match with the operands in either order. return SDValue(); } // Now handle any lanes with only one source. for (int Lane = 0; Lane != NumLanes; ++Lane) { // If this lane has already been processed, skip it. if (LaneSrcs[Lane][0] >= 0) continue; for (int i = 0; i != NumLaneElts; ++i) { int M = Mask[(Lane * NumLaneElts) + i]; if (M < 0) continue; // If RepeatMask isn't defined yet we can define it ourself. if (RepeatMask[i] < 0) RepeatMask[i] = M % NumLaneElts; if (RepeatMask[i] < NumElts) { if (RepeatMask[i] != M % NumLaneElts) return SDValue(); LaneSrcs[Lane][0] = M / NumLaneElts; } else { if (RepeatMask[i] != ((M % NumLaneElts) + NumElts)) return SDValue(); LaneSrcs[Lane][1] = M / NumLaneElts; } } if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0) return SDValue(); } SmallVector NewMask(NumElts, -1); for (int Lane = 0; Lane != NumLanes; ++Lane) { int Src = LaneSrcs[Lane][0]; for (int i = 0; i != NumLaneElts; ++i) { int M = -1; if (Src >= 0) M = Src * NumLaneElts + i; NewMask[Lane * NumLaneElts + i] = M; } } SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); // Ensure we didn't get back the shuffle we started with. // FIXME: This is a hack to make up for some splat handling code in // getVectorShuffle. if (isa(NewV1) && cast(NewV1)->getMask() == Mask) return SDValue(); for (int Lane = 0; Lane != NumLanes; ++Lane) { int Src = LaneSrcs[Lane][1]; for (int i = 0; i != NumLaneElts; ++i) { int M = -1; if (Src >= 0) M = Src * NumLaneElts + i; NewMask[Lane * NumLaneElts + i] = M; } } SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); // Ensure we didn't get back the shuffle we started with. // FIXME: This is a hack to make up for some splat handling code in // getVectorShuffle. if (isa(NewV2) && cast(NewV2)->getMask() == Mask) return SDValue(); for (int i = 0; i != NumElts; ++i) { NewMask[i] = RepeatMask[i % NumLaneElts]; if (NewMask[i] < 0) continue; NewMask[i] += (i / NumLaneElts) * NumLaneElts; } return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask); } /// If the input shuffle mask results in a vector that is undefined in all upper /// or lower half elements and that mask accesses only 2 halves of the /// shuffle's operands, return true. A mask of half the width with mask indexes /// adjusted to access the extracted halves of the original shuffle operands is /// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or /// lower half of each input operand is accessed. static bool getHalfShuffleMask(ArrayRef Mask, MutableArrayRef HalfMask, int &HalfIdx1, int &HalfIdx2) { assert((Mask.size() == HalfMask.size() * 2) && "Expected input mask to be twice as long as output"); // Exactly one half of the result must be undef to allow narrowing. bool UndefLower = isUndefLowerHalf(Mask); bool UndefUpper = isUndefUpperHalf(Mask); if (UndefLower == UndefUpper) return false; unsigned HalfNumElts = HalfMask.size(); unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0; HalfIdx1 = -1; HalfIdx2 = -1; for (unsigned i = 0; i != HalfNumElts; ++i) { int M = Mask[i + MaskIndexOffset]; if (M < 0) { HalfMask[i] = M; continue; } // Determine which of the 4 half vectors this element is from. // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2. int HalfIdx = M / HalfNumElts; // Determine the element index into its half vector source. int HalfElt = M % HalfNumElts; // We can shuffle with up to 2 half vectors, set the new 'half' // shuffle mask accordingly. if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) { HalfMask[i] = HalfElt; HalfIdx1 = HalfIdx; continue; } if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) { HalfMask[i] = HalfElt + HalfNumElts; HalfIdx2 = HalfIdx; continue; } // Too many half vectors referenced. return false; } return true; } /// Given the output values from getHalfShuffleMask(), create a half width /// shuffle of extracted vectors followed by an insert back to full width. static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat = false) { assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?"); assert(V1.getValueType().isSimple() && "Expecting only simple types"); MVT VT = V1.getSimpleValueType(); MVT HalfVT = VT.getHalfNumVectorElementsVT(); unsigned HalfNumElts = HalfVT.getVectorNumElements(); auto getHalfVector = [&](int HalfIdx) { if (HalfIdx < 0) return DAG.getUNDEF(HalfVT); SDValue V = (HalfIdx < 2 ? V1 : V2); HalfIdx = (HalfIdx % 2) * HalfNumElts; return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V, DAG.getIntPtrConstant(HalfIdx, DL)); }; // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset SDValue Half1 = getHalfVector(HalfIdx1); SDValue Half2 = getHalfVector(HalfIdx2); SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask); if (UseConcat) { SDValue Op0 = V; SDValue Op1 = DAG.getUNDEF(HalfVT); if (UndefLower) std::swap(Op0, Op1); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1); } unsigned Offset = UndefLower ? HalfNumElts : 0; return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, DAG.getIntPtrConstant(Offset, DL)); } /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF. /// This allows for fast cases such as subvector extraction/insertion /// or shuffling smaller vector types which can lower more efficiently. static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert((VT.is256BitVector() || VT.is512BitVector()) && "Expected 256-bit or 512-bit vector"); bool UndefLower = isUndefLowerHalf(Mask); if (!UndefLower && !isUndefUpperHalf(Mask)) return SDValue(); assert((!UndefLower || !isUndefUpperHalf(Mask)) && "Completely undef shuffle mask should have been simplified already"); // Upper half is undef and lower half is whole upper subvector. // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> MVT HalfVT = VT.getHalfNumVectorElementsVT(); unsigned HalfNumElts = HalfVT.getVectorNumElements(); if (!UndefLower && isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, DAG.getIntPtrConstant(HalfNumElts, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, DAG.getIntPtrConstant(0, DL)); } // Lower half is undef and upper half is whole lower subvector. // e.g. vector_shuffle or if (UndefLower && isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, DAG.getIntPtrConstant(HalfNumElts, DL)); } int HalfIdx1, HalfIdx2; SmallVector HalfMask(HalfNumElts); if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2)) return SDValue(); assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"); // Only shuffle the halves of the inputs when useful. unsigned NumLowerHalves = (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2); unsigned NumUpperHalves = (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3); assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed"); // Determine the larger pattern of undef/halves, then decide if it's worth // splitting the shuffle based on subtarget capabilities and types. unsigned EltWidth = VT.getVectorElementType().getSizeInBits(); if (!UndefLower) { // XXXXuuuu: no insert is needed. // Always extract lowers when setting lower - these are all free subreg ops. if (NumUpperHalves == 0) return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2, UndefLower, DAG); if (NumUpperHalves == 1) { // AVX2 has efficient 32/64-bit element cross-lane shuffles. if (Subtarget.hasAVX2()) { // extract128 + vunpckhps/vshufps, is better than vblend + vpermps. if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() && !is128BitUnpackShuffleMask(HalfMask) && (!isSingleSHUFPSMask(HalfMask) || Subtarget.hasFastVariableShuffle())) return SDValue(); // If this is a unary shuffle (assume that the 2nd operand is // canonicalized to undef), then we can use vpermpd. Otherwise, we // are better off extracting the upper half of 1 operand and using a // narrow shuffle. if (EltWidth == 64 && V2.isUndef()) return SDValue(); } // AVX512 has efficient cross-lane shuffles for all legal 512-bit types. if (Subtarget.hasAVX512() && VT.is512BitVector()) return SDValue(); // Extract + narrow shuffle is better than the wide alternative. return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2, UndefLower, DAG); } // Don't extract both uppers, instead shuffle and then extract. assert(NumUpperHalves == 2 && "Half vector count went wrong"); return SDValue(); } // UndefLower - uuuuXXXX: an insert to high half is required if we split this. if (NumUpperHalves == 0) { // AVX2 has efficient 64-bit element cross-lane shuffles. // TODO: Refine to account for unary shuffle, splat, and other masks? if (Subtarget.hasAVX2() && EltWidth == 64) return SDValue(); // AVX512 has efficient cross-lane shuffles for all legal 512-bit types. if (Subtarget.hasAVX512() && VT.is512BitVector()) return SDValue(); // Narrow shuffle + insert is better than the wide alternative. return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2, UndefLower, DAG); } // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert. return SDValue(); } /// Test whether the specified input (0 or 1) is in-place blended by the /// given mask. /// /// This returns true if the elements from a particular input are already in the /// slot required by the given mask and require no permutation. static bool isShuffleMaskInputInPlace(int Input, ArrayRef Mask) { assert((Input == 0 || Input == 1) && "Only two inputs to shuffles."); int Size = Mask.size(); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i) return false; return true; } /// Handle case where shuffle sources are coming from the same 128-bit lane and /// every lane can be represented as the same repeating mask - allowing us to /// shuffle the sources with the repeating shuffle and then permute the result /// to the destination lanes. static SDValue lowerShuffleAsRepeatedMaskAndLanePermute( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int NumElts = VT.getVectorNumElements(); int NumLanes = VT.getSizeInBits() / 128; int NumLaneElts = NumElts / NumLanes; // On AVX2 we may be able to just shuffle the lowest elements and then // broadcast the result. if (Subtarget.hasAVX2()) { for (unsigned BroadcastSize : {16, 32, 64}) { if (BroadcastSize <= VT.getScalarSizeInBits()) continue; int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits(); // Attempt to match a repeating pattern every NumBroadcastElts, // accounting for UNDEFs but only references the lowest 128-bit // lane of the inputs. auto FindRepeatingBroadcastMask = [&](SmallVectorImpl &RepeatMask) { for (int i = 0; i != NumElts; i += NumBroadcastElts) for (int j = 0; j != NumBroadcastElts; ++j) { int M = Mask[i + j]; if (M < 0) continue; int &R = RepeatMask[j]; if (0 != ((M % NumElts) / NumLaneElts)) return false; if (0 <= R && R != M) return false; R = M; } return true; }; SmallVector RepeatMask((unsigned)NumElts, -1); if (!FindRepeatingBroadcastMask(RepeatMask)) continue; // Shuffle the (lowest) repeated elements in place for broadcast. SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask); // Shuffle the actual broadcast. SmallVector BroadcastMask((unsigned)NumElts, -1); for (int i = 0; i != NumElts; i += NumBroadcastElts) for (int j = 0; j != NumBroadcastElts; ++j) BroadcastMask[i + j] = j; return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT), BroadcastMask); } } // Bail if the shuffle mask doesn't cross 128-bit lanes. if (!is128BitLaneCrossingShuffleMask(VT, Mask)) return SDValue(); // Bail if we already have a repeated lane shuffle mask. SmallVector RepeatedShuffleMask; if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask)) return SDValue(); // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes. int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1; int NumSubLanes = NumLanes * SubLaneScale; int NumSubLaneElts = NumLaneElts / SubLaneScale; // Check that all the sources are coming from the same lane and see if we can // form a repeating shuffle mask (local to each sub-lane). At the same time, // determine the source sub-lane for each destination sub-lane. int TopSrcSubLane = -1; SmallVector Dst2SrcSubLanes((unsigned)NumSubLanes, -1); SmallVector RepeatedSubLaneMasks[2] = { SmallVector((unsigned)NumSubLaneElts, SM_SentinelUndef), SmallVector((unsigned)NumSubLaneElts, SM_SentinelUndef)}; for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) { // Extract the sub-lane mask, check that it all comes from the same lane // and normalize the mask entries to come from the first lane. int SrcLane = -1; SmallVector SubLaneMask((unsigned)NumSubLaneElts, -1); for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) { int M = Mask[(DstSubLane * NumSubLaneElts) + Elt]; if (M < 0) continue; int Lane = (M % NumElts) / NumLaneElts; if ((0 <= SrcLane) && (SrcLane != Lane)) return SDValue(); SrcLane = Lane; int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts); SubLaneMask[Elt] = LocalM; } // Whole sub-lane is UNDEF. if (SrcLane < 0) continue; // Attempt to match against the candidate repeated sub-lane masks. for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) { auto MatchMasks = [NumSubLaneElts](ArrayRef M1, ArrayRef M2) { for (int i = 0; i != NumSubLaneElts; ++i) { if (M1[i] < 0 || M2[i] < 0) continue; if (M1[i] != M2[i]) return false; } return true; }; auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane]; if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask)) continue; // Merge the sub-lane mask into the matching repeated sub-lane mask. for (int i = 0; i != NumSubLaneElts; ++i) { int M = SubLaneMask[i]; if (M < 0) continue; assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && "Unexpected mask element"); RepeatedSubLaneMask[i] = M; } // Track the top most source sub-lane - by setting the remaining to UNDEF // we can greatly simplify shuffle matching. int SrcSubLane = (SrcLane * SubLaneScale) + SubLane; TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane); Dst2SrcSubLanes[DstSubLane] = SrcSubLane; break; } // Bail if we failed to find a matching repeated sub-lane mask. if (Dst2SrcSubLanes[DstSubLane] < 0) return SDValue(); } assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && "Unexpected source lane"); // Create a repeating shuffle mask for the entire vector. SmallVector RepeatedMask((unsigned)NumElts, -1); for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) { int Lane = SubLane / SubLaneScale; auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale]; for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) { int M = RepeatedSubLaneMask[Elt]; if (M < 0) continue; int Idx = (SubLane * NumSubLaneElts) + Elt; RepeatedMask[Idx] = M + (Lane * NumLaneElts); } } SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask); // Shuffle each source sub-lane to its destination. SmallVector SubLaneMask((unsigned)NumElts, -1); for (int i = 0; i != NumElts; i += NumSubLaneElts) { int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts]; if (SrcSubLane < 0) continue; for (int j = 0; j != NumSubLaneElts; ++j) SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts); } return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT), SubLaneMask); } static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef Mask, const APInt &Zeroable) { int NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"); assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && "Illegal shuffle mask"); bool ZeroLane[2] = { true, true }; for (int i = 0; i < NumElts; ++i) ZeroLane[i & 1] &= Zeroable[i]; // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, .. // Mask for V4F64; 0/1, 4/5, 2/3, 6/7.. ShuffleImm = 0; bool ShufpdMask = true; bool CommutableMask = true; for (int i = 0; i < NumElts; ++i) { if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1]) continue; if (Mask[i] < 0) return false; int Val = (i & 6) + NumElts * (i & 1); int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1); if (Mask[i] < Val || Mask[i] > Val + 1) ShufpdMask = false; if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1) CommutableMask = false; ShuffleImm |= (Mask[i] % 2) << i; } if (!ShufpdMask && !CommutableMask) return false; if (!ShufpdMask && CommutableMask) std::swap(V1, V2); ForceV1Zero = ZeroLane[0]; ForceV2Zero = ZeroLane[1]; return true; } static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"); unsigned Immediate = 0; bool ForceV1Zero = false, ForceV2Zero = false; if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate, Mask, Zeroable)) return SDValue(); // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. if (ForceV1Zero) V1 = getZeroVector(VT, Subtarget, DAG, DL); if (ForceV2Zero) V2 = getZeroVector(VT, Subtarget, DAG, DL); return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, DAG.getTargetConstant(Immediate, DL, MVT::i8)); } // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed // by zeroable elements in the remaining 24 elements. Turn this into two // vmovqb instructions shuffled together. static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, SelectionDAG &DAG) { assert(VT == MVT::v32i8 && "Unexpected type!"); // The first 8 indices should be every 8th element. if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8)) return SDValue(); // Remaining elements need to be zeroable. if (Zeroable.countLeadingOnes() < (Mask.size() - 8)) return SDValue(); V1 = DAG.getBitcast(MVT::v4i64, V1); V2 = DAG.getBitcast(MVT::v4i64, V2); V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1); V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2); // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in // the upper bits of the result using an unpckldq. SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 }); // Insert the unpckldq into a zero vector to widen to v32i8. return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8, DAG.getConstant(0, DL, MVT::v32i8), Unpack, DAG.getIntPtrConstant(0, DL)); } /// Handle lowering of 4-lane 64-bit floating point shuffles. /// /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (V2.isUndef()) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Use low duplicate instructions for masks that match their pattern. if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1); if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { // Non-half-crossing single input shuffles can be lowered with an // interleaved permutation. unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1, DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8)); } // With AVX2 we have direct support for this permutation. if (Subtarget.hasAVX2()) return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return V; // Try to permute the lanes and then use a per-lane permute. if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget)) return V; // Otherwise, fall back. return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget); } // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Check if the blend happens to exactly fit that of SHUFPD. if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Op; // If we have lane crossing shuffles AND they don't all come from the lower // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)). // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently // canonicalize to a blend of splat which isn't necessary for this combine. if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) && !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) && (V1.getOpcode() != ISD::BUILD_VECTOR) && (V2.getOpcode() != ISD::BUILD_VECTOR)) if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG)) return Op; // If we have one input in place, then we can permute the other input and // blend the result. if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG); // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return V; // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single // instruction so skip this pattern. if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)))) if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return V; // If we have VLX support, we can use VEXPAND. if (Subtarget.hasVLX()) if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; // If we have AVX2 then we always want to lower with a blend because an v4 we // can fully permute the elements. if (Subtarget.hasAVX2()) return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG); // Otherwise fall back on generic lowering. return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG); } /// Handle lowering of 4-lane 64-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v4i64 shuffling.. static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"); if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; if (V2.isUndef()) { // When the shuffle is mirrored between the 128-bit lanes of the unit, we // can use lower latency instructions that will operate on both lanes. SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) { SmallVector PSHUFDMask; scaleShuffleMask(2, RepeatedMask, PSHUFDMask); return DAG.getBitcast( MVT::v4i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, DAG.getBitcast(MVT::v8i32, V1), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } // AVX2 provides a direct instruction for permuting a single input across // lanes. return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // If we have VLX support, we can use VALIGN or VEXPAND. if (Subtarget.hasVLX()) { if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; } // Try to use PALIGNR. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) return V; // If we have one input in place, then we can permute the other input and // blend the result. if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG); // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return V; // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single // instruction so skip this pattern. if (!isShuffleMaskInputInPlace(0, Mask) && !isShuffleMaskInputInPlace(1, Mask)) if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic blend lowering. return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG); } /// Handle lowering of 8-lane 32-bit floating point shuffles. /// /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // If the shuffle mask is repeated in each 128-bit lane, we have many more // options to efficiently lower the shuffle. SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) { assert(RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"); // Use even/odd duplicate instructions for masks that match their pattern. if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2})) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1); if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3})) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1); if (V2.isUndef()) return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG)) return V; // Otherwise, fall back to a SHUFPS sequence. Here it is important that we // have already handled any direct blends. return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); } // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return V; // If we have a single input shuffle with different shuffle patterns in the // two 128-bit lanes use the variable mask to VPERMILPS. if (V2.isUndef()) { SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true); if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask); if (Subtarget.hasAVX2()) return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1); // Otherwise, fall back. return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG, Subtarget); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Result; // If we have VLX support, we can use VEXPAND. if (Subtarget.hasVLX()) if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; // For non-AVX512 if the Mask is of 16bit elements in lane then try to split // since after split we get a more efficient code using vpunpcklwd and // vpunpckhwd instrs than vblend. if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32)) if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return V; // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. if (Subtarget.hasAVX2()) return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG); // Otherwise fall back on generic lowering. return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG); } /// Handle lowering of 8-lane 32-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v8i32 shuffling.. static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // For non-AVX512 if the Mask is of 16bit elements in lane then try to split // since after split we get a more efficient code than vblend by using // vpunpcklwd and vpunpckhwd instrs. if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() && !Subtarget.hasAVX512()) if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // If the shuffle mask is repeated in each 128-bit lane we can use more // efficient instructions that mirror the shuffles across the two 128-bit // lanes. SmallVector RepeatedMask; bool Is128BitLaneRepeatedShuffle = is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask); if (Is128BitLaneRepeatedShuffle) { assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); if (V2.isUndef()) return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG)) return V; } // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // If we have VLX support, we can use VALIGN or EXPAND. if (Subtarget.hasVLX()) { if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; } // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return V; // If the shuffle patterns aren't repeated but it is a single input, directly // generate a cross-lane VPERMD instruction. if (V2.isUndef()) { SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true); return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1); } // Assume that a single SHUFPS is faster than an alternative sequence of // multiple instructions (even if the CPU has a domain penalty). // If some CPU is harmed by the domain switch, we can fix it in a later pass. if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) { SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1); SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2); SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, CastV1, CastV2, DAG); return DAG.getBitcast(MVT::v8i32, ShufPS); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic blend lowering. return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG); } /// Handle lowering of 16-lane 16-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v16i16 shuffling.. static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Broadcast; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return V; if (V2.isUndef()) { // There are no generalized cross-lane shuffle operations available on i16 // element types. if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) { if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget)) return V; return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget); } SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { // As this is a single-input shuffle, the repeated mask should be // a strictly valid v8i16 mask that we can pass through to the v8i16 // lowering to handle even the v16 case. return lowerV8I16GeneralSingleInputShuffle( DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG); } } if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; // AVX512BWVL can lower to VPERMW. if (Subtarget.hasBWI() && Subtarget.hasVLX()) return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG); // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Result; // Try to permute the lanes and then use a per-lane permute. if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget)) return V; // Otherwise fall back on generic lowering. return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG); } /// Handle lowering of 32-lane 8-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v32i8 shuffling.. static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Broadcast; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return V; // There are no generalized cross-lane shuffle operations available on i8 // element types. if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) { if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget); } if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; // AVX512VBMIVL can lower to VPERMB. if (Subtarget.hasVBMI() && Subtarget.hasVLX()) return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG); // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Result; // Try to permute the lanes and then use a per-lane permute. if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed // by zeroable elements in the remaining 24 elements. Turn this into two // vmovqb instructions shuffled together. if (Subtarget.hasVLX()) if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2, Mask, Zeroable, DAG)) return V; // Otherwise fall back on generic lowering. return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG); } /// High-level routine to lower various 256-bit x86 vector shuffles. /// /// This routine either breaks down the specific type of a 256-bit x86 vector /// shuffle or splits it into two 128-bit shuffles and fuses the results back /// together based on the available instructions. static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // If we have a single input to the zero element, insert that into V1 if we // can do so cheaply. int NumElts = VT.getVectorNumElements(); int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; }); if (NumV2Elements == 1 && Mask[0] >= NumElts) if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Handle special cases where the lower or upper half is UNDEF. if (SDValue V = lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) return V; // There is a really nice hard cut-over between AVX1 and AVX2 that means we // can check for those subtargets here and avoid much of the subtarget // querying in the per-vector-type lowering routines. With AVX1 we have // essentially *zero* ability to manipulate a 256-bit vector with integer // types. Since we'll use floating point types there eventually, just // immediately cast everything to a float and operate entirely in that domain. if (VT.isInteger() && !Subtarget.hasAVX2()) { int ElementBits = VT.getScalarSizeInBits(); if (ElementBits < 32) { // No floating point type available, if we can't use the bit operations // for masking/blending then decompose into 128-bit vectors. if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) return V; return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); } MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), VT.getVectorNumElements()); V1 = DAG.getBitcast(FpVT, V1); V2 = DAG.getBitcast(FpVT, V2); return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask)); } switch (VT.SimpleTy) { case MVT::v4f64: return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4i64: return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8f32: return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i32: return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i16: return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v32i8: return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid 256-bit x86 vector type!"); } } /// Try to lower a vector shuffle as a 128-bit shuffles. static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle."); // To handle 256 bit vector requires VLX and most probably // function lowerV2X128VectorShuffle() is better solution. assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."); // TODO - use Zeroable like we do for lowerV2X128VectorShuffle? SmallVector WidenedMask; if (!canWidenShuffleElements(Mask, WidenedMask)) return SDValue(); // Try to use an insert into a zero vector. if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 && (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) { unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4; MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts); SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, getZeroVector(VT, Subtarget, DAG, DL), LoV, DAG.getIntPtrConstant(0, DL)); } // Check for patterns which can be matched with a single insert of a 256-bit // subvector. bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 0, 1, 2, 3}); if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 8, 9, 10, 11})) { MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4); SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec, DAG.getIntPtrConstant(4, DL)); } assert(WidenedMask.size() == 4); // See if this is an insertion of the lower 128-bits of V2 into V1. bool IsInsert = true; int V2Index = -1; for (int i = 0; i < 4; ++i) { assert(WidenedMask[i] >= -1); if (WidenedMask[i] < 0) continue; // Make sure all V1 subvectors are in place. if (WidenedMask[i] < 4) { if (WidenedMask[i] != i) { IsInsert = false; break; } } else { // Make sure we only have a single V2 index and its the lowest 128-bits. if (V2Index >= 0 || WidenedMask[i] != 4) { IsInsert = false; break; } V2Index = i; } } if (IsInsert && V2Index >= 0) { MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2); SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2, DAG.getIntPtrConstant(0, DL)); return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL); } // Try to lower to vshuf64x2/vshuf32x4. SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)}; unsigned PermMask = 0; // Insure elements came from the same Op. for (int i = 0; i < 4; ++i) { assert(WidenedMask[i] >= -1); if (WidenedMask[i] < 0) continue; SDValue Op = WidenedMask[i] >= 4 ? V2 : V1; unsigned OpIndex = i / 2; if (Ops[OpIndex].isUndef()) Ops[OpIndex] = Op; else if (Ops[OpIndex] != Op) return SDValue(); // Convert the 128-bit shuffle mask selection values into 128-bit selection // bits defined by a vshuf64x2 instruction's immediate control byte. PermMask |= (WidenedMask[i] % 4) << (i * 2); } return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1], DAG.getTargetConstant(PermMask, DL, MVT::i8)); } /// Handle lowering of 8-lane 64-bit floating point shuffles. static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); if (V2.isUndef()) { // Use low duplicate instructions for masks that match their pattern. if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6})) return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1); if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) { // Non-half-crossing single input shuffles can be lowered with an // interleaved permutation. unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) | ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) | ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7); return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1, DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8)); } SmallVector RepeatedMask; if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); } if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2, Subtarget, DAG)) return Shuf128; if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Unpck; // Check if the blend happens to exactly fit that of SHUFPD. if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Op; if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } /// Handle lowering of 16-lane 32-bit floating point shuffles. static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // If the shuffle mask is repeated in each 128-bit lane, we have many more // options to efficiently lower the shuffle. SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) { assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); // Use even/odd duplicate instructions for masks that match their pattern. if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2})) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1); if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3})) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1); if (V2.isUndef()) return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Otherwise, fall back to a SHUFPS sequence. return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } // If we have a single input shuffle with different shuffle patterns in the // 128-bit lanes and don't lane cross, use variable mask VPERMILPS. if (V2.isUndef() && !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) { SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true); return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask); } // If we have AVX512F support, we can use VEXPAND. if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); } /// Handle lowering of 8-lane 64-bit integer shuffles. static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); if (V2.isUndef()) { // When the shuffle is mirrored between the 128-bit lanes of the unit, we // can use lower latency instructions that will operate on all four // 128-bit lanes. SmallVector Repeated128Mask; if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) { SmallVector PSHUFDMask; scaleShuffleMask(2, Repeated128Mask, PSHUFDMask); return DAG.getBitcast( MVT::v8i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, DAG.getBitcast(MVT::v16i32, V1), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } SmallVector Repeated256Mask; if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask)) return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1, getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG)); } if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1, V2, Subtarget, DAG)) return Shuf128; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use VALIGN. if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to use PALIGNR. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Unpck; // If we have AVX512F support, we can use VEXPAND. if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } /// Handle lowering of 16-lane 32-bit integer shuffles. static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // If the shuffle mask is repeated in each 128-bit lane we can use more // efficient instructions that mirror the shuffles across the four 128-bit // lanes. SmallVector RepeatedMask; bool Is128BitLaneRepeatedShuffle = is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask); if (Is128BitLaneRepeatedShuffle) { assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); if (V2.isUndef()) return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) return V; } // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use VALIGN. if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to use byte rotation instructions. if (Subtarget.hasBWI()) if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Assume that a single SHUFPS is faster than using a permv shuffle. // If some CPU is harmed by the domain switch, we can fix it in a later pass. if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) { SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1); SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2); SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, CastV1, CastV2, DAG); return DAG.getBitcast(MVT::v16i32, ShufPS); } // If we have AVX512F support, we can use VEXPAND. if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } /// Handle lowering of 32-lane 16-bit integer shuffles. static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG)) return V; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (V2.isUndef()) { SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) { // As this is a single-input shuffle, the repeated mask should be // a strictly valid v8i16 mask that we can pass through to the v8i16 // lowering to handle even the v32 case. return lowerV8I16GeneralSingleInputShuffle( DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG); } } if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); } /// Handle lowering of 64-lane 8-bit integer shuffles. static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"); assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; // VBMI can use VPERMV/VPERMV3 byte shuffles. if (Subtarget.hasVBMI()) return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG); // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (!V2.isUndef()) if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return Result; // FIXME: Implement direct support for this type! return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); } /// High-level routine to lower various 512-bit x86 vector shuffles. /// /// This routine either breaks down the specific type of a 512-bit x86 vector /// shuffle or splits it into two 256-bit shuffles and fuses the results back /// together based on the available instructions. static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"); // If we have a single input to the zero element, insert that into V1 if we // can do so cheaply. int NumElts = Mask.size(); int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; }); if (NumV2Elements == 1 && Mask[0] >= NumElts) if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Handle special cases where the lower or upper half is UNDEF. if (SDValue V = lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) return V; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Dispatch to each element type for lowering. If we don't have support for // specific element type shuffles at 512 bits, immediately split them and // lower them. Each lowering routine of a given type is allowed to assume that // the requisite ISA extensions for that element type are available. switch (VT.SimpleTy) { case MVT::v8f64: return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16f32: return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i64: return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i32: return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v32i16: return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v64i8: return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid 512-bit x86 vector type!"); } } static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // Shuffle should be unary. if (!V2.isUndef()) return SDValue(); int ShiftAmt = -1; int NumElts = Mask.size(); for (int i = 0; i != NumElts; ++i) { int M = Mask[i]; assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) && "Unexpected mask index."); if (M < 0) continue; // The first non-undef element determines our shift amount. if (ShiftAmt < 0) { ShiftAmt = M - i; // Need to be shifting right. if (ShiftAmt <= 0) return SDValue(); } // All non-undef elements must shift by the same amount. if (ShiftAmt != M - i) return SDValue(); } assert(ShiftAmt >= 0 && "All undef?"); // Great we found a shift right. MVT WideVT = VT; if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8) WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT, DAG.getUNDEF(WideVT), V1, DAG.getIntPtrConstant(0, DL)); Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, DAG.getIntPtrConstant(0, DL)); } // Determine if this shuffle can be implemented with a KSHIFT instruction. // Returns the shift amount if possible or -1 if not. This is a simplified // version of matchShuffleAsShift. static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef Mask, int MaskOffset, const APInt &Zeroable) { int Size = Mask.size(); auto CheckZeros = [&](int Shift, bool Left) { for (int j = 0; j < Shift; ++j) if (!Zeroable[j + (Left ? 0 : (Size - Shift))]) return false; return true; }; auto MatchShift = [&](int Shift, bool Left) { unsigned Pos = Left ? Shift : 0; unsigned Low = Left ? 0 : Shift; unsigned Len = Size - Shift; return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset); }; for (int Shift = 1; Shift != Size; ++Shift) for (bool Left : {true, false}) if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) { Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR; return Shift; } return -1; } // Lower vXi1 vector shuffles. // There is no a dedicated instruction on AVX-512 that shuffles the masks. // The only way to shuffle bits is to sign-extend the mask vector to SIMD // vector, shuffle and then truncate it back. static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"); int NumElts = Mask.size(); // Try to recognize shuffles that are just padding a subvector with zeros. int SubvecElts = 0; int Src = -1; for (int i = 0; i != NumElts; ++i) { if (Mask[i] >= 0) { // Grab the source from the first valid mask. All subsequent elements need // to use this same source. if (Src < 0) Src = Mask[i] / NumElts; if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i) break; } ++SubvecElts; } assert(SubvecElts != NumElts && "Identity shuffle?"); // Clip to a power 2. SubvecElts = PowerOf2Floor(SubvecElts); // Make sure the number of zeroable bits in the top at least covers the bits // not covered by the subvector. if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) { assert(Src >= 0 && "Expected a source!"); MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts); SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getConstant(0, DL, VT), Extract, DAG.getIntPtrConstant(0, DL)); } // Try a simple shift right with undef elements. Later we'll try with zeros. if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG)) return Shift; // Try to match KSHIFTs. unsigned Offset = 0; for (SDValue V : { V1, V2 }) { unsigned Opcode; int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable); if (ShiftAmt >= 0) { MVT WideVT = VT; if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8) WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT, DAG.getUNDEF(WideVT), V, DAG.getIntPtrConstant(0, DL)); // Widened right shifts need two shifts to ensure we shift in zeroes. if (Opcode == X86ISD::KSHIFTR && WideVT != VT) { int WideElts = WideVT.getVectorNumElements(); // Shift left to put the original vector in the MSBs of the new size. Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res, DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8)); // Increase the shift amount to account for the left shift. ShiftAmt += WideElts - NumElts; } Res = DAG.getNode(Opcode, DL, WideVT, Res, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, DAG.getIntPtrConstant(0, DL)); } Offset += NumElts; // Increment for next iteration. } MVT ExtVT; switch (VT.SimpleTy) { default: llvm_unreachable("Expected a vector of i1 elements"); case MVT::v2i1: ExtVT = MVT::v2i64; break; case MVT::v4i1: ExtVT = MVT::v4i32; break; case MVT::v8i1: // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit // shuffle. ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64; break; case MVT::v16i1: // Take 512-bit type, unless we are avoiding 512-bit types and have the // 256-bit operation available. ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16; break; case MVT::v32i1: // Take 512-bit type, unless we are avoiding 512-bit types and have the // 256-bit operation available. assert(Subtarget.hasBWI() && "Expected AVX512BW support"); ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8; break; case MVT::v64i1: // Fall back to scalarization. FIXME: We can do better if the shuffle // can be partitioned cleanly. if (!Subtarget.useBWIRegs()) return SDValue(); ExtVT = MVT::v64i8; break; } V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1); V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2); SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask); // i1 was sign extended we can use X86ISD::CVT2MASK. int NumElems = VT.getVectorNumElements(); if ((Subtarget.hasBWI() && (NumElems >= 32)) || (Subtarget.hasDQI() && (NumElems < 32))) return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT), Shuffle, ISD::SETGT); return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle); } /// Helper function that returns true if the shuffle mask should be /// commuted to improve canonicalization. static bool canonicalizeShuffleMaskWithCommute(ArrayRef Mask) { int NumElements = Mask.size(); int NumV1Elements = 0, NumV2Elements = 0; for (int M : Mask) if (M < 0) continue; else if (M < NumElements) ++NumV1Elements; else ++NumV2Elements; // Commute the shuffle as needed such that more elements come from V1 than // V2. This allows us to match the shuffle pattern strictly on how many // elements come from V1 without handling the symmetric cases. if (NumV2Elements > NumV1Elements) return true; assert(NumV1Elements > 0 && "No V1 indices"); if (NumV2Elements == 0) return false; // When the number of V1 and V2 elements are the same, try to minimize the // number of uses of V2 in the low half of the vector. When that is tied, // ensure that the sum of indices for V1 is equal to or lower than the sum // indices for V2. When those are equal, try to ensure that the number of odd // indices for V1 is lower than the number of odd indices for V2. if (NumV1Elements == NumV2Elements) { int LowV1Elements = 0, LowV2Elements = 0; for (int M : Mask.slice(0, NumElements / 2)) if (M >= NumElements) ++LowV2Elements; else if (M >= 0) ++LowV1Elements; if (LowV2Elements > LowV1Elements) return true; if (LowV2Elements == LowV1Elements) { int SumV1Indices = 0, SumV2Indices = 0; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] >= NumElements) SumV2Indices += i; else if (Mask[i] >= 0) SumV1Indices += i; if (SumV2Indices < SumV1Indices) return true; if (SumV2Indices == SumV1Indices) { int NumV1OddIndices = 0, NumV2OddIndices = 0; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] >= NumElements) NumV2OddIndices += i % 2; else if (Mask[i] >= 0) NumV1OddIndices += i % 2; if (NumV2OddIndices < NumV1OddIndices) return true; } } } return false; } /// Top-level lowering for x86 vector shuffles. /// /// This handles decomposition, canonicalization, and lowering of all x86 /// vector shuffles. Most of the specific lowering strategies are encapsulated /// above in helper routines. The canonicalization attempts to widen shuffles /// to involve fewer lanes of wider elements, consolidate symmetric patterns /// s.t. only one of the two inputs needs to be tested, etc. static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef OrigMask = SVOp->getMask(); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); MVT VT = Op.getSimpleValueType(); int NumElements = VT.getVectorNumElements(); SDLoc DL(Op); bool Is1BitVector = (VT.getVectorElementType() == MVT::i1); assert((VT.getSizeInBits() != 64 || Is1BitVector) && "Can't lower MMX shuffles"); bool V1IsUndef = V1.isUndef(); bool V2IsUndef = V2.isUndef(); if (V1IsUndef && V2IsUndef) return DAG.getUNDEF(VT); // When we create a shuffle node we put the UNDEF node to second operand, // but in some cases the first operand may be transformed to UNDEF. // In this case we should just commute the node. if (V1IsUndef) return DAG.getCommutedVectorShuffle(*SVOp); // Check for non-undef masks pointing at an undef vector and make the masks // undef as well. This makes it easier to match the shuffle based solely on // the mask. if (V2IsUndef && any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) { SmallVector NewMask(OrigMask.begin(), OrigMask.end()); for (int &M : NewMask) if (M >= NumElements) M = -1; return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); } // Check for illegal shuffle mask element index values. int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit; assert(llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && "Out of bounds shuffle index"); // We actually see shuffles that are entirely re-arrangements of a set of // zero inputs. This mostly happens while decomposing complex shuffles into // simple ones. Directly lower these as a buildvector of zeros. APInt KnownUndef, KnownZero; computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero); APInt Zeroable = KnownUndef | KnownZero; if (Zeroable.isAllOnesValue()) return getZeroVector(VT, Subtarget, DAG, DL); bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode()); // Try to collapse shuffles into using a vector type with fewer elements but // wider element types. We cap this to not form integers or floating point // elements wider than 64 bits, but it might be interesting to form i128 // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector WidenedMask; if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) { // Shuffle mask widening should not interfere with a broadcast opportunity // by obfuscating the operands with bitcasts. // TODO: Avoid lowering directly from this top-level function: make this // a query (canLowerAsBroadcast) and defer lowering to the type-based calls. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask, Subtarget, DAG)) return Broadcast; MVT NewEltVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2); int NewNumElts = NumElements / 2; MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts); // Make sure that the new vector type is legal. For example, v2f64 isn't // legal on SSE1. if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { if (V2IsZero) { // Modify the new Mask to take all zeros from the all-zero vector. // Choose indices that are blend-friendly. bool UsedZeroVector = false; assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() && "V2's non-undef elements are used?!"); for (int i = 0; i != NewNumElts; ++i) if (WidenedMask[i] == SM_SentinelZero) { WidenedMask[i] = i + NewNumElts; UsedZeroVector = true; } // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits // some elements to be undef. if (UsedZeroVector) V2 = getZeroVector(NewVT, Subtarget, DAG, DL); } V1 = DAG.getBitcast(NewVT, V1); V2 = DAG.getBitcast(NewVT, V2); return DAG.getBitcast( VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask)); } } // Commute the shuffle if it will improve canonicalization. SmallVector Mask(OrigMask.begin(), OrigMask.end()); if (canonicalizeShuffleMaskWithCommute(Mask)) { ShuffleVectorSDNode::commuteMask(Mask); std::swap(V1, V2); } if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget)) return V; // For each vector width, delegate to a specialized lowering routine. if (VT.is128BitVector()) return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); if (VT.is256BitVector()) return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); if (VT.is512BitVector()) return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); if (Is1BitVector) return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); llvm_unreachable("Unimplemented!"); } /// Try to lower a VSELECT instruction to a vector shuffle. static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Cond = Op.getOperand(0); SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); MVT VT = Op.getSimpleValueType(); // Only non-legal VSELECTs reach this lowering, convert those into generic // shuffles and re-use the shuffle lowering path for blends. SmallVector Mask; if (createShuffleMaskFromVSELECT(Mask, Cond)) return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask); return SDValue(); } SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cond = Op.getOperand(0); SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); // A vselect where all conditions and data are constants can be optimized into // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) && ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) && ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) return SDValue(); // Try to lower this to a blend-style vector shuffle. This can handle all // constant condition cases. if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG)) return BlendOp; // If this VSELECT has a vector if i1 as a mask, it will be directly matched // with patterns on the mask registers on AVX-512. MVT CondVT = Cond.getSimpleValueType(); unsigned CondEltSize = Cond.getScalarValueSizeInBits(); if (CondEltSize == 1) return Op; // Variable blends are only legal from SSE4.1 onward. if (!Subtarget.hasSSE41()) return SDValue(); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); unsigned EltSize = VT.getScalarSizeInBits(); unsigned NumElts = VT.getVectorNumElements(); // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition // into an i1 condition so that we can use the mask-based 512-bit blend // instructions. if (VT.getSizeInBits() == 512) { // Build a mask by testing the condition against zero. MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts); SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond, DAG.getConstant(0, dl, CondVT), ISD::SETNE); // Now return a new VSELECT using the mask. return DAG.getSelect(dl, VT, Mask, LHS, RHS); } // SEXT/TRUNC cases where the mask doesn't match the destination size. if (CondEltSize != EltSize) { // If we don't have a sign splat, rely on the expansion. if (CondEltSize != DAG.ComputeNumSignBits(Cond)) return SDValue(); MVT NewCondSVT = MVT::getIntegerVT(EltSize); MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts); Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT); return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS); } // Only some types will be legal on some subtargets. If we can emit a legal // VSELECT-matching blend, return Op, and but if we need to expand, return // a null value. switch (VT.SimpleTy) { default: // Most of the vector types have blends past SSE4.1. return Op; case MVT::v32i8: // The byte blends for AVX vectors were introduced only in AVX2. if (Subtarget.hasAVX2()) return Op; return SDValue(); case MVT::v8i16: case MVT::v16i16: { // Bitcast everything to the vXi8 type and use a vXi8 vselect. MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2); Cond = DAG.getBitcast(CastVT, Cond); LHS = DAG.getBitcast(CastVT, LHS); RHS = DAG.getBitcast(CastVT, RHS); SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS); return DAG.getBitcast(VT, Select); } } } static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); if (!Op.getOperand(0).getSimpleValueType().is128BitVector()) return SDValue(); if (VT.getSizeInBits() == 8) { SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Op.getOperand(0), Op.getOperand(1)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract); } if (VT == MVT::f32) { // EXTRACTPS outputs to a GPR32 register which will require a movd to copy // the result back to FR32 register. It's only worth matching if the // result has a single use which is a store or a bitcast to i32. And in // the case of a store, it's not worth it if the index is a constant 0, // because a MOVSSmr can be used instead, which is smaller and faster. if (!Op.hasOneUse()) return SDValue(); SDNode *User = *Op.getNode()->use_begin(); if ((User->getOpcode() != ISD::STORE || isNullConstant(Op.getOperand(1))) && (User->getOpcode() != ISD::BITCAST || User->getValueType(0) != MVT::i32)) return SDValue(); SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Op.getOperand(0)), Op.getOperand(1)); return DAG.getBitcast(MVT::f32, Extract); } if (VT == MVT::i32 || VT == MVT::i64) { // ExtractPS/pextrq works with constant index. if (isa(Op.getOperand(1))) return Op; } return SDValue(); } /// Extract one bit from mask vector, like v16i1 or v8i1. /// AVX-512 feature. static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Vec = Op.getOperand(0); SDLoc dl(Vec); MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); MVT EltVT = Op.getSimpleValueType(); assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"); // variable index can't be handled in mask registers, // extend vector to VR512/128 if (!isa(Idx)) { unsigned NumElts = VecVT.getVectorNumElements(); // Extending v8i1/v16i1 to 512-bit get better performance on KNL // than extending to 128/256bit. MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8; MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts); SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec); SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx); return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); } unsigned IdxVal = cast(Idx)->getZExtValue(); if (IdxVal == 0) // the operation is legal return Op; // Extend to natively supported kshift. unsigned NumElems = VecVT.getVectorNumElements(); MVT WideVecVT = VecVT; if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) { WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT, DAG.getUNDEF(WideVecVT), Vec, DAG.getIntPtrConstant(0, dl)); } // Use kshiftr instruction to move to the lower element. Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec, DAG.getTargetConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, DAG.getIntPtrConstant(0, dl)); } SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); if (VecVT.getVectorElementType() == MVT::i1) return ExtractBitFromMaskVector(Op, DAG, Subtarget); if (!isa(Idx)) { // Its more profitable to go through memory (1 cycles throughput) // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput) // IACA tool was used to get performance estimation // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer) // // example : extractelement <16 x i8> %a, i32 %i // // Block Throughput: 3.00 Cycles // Throughput Bottleneck: Port5 // // | Num Of | Ports pressure in cycles | | // | Uops | 0 - DV | 5 | 6 | 7 | | // --------------------------------------------- // | 1 | | 1.0 | | | CP | vmovd xmm1, edi // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0 // Total Num Of Uops: 4 // // // Block Throughput: 1.00 Cycles // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4 // // | | Ports pressure in cycles | | // |Uops| 1 | 2 - D |3 - D | 4 | 5 | | // --------------------------------------------------------- // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18] // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1] // Total Num Of Uops: 4 return SDValue(); } unsigned IdxVal = cast(Idx)->getZExtValue(); // If this is a 256-bit vector result, first extract the 128-bit vector and // then extract the element from the 128-bit vector. if (VecVT.is256BitVector() || VecVT.is512BitVector()) { // Get the 128-bit vector. Vec = extract128BitVector(Vec, IdxVal, DAG, dl); MVT EltVT = VecVT.getVectorElementType(); unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2 // this can be done with a mask. IdxVal &= ElemsPerChunk - 1; return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, DAG.getIntPtrConstant(IdxVal, dl)); } assert(VecVT.is128BitVector() && "Unexpected vector length"); MVT VT = Op.getSimpleValueType(); if (VT.getSizeInBits() == 16) { // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless // we're going to zero extend the register or fold the store (SSE41 only). if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) && !(Subtarget.hasSSE41() && MayFoldIntoStore(Op))) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), Idx)); // Transform it so it match pextrw which produces a 32-bit result. SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Op.getOperand(0), Op.getOperand(1)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract); } if (Subtarget.hasSSE41()) if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG)) return Res; // TODO: We only extract a single element from v16i8, we can probably afford // to be more aggressive here before using the default approach of spilling to // stack. if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) { // Extract either the lowest i32 or any i16, and extract the sub-byte. int DWordIdx = IdxVal / 4; if (DWordIdx == 0) { SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), DAG.getIntPtrConstant(DWordIdx, dl)); int ShiftVal = (IdxVal % 4) * 8; if (ShiftVal != 0) Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res, DAG.getConstant(ShiftVal, dl, MVT::i8)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); } int WordIdx = IdxVal / 2; SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, DAG.getBitcast(MVT::v8i16, Vec), DAG.getIntPtrConstant(WordIdx, dl)); int ShiftVal = (IdxVal % 2) * 8; if (ShiftVal != 0) Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res, DAG.getConstant(ShiftVal, dl, MVT::i8)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); } if (VT.getSizeInBits() == 32) { if (IdxVal == 0) return Op; // SHUFPS the element to the lowest double word, then movss. int Mask[4] = { static_cast(IdxVal), -1, -1, -1 }; Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, DAG.getIntPtrConstant(0, dl)); } if (VT.getSizeInBits() == 64) { // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught // to match extract_elt for f64. if (IdxVal == 0) return Op; // UNPCKHPD the element to the lowest double word, then movsd. // Note if the lower 64 bits of the result of the UNPCKHPD is then stored // to a f64mem, the whole operation is folded into a single MOVHPDmr. int Mask[2] = { 1, -1 }; Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, DAG.getIntPtrConstant(0, dl)); } return SDValue(); } /// Insert one bit to mask vector, like v16i1 or v8i1. /// AVX-512 feature. static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); SDValue Elt = Op.getOperand(1); SDValue Idx = Op.getOperand(2); MVT VecVT = Vec.getSimpleValueType(); if (!isa(Idx)) { // Non constant index. Extend source and destination, // insert element and then truncate the result. unsigned NumElts = VecVT.getVectorNumElements(); MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8; MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts); SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec), DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx); return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); } // Copy into a k-register, extract to v1i1 and insert_subvector. SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Op.getOperand(2)); } SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); unsigned NumElts = VT.getVectorNumElements(); if (EltVT == MVT::i1) return InsertBitToMaskVector(Op, DAG, Subtarget); SDLoc dl(Op); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2 = Op.getOperand(2); auto *N2C = dyn_cast(N2); if (!N2C || N2C->getAPIntValue().uge(NumElts)) return SDValue(); uint64_t IdxVal = N2C->getZExtValue(); bool IsZeroElt = X86::isZeroNode(N1); bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1); // If we are inserting a element, see if we can do this more efficiently with // a blend shuffle with a rematerializable vector than a costly integer // insertion. if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && 16 <= EltVT.getSizeInBits()) { SmallVector BlendMask; for (unsigned i = 0; i != NumElts; ++i) BlendMask.push_back(i == IdxVal ? i + NumElts : i); SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl) : getOnesVector(VT, DAG, dl); return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask); } // If the vector is wider than 128 bits, extract the 128-bit subvector, insert // into that, and then insert the subvector back into the result. if (VT.is256BitVector() || VT.is512BitVector()) { // With a 256-bit vector, we can insert into the zero element efficiently // using a blend if we have AVX or AVX2 and the right data type. if (VT.is256BitVector() && IdxVal == 0) { // TODO: It is worthwhile to cast integer to floating point and back // and incur a domain crossing penalty if that's what we'll end up // doing anyway after extracting to a 128-bit vector. if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || (Subtarget.hasAVX2() && EltVT == MVT::i32)) { SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, DAG.getTargetConstant(1, dl, MVT::i8)); } } // Get the desired 128-bit vector chunk. SDValue V = extract128BitVector(N0, IdxVal, DAG, dl); // Insert the element into the desired chunk. unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits(); assert(isPowerOf2_32(NumEltsIn128)); // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo. unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1); V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, DAG.getIntPtrConstant(IdxIn128, dl)); // Insert the changed part back into the bigger vector return insert128BitVector(N0, V, IdxVal, DAG, dl); } assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); // This will be just movd/movq/movss/movsd. if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode()) && (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 || EltVT == MVT::i64)) { N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG); } // Transform it so it match pinsr{b,w} which expects a GR32 as its second // argument. SSE41 required for pinsrb. if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) { unsigned Opc; if (VT == MVT::v8i16) { assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW"); Opc = X86ISD::PINSRW; } else { assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector"); assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB"); Opc = X86ISD::PINSRB; } if (N1.getValueType() != MVT::i32) N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); if (N2.getValueType() != MVT::i32) N2 = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(Opc, dl, VT, N0, N1, N2); } if (Subtarget.hasSSE41()) { if (EltVT == MVT::f32) { // Bits [7:6] of the constant are the source select. This will always be // zero here. The DAG Combiner may combine an extract_elt index into // these bits. For example (insert (extract, 3), 2) could be matched by // putting the '3' into bits [7:6] of X86ISD::INSERTPS. // Bits [5:4] of the constant are the destination select. This is the // value of the incoming immediate. // Bits [3:0] of the constant are the zero mask. The DAG Combiner may // combine either bitwise AND or insert of float 0.0 to set these bits. bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize(); if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { // If this is an insertion of 32-bits into the low 32-bits of // a vector, we prefer to generate a blend with immediate rather // than an insertps. Blends are simpler operations in hardware and so // will always have equal or better performance than insertps. // But if optimizing for size and there's a load folding opportunity, // generate insertps because blendps does not have a 32-bit memory // operand form. N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, DAG.getTargetConstant(1, dl, MVT::i8)); } // Create this as a scalar to vector.. N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8)); } // PINSR* works with constant index. if (EltVT == MVT::i32 || EltVT == MVT::i64) return Op; } return SDValue(); } static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); MVT OpVT = Op.getSimpleValueType(); // It's always cheaper to replace a xor+movd with xorps and simplifies further // combines. if (X86::isZeroNode(Op.getOperand(0))) return getZeroVector(OpVT, Subtarget, DAG, dl); // If this is a 256-bit vector result, first insert into a 128-bit // vector and then insert into the 256-bit vector. if (!OpVT.is128BitVector()) { // Insert into a 128-bit vector. unsigned SizeFactor = OpVT.getSizeInBits() / 128; MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(), OpVT.getVectorNumElements() / SizeFactor); Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); // Insert the 128-bit vector. return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); } assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"); // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen. if (OpVT == MVT::v4i32) return Op; SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); return DAG.getBitcast( OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt)); } // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a // simple superregister reference or explicit instructions to insert // the upper bits of a vector. static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1); return insert1BitVector(Op, DAG, Subtarget); } static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"); SDLoc dl(Op); SDValue Vec = Op.getOperand(0); SDValue Idx = Op.getOperand(1); if (!isa(Idx)) return SDValue(); unsigned IdxVal = cast(Idx)->getZExtValue(); if (IdxVal == 0) // the operation is legal return Op; MVT VecVT = Vec.getSimpleValueType(); unsigned NumElems = VecVT.getVectorNumElements(); // Extend to natively supported kshift. MVT WideVecVT = VecVT; if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) { WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT, DAG.getUNDEF(WideVecVT), Vec, DAG.getIntPtrConstant(0, dl)); } // Shift to the LSB. Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec, DAG.getTargetConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec, DAG.getIntPtrConstant(0, dl)); } // Returns the appropriate wrapper opcode for a global reference. unsigned X86TargetLowering::getGlobalWrapperKind( const GlobalValue *GV, const unsigned char OpFlags) const { // References to absolute symbols are never PC-relative. if (GV && GV->isAbsoluteSymbolRef()) return X86ISD::Wrapper; CodeModel::Model M = getTargetMachine().getCodeModel(); if (Subtarget.isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) return X86ISD::WrapperRIP; // GOTPCREL references must always use RIP. if (OpFlags == X86II::MO_GOTPCREL) return X86ISD::WrapperRIP; return X86ISD::Wrapper; } // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is // one of the above mentioned nodes. It has to be wrapped because otherwise // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only // be used to form addressing mode. These wrapped nodes will be selected // into MOV32ri. SDValue X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { ConstantPoolSDNode *CP = cast(Op); // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetConstantPool( CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag); SDLoc DL(CP); Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (OpFlag) { Result = DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); } return Result; } SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { JumpTableSDNode *JT = cast(Op); // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag); SDLoc DL(JT); Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (OpFlag) Result = DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); return Result; } SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false); } SDValue X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { // Create the TargetBlockAddressAddress node. unsigned char OpFlags = Subtarget.classifyBlockAddressReference(); const BlockAddress *BA = cast(Op)->getBlockAddress(); int64_t Offset = cast(Op)->getOffset(); SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags); Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (isGlobalRelativeToPICBase(OpFlags)) { Result = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); } return Result; } /// Creates target global address or external symbol nodes for calls or /// other uses. SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, bool ForCall) const { // Unpack the global address or external symbol. const SDLoc &dl = SDLoc(Op); const GlobalValue *GV = nullptr; int64_t Offset = 0; const char *ExternalSym = nullptr; if (const auto *G = dyn_cast(Op)) { GV = G->getGlobal(); Offset = G->getOffset(); } else { const auto *ES = cast(Op); ExternalSym = ES->getSymbol(); } // Calculate some flags for address lowering. const Module &Mod = *DAG.getMachineFunction().getFunction().getParent(); unsigned char OpFlags; if (ForCall) OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod); else OpFlags = Subtarget.classifyGlobalReference(GV, Mod); bool HasPICReg = isGlobalRelativeToPICBase(OpFlags); bool NeedsLoad = isGlobalStubReference(OpFlags); CodeModel::Model M = DAG.getTarget().getCodeModel(); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result; if (GV) { // Create a target global address if this is a global. If possible, fold the // offset into the global address reference. Otherwise, ADD it on later. int64_t GlobalOffset = 0; if (OpFlags == X86II::MO_NO_FLAG && X86::isOffsetSuitableForCodeModel(Offset, M)) { std::swap(GlobalOffset, Offset); } Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags); } else { // If this is not a global address, this must be an external symbol. Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags); } // If this is a direct call, avoid the wrapper if we don't need to do any // loads or adds. This allows SDAG ISel to match direct calls. if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0) return Result; Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (HasPICReg) { Result = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); } // For globals that require a load from a stub to get the address, emit the // load. if (NeedsLoad) Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction())); // If there was a non-zero offset that we didn't fold, create an explicit // addition for it. if (Offset != 0) Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, DAG.getConstant(Offset, dl, PtrVT)); return Result; } SDValue X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false); } static SDValue GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LocalDynamic = false) { MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SDLoc dl(GA); SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR : X86ISD::TLSADDR; if (InFlag) { SDValue Ops[] = { Chain, TGA, *InFlag }; Chain = DAG.getNode(CallType, dl, NodeTys, Ops); } else { SDValue Ops[] = { Chain, TGA }; Chain = DAG.getNode(CallType, dl, NodeTys, Ops); } // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. MFI.setAdjustsStack(true); MFI.setHasCalls(true); SDValue Flag = Chain.getValue(1); return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { SDValue InFlag; SDLoc dl(GA); // ? function entry point might be better SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag); InFlag = Chain.getValue(1); return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, X86II::MO_TLSGD); } static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool is64Bit) { SDLoc dl(GA); // Get the start address of the TLS block for this module. X86MachineFunctionInfo *MFI = DAG.getMachineFunction() .getInfo(); MFI->incNumLocalDynamicTLSAccesses(); SDValue Base; if (is64Bit) { Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, X86II::MO_TLSLD, /*LocalDynamic=*/true); } else { SDValue InFlag; SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag); InFlag = Chain.getValue(1); Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSLDM, /*LocalDynamic=*/true); } // Note: the CleanupLocalDynamicTLSPass will remove redundant computations // of Base. // Build x@dtpoff. unsigned char OperandFlags = X86II::MO_DTPOFF; unsigned WrapperKind = X86ISD::Wrapper; SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); // Add x@dtpoff with the base. return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base); } // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model. static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC) { SDLoc dl(GA); // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), is64Bit ? 257 : 256)); SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr)); unsigned char OperandFlags = 0; // Most TLS accesses are not RIP relative, even on x86-64. One exception is // initialexec. unsigned WrapperKind = X86ISD::Wrapper; if (model == TLSModel::LocalExec) { OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; } else if (model == TLSModel::InitialExec) { if (is64Bit) { OperandFlags = X86II::MO_GOTTPOFF; WrapperKind = X86ISD::WrapperRIP; } else { OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF; } } else { llvm_unreachable("Unexpected model"); } // emit "addl x@ntpoff,%eax" (local exec) // or "addl x@indntpoff,%eax" (initial exec) // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); if (model == TLSModel::InitialExec) { if (isPIC && !is64Bit) { Offset = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Offset); } Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, MachinePointerInfo::getGOT(DAG.getMachineFunction())); } // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); } SDValue X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GA = cast(Op); if (DAG.getTarget().useEmulatedTLS()) return LowerToTLSEmulatedModel(GA, DAG); const GlobalValue *GV = GA->getGlobal(); auto PtrVT = getPointerTy(DAG.getDataLayout()); bool PositionIndependent = isPositionIndependent(); if (Subtarget.isTargetELF()) { TLSModel::Model model = DAG.getTarget().getTLSModel(GV); switch (model) { case TLSModel::GeneralDynamic: if (Subtarget.is64Bit()) return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT); return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT); case TLSModel::LocalDynamic: return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit()); case TLSModel::InitialExec: case TLSModel::LocalExec: return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(), PositionIndependent); } llvm_unreachable("Unknown TLS model."); } if (Subtarget.isTargetDarwin()) { // Darwin only has one model of TLS. Lower to that. unsigned char OpFlag = 0; unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ? X86ISD::WrapperRIP : X86ISD::Wrapper; // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. bool PIC32 = PositionIndependent && !Subtarget.is64Bit(); if (PIC32) OpFlag = X86II::MO_TLVP_PIC_BASE; else OpFlag = X86II::MO_TLVP; SDLoc DL(Op); SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, GA->getValueType(0), GA->getOffset(), OpFlag); SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result); // With PIC32, the address is actually $g + Offset. if (PIC32) Offset = DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Offset); // Lowering the machine isd will make sure everything is in the right // location. SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); SDValue Args[] = { Chain, Offset }; Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true), DAG.getIntPtrConstant(0, DL, true), Chain.getValue(1), DL); // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setAdjustsStack(true); // And our return value (tls address) is in the standard call return value // location. unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1)); } if (Subtarget.isOSWindows()) { // Just use the implicit TLS architecture // Need to generate something similar to: // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage // ; from TEB // mov ecx, dword [rel _tls_index]: Load index (from C runtime) // mov rcx, qword [rdx+rcx*8] // mov eax, .tls$:tlsvar // [rax+rcx] contains the address // Windows 64bit: gs:0x58 // Windows 32bit: fs:__tls_array SDLoc dl(GA); SDValue Chain = DAG.getEntryNode(); // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly // use its literal value of 0x2C. Value *Ptr = Constant::getNullValue(Subtarget.is64Bit() ? Type::getInt8PtrTy(*DAG.getContext(), 256) : Type::getInt32PtrTy(*DAG.getContext(), 257)); SDValue TlsArray = Subtarget.is64Bit() ? DAG.getIntPtrConstant(0x58, dl) : (Subtarget.isTargetWindowsGNU() ? DAG.getIntPtrConstant(0x2C, dl) : DAG.getExternalSymbol("_tls_array", PtrVT)); SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr)); SDValue res; if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) { res = ThreadPointer; } else { // Load the _tls_index variable SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT); if (Subtarget.is64Bit()) IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX, MachinePointerInfo(), MVT::i32); else IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo()); auto &DL = DAG.getDataLayout(); SDValue Scale = DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8); IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale); res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX); } res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo()); // Get the offset of start of .tls section SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), X86II::MO_SECREL); SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA); // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset); } llvm_unreachable("TLS not implemented for this target."); } /// Lower SRA_PARTS and friends, which return two i32 values /// and take a 2 x i32 value to shift plus a shift amount. /// TODO: Can this be moved to general expansion code? static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); MVT VT = Op.getSimpleValueType(); unsigned VTBits = VT.getSizeInBits(); SDLoc dl(Op); bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); // ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and // ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away // during isel. SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, DAG.getConstant(VTBits - 1, dl, MVT::i8)); SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, DAG.getConstant(VTBits - 1, dl, MVT::i8)) : DAG.getConstant(0, dl, VT); SDValue Tmp2, Tmp3; if (Op.getOpcode() == ISD::SHL_PARTS) { Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt); Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt); } else { Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt); Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt); } // If the shift amount is larger or equal than the width of a part we can't // rely on the results of shld/shrd. Insert a test and select the appropriate // values for large shift amounts. SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, DAG.getConstant(VTBits, dl, MVT::i8)); SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode, DAG.getConstant(0, dl, MVT::i8), ISD::SETNE); SDValue Hi, Lo; if (Op.getOpcode() == ISD::SHL_PARTS) { Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2); Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3); } else { Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2); Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3); } return DAG.getMergeValues({ Lo, Hi }, dl); } static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"); SDLoc DL(Op); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue Amt = Op.getOperand(2); bool IsFSHR = Op.getOpcode() == ISD::FSHR; if (VT.isVector()) { assert(Subtarget.hasVBMI2() && "Expected VBMI2"); if (IsFSHR) std::swap(Op0, Op1); APInt APIntShiftAmt; if (X86::isConstantSplat(Amt, APIntShiftAmt)) { uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits()); return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, Op0, Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); } return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT, Op0, Op1, Amt); } assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"); // Expand slow SHLD/SHRD cases if we are not optimizing for size. bool OptForSize = DAG.shouldOptForSize(); if (!OptForSize && Subtarget.isSHLDSlow()) return SDValue(); if (IsFSHR) std::swap(Op0, Op1); // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo. if (VT == MVT::i16) Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, DAG.getConstant(15, DL, Amt.getValueType())); unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD); return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt); } // Try to use a packed vector operation to handle i64 on 32-bit targets when // AVX512DQ is enabled. static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"); bool IsStrict = Op->isStrictFPOpcode(); unsigned OpNo = IsStrict ? 1 : 0; SDValue Src = Op.getOperand(OpNo); MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() || (VT != MVT::f32 && VT != MVT::f64)) return SDValue(); // Pack the i64 into a vector, do the operation and extract. // Using 256-bit to ensure result is 128-bits for f32 case. unsigned NumElts = Subtarget.hasVLX() ? 4 : 8; MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts); MVT VecVT = MVT::getVectorVT(VT, NumElts); SDLoc dl(Op); SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src); if (IsStrict) { SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other}, {Op.getOperand(0), InVec}); SDValue Chain = CvtVec.getValue(1); SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, DAG.getIntPtrConstant(0, dl)); return DAG.getMergeValues({Value, Chain}, dl); } SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, DAG.getIntPtrConstant(0, dl)); } static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget) { switch (Opcode) { case ISD::SINT_TO_FP: // TODO: Handle wider types with AVX/AVX512. if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32) return false; // CVTDQ2PS or (V)CVTDQ2PD return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64); case ISD::UINT_TO_FP: // TODO: Handle wider types and i64 elements. if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32) return false; // VCVTUDQ2PS or VCVTUDQ2PD return ToVT == MVT::v4f32 || ToVT == MVT::v4f64; default: return false; } } /// Given a scalar cast operation that is extracted from a vector, try to /// vectorize the cast op followed by extraction. This will avoid an expensive /// round-trip between XMM and GPR. static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // TODO: This could be enhanced to handle smaller integer types by peeking // through an extend. SDValue Extract = Cast.getOperand(0); MVT DestVT = Cast.getSimpleValueType(); if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Extract.getOperand(1))) return SDValue(); // See if we have a 128-bit vector cast op for this type of cast. SDValue VecOp = Extract.getOperand(0); MVT FromVT = VecOp.getSimpleValueType(); unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits(); MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM); MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM); if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget)) return SDValue(); // If we are extracting from a non-zero element, first shuffle the source // vector to allow extracting from element zero. SDLoc DL(Cast); if (!isNullConstant(Extract.getOperand(1))) { SmallVector Mask(FromVT.getVectorNumElements(), -1); Mask[0] = Extract.getConstantOperandVal(1); VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask); } // If the source vector is wider than 128-bits, extract the low part. Do not // create an unnecessarily wide vector cast op. if (FromVT != Vec128VT) VecOp = extract128BitVector(VecOp, 0, DAG, DL); // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast, DAG.getIntPtrConstant(0, DL)); } static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(Op); bool IsStrict = Op->isStrictFPOpcode(); MVT VT = Op->getSimpleValueType(0); SDValue Src = Op->getOperand(IsStrict ? 1 : 0); if (Subtarget.hasDQI()) { assert(!Subtarget.hasVLX() && "Unexpected features"); assert((Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"); // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type. assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && "Unexpected VT!"); MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64; // Need to concat with zero vector for strict fp to avoid spurious // exceptions. SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64) : DAG.getUNDEF(MVT::v8i64); Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src, DAG.getIntPtrConstant(0, DL)); SDValue Res, Chain; if (IsStrict) { Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other}, {Op->getOperand(0), Src}); Chain = Res.getValue(1); } else { Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src); } Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, DAG.getIntPtrConstant(0, DL)); if (IsStrict) return DAG.getMergeValues({Res, Chain}, DL); return Res; } bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP || Op->getOpcode() == ISD::STRICT_SINT_TO_FP; if (VT != MVT::v4f32 || IsSigned) return SDValue(); SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64); SDValue One = DAG.getConstant(1, DL, MVT::v4i64); SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64, DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One), DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One)); SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT); SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src); SmallVector SignCvts(4); SmallVector Chains(4); for (int i = 0; i != 4; ++i) { SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc, DAG.getIntPtrConstant(i, DL)); if (IsStrict) { SignCvts[i] = DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other}, {Op.getOperand(0), Src}); Chains[i] = SignCvts[i].getValue(1); } else { SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Src); } } SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts); SDValue Slow, Chain; if (IsStrict) { Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other}, {Chain, SignCvt, SignCvt}); Chain = Slow.getValue(1); } else { Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt); } IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg); SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt); if (IsStrict) return DAG.getMergeValues({Cvt, Chain}, DL); return Cvt; } SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); unsigned OpNo = IsStrict ? 1 : 0; SDValue Src = Op.getOperand(OpNo); SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode(); MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) return Extract; if (SrcVT.isVector()) { if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { // Note: Since v2f64 is a legal type. We don't need to zero extend the // source for strict FP. if (IsStrict) return DAG.getNode( X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other}, {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, DAG.getUNDEF(SrcVT))}); return DAG.getNode(X86ISD::CVTSI2P, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, DAG.getUNDEF(SrcVT))); } if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64) return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget); return SDValue(); } assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && "Unknown SINT_TO_FP to lower!"); bool UseSSEReg = isScalarFPTypeInSSEReg(VT); // These are really Legal; return the operand so the caller accepts it as // Legal. if (SrcVT == MVT::i32 && UseSSEReg) return Op; if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit()) return Op; if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget)) return V; // SSE doesn't have an i16 conversion so we need to promote. if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) { SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src); if (IsStrict) return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, {Chain, Ext}); return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext); } if (VT == MVT::f128) return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT)); SDValue ValueToStore = Src; if (SrcVT == MVT::i64 && UseSSEReg && !Subtarget.is64Bit()) // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come // with two 32-bit stores. ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); unsigned Size = SrcVT.getSizeInBits()/8; MachineFunction &MF = DAG.getMachineFunction(); auto PtrVT = getPointerTy(MF.getDataLayout()); int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); Chain = DAG.getStore( Chain, dl, ValueToStore, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI)); std::pair Tmp = BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); if (IsStrict) return DAG.getMergeValues({Tmp.first, Tmp.second}, dl); return Tmp.first; } std::pair X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, SelectionDAG &DAG) const { // Build the FILD SDLoc DL(Op); SDVTList Tys; bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); if (useSSE) Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); else Tys = DAG.getVTList(Op.getValueType(), MVT::Other); unsigned ByteSize = SrcVT.getSizeInBits() / 8; FrameIndexSDNode *FI = dyn_cast(StackSlot); MachineMemOperand *LoadMMO; if (FI) { int SSFI = FI->getIndex(); LoadMMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), MachineMemOperand::MOLoad, ByteSize, ByteSize); } else { LoadMMO = cast(StackSlot)->getMemOperand(); StackSlot = StackSlot.getOperand(1); } SDValue FILDOps[] = {Chain, StackSlot}; SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL, Tys, FILDOps, SrcVT, LoadMMO); Chain = Result.getValue(1); if (useSSE) { SDValue InFlag = Result.getValue(2); // FIXME: Currently the FST is glued to the FILD_FLAG. This // shouldn't be necessary except that RFP cannot be live across // multiple blocks. When stackifier is fixed, they can be uncoupled. MachineFunction &MF = DAG.getMachineFunction(); unsigned SSFISize = Op.getValueSizeInBits() / 8; int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false); auto PtrVT = getPointerTy(MF.getDataLayout()); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); Tys = DAG.getVTList(MVT::Other); SDValue FSTOps[] = {Chain, Result, StackSlot, InFlag}; MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), MachineMemOperand::MOStore, SSFISize, SSFISize); Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, Op.getValueType(), StoreMMO); Result = DAG.getLoad( Op.getValueType(), DL, Chain, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI)); Chain = Result.getValue(1); } return { Result, Chain }; } /// Horizontal vector math instructions may be slower than normal math with /// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch /// implementation, and likely shuffle complexity of the alternate sequence. static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget) { bool IsOptimizingSize = DAG.shouldOptForSize(); bool HasFastHOps = Subtarget.hasFastHorizontalOps(); return !IsSingleSource || IsOptimizingSize || HasFastHOps; } /// 64-bit unsigned integer to double expansion. static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // This algorithm is not obvious. Here it is what we're trying to output: /* movq %rax, %xmm0 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } #ifdef __SSE3__ haddpd %xmm0, %xmm0 #else pshufd $0x4e, %xmm0, %xmm1 addpd %xmm1, %xmm0 #endif */ bool IsStrict = Op->isStrictFPOpcode(); unsigned OpNo = IsStrict ? 1 : 0; SDLoc dl(Op); LLVMContext *Context = DAG.getContext(); // Build some magic constants. static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; Constant *C0 = ConstantDataVector::get(*Context, CV0); auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16); SmallVector CV1; CV1.push_back( ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)))); CV1.push_back( ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(), APInt(64, 0x4530000000000000ULL)))); Constant *C1 = ConstantVector::get(CV1); SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16); // Load the 64-bit value into an XMM register. SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo)); SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), /* Alignment = */ 16); SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0); SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), /* Alignment = */ 16); SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); SDValue Sub; SDValue Chain; // TODO: Are there any fast-math-flags to propagate here? if (IsStrict) { Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other}, {Op.getOperand(0), XR2F, CLod1}); Chain = Sub.getValue(1); } else Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; if (!IsStrict && Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) { // FIXME: Do we need a STRICT version of FHADD? Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); } else { SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1}); if (IsStrict) { Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other}, {Chain, Shuffle, Sub}); Chain = Result.getValue(1); } else Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub); } Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, DAG.getIntPtrConstant(0, dl)); if (IsStrict) return DAG.getMergeValues({Result, Chain}, dl); return Result; } /// 32-bit unsigned integer to float expansion. static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; SDLoc dl(Op); // FP constant to bias correct the final result. SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::f64); // Load the 32-bit value into an XMM register. SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo)); // Zero out the upper parts of the register. Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); // Or the load with the bias. SDValue Or = DAG.getNode( ISD::OR, dl, MVT::v2i64, DAG.getBitcast(MVT::v2i64, Load), DAG.getBitcast(MVT::v2i64, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias))); Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl)); if (Op.getNode()->isStrictFPOpcode()) { // Subtract the bias. // TODO: Are there any fast-math-flags to propagate here? SDValue Chain = Op.getOperand(0); SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other}, {Chain, Or, Bias}); if (Op.getValueType() == Sub.getValueType()) return Sub; // Handle final rounding. std::pair ResultPair = DAG.getStrictFPExtendOrRound( Sub, Sub.getValue(1), dl, Op.getSimpleValueType()); return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl); } // Subtract the bias. // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); // Handle final rounding. return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType()); } static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL) { if (Op.getSimpleValueType() != MVT::v2f64) return SDValue(); bool IsStrict = Op->isStrictFPOpcode(); SDValue N0 = Op.getOperand(IsStrict ? 1 : 0); assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type"); if (Subtarget.hasAVX512()) { if (!Subtarget.hasVLX()) { // Let generic type legalization widen this. if (!IsStrict) return SDValue(); // Otherwise pad the integer input with 0s and widen the operation. N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, DAG.getConstant(0, DL, MVT::v2i32)); SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other}, {Op.getOperand(0), N0}); SDValue Chain = Res.getValue(1); Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res, DAG.getIntPtrConstant(0, DL)); return DAG.getMergeValues({Res, Chain}, DL); } // Legalize to v4i32 type. N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, DAG.getUNDEF(MVT::v2i32)); if (IsStrict) return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other}, {Op.getOperand(0), N0}); return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0); } // Zero extend to 2i64, OR with the floating point representation of 2^52. // This gives us the floating point equivalent of 2^52 + the i32 integer // since double has 52-bits of mantissa. Then subtract 2^52 in floating // point leaving just our i32 integers in double format. SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0); SDValue VBias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64); SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn, DAG.getBitcast(MVT::v2i64, VBias)); Or = DAG.getBitcast(MVT::v2f64, Or); if (IsStrict) return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other}, {Op.getOperand(0), Or, VBias}); return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias); } static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(Op); bool IsStrict = Op->isStrictFPOpcode(); SDValue V = Op->getOperand(IsStrict ? 1 : 0); MVT VecIntVT = V.getSimpleValueType(); assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && "Unsupported custom type"); if (Subtarget.hasAVX512()) { // With AVX512, but not VLX we need to widen to get a 512-bit result type. assert(!Subtarget.hasVLX() && "Unexpected features"); MVT VT = Op->getSimpleValueType(0); // v8i32->v8f64 is legal with AVX512 so just return it. if (VT == MVT::v8f64) return Op; assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && "Unexpected VT!"); MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32; MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32; // Need to concat with zero vector for strict fp to avoid spurious // exceptions. SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT); V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V, DAG.getIntPtrConstant(0, DL)); SDValue Res, Chain; if (IsStrict) { Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other}, {Op->getOperand(0), V}); Chain = Res.getValue(1); } else { Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V); } Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, DAG.getIntPtrConstant(0, DL)); if (IsStrict) return DAG.getMergeValues({Res, Chain}, DL); return Res; } if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 && Op->getSimpleValueType(0) == MVT::v4f64) { SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V); Constant *Bias = ConstantFP::get( *DAG.getContext(), APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL))); auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, /*Alignment*/ 8); SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other); SDValue Ops[] = {DAG.getEntryNode(), CPIdx}; SDValue VBias = DAG.getMemIntrinsicNode( X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), /*Alignment*/ 8, MachineMemOperand::MOLoad); SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn, DAG.getBitcast(MVT::v4i64, VBias)); Or = DAG.getBitcast(MVT::v4f64, Or); if (IsStrict) return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other}, {Op.getOperand(0), Or, VBias}); return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias); } // The algorithm is the following: // #ifdef __SSE4_1__ // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), // (uint4) 0x53000000, 0xaa); // #else // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; // uint4 hi = (v >> 16) | (uint4) 0x53000000; // #endif // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); // return (float4) lo + fhi; bool Is128 = VecIntVT == MVT::v4i32; MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; // If we convert to something else than the supported type, e.g., to v4f64, // abort early. if (VecFloatVT != Op->getSimpleValueType(0)) return SDValue(); // In the #idef/#else code, we have in common: // - The vector of constants: // -- 0x4b000000 // -- 0x53000000 // - A shift: // -- v >> 16 // Create the splat vector for 0x4b000000. SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT); // Create the splat vector for 0x53000000. SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT); // Create the right shift. SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT); SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift); SDValue Low, High; if (Subtarget.hasSSE41()) { MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow); SDValue VecBitcast = DAG.getBitcast(VecI16VT, V); // Low will be bitcasted right away, so do not bother bitcasting back to its // original type. Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8)); // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), // (uint4) 0x53000000, 0xaa); SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh); SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift); // High will be bitcasted right away, so do not bother bitcasting back to // its original type. High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast, VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8)); } else { SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT); // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask); Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow); // uint4 hi = (v >> 16) | (uint4) 0x53000000; High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh); } // Create the vector constant for (0x1.0p39f + 0x1.0p23f). SDValue VecCstFSub = DAG.getConstantFP( APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT); // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); // NOTE: By using fsub of a positive constant instead of fadd of a negative // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is // enabled. See PR24512. SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High); // TODO: Are there any fast-math-flags to propagate here? // (float4) lo; SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low); // return (float4) lo + fhi; if (IsStrict) { SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other}, {Op.getOperand(0), HighBitcast, VecCstFSub}); return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other}, {FHigh.getValue(1), LowBitcast, FHigh}); } SDValue FHigh = DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub); return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh); } static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; SDValue N0 = Op.getOperand(OpNo); MVT SrcVT = N0.getSimpleValueType(); SDLoc dl(Op); switch (SrcVT.SimpleTy) { default: llvm_unreachable("Custom UINT_TO_FP is not supported!"); case MVT::v2i32: return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl); case MVT::v4i32: case MVT::v8i32: return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget); case MVT::v2i64: case MVT::v4i64: return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget); } } SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); unsigned OpNo = IsStrict ? 1 : 0; SDValue Src = Op.getOperand(OpNo); SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); MVT SrcVT = Src.getSimpleValueType(); MVT DstVT = Op->getSimpleValueType(0); SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); if (DstVT == MVT::f128) return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT)); if (DstVT.isVector()) return lowerUINT_TO_FP_vec(Op, DAG, Subtarget); if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) return Extract; if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) && (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) { // Conversions from unsigned i32 to f32/f64 are legal, // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode. return Op; } // Promote i32 to i64 and use a signed conversion on 64-bit targets. if (SrcVT == MVT::i32 && Subtarget.is64Bit()) { Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src); if (IsStrict) return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other}, {Chain, Src}); return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src); } if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget)) return V; if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) return LowerUINT_TO_FP_i64(Op, DAG, Subtarget); if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80) return LowerUINT_TO_FP_i32(Op, DAG, Subtarget); if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) return SDValue(); // Make a 64-bit buffer, and use it to build an FILD. SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); if (SrcVT == MVT::i32) { SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl); SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MachinePointerInfo()); SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32), OffsetSlot, MachinePointerInfo()); std::pair Tmp = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); if (IsStrict) return DAG.getMergeValues({Tmp.first, Tmp.second}, dl); return Tmp.first; } assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); SDValue ValueToStore = Src; if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) { // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come // with two 32-bit stores. ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); } SDValue Store = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MachinePointerInfo()); // For i64 source, we need to add the appropriate power of 2 if the input // was negative. This is the same as the optimization in // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, // we must be careful to do the computation in x87 extended precision, not // in SSE. (The generic code can't know it's OK to do this, or how to.) int SSFI = cast(StackSlot)->getIndex(); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), MachineMemOperand::MOLoad, 8, 8); SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Store, StackSlot }; SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MMO); Chain = Fild.getValue(1); // Check whether the sign bit is set. SDValue SignSet = DAG.getSetCC( dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64), Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits. APInt FF(64, 0x5F80000000000000ULL); SDValue FudgePtr = DAG.getConstantPool( ConstantInt::get(*DAG.getContext(), FF), PtrVT); // Get a pointer to FF if the sign bit was set, or to 0 otherwise. SDValue Zero = DAG.getIntPtrConstant(0, dl); SDValue Four = DAG.getIntPtrConstant(4, dl); SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero); FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset); // Load the value out, extending it from f32 to f80. SDValue Fudge = DAG.getExtLoad( ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, /* Alignment = */ 4); Chain = Fudge.getValue(1); // Extend everything to 80 bits to force it to be done on x87. // TODO: Are there any fast-math-flags to propagate here? if (IsStrict) { SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge}); // STRICT_FP_ROUND can't handle equal types. if (DstVT == MVT::f80) return Add; return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other}, {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)}); } SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0, dl)); } // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation // is legal, or has an fp128 or f16 source (which needs to be promoted to f32), // just return an SDValue(). // Otherwise it is assumed to be a conversion from one of f32, f64 or f80 // to i16, i32 or i64, and we lower it to a legal sequence and return the // result. SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, SDValue &Chain) const { bool IsStrict = Op->isStrictFPOpcode(); SDLoc DL(Op); EVT DstTy = Op.getValueType(); SDValue Value = Op.getOperand(IsStrict ? 1 : 0); EVT TheVT = Value.getValueType(); auto PtrVT = getPointerTy(DAG.getDataLayout()); if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) { // f16 must be promoted before using the lowering in this routine. // fp128 does not use this lowering. return SDValue(); } // If using FIST to compute an unsigned i64, we'll need some fixup // to handle values above the maximum signed i64. A FIST is always // used for the 32-bit subtarget, but also for f80 on a 64-bit target. bool UnsignedFixup = !IsSigned && DstTy == MVT::i64; // FIXME: This does not generate an invalid exception if the input does not // fit in i32. PR44019 if (!IsSigned && DstTy != MVT::i64) { // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST. // The low 32 bits of the fist result will have the correct uint32 result. assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); DstTy = MVT::i64; } assert(DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"); // We lower FP->int64 into FISTP64 followed by a load from a temporary // stack slot. MachineFunction &MF = DAG.getMachineFunction(); unsigned MemSize = DstTy.getStoreSize(); int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment. if (UnsignedFixup) { // // Conversion to unsigned i64 is implemented with a select, // depending on whether the source value fits in the range // of a signed i64. Let Thresh be the FP equivalent of // 0x8000000000000000ULL. // // Adjust = (Value < Thresh) ? 0 : 0x80000000; // FltOfs = (Value < Thresh) ? 0 : 0x80000000; // FistSrc = (Value - FltOfs); // Fist-to-mem64 FistSrc // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent // to XOR'ing the high 32 bits with Adjust. // // Being a power of 2, Thresh is exactly representable in all FP formats. // For X87 we'd like to use the smallest FP type for this constant, but // for DAG type consistency we have to match the FP operand type. APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000)); LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK; bool LosesInfo = false; if (TheVT == MVT::f64) // The rounding mode is irrelevant as the conversion should be exact. Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, &LosesInfo); else if (TheVT == MVT::f80) Status = Thresh.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven, &LosesInfo); assert(Status == APFloat::opOK && !LosesInfo && "FP conversion should have been exact"); SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT); EVT ResVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), TheVT); SDValue Cmp; if (IsStrict) { Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT, Chain, /*IsSignaling*/ true); Chain = Cmp.getValue(1); } else { Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT); } Adjust = DAG.getSelect(DL, MVT::i64, Cmp, DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(APInt::getSignMask(64), DL, MVT::i64)); SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, DAG.getConstantFP(0.0, DL, TheVT), ThreshVal); if (IsStrict) { Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other}, { Chain, Value, FltOfs }); Chain = Value.getValue(1); } else Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs); } MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI); // FIXME This causes a redundant load/store if the SSE-class value is already // in memory, such as if it is on the callstack. if (isScalarFPTypeInSSEReg(TheVT)) { assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI); SDVTList Tys = DAG.getVTList(TheVT, MVT::Other); SDValue Ops[] = { Chain, StackSlot }; unsigned FLDSize = TheVT.getStoreSize(); assert(FLDSize <= MemSize && "Stack slot not big enough"); MachineMemOperand *MMO = MF.getMachineMemOperand( MPI, MachineMemOperand::MOLoad, FLDSize, FLDSize); Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO); Chain = Value.getValue(1); } // Build the FP_TO_INT*_IN_MEM MachineMemOperand *MMO = MF.getMachineMemOperand( MPI, MachineMemOperand::MOStore, MemSize, MemSize); SDValue Ops[] = { Chain, Value, StackSlot }; SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL, DAG.getVTList(MVT::Other), Ops, DstTy, MMO); SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI); Chain = Res.getValue(1); // If we need an unsigned fixup, XOR the result with adjust. if (UnsignedFixup) Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust); return Res; } static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); unsigned Opc = Op.getOpcode(); assert(VT.isVector() && InVT.isVector() && "Expected vector type"); assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && "Unexpected extension opcode"); assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Expected same number of elements"); assert((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && "Unexpected element type"); assert((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && "Unexpected element type"); unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc); // Custom legalize v8i8->v8i64 on CPUs without avx512bw. if (InVT == MVT::v8i8) { if (VT != MVT::v8i64) return SDValue(); In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8)); return DAG.getNode(ExtendInVecOpc, dl, VT, In); } if (Subtarget.hasInt256()) return Op; // Optimize vectors in AVX mode: // // v8i16 -> v8i32 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32. // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. // Concat upper and lower parts. // // v4i32 -> v4i64 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64. // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. // Concat upper and lower parts. // MVT HalfVT = VT.getHalfNumVectorElementsVT(); SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In); // Short-circuit if we can determine that each 128-bit half is the same value. // Otherwise, this is difficult to match and optimize. if (auto *Shuf = dyn_cast(In)) if (hasIdenticalHalvesShuffleMask(Shuf->getMask())) return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo); SDValue ZeroVec = DAG.getConstant(0, dl, InVT); SDValue Undef = DAG.getUNDEF(InVT); bool NeedZero = Opc == ISD::ZERO_EXTEND; SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); OpHi = DAG.getBitcast(HalfVT, OpHi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } // Helper to split and extend a v16i1 mask to v16i8 or v16i16. static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG) { assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT."); SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In, DAG.getIntPtrConstant(0, dl)); SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In, DAG.getIntPtrConstant(8, dl)); Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo); Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi); return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); } static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!"); SDLoc DL(Op); unsigned NumElts = VT.getVectorNumElements(); // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This // avoids a constant pool load. if (VT.getVectorElementType() != MVT::i8) { SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In); return DAG.getNode(ISD::SRL, DL, VT, Extend, DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT)); } // Extend VT if BWI is not supported. MVT ExtVT = VT; if (!Subtarget.hasBWI()) { // If v16i32 is to be avoided, we'll need to split and concatenate. if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG); ExtVT = MVT::getVectorVT(MVT::i32, NumElts); } // Widen to 512-bits if VLX is not supported. MVT WideVT = ExtVT; if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) { NumElts *= 512 / ExtVT.getSizeInBits(); InVT = MVT::getVectorVT(MVT::i1, NumElts); In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In, DAG.getIntPtrConstant(0, DL)); WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts); } SDValue One = DAG.getConstant(1, DL, WideVT); SDValue Zero = DAG.getConstant(0, DL, WideVT); SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero); // Truncate if we had to extend above. if (VT != ExtVT) { WideVT = MVT::getVectorVT(MVT::i8, NumElts); SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal); } // Extract back to 128/256-bit if we widened. if (WideVT != VT) SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal, DAG.getIntPtrConstant(0, DL)); return SelectedVal; } static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = Op.getOperand(0); MVT SVT = In.getSimpleValueType(); if (SVT.getVectorElementType() == MVT::i1) return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG); assert(Subtarget.hasAVX() && "Expected AVX support"); return LowerAVXExtend(Op, DAG, Subtarget); } /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS. /// It makes use of the fact that vectors with enough leading sign/zero bits /// prevent the PACKSS/PACKUS from saturating the results. /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates /// within each 128-bit lane. static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && "Unexpected PACK opcode"); assert(DstVT.isVector() && "VT not a vector?"); // Requires SSE2 but AVX512 has fast vector truncate. if (!Subtarget.hasSSE2()) return SDValue(); EVT SrcVT = In.getValueType(); // No truncation required, we might get here due to recursive calls. if (SrcVT == DstVT) return In; // We only support vector truncation to 64bits or greater from a // 128bits or greater source. unsigned DstSizeInBits = DstVT.getSizeInBits(); unsigned SrcSizeInBits = SrcVT.getSizeInBits(); if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0) return SDValue(); unsigned NumElems = SrcVT.getVectorNumElements(); if (!isPowerOf2_32(NumElems)) return SDValue(); LLVMContext &Ctx = *DAG.getContext(); assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation"); assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation"); EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2); // Pack to the largest type possible: // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB. EVT InVT = MVT::i16, OutVT = MVT::i8; if (SrcVT.getScalarSizeInBits() > 16 && (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) { InVT = MVT::i32; OutVT = MVT::i16; } // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector. if (SrcVT.is128BitVector()) { InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits()); OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits()); In = DAG.getBitcast(InVT, In); SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In); Res = extractSubVector(Res, 0, DAG, DL, 64); return DAG.getBitcast(DstVT, Res); } // Extract lower/upper subvectors. unsigned NumSubElts = NumElems / 2; SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2); SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2); unsigned SubSizeInBits = SrcSizeInBits / 2; InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits()); OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits()); // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors. if (SrcVT.is256BitVector() && DstVT.is128BitVector()) { Lo = DAG.getBitcast(InVT, Lo); Hi = DAG.getBitcast(InVT, Hi); SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi); return DAG.getBitcast(DstVT, Res); } // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors. // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK). if (SrcVT.is512BitVector() && Subtarget.hasInt256()) { Lo = DAG.getBitcast(InVT, Lo); Hi = DAG.getBitcast(InVT, Hi); SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi); // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)), // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)). // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits. SmallVector Mask; int Scale = 64 / OutVT.getScalarSizeInBits(); scaleShuffleMask(Scale, ArrayRef({ 0, 2, 1, 3 }), Mask); Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask); if (DstVT.is256BitVector()) return DAG.getBitcast(DstVT, Res); // If 512bit -> 128bit truncate another stage. EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems); Res = DAG.getBitcast(PackedVT, Res); return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget); } // Recursively pack lower/upper subvectors, concat result and pack again. assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater"); EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts); Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget); Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget); PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi); return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget); } static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type."); // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q. unsigned ShiftInx = InVT.getScalarSizeInBits() - 1; if (InVT.getScalarSizeInBits() <= 16) { if (Subtarget.hasBWI()) { // legal, will go to VPMOVB2M, VPMOVW2M if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) { // We need to shift to get the lsb into sign position. // Shift packed bytes not supported natively, bitcast to word MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); In = DAG.getNode(ISD::SHL, DL, ExtVT, DAG.getBitcast(ExtVT, In), DAG.getConstant(ShiftInx, DL, ExtVT)); In = DAG.getBitcast(InVT, In); } return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT); } // Use TESTD/Q, extended vector to packed dword/qword. assert((InVT.is256BitVector() || InVT.is128BitVector()) && "Unexpected vector type."); unsigned NumElts = InVT.getVectorNumElements(); assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements"); // We need to change to a wider element type that we have support for. // For 8 element vectors this is easy, we either extend to v8i32 or v8i64. // For 16 element vectors we extend to v16i32 unless we are explicitly // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors // we need to split into two 8 element vectors which we can extend to v8i32, // truncate and concat the results. There's an additional complication if // the original type is v16i8. In that case we can't split the v16i8 so // first we pre-extend it to v16i16 which we can split to v8i16, then extend // to v8i32, truncate that to v8i1 and concat the two halves. if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) { if (InVT == MVT::v16i8) { // First we need to sign extend up to 256-bits so we can split that. InVT = MVT::v16i16; In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In); } SDValue Lo = extract128BitVector(In, 0, DAG, DL); SDValue Hi = extract128BitVector(In, 8, DAG, DL); // We're split now, just emit two truncates and a concat. The two // truncates will trigger legalization to come back to this function. Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo); Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); } // We either have 8 elements or we're allowed to use 512-bit vectors. // If we have VLX, we want to use the narrowest vector that can get the // job done so we use vXi32. MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts); MVT ExtVT = MVT::getVectorVT(EltVT, NumElts); In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); InVT = ExtVT; ShiftInx = InVT.getScalarSizeInBits() - 1; } if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) { // We need to shift to get the lsb into sign position. In = DAG.getNode(ISD::SHL, DL, InVT, In, DAG.getConstant(ShiftInx, DL, InVT)); } // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m. if (Subtarget.hasDQI()) return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT); return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE); } SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); unsigned InNumEltBits = InVT.getScalarSizeInBits(); assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); // If we're called by the type legalizer, handle a few cases. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isTypeLegal(InVT)) { if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) && VT.is128BitVector()) { assert(Subtarget.hasVLX() && "Unexpected subtarget!"); // The default behavior is to truncate one step, concatenate, and then // truncate the remainder. We'd rather produce two 64-bit results and // concatenate those. SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(In, DL); EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo); Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); } // Otherwise let default legalization handle it. return SDValue(); } if (VT.getVectorElementType() == MVT::i1) return LowerTruncateVecI1(Op, DAG, Subtarget); // vpmovqb/w/d, vpmovdb/w, vpmovwb if (Subtarget.hasAVX512()) { // word to byte only under BWI. Otherwise we have to promoted to v16i32 // and then truncate that. But we should only do that if we haven't been // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be // handled by isel patterns. if (InVT != MVT::v16i16 || Subtarget.hasBWI() || Subtarget.canExtendTo512DQ()) return Op; } unsigned NumPackedSignBits = std::min(VT.getScalarSizeInBits(), 16); unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8; // Truncate with PACKUS if we are truncating a vector with leading zero bits // that extend all the way to the packed/truncated value. // Pre-SSE41 we can only use PACKUSWB. KnownBits Known = DAG.computeKnownBits(In); if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) if (SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget)) return V; // Truncate with PACKSS if we are truncating a vector with sign-bits that // extend all the way to the packed/truncated value. if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In)) if (SDValue V = truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget)) return V; // Handle truncation of V256 to V128 using shuffles. assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!"); if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget.hasInt256()) { static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; In = DAG.getBitcast(MVT::v8i32, In); In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, DAG.getIntPtrConstant(0, DL)); } SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(0, DL)); SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(2, DL)); OpLo = DAG.getBitcast(MVT::v4i32, OpLo); OpHi = DAG.getBitcast(MVT::v4i32, OpHi); static const int ShufMask[] = {0, 2, 4, 6}; return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask); } if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { // On AVX2, v8i32 -> v8i16 becomes PSHUFB. if (Subtarget.hasInt256()) { In = DAG.getBitcast(MVT::v32i8, In); // The PSHUFB mask: static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, 16, 17, 20, 21, 24, 25, 28, 29, -1, -1, -1, -1, -1, -1, -1, -1 }; In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1); In = DAG.getBitcast(MVT::v4i64, In); static const int ShufMask2[] = {0, 2, -1, -1}; In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2); In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(0, DL)); return DAG.getBitcast(VT, In); } SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, DAG.getIntPtrConstant(0, DL)); SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, DAG.getIntPtrConstant(4, DL)); OpLo = DAG.getBitcast(MVT::v16i8, OpLo); OpHi = DAG.getBitcast(MVT::v16i8, OpHi); // The PSHUFB mask: static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1}; OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1); OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1); OpLo = DAG.getBitcast(MVT::v4i32, OpLo); OpHi = DAG.getBitcast(MVT::v4i32, OpHi); // The MOVLHPS Mask: static const int ShufMask2[] = {0, 1, 4, 5}; SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2); return DAG.getBitcast(MVT::v8i16, res); } if (VT == MVT::v16i8 && InVT == MVT::v16i16) { // Use an AND to zero uppper bits for PACKUS. In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT)); SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In, DAG.getIntPtrConstant(0, DL)); SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In, DAG.getIntPtrConstant(8, DL)); return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi); } llvm_unreachable("All 256->128 cases should have been handled above!"); } SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT || Op.getOpcode() == ISD::STRICT_FP_TO_SINT; MVT VT = Op->getSimpleValueType(0); SDValue Src = Op.getOperand(IsStrict ? 1 : 0); MVT SrcVT = Src.getSimpleValueType(); SDLoc dl(Op); if (VT.isVector()) { if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) { MVT ResVT = MVT::v4i32; MVT TruncVT = MVT::v4i1; unsigned Opc; if (IsStrict) Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; else Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; if (!IsSigned && !Subtarget.hasVLX()) { assert(Subtarget.useAVX512Regs() && "Unexpected features!"); // Widen to 512-bits. ResVT = MVT::v8i32; TruncVT = MVT::v8i1; Opc = Op.getOpcode(); // Need to concat with zero vector for strict fp to avoid spurious // exceptions. // TODO: Should we just do this for non-strict as well? SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64) : DAG.getUNDEF(MVT::v8f64); Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src, DAG.getIntPtrConstant(0, dl)); } SDValue Res, Chain; if (IsStrict) { Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src}); Chain = Res.getValue(1); } else { Res = DAG.getNode(Opc, dl, ResVT, Src); } Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res); Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res, DAG.getIntPtrConstant(0, dl)); if (IsStrict) return DAG.getMergeValues({Res, Chain}, dl); return Res; } // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32. if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) { assert(!IsSigned && "Expected unsigned conversion!"); assert(Subtarget.useAVX512Regs() && "Requires avx512f"); return Op; } // Widen vXi32 fp_to_uint with avx512f to 512-bit source. if ((VT == MVT::v4i32 || VT == MVT::v8i32) && (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32)) { assert(!IsSigned && "Expected unsigned conversion!"); assert(Subtarget.useAVX512Regs() && !Subtarget.hasVLX() && "Unexpected features!"); MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32; MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32; // Need to concat with zero vector for strict fp to avoid spurious // exceptions. // TODO: Should we just do this for non-strict as well? SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT); Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src, DAG.getIntPtrConstant(0, dl)); SDValue Res, Chain; if (IsStrict) { Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src}); Chain = Res.getValue(1); } else { Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src); } Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, DAG.getIntPtrConstant(0, dl)); if (IsStrict) return DAG.getMergeValues({Res, Chain}, dl); return Res; } // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source. if ((VT == MVT::v2i64 || VT == MVT::v4i64) && (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32)) { assert(Subtarget.useAVX512Regs() && Subtarget.hasDQI() && !Subtarget.hasVLX() && "Unexpected features!"); MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64; // Need to concat with zero vector for strict fp to avoid spurious // exceptions. // TODO: Should we just do this for non-strict as well? SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT); Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src, DAG.getIntPtrConstant(0, dl)); SDValue Res, Chain; if (IsStrict) { Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other}, {Op->getOperand(0), Src}); Chain = Res.getValue(1); } else { Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src); } Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, DAG.getIntPtrConstant(0, dl)); if (IsStrict) return DAG.getMergeValues({Res, Chain}, dl); return Res; } if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) { assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL"); SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, DAG.getUNDEF(MVT::v2f32)); if (IsStrict) { unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp}); } unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; return DAG.getNode(Opc, dl, VT, Tmp); } return SDValue(); } assert(!VT.isVector()); bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT); if (!IsSigned && UseSSEReg) { // Conversions from f32/f64 with AVX512 should be legal. if (Subtarget.hasAVX512()) return Op; // Use default expansion for i64. if (VT == MVT::i64) return SDValue(); assert(VT == MVT::i32 && "Unexpected VT!"); // Promote i32 to i64 and use a signed operation on 64-bit targets. // FIXME: This does not generate an invalid exception if the input does not // fit in i32. PR44019 if (Subtarget.is64Bit()) { SDValue Res, Chain; if (IsStrict) { Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other}, { Op.getOperand(0), Src }); Chain = Res.getValue(1); } else Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src); Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); if (IsStrict) return DAG.getMergeValues({ Res, Chain }, dl); return Res; } // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can // use fisttp which will be handled later. if (!Subtarget.hasSSE3()) return SDValue(); } // Promote i16 to i32 if we can use a SSE operation or the type is f128. // FIXME: This does not generate an invalid exception if the input does not // fit in i16. PR44019 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) { assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"); SDValue Res, Chain; if (IsStrict) { Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other}, { Op.getOperand(0), Src }); Chain = Res.getValue(1); } else Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src); Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); if (IsStrict) return DAG.getMergeValues({ Res, Chain }, dl); return Res; } // If this is a FP_TO_SINT using SSEReg we're done. if (UseSSEReg && IsSigned) return Op; // fp128 needs to use a libcall. if (SrcVT == MVT::f128) { RTLIB::Libcall LC; if (IsSigned) LC = RTLIB::getFPTOSINT(SrcVT, VT); else LC = RTLIB::getFPTOUINT(SrcVT, VT); SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); MakeLibCallOptions CallOptions; std::pair Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions, SDLoc(Op), Chain); if (IsStrict) return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl); return Tmp.first; } // Fall back to X87. SDValue Chain; if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) { if (IsStrict) return DAG.getMergeValues({V, Chain}, dl); return V; } llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases."); } SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(IsStrict ? 1 : 0); MVT SVT = In.getSimpleValueType(); if (VT == MVT::f128) { RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT); return LowerF128Call(Op, DAG, LC); } assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT)); if (IsStrict) return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other}, {Op->getOperand(0), Res}); return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res); } SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(IsStrict ? 1 : 0); MVT SVT = In.getSimpleValueType(); // It's legal except when f128 is involved if (SVT != MVT::f128) return Op; RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, VT); // FP_ROUND node has a second operand indicating whether it is known to be // precise. That doesn't take part in the LibCall so we can't directly use // LowerF128Call. SDLoc dl(Op); SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); MakeLibCallOptions CallOptions; std::pair Tmp = makeLibCall(DAG, LC, VT, In, CallOptions, dl, Chain); if (IsStrict) return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl); return Tmp.first; } /// Depending on uarch and/or optimizing for size, we might prefer to use a /// vector operation in place of the typical scalar operation. static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // If both operands have other uses, this is probably not profitable. SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); if (!LHS.hasOneUse() && !RHS.hasOneUse()) return Op; // FP horizontal add/sub were added with SSE3. Integer with SSSE3. bool IsFP = Op.getSimpleValueType().isFloatingPoint(); if (IsFP && !Subtarget.hasSSE3()) return Op; if (!IsFP && !Subtarget.hasSSSE3()) return Op; // Extract from a common vector. if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getOperand(0) != RHS.getOperand(0) || !isa(LHS.getOperand(1)) || !isa(RHS.getOperand(1)) || !shouldUseHorizontalOp(true, DAG, Subtarget)) return Op; // Allow commuted 'hadd' ops. // TODO: Allow commuted (f)sub by negating the result of (F)HSUB? unsigned HOpcode; switch (Op.getOpcode()) { case ISD::ADD: HOpcode = X86ISD::HADD; break; case ISD::SUB: HOpcode = X86ISD::HSUB; break; case ISD::FADD: HOpcode = X86ISD::FHADD; break; case ISD::FSUB: HOpcode = X86ISD::FHSUB; break; default: llvm_unreachable("Trying to lower unsupported opcode to horizontal op"); } unsigned LExtIndex = LHS.getConstantOperandVal(1); unsigned RExtIndex = RHS.getConstantOperandVal(1); if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 && (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD)) std::swap(LExtIndex, RExtIndex); if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1)) return Op; SDValue X = LHS.getOperand(0); EVT VecVT = X.getValueType(); unsigned BitWidth = VecVT.getSizeInBits(); unsigned NumLanes = BitWidth / 128; unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes; assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && "Not expecting illegal vector widths here"); // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit // equivalent, so extract the 256/512-bit source op to 128-bit if we can. SDLoc DL(Op); if (BitWidth == 256 || BitWidth == 512) { unsigned LaneIdx = LExtIndex / NumEltsPerLane; X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL); LExtIndex %= NumEltsPerLane; } // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp, DAG.getIntPtrConstant(LExtIndex / 2, DL)); } /// Depending on uarch and/or optimizing for size, we might prefer to use a /// vector operation in place of the typical scalar operation. SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const { assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && "Only expecting float/double"); return lowerAddSubToHorizontalOp(Op, DAG, Subtarget); } /// The only differences between FABS and FNEG are the mask and the logic op. /// FNEG also has a folding opportunity for FNEG(FABS(x)). static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."); bool IsFABS = (Op.getOpcode() == ISD::FABS); // If this is a FABS and it has an FNEG user, bail out to fold the combination // into an FNABS. We'll lower the FABS after that if it is still in use. if (IsFABS) for (SDNode *User : Op->uses()) if (User->getOpcode() == ISD::FNEG) return Op; SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); bool IsF128 = (VT == MVT::f128); assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"); // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to // decide if we should generate a 16-byte constant mask when we only need 4 or // 8 bytes for the scalar case. // There are no scalar bitwise logical SSE/AVX instructions, so we // generate a 16-byte vector constant and logic op even for the scalar case. // Using a 16-byte mask allows folding the load of the mask with // the logic op, so it can save (~4 bytes) on code size. bool IsFakeVector = !VT.isVector() && !IsF128; MVT LogicVT = VT; if (IsFakeVector) LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; unsigned EltBits = VT.getScalarSizeInBits(); // For FABS, mask is 0x7f...; for FNEG, mask is 0x80... APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits); const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT); SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT); SDValue Op0 = Op.getOperand(0); bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); unsigned LogicOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; if (VT.isVector() || IsF128) return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); // For the scalar case extend to a 128-bit vector, perform the logic op, // and extract the scalar result back out. Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand); SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode, DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { SDValue Mag = Op.getOperand(0); SDValue Sign = Op.getOperand(1); SDLoc dl(Op); // If the sign operand is smaller, extend it first. MVT VT = Op.getSimpleValueType(); if (Sign.getSimpleValueType().bitsLT(VT)) Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign); // And if it is bigger, shrink it first. if (Sign.getSimpleValueType().bitsGT(VT)) Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl)); // At this point the operands and the result should have the same // type, and that won't be f80 since that is not custom lowered. bool IsF128 = (VT == MVT::f128); assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"); const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT); // Perform all scalar logic operations as 16-byte vectors because there are no // scalar FP logic instructions in SSE. // TODO: This isn't necessary. If we used scalar types, we might avoid some // unnecessary splats, but we might miss load folding opportunities. Should // this decision be based on OptimizeForSize? bool IsFakeVector = !VT.isVector() && !IsF128; MVT LogicVT = VT; if (IsFakeVector) LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; // The mask constants are automatically splatted for vector types. unsigned EltSizeInBits = VT.getScalarSizeInBits(); SDValue SignMask = DAG.getConstantFP( APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT); SDValue MagMask = DAG.getConstantFP( APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT); // First, clear all bits but the sign bit from the second operand (sign). if (IsFakeVector) Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask); // Next, clear the sign bit from the first operand (magnitude). // TODO: If we had general constant folding for FP logic ops, this check // wouldn't be necessary. SDValue MagBits; if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) { APFloat APF = Op0CN->getValueAPF(); APF.clearSign(); MagBits = DAG.getConstantFP(APF, dl, LogicVT); } else { // If the magnitude operand wasn't a constant, we need to AND out the sign. if (IsFakeVector) Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag); MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask); } // OR the magnitude value with the sign bit. SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit); return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or, DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { SDValue N0 = Op.getOperand(0); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT OpVT = N0.getSimpleValueType(); assert((OpVT == MVT::f32 || OpVT == MVT::f64) && "Unexpected type for FGETSIGN"); // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1). MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64); SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0); Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res); Res = DAG.getZExtOrTrunc(Res, dl, VT); Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT)); return Res; } /// Helper for creating a X86ISD::SETCC node. static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl, SelectionDAG &DAG) { return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS); } /// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...)) /// style scalarized (associative) reduction patterns. static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl &SrcOps) { SmallVector Opnds; DenseMap SrcOpMap; EVT VT = MVT::Other; // Recognize a special case where a vector is casted into wide integer to // test all 0s. assert(Op.getOpcode() == unsigned(BinOp) && "Unexpected bit reduction opcode"); Opnds.push_back(Op.getOperand(0)); Opnds.push_back(Op.getOperand(1)); for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { SmallVectorImpl::const_iterator I = Opnds.begin() + Slot; // BFS traverse all BinOp operands. if (I->getOpcode() == unsigned(BinOp)) { Opnds.push_back(I->getOperand(0)); Opnds.push_back(I->getOperand(1)); // Re-evaluate the number of nodes to be traversed. e += 2; // 2 more nodes (LHS and RHS) are pushed. continue; } // Quit if a non-EXTRACT_VECTOR_ELT if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) return false; // Quit if without a constant index. SDValue Idx = I->getOperand(1); if (!isa(Idx)) return false; SDValue Src = I->getOperand(0); DenseMap::iterator M = SrcOpMap.find(Src); if (M == SrcOpMap.end()) { VT = Src.getValueType(); // Quit if not the same type. if (SrcOpMap.begin() != SrcOpMap.end() && VT != SrcOpMap.begin()->first.getValueType()) return false; unsigned NumElts = VT.getVectorNumElements(); APInt EltCount = APInt::getNullValue(NumElts); M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first; SrcOps.push_back(Src); } // Quit if element already used. unsigned CIdx = cast(Idx)->getZExtValue(); if (M->second[CIdx]) return false; M->second.setBit(CIdx); } // Quit if not all elements are used. for (DenseMap::const_iterator I = SrcOpMap.begin(), E = SrcOpMap.end(); I != E; ++I) { if (!I->second.isAllOnesValue()) return false; } return true; } // Check whether an OR'd tree is PTEST-able. static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &X86CC) { assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); if (!Subtarget.hasSSE41() || !Op->hasOneUse()) return SDValue(); SmallVector VecIns; if (!matchScalarReduction(Op, ISD::OR, VecIns)) return SDValue(); // Quit if not 128/256-bit vector. EVT VT = VecIns[0].getValueType(); if (!VT.is128BitVector() && !VT.is256BitVector()) return SDValue(); SDLoc DL(Op); MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; // Cast all vectors into TestVT for PTEST. for (unsigned i = 0, e = VecIns.size(); i < e; ++i) VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]); // If more than one full vector is evaluated, OR them first before PTEST. for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { // Each iteration will OR 2 nodes and append the result until there is only // 1 node left, i.e. the final OR'd value of all vectors. SDValue LHS = VecIns[Slot]; SDValue RHS = VecIns[Slot + 1]; VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); } X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, DL, MVT::i8); return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back()); } /// return true if \c Op has a use that doesn't just read flags. static bool hasNonFlagsUse(SDValue Op) { for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE; ++UI) { SDNode *User = *UI; unsigned UOpNo = UI.getOperandNo(); if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { // Look pass truncate. UOpNo = User->use_begin().getOperandNo(); User = *User->use_begin(); } if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC && !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) return true; } return false; } // Transform to an x86-specific ALU node with flags if there is a chance of // using an RMW op or only the flags are used. Otherwise, leave // the node alone and emit a 'cmp' or 'test' instruction. static bool isProfitableToUseFlagOp(SDValue Op) { for (SDNode *U : Op->uses()) if (U->getOpcode() != ISD::CopyToReg && U->getOpcode() != ISD::SETCC && U->getOpcode() != ISD::STORE) return false; return true; } /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // CF and OF aren't always set the way we want. Determine which // of these we need. bool NeedCF = false; bool NeedOF = false; switch (X86CC) { default: break; case X86::COND_A: case X86::COND_AE: case X86::COND_B: case X86::COND_BE: NeedCF = true; break; case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: case X86::COND_O: case X86::COND_NO: { // Check if we really need to set the // Overflow flag. If NoSignedWrap is present // that is not actually needed. switch (Op->getOpcode()) { case ISD::ADD: case ISD::SUB: case ISD::MUL: case ISD::SHL: if (Op.getNode()->getFlags().hasNoSignedWrap()) break; LLVM_FALLTHROUGH; default: NeedOF = true; break; } break; } } // See if we can use the EFLAGS value from the operand instead of // doing a separate TEST. TEST always sets OF and CF to 0, so unless // we prove that the arithmetic won't overflow, we can't use OF or CF. if (Op.getResNo() != 0 || NeedOF || NeedCF) { // Emit a CMP with 0, which is the TEST pattern. return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, Op.getValueType())); } unsigned Opcode = 0; unsigned NumOperands = 0; SDValue ArithOp = Op; // NOTICE: In the code below we use ArithOp to hold the arithmetic operation // which may be the result of a CAST. We use the variable 'Op', which is the // non-casted variable when we check for possible users. switch (ArithOp.getOpcode()) { case ISD::AND: // If the primary 'and' result isn't used, don't bother using X86ISD::AND, // because a TEST instruction will be better. if (!hasNonFlagsUse(Op)) break; LLVM_FALLTHROUGH; case ISD::ADD: case ISD::SUB: case ISD::OR: case ISD::XOR: if (!isProfitableToUseFlagOp(Op)) break; // Otherwise use a regular EFLAGS-setting instruction. switch (ArithOp.getOpcode()) { default: llvm_unreachable("unexpected operator!"); case ISD::ADD: Opcode = X86ISD::ADD; break; case ISD::SUB: Opcode = X86ISD::SUB; break; case ISD::XOR: Opcode = X86ISD::XOR; break; case ISD::AND: Opcode = X86ISD::AND; break; case ISD::OR: Opcode = X86ISD::OR; break; } NumOperands = 2; break; case X86ISD::ADD: case X86ISD::SUB: case X86ISD::OR: case X86ISD::XOR: case X86ISD::AND: return SDValue(Op.getNode(), 1); case ISD::SSUBO: case ISD::USUBO: { // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag. SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0), Op->getOperand(1)).getValue(1); } default: break; } if (Opcode == 0) { // Emit a CMP with 0, which is the TEST pattern. return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, Op.getValueType())); } SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SmallVector Ops(Op->op_begin(), Op->op_begin() + NumOperands); SDValue New = DAG.getNode(Opcode, dl, VTs, Ops); DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New); return SDValue(New.getNode(), 1); } /// Emit nodes that will be selected as "cmp Op0,Op1", or something /// equivalent. static std::pair EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, bool IsSignaling) { if (isNullConstant(Op1)) return std::make_pair(EmitTest(Op0, X86CC, dl, DAG, Subtarget), Chain); EVT CmpVT = Op0.getValueType(); if (CmpVT.isFloatingPoint()) { if (Chain) { SDValue Res = DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP, dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1}); return std::make_pair(Res, Res.getValue(1)); } return std::make_pair(DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1), SDValue()); } assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"); // Only promote the compare up to I32 if it is a 16 bit operation // with an immediate. 16 bit immediates are to be avoided. if (CmpVT == MVT::i16 && !Subtarget.isAtom() && !DAG.getMachineFunction().getFunction().hasMinSize()) { ConstantSDNode *COp0 = dyn_cast(Op0); ConstantSDNode *COp1 = dyn_cast(Op1); // Don't do this if the immediate can fit in 8-bits. if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) || (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) { unsigned ExtendOp = isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; if (X86CC == X86::COND_E || X86CC == X86::COND_NE) { // For equality comparisons try to use SIGN_EXTEND if the input was // truncate from something with enough sign bits. if (Op0.getOpcode() == ISD::TRUNCATE) { SDValue In = Op0.getOperand(0); unsigned EffBits = In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1; if (EffBits <= 16) ExtendOp = ISD::SIGN_EXTEND; } else if (Op1.getOpcode() == ISD::TRUNCATE) { SDValue In = Op1.getOperand(0); unsigned EffBits = In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1; if (EffBits <= 16) ExtendOp = ISD::SIGN_EXTEND; } } CmpVT = MVT::i32; Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0); Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1); } } // Try to shrink i64 compares if the input has enough zero bits. // FIXME: Do this for non-constant compares for constant on LHS? if (CmpVT == MVT::i64 && isa(Op1) && !isX86CCSigned(X86CC) && Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub. cast(Op1)->getAPIntValue().getActiveBits() <= 32 && DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) { CmpVT = MVT::i32; Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0); Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1); } // Use SUB instead of CMP to enable CSE between SUB and CMP. SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32); SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1); return std::make_pair(Sub.getValue(1), SDValue()); } /// Convert a comparison if required by the subtarget. SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const { // If the subtarget does not support the FUCOMI instruction, floating-point // comparisons have to be converted. bool IsCmp = Cmp.getOpcode() == X86ISD::CMP; bool IsStrictCmp = Cmp.getOpcode() == X86ISD::STRICT_FCMP || Cmp.getOpcode() == X86ISD::STRICT_FCMPS; if (Subtarget.hasCMov() || (!IsCmp && !IsStrictCmp) || !Cmp.getOperand(IsStrictCmp ? 1 : 0).getValueType().isFloatingPoint() || !Cmp.getOperand(IsStrictCmp ? 2 : 1).getValueType().isFloatingPoint()) return Cmp; // The instruction selector will select an FUCOM instruction instead of // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence // build an SDNode sequence that transfers the result from FPSW into EFLAGS: // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86any_fcmp ...)), 8)))) SDLoc dl(Cmp); SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, DAG.getConstant(8, dl, MVT::i8)); SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); // Some 64-bit targets lack SAHF support, but they do support FCOMI. assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?"); return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } /// Check if replacement of SQRT with RSQRT should be disabled. bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); // We never want to use both SQRT and RSQRT instructions for the same input. if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op)) return false; if (VT.isVector()) return Subtarget.hasFastVectorFSQRT(); return Subtarget.hasFastScalarFSQRT(); } /// The minimum architected relative accuracy is 2^-12. We need one /// Newton-Raphson step to have a good float result (24 bits of precision). SDValue X86TargetLowering::getSqrtEstimate(SDValue Op, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const { EVT VT = Op.getValueType(); // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps. // It is likely not profitable to do this for f64 because a double-precision // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 // instructions: convert to single, rsqrtss, convert back to double, refine // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA // along with FMA, this could be a throughput win. // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32 // after legalize types. if ((VT == MVT::f32 && Subtarget.hasSSE1()) || (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) || (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) || (VT == MVT::v8f32 && Subtarget.hasAVX()) || (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) { if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = 1; UseOneConstNR = false; // There is no FSQRT for 512-bits, but there is RSQRT14. unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT; return DAG.getNode(Opcode, SDLoc(Op), VT, Op); } return SDValue(); } /// The minimum architected relative accuracy is 2^-12. We need one /// Newton-Raphson step to have a good float result (24 bits of precision). SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const { EVT VT = Op.getValueType(); // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. // It is likely not profitable to do this for f64 because a double-precision // reciprocal estimate with refinement on x86 prior to FMA requires // 15 instructions: convert to single, rcpss, convert back to double, refine // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA // along with FMA, this could be a throughput win. if ((VT == MVT::f32 && Subtarget.hasSSE1()) || (VT == MVT::v4f32 && Subtarget.hasSSE1()) || (VT == MVT::v8f32 && Subtarget.hasAVX()) || (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) { // Enable estimate codegen with 1 refinement step for vector division. // Scalar division estimates are disabled because they break too much // real-world code. These defaults are intended to match GCC behavior. if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified) return SDValue(); if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = 1; // There is no FSQRT for 512-bits, but there is RCP14. unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP; return DAG.getNode(Opcode, SDLoc(Op), VT, Op); } return SDValue(); } /// If we have at least two divisions that use the same divisor, convert to /// multiplication by a reciprocal. This may need to be adjusted for a given /// CPU if a division's cost is not at least twice the cost of a multiplication. /// This is because we still need one division to calculate the reciprocal and /// then we need two multiplies by that reciprocal as replacements for the /// original divisions. unsigned X86TargetLowering::combineRepeatedFPDivisors() const { return 2; } SDValue X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const { AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); if (isIntDivCheap(N->getValueType(0), Attr)) return SDValue(N,0); // Lower SDIV as SDIV assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) && "Unexpected divisor!"); // Only perform this transform if CMOV is supported otherwise the select // below will become a branch. if (!Subtarget.hasCMov()) return SDValue(); // fold (sdiv X, pow2) EVT VT = N->getValueType(0); // FIXME: Support i8. if (VT != MVT::i16 && VT != MVT::i32 && !(Subtarget.is64Bit() && VT == MVT::i64)) return SDValue(); unsigned Lg2 = Divisor.countTrailingZeros(); // If the divisor is 2 or -2, the default expansion is better. if (Lg2 == 1) return SDValue(); SDLoc DL(N); SDValue N0 = N->getOperand(0); SDValue Zero = DAG.getConstant(0, DL, VT); APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2); SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT); // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right. SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT); SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0); Created.push_back(Cmp.getNode()); Created.push_back(Add.getNode()); Created.push_back(CMov.getNode()); // Divide by pow2. SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8)); // If we're dividing by a positive value, we're done. Otherwise, we must // negate the result. if (Divisor.isNonNegative()) return SRA; Created.push_back(SRA.getNode()); return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA); } /// Result of 'and' is compared against zero. Change to a BT node if possible. /// Returns the BT node and the condition code needed to use it. static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, SDValue &X86CC) { assert(And.getOpcode() == ISD::AND && "Expected AND node!"); SDValue Op0 = And.getOperand(0); SDValue Op1 = And.getOperand(1); if (Op0.getOpcode() == ISD::TRUNCATE) Op0 = Op0.getOperand(0); if (Op1.getOpcode() == ISD::TRUNCATE) Op1 = Op1.getOperand(0); SDValue Src, BitNo; if (Op1.getOpcode() == ISD::SHL) std::swap(Op0, Op1); if (Op0.getOpcode() == ISD::SHL) { if (isOneConstant(Op0.getOperand(0))) { // If we looked past a truncate, check that it's only truncating away // known zeros. unsigned BitWidth = Op0.getValueSizeInBits(); unsigned AndBitWidth = And.getValueSizeInBits(); if (BitWidth > AndBitWidth) { KnownBits Known = DAG.computeKnownBits(Op0); if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth) return SDValue(); } Src = Op1; BitNo = Op0.getOperand(1); } } else if (Op1.getOpcode() == ISD::Constant) { ConstantSDNode *AndRHS = cast(Op1); uint64_t AndRHSVal = AndRHS->getZExtValue(); SDValue AndLHS = Op0; if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { Src = AndLHS.getOperand(0); BitNo = AndLHS.getOperand(1); } else { // Use BT if the immediate can't be encoded in a TEST instruction or we // are optimizing for size and the immedaite won't fit in a byte. bool OptForSize = DAG.shouldOptForSize(); if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) && isPowerOf2_64(AndRHSVal)) { Src = AndLHS; BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, Src.getValueType()); } } } // No patterns found, give up. if (!Src.getNode()) return SDValue(); // If Src is i8, promote it to i32 with any_extend. There is no i8 BT // instruction. Since the shift amount is in-range-or-undefined, we know // that doing a bittest on the i32 value is ok. We extend to i32 because // the encoding for the i16 version is larger than the i32 version. // Also promote i16 to i32 for performance / code size reason. if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16) Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src); // See if we can use the 32-bit instruction instead of the 64-bit one for a // shorter encoding. Since the former takes the modulo 32 of BitNo and the // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is // known to be zero. if (Src.getValueType() == MVT::i64 && DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32))) Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src); // If the operand types disagree, extend the shift amount to match. Since // BT ignores high bits (like shifts) we can use anyextend. if (Src.getValueType() != BitNo.getValueType()) BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo); X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B, dl, MVT::i8); return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo); } /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask /// CMPs. static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling) { unsigned SSECC; bool Swap = false; // SSE Condition code mapping: // 0 - EQ // 1 - LT // 2 - LE // 3 - UNORD // 4 - NEQ // 5 - NLT // 6 - NLE // 7 - ORD switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETOEQ: case ISD::SETEQ: SSECC = 0; break; case ISD::SETOGT: case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETLT: case ISD::SETOLT: SSECC = 1; break; case ISD::SETOGE: case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETLE: case ISD::SETOLE: SSECC = 2; break; case ISD::SETUO: SSECC = 3; break; case ISD::SETUNE: case ISD::SETNE: SSECC = 4; break; case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGE: SSECC = 5; break; case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGT: SSECC = 6; break; case ISD::SETO: SSECC = 7; break; case ISD::SETUEQ: SSECC = 8; break; case ISD::SETONE: SSECC = 12; break; } if (Swap) std::swap(Op0, Op1); switch (SetCCOpcode) { default: IsAlwaysSignaling = true; break; case ISD::SETEQ: case ISD::SETOEQ: case ISD::SETUEQ: case ISD::SETNE: case ISD::SETONE: case ISD::SETUNE: case ISD::SETO: case ISD::SETUO: IsAlwaysSignaling = false; break; } return SSECC; } /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then /// concatenate the result back. static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && "Unsupported value type for operation"); unsigned NumElems = VT.getVectorNumElements(); SDLoc dl(Op); SDValue CC = Op.getOperand(2); // Extract the LHS vectors SDValue LHS = Op.getOperand(0); SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl); SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl); // Extract the RHS vectors SDValue RHS = Op.getOperand(1); SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl); SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl); // Issue the operation on the smaller types and concatenate the result back MVT EltVT = VT.getVectorElementType(); MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); } static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); assert(VT.getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation"); ISD::CondCode SetCCOpcode = cast(CC)->get(); // Prefer SETGT over SETLT. if (SetCCOpcode == ISD::SETLT) { SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode); std::swap(Op0, Op1); } return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode); } /// Given a buildvector constant, return a new vector constant with each element /// incremented or decremented. If incrementing or decrementing would result in /// unsigned overflow or underflow or this is not a simple vector constant, /// return an empty value. static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) { auto *BV = dyn_cast(V.getNode()); if (!BV) return SDValue(); MVT VT = V.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); unsigned NumElts = VT.getVectorNumElements(); SmallVector NewVecC; SDLoc DL(V); for (unsigned i = 0; i < NumElts; ++i) { auto *Elt = dyn_cast(BV->getOperand(i)); if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT) return SDValue(); // Avoid overflow/underflow. const APInt &EltC = Elt->getAPIntValue(); if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue())) return SDValue(); NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT)); } return DAG.getBuildVector(VT, DL, NewVecC); } /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for /// Op0 u<= Op1: /// t = psubus Op0, Op1 /// pcmpeq t, <0..0> static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG) { if (!Subtarget.hasSSE2()) return SDValue(); MVT VET = VT.getVectorElementType(); if (VET != MVT::i8 && VET != MVT::i16) return SDValue(); switch (Cond) { default: return SDValue(); case ISD::SETULT: { // If the comparison is against a constant we can turn this into a // setule. With psubus, setule does not require a swap. This is // beneficial because the constant in the register is no longer // destructed as the destination so it can be hoisted out of a loop. // Only do this pre-AVX since vpcmp* is no longer destructive. if (Subtarget.hasAVX()) return SDValue(); SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false); if (!ULEOp1) return SDValue(); Op1 = ULEOp1; break; } case ISD::SETUGT: { // If the comparison is against a constant, we can turn this into a setuge. // This is beneficial because materializing a constant 0 for the PCMPEQ is // probably cheaper than XOR+PCMPGT using 2 different vector constants: // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0 SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true); if (!UGEOp1) return SDValue(); Op1 = Op0; Op0 = UGEOp1; break; } // Psubus is better than flip-sign because it requires no inversion. case ISD::SETUGE: std::swap(Op0, Op1); break; case ISD::SETULE: break; } SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1); return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result, DAG.getConstant(0, dl, VT)); } static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC || Op.getOpcode() == ISD::STRICT_FSETCCS; SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0); SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1); SDValue CC = Op.getOperand(IsStrict ? 3 : 2); MVT VT = Op->getSimpleValueType(0); ISD::CondCode Cond = cast(CC)->get(); bool isFP = Op1.getSimpleValueType().isFloatingPoint(); SDLoc dl(Op); if (isFP) { #ifndef NDEBUG MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); assert(EltVT == MVT::f32 || EltVT == MVT::f64); #endif bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); unsigned Opc; if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) { assert(VT.getVectorNumElements() <= 16); Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM; } else { Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP; // The SSE/AVX packed FP comparison nodes are defined with a // floating-point vector result that matches the operand type. This allows // them to work with an SSE1 target (integer vector types are not legal). VT = Op0.getSimpleValueType(); } SDValue Cmp; bool IsAlwaysSignaling; unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling); if (!Subtarget.hasAVX()) { // TODO: We could use following steps to handle a quiet compare with // signaling encodings. // 1. Get ordered masks from a quiet ISD::SETO // 2. Use the masks to mask potential unordered elements in operand A, B // 3. Get the compare results of masked A, B // 4. Calculating final result using the mask and result from 3 // But currently, we just fall back to scalar operations. if (IsStrict && IsAlwaysSignaling && !IsSignaling) return SDValue(); // Insert an extra signaling instruction to raise exception. if (IsStrict && !IsAlwaysSignaling && IsSignaling) { SDValue SignalCmp = DAG.getNode( Opc, dl, {VT, MVT::Other}, {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS // FIXME: It seems we need to update the flags of all new strict nodes. // Otherwise, mayRaiseFPException in MI will return false due to // NoFPExcept = false by default. However, I didn't find it in other // patches. SignalCmp->setFlags(Op->getFlags()); Chain = SignalCmp.getValue(1); } // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE), // emit two comparisons and a logic op to tie them together. if (SSECC >= 8) { // LLVM predicate is SETUEQ or SETONE. unsigned CC0, CC1; unsigned CombineOpc; if (Cond == ISD::SETUEQ) { CC0 = 3; // UNORD CC1 = 0; // EQ CombineOpc = X86ISD::FOR; } else { assert(Cond == ISD::SETONE); CC0 = 7; // ORD CC1 = 4; // NEQ CombineOpc = X86ISD::FAND; } SDValue Cmp0, Cmp1; if (IsStrict) { Cmp0 = DAG.getNode( Opc, dl, {VT, MVT::Other}, {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)}); Cmp1 = DAG.getNode( Opc, dl, {VT, MVT::Other}, {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)}); Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1), Cmp1.getValue(1)); } else { Cmp0 = DAG.getNode( Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)); Cmp1 = DAG.getNode( Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)); } Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); } else { if (IsStrict) { Cmp = DAG.getNode( Opc, dl, {VT, MVT::Other}, {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)}); Chain = Cmp.getValue(1); } else Cmp = DAG.getNode( Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)); } } else { // Handle all other FP comparisons here. if (IsStrict) { // Make a flip on already signaling CCs before setting bit 4 of AVX CC. SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4; Cmp = DAG.getNode( Opc, dl, {VT, MVT::Other}, {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)}); Chain = Cmp.getValue(1); } else Cmp = DAG.getNode( Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)); } // If this is SSE/AVX CMPP, bitcast the result back to integer to match the // result type of SETCC. The bitcast is expected to be optimized away // during combining/isel. Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp); if (IsStrict) return DAG.getMergeValues({Cmp, Chain}, dl); return Cmp; } assert(!IsStrict && "Strict SETCC only handles FP operands."); MVT VTOp0 = Op0.getSimpleValueType(); (void)VTOp0; assert(VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!"); assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() && "Invalid number of packed elements for source and destination!"); // The non-AVX512 code below works under the assumption that source and // destination types are the same. assert((Subtarget.hasAVX512() || (VT == VTOp0)) && "Value types for source and destination must be the same!"); // The result is boolean, but operands are int/float if (VT.getVectorElementType() == MVT::i1) { // In AVX-512 architecture setcc returns mask with i1 elements, // But there is no compare instruction for i8 and i16 elements in KNL. assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && "Unexpected operand type"); return LowerIntVSETCC_AVX512(Op, DAG); } // Lower using XOP integer comparisons. if (VT.is128BitVector() && Subtarget.hasXOP()) { // Translate compare code to XOP PCOM compare mode. unsigned CmpMode = 0; switch (Cond) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETULT: case ISD::SETLT: CmpMode = 0x00; break; case ISD::SETULE: case ISD::SETLE: CmpMode = 0x01; break; case ISD::SETUGT: case ISD::SETGT: CmpMode = 0x02; break; case ISD::SETUGE: case ISD::SETGE: CmpMode = 0x03; break; case ISD::SETEQ: CmpMode = 0x04; break; case ISD::SETNE: CmpMode = 0x05; break; } // Are we comparing unsigned or signed integers? unsigned Opc = ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM; return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CmpMode, dl, MVT::i8)); } // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2. // Revert part of the simplifySetCCWithAnd combine, to avoid an invert. if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) { SDValue BC0 = peekThroughBitcasts(Op0); if (BC0.getOpcode() == ISD::AND) { APInt UndefElts; SmallVector EltBits; if (getTargetConstantBitsFromNode(BC0.getOperand(1), VT.getScalarSizeInBits(), UndefElts, EltBits, false, false)) { if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) { Cond = ISD::SETEQ; Op1 = DAG.getBitcast(VT, BC0.getOperand(1)); } } } } // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2. if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND && Op0.getOperand(1) == Op1 && Op0.hasOneUse()) { ConstantSDNode *C1 = isConstOrConstSplat(Op1); if (C1 && C1->getAPIntValue().isPowerOf2()) { unsigned BitWidth = VT.getScalarSizeInBits(); unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1; SDValue Result = Op0.getOperand(0); Result = DAG.getNode(ISD::SHL, dl, VT, Result, DAG.getConstant(ShiftAmt, dl, VT)); Result = DAG.getNode(ISD::SRA, dl, VT, Result, DAG.getConstant(BitWidth - 1, dl, VT)); return Result; } } // Break 256-bit integer vector compare into smaller ones. if (VT.is256BitVector() && !Subtarget.hasInt256()) return Lower256IntVSETCC(Op, DAG); // If this is a SETNE against the signed minimum value, change it to SETGT. // If this is a SETNE against the signed maximum value, change it to SETLT. // which will be swapped to SETGT. // Otherwise we use PCMPEQ+invert. APInt ConstValue; if (Cond == ISD::SETNE && ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) { if (ConstValue.isMinSignedValue()) Cond = ISD::SETGT; else if (ConstValue.isMaxSignedValue()) Cond = ISD::SETLT; } // If both operands are known non-negative, then an unsigned compare is the // same as a signed compare and there's no need to flip signbits. // TODO: We could check for more general simplifications here since we're // computing known bits. bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) && !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1)); // Special case: Use min/max operations for unsigned compares. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (ISD::isUnsignedIntSetCC(Cond) && (FlipSigns || ISD::isTrueWhenEqual(Cond)) && TLI.isOperationLegal(ISD::UMIN, VT)) { // If we have a constant operand, increment/decrement it and change the // condition to avoid an invert. if (Cond == ISD::SETUGT) { // X > C --> X >= (C+1) --> X == umax(X, C+1) if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) { Op1 = UGTOp1; Cond = ISD::SETUGE; } } if (Cond == ISD::SETULT) { // X < C --> X <= (C-1) --> X == umin(X, C-1) if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) { Op1 = ULTOp1; Cond = ISD::SETULE; } } bool Invert = false; unsigned Opc; switch (Cond) { default: llvm_unreachable("Unexpected condition code"); case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH; case ISD::SETULE: Opc = ISD::UMIN; break; case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH; case ISD::SETUGE: Opc = ISD::UMAX; break; } SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result); // If the logical-not of the result is required, perform that now. if (Invert) Result = DAG.getNOT(dl, Result, VT); return Result; } // Try to use SUBUS and PCMPEQ. if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG)) return V; // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ : X86ISD::PCMPGT; bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT || Cond == ISD::SETGE || Cond == ISD::SETUGE; bool Invert = Cond == ISD::SETNE || (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond)); if (Swap) std::swap(Op0, Op1); // Check that the operation in question is available (most are plain SSE2, // but PCMPGTQ and PCMPEQQ have different requirements). if (VT == MVT::v2i64) { if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) { assert(Subtarget.hasSSE2() && "Don't know how to lower!"); // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle // the odd elements over the even elements. if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) { Op0 = DAG.getConstant(0, dl, MVT::v4i32); Op1 = DAG.getBitcast(MVT::v4i32, Op1); SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); static const int MaskHi[] = { 1, 1, 3, 3 }; SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); return DAG.getBitcast(VT, Result); } if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) { Op0 = DAG.getBitcast(MVT::v4i32, Op0); Op1 = DAG.getConstant(-1, dl, MVT::v4i32); SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); static const int MaskHi[] = { 1, 1, 3, 3 }; SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); return DAG.getBitcast(VT, Result); } // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. The lower // compare is always unsigned. SDValue SB; if (FlipSigns) { SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64); } else { SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64); } Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB); Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB); // Cast everything to the right type. Op0 = DAG.getBitcast(MVT::v4i32, Op0); Op1 = DAG.getBitcast(MVT::v4i32, Op1); // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2)) SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); // Create masks for only the low parts/high parts of the 64 bit integers. static const int MaskHi[] = { 1, 1, 3, 3 }; static const int MaskLo[] = { 0, 0, 2, 2 }; SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi); SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo); Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi); if (Invert) Result = DAG.getNOT(dl, Result, MVT::v4i32); return DAG.getBitcast(VT, Result); } if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) { // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with // pcmpeqd + pshufd + pand. assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!"); // First cast everything to the right type. Op0 = DAG.getBitcast(MVT::v4i32, Op0); Op1 = DAG.getBitcast(MVT::v4i32, Op1); // Do the compare. SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); // Make sure the lower and upper halves are both all-ones. static const int Mask[] = { 1, 0, 3, 2 }; SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); if (Invert) Result = DAG.getNOT(dl, Result, MVT::v4i32); return DAG.getBitcast(VT, Result); } } // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. if (FlipSigns) { MVT EltVT = VT.getVectorElementType(); SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl, VT); Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM); Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM); } SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); // If the logical-not of the result is required, perform that now. if (Invert) Result = DAG.getNOT(dl, Result, VT); return Result; } // Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible. static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC) { // Only support equality comparisons. if (CC != ISD::SETEQ && CC != ISD::SETNE) return SDValue(); // Must be a bitcast from vXi1. if (Op0.getOpcode() != ISD::BITCAST) return SDValue(); Op0 = Op0.getOperand(0); MVT VT = Op0.getSimpleValueType(); if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) && !(Subtarget.hasDQI() && VT == MVT::v8i1) && !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))) return SDValue(); X86::CondCode X86Cond; if (isNullConstant(Op1)) { X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE; } else if (isAllOnesConstant(Op1)) { // C flag is set for all ones. X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE; } else return SDValue(); // If the input is an AND, we can combine it's operands into the KTEST. bool KTestable = false; if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1)) KTestable = true; if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)) KTestable = true; if (!isNullConstant(Op1)) KTestable = false; if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) { SDValue LHS = Op0.getOperand(0); SDValue RHS = Op0.getOperand(1); X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8); return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS); } // If the input is an OR, we can combine it's operands into the KORTEST. SDValue LHS = Op0; SDValue RHS = Op0; if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) { LHS = Op0.getOperand(0); RHS = Op0.getOperand(1); } X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8); return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); } /// Emit flags for the given setcc condition and operands. Also returns the /// corresponding X86 condition code constant in X86CC. SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, SDValue &X86CC, SDValue &Chain, bool IsSignaling) const { // Optimize to BT if possible. // Lower (X & (1 << N)) == 0 to BT(X, N). // Lower ((X >>u N) & 1) != 0 to BT(X, N). // Lower ((X >>s N) & 1) != 0 to BT(X, N). if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC)) return BT; } // Try to use PTEST for a tree ORs equality compared with 0. // TODO: We could do AND tree with all 1s as well by using the C flag. if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { if (SDValue PTEST = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG, X86CC)) return PTEST; } // Try to lower using KORTEST or KTEST. if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC)) return Test; // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of // these. if ((isOneConstant(Op1) || isNullConstant(Op1)) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // If the input is a setcc, then reuse the input setcc or use a new one with // the inverted condition. if (Op0.getOpcode() == X86ISD::SETCC) { bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1); X86CC = Op0.getOperand(0); if (Invert) { X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8); } return Op0.getOperand(1); } } // Try to use the carry flag from the add in place of an separate CMP for: // (seteq (add X, -1), -1). Similar for setne. if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD && Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) { if (isProfitableToUseFlagOp(Op0)) { SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0), Op0.getOperand(1)); DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New); X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8); return SDValue(New.getNode(), 1); } } bool IsFP = Op1.getSimpleValueType().isFloatingPoint(); X86::CondCode CondCode = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG); if (CondCode == X86::COND_INVALID) return SDValue(); std::pair Tmp = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget, Chain, IsSignaling); SDValue EFLAGS = Tmp.first; if (Chain) Chain = Tmp.second; EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8); return EFLAGS; } SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC || Op.getOpcode() == ISD::STRICT_FSETCCS; MVT VT = Op->getSimpleValueType(0); if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0); SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1); SDLoc dl(Op); ISD::CondCode CC = cast(Op.getOperand(IsStrict ? 3 : 2))->get(); // Handle f128 first, since one possible outcome is a normal integer // comparison which gets handled by emitFlagsForSetcc. if (Op0.getValueType() == MVT::f128) { softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain, Op.getOpcode() == ISD::STRICT_FSETCCS); // If softenSetCCOperands returned a scalar, use it. if (!Op1.getNode()) { assert(Op0.getValueType() == Op.getValueType() && "Unexpected setcc expansion!"); if (IsStrict) return DAG.getMergeValues({Op0, Chain}, dl); return Op0; } } SDValue X86CC; SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC, Chain, Op.getOpcode() == ISD::STRICT_FSETCCS); if (!EFLAGS) return SDValue(); SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS); if (IsStrict) return DAG.getMergeValues({Res, Chain}, dl); return Res; } SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDValue Carry = Op.getOperand(2); SDValue Cond = Op.getOperand(3); SDLoc DL(Op); assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); X86::CondCode CC = TranslateIntegerX86CC(cast(Cond)->get()); // Recreate the carry if needed. EVT CarryVT = Carry.getValueType(); APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits()); Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), Carry, DAG.getConstant(NegOne, DL, CarryVT)); SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1)); return getSETCC(CC, Cmp.getValue(1), DL, DAG); } // This function returns three things: the arithmetic computation itself // (Value), an EFLAGS result (Overflow), and a condition code (Cond). The // flag and the condition code define the case in which the arithmetic // computation overflows. static std::pair getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) { assert(Op.getResNo() == 0 && "Unexpected result number!"); SDValue Value, Overflow; SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); unsigned BaseOp = 0; SDLoc DL(Op); switch (Op.getOpcode()) { default: llvm_unreachable("Unknown ovf instruction!"); case ISD::SADDO: BaseOp = X86ISD::ADD; Cond = X86::COND_O; break; case ISD::UADDO: BaseOp = X86ISD::ADD; Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B; break; case ISD::SSUBO: BaseOp = X86ISD::SUB; Cond = X86::COND_O; break; case ISD::USUBO: BaseOp = X86ISD::SUB; Cond = X86::COND_B; break; case ISD::SMULO: BaseOp = X86ISD::SMUL; Cond = X86::COND_O; break; case ISD::UMULO: BaseOp = X86ISD::UMUL; Cond = X86::COND_O; break; } if (BaseOp) { // Also sets EFLAGS. SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); Overflow = Value.getValue(1); } return std::make_pair(Value, Overflow); } static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // Lower the "add/sub/mul with overflow" instruction into a regular ins plus // a "setcc" instruction that checks the overflow flag. The "brcond" lowering // looks for this combo and may remove the "setcc" instruction if the "setcc" // has only one use. SDLoc DL(Op); X86::CondCode Cond; SDValue Value, Overflow; std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG); SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG); assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!"); return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC); } /// Return true if opcode is a X86 logical comparison. static bool isX86LogicalCmp(SDValue Op) { unsigned Opc = Op.getOpcode(); if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || Opc == X86ISD::SAHF) return true; if (Op.getResNo() == 1 && (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC || Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL || Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND)) return true; return false; } static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { if (V.getOpcode() != ISD::TRUNCATE) return false; SDValue VOp0 = V.getOperand(0); unsigned InBits = VOp0.getValueSizeInBits(); unsigned Bits = V.getValueSizeInBits(); return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); } SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { bool AddTest = true; SDValue Cond = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue Op2 = Op.getOperand(2); SDLoc DL(Op); MVT VT = Op1.getSimpleValueType(); SDValue CC; // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops // are available or VBLENDV if AVX is available. // Otherwise FP cmovs get lowered into a less efficient branch sequence later. if (Cond.getOpcode() == ISD::SETCC && ((Subtarget.hasSSE2() && VT == MVT::f64) || (Subtarget.hasSSE1() && VT == MVT::f32)) && VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) { SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); bool IsAlwaysSignaling; unsigned SSECC = translateX86FSETCC(cast(Cond.getOperand(2))->get(), CondOp0, CondOp1, IsAlwaysSignaling); if (Subtarget.hasAVX512()) { SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1, DAG.getTargetConstant(SSECC, DL, MVT::i8)); assert(!VT.isVector() && "Not a scalar type?"); return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); } if (SSECC < 8 || Subtarget.hasAVX()) { SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, DAG.getTargetConstant(SSECC, DL, MVT::i8)); // If we have AVX, we can use a variable vector select (VBLENDV) instead // of 3 logic instructions for size savings and potentially speed. // Unfortunately, there is no scalar form of VBLENDV. // If either operand is a +0.0 constant, don't try this. We can expect to // optimize away at least one of the logic instructions later in that // case, so that sequence would be faster than a variable blend. // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly // uses XMM0 as the selection register. That may need just as many // instructions as the AND/ANDN/OR sequence due to register moves, so // don't bother. if (Subtarget.hasAVX() && !isNullFPConstant(Op1) && !isNullFPConstant(Op2)) { // Convert to vectors, do a VSELECT, and convert back to scalar. // All of the conversions should be optimized away. MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1); SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2); SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp); MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; VCmp = DAG.getBitcast(VCmpVT, VCmp); SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel, DAG.getIntPtrConstant(0, DL)); } SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); } } // AVX512 fallback is to lower selects of scalar floats to masked moves. if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) { SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond); return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); } // For v64i1 without 64-bit support we need to split and rejoin. if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { assert(Subtarget.hasBWI() && "Expected BWI to be legal"); SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32); SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32); SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32); SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32); SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo); SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); } if (VT.isVector() && VT.getVectorElementType() == MVT::i1) { SDValue Op1Scalar; if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode())) Op1Scalar = ConvertI1VectorToInteger(Op1, DAG); else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0)) Op1Scalar = Op1.getOperand(0); SDValue Op2Scalar; if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode())) Op2Scalar = ConvertI1VectorToInteger(Op2, DAG); else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) Op2Scalar = Op2.getOperand(0); if (Op1Scalar.getNode() && Op2Scalar.getNode()) { SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond, Op1Scalar, Op2Scalar); if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) return DAG.getBitcast(VT, newSelect); SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec, DAG.getIntPtrConstant(0, DL)); } } if (Cond.getOpcode() == ISD::SETCC) { if (SDValue NewCond = LowerSETCC(Cond, DAG)) { Cond = NewCond; // If the condition was updated, it's possible that the operands of the // select were also updated (for example, EmitTest has a RAUW). Refresh // the local references to the select operands in case they got stale. Op1 = Op.getOperand(1); Op2 = Op.getOperand(2); } } // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y if (Cond.getOpcode() == X86ISD::SETCC && Cond.getOperand(1).getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1).getOperand(1))) { SDValue Cmp = Cond.getOperand(1); unsigned CondCode = Cond.getConstantOperandVal(0); if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2; SDValue CmpOp0 = Cmp.getOperand(0); // Apply further optimizations for special cases // (select (x != 0), -1, 0) -> neg & sbb // (select (x == 0), 0, -1) -> neg & sbb if (isNullConstant(Y) && (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType()); SDValue CmpZero = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); Zero = DAG.getConstant(0, DL, Op.getValueType()); return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, CmpZero); } Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType())); Cmp = ConvertCmpIfNecessary(Cmp, DAG); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SDValue Zero = DAG.getConstant(0, DL, Op.getValueType()); SDValue Res = // Res = 0 or -1. DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp); if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E)) Res = DAG.getNOT(DL, Res, Res.getValueType()); if (!isNullConstant(Op2)) Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); return Res; } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E && Cmp.getOperand(0).getOpcode() == ISD::AND && isOneConstant(Cmp.getOperand(0).getOperand(1))) { SDValue CmpOp0 = Cmp.getOperand(0); SDValue Src1, Src2; // true if Op2 is XOR or OR operator and one of its operands // is equal to Op1 // ( a , a op b) || ( b , a op b) auto isOrXorPattern = [&]() { if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) && (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) { Src1 = Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0); Src2 = Op1; return true; } return false; }; if (isOrXorPattern()) { SDValue Neg; unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits(); // we need mask of all zeros or ones with same size of the other // operands. if (CmpSz > VT.getSizeInBits()) Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0); else if (CmpSz < VT.getSizeInBits()) Neg = DAG.getNode(ISD::AND, DL, VT, DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)), DAG.getConstant(1, DL, VT)); else Neg = CmpOp0; SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Neg); // -(and (x, 0x1)) SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y } } } // Look past (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && isOneConstant(Cond.getOperand(1))) Cond = Cond.getOperand(0); // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. unsigned CondOpcode = Cond.getOpcode(); if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); bool IllegalFPCMov = false; if (VT.isFloatingPoint() && !VT.isVector() && !isScalarFPTypeInSSEReg(VT)) // FPStack? IllegalFPCMov = !hasFPCMov(cast(CC)->getSExtValue()); if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || Cmp.getOpcode() == X86ISD::BT) { // FIXME Cond = Cmp; AddTest = false; } } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) { SDValue Value; X86::CondCode X86Cond; std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG); CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8); AddTest = false; } if (AddTest) { // Look past the truncate if the high bits are known zero. if (isTruncWithZeroHighBitsInput(Cond, DAG)) Cond = Cond.getOperand(0); // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { SDValue BTCC; if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) { CC = BTCC; Cond = BT; AddTest = false; } } } if (AddTest) { CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8); Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget); } // a < b ? -1 : 0 -> RES = ~setcc_carry // a < b ? 0 : -1 -> RES = setcc_carry // a >= b ? -1 : 0 -> RES = setcc_carry // a >= b ? 0 : -1 -> RES = ~setcc_carry if (Cond.getOpcode() == X86ISD::SUB) { Cond = ConvertCmpIfNecessary(Cond, DAG); unsigned CondCode = cast(CC)->getZExtValue(); if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (isNullConstant(Op1) || isNullConstant(Op2))) { SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond); if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B)) return DAG.getNOT(DL, Res, Res.getValueType()); return Res; } } // X86 doesn't have an i8 cmov. If both operands are the result of a truncate // widen the cmov and push the truncate through. This avoids introducing a new // branch during isel and doesn't add any extensions. if (Op.getValueType() == MVT::i8 && Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); if (T1.getValueType() == T2.getValueType() && // Blacklist CopyFromReg to avoid partial register stalls. T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1, CC, Cond); return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); } } // Or finally, promote i8 cmovs if we have CMOV, // or i16 cmovs if it won't prevent folding a load. // FIXME: we should not limit promotion of i8 case to only when the CMOV is // legal, but EmitLoweredSelect() can not deal with these extensions // being inserted between two CMOV's. (in i16 case too TBN) // https://bugs.llvm.org/show_bug.cgi?id=40974 if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) || (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && !MayFoldLoad(Op2))) { Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); SDValue Ops[] = { Op2, Op1, CC, Cond }; SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops); return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); } // X86ISD::CMOV means set the result (which is operand 1) to the RHS if // condition is true. SDValue Ops[] = { Op2, Op1, CC, Cond }; return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops); } static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!"); MVT VTElt = VT.getVectorElementType(); SDLoc dl(Op); unsigned NumElts = VT.getVectorNumElements(); // Extend VT if the scalar type is i8/i16 and BWI is not supported. MVT ExtVT = VT; if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) { // If v16i32 is to be avoided, we'll need to split and concatenate. if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG); ExtVT = MVT::getVectorVT(MVT::i32, NumElts); } // Widen to 512-bits if VLX is not supported. MVT WideVT = ExtVT; if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) { NumElts *= 512 / ExtVT.getSizeInBits(); InVT = MVT::getVectorVT(MVT::i1, NumElts); In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In, DAG.getIntPtrConstant(0, dl)); WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts); } SDValue V; MVT WideEltVT = WideVT.getVectorElementType(); if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) || (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) { V = DAG.getNode(Op.getOpcode(), dl, WideVT, In); } else { SDValue NegOne = DAG.getConstant(-1, dl, WideVT); SDValue Zero = DAG.getConstant(0, dl, WideVT); V = DAG.getSelect(dl, WideVT, In, NegOne, Zero); } // Truncate if we had to extend i16/i8 above. if (VT != ExtVT) { WideVT = MVT::getVectorVT(VTElt, NumElts); V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V); } // Extract back to 128/256-bit if we widened. if (WideVT != VT) V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V, DAG.getIntPtrConstant(0, dl)); return V; } static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); if (InVT.getVectorElementType() == MVT::i1) return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG); assert(Subtarget.hasAVX() && "Expected AVX support"); return LowerAVXExtend(Op, DAG, Subtarget); } // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG. // For sign extend this needs to handle all vector sizes and SSE4.1 and // non-SSE4.1 targets. For zero extend this should only handle inputs of // MVT::v64i8 when BWI is not supported, but AVX512 is. static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = Op->getOperand(0); MVT VT = Op->getSimpleValueType(0); MVT InVT = In.getSimpleValueType(); MVT SVT = VT.getVectorElementType(); MVT InSVT = InVT.getVectorElementType(); assert(SVT.getSizeInBits() > InSVT.getSizeInBits()); if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16) return SDValue(); if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) return SDValue(); if (!(VT.is128BitVector() && Subtarget.hasSSE2()) && !(VT.is256BitVector() && Subtarget.hasAVX()) && !(VT.is512BitVector() && Subtarget.hasAVX512())) return SDValue(); SDLoc dl(Op); unsigned Opc = Op.getOpcode(); unsigned NumElts = VT.getVectorNumElements(); // For 256-bit vectors, we only need the lower (128-bit) half of the input. // For 512-bit vectors, we need 128-bits or 256-bits. if (InVT.getSizeInBits() > 128) { // Input needs to be at least the same number of elements as output, and // at least 128-bits. int InSize = InSVT.getSizeInBits() * NumElts; In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128)); InVT = In.getSimpleValueType(); } // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results, // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still // need to be handled here for 256/512-bit results. if (Subtarget.hasInt256()) { assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension"); if (InVT.getVectorNumElements() != NumElts) return DAG.getNode(Op.getOpcode(), dl, VT, In); // FIXME: Apparently we create inreg operations that could be regular // extends. unsigned ExtOpc = Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; return DAG.getNode(ExtOpc, dl, VT, In); } // pre-AVX2 256-bit extensions need to be split into 128-bit instructions. if (Subtarget.hasAVX()) { assert(VT.is256BitVector() && "256-bit vector expected"); MVT HalfVT = VT.getHalfNumVectorElementsVT(); int HalfNumElts = HalfVT.getVectorNumElements(); unsigned NumSrcElts = InVT.getVectorNumElements(); SmallVector HiMask(NumSrcElts, SM_SentinelUndef); for (int i = 0; i != HalfNumElts; ++i) HiMask[i] = HalfNumElts + i; SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In); SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask); Hi = DAG.getNode(Opc, dl, HalfVT, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); } // We should only get here for sign extend. assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!"); assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs"); // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI. SDValue Curr = In; SDValue SignExt = Curr; // As SRAI is only available on i16/i32 types, we expand only up to i32 // and handle i64 separately. if (InVT != MVT::v4i32) { MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT; unsigned DestWidth = DestVT.getScalarSizeInBits(); unsigned Scale = DestWidth / InSVT.getSizeInBits(); unsigned InNumElts = InVT.getVectorNumElements(); unsigned DestElts = DestVT.getVectorNumElements(); // Build a shuffle mask that takes each input element and places it in the // MSBs of the new element size. SmallVector Mask(InNumElts, SM_SentinelUndef); for (unsigned i = 0; i != DestElts; ++i) Mask[i * Scale + (Scale - 1)] = i; Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask); Curr = DAG.getBitcast(DestVT, Curr); unsigned SignExtShift = DestWidth - InSVT.getSizeInBits(); SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr, DAG.getTargetConstant(SignExtShift, dl, MVT::i8)); } if (VT == MVT::v2i64) { assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT"); SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32); SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT); SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5}); SignExt = DAG.getBitcast(VT, SignExt); } return SignExt; } static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); if (InVT.getVectorElementType() == MVT::i1) return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG); assert(VT.isVector() && InVT.isVector() && "Expected vector type"); assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Expected same number of elements"); assert((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && "Unexpected element type"); assert((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && "Unexpected element type"); // Custom legalize v8i8->v8i64 on CPUs without avx512bw. if (InVT == MVT::v8i8) { if (VT != MVT::v8i64) return SDValue(); In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8)); return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In); } if (Subtarget.hasInt256()) return Op; // Optimize vectors in AVX mode // Sign extend v8i16 to v8i32 and // v4i32 to v4i64 // // Divide input vector into two parts // for v4i32 the high shuffle mask will be {2, 3, -1, -1} // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 // concat the vectors to original VT MVT HalfVT = VT.getHalfNumVectorElementsVT(); SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In); unsigned NumElems = InVT.getVectorNumElements(); SmallVector ShufMask(NumElems, -1); for (unsigned i = 0; i != NumElems/2; ++i) ShufMask[i] = i + NumElems/2; SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask); OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } /// Change a vector store into a pair of half-size vector stores. static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) { SDValue StoredVal = Store->getValue(); assert((StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"); // Splitting volatile memory ops is not allowed unless the operation was not // legal to begin with. Assume the input store is legal (this transform is // only used for targets with AVX). Note: It is possible that we have an // illegal type like v2i128, and so we could allow splitting a volatile store // in that case if that is important. if (!Store->isSimple()) return SDValue(); EVT StoreVT = StoredVal.getValueType(); unsigned NumElems = StoreVT.getVectorNumElements(); unsigned HalfSize = StoredVal.getValueSizeInBits() / 2; unsigned HalfAlign = (128 == HalfSize ? 16 : 32); SDLoc DL(Store); SDValue Value0 = extractSubVector(StoredVal, 0, DAG, DL, HalfSize); SDValue Value1 = extractSubVector(StoredVal, NumElems / 2, DAG, DL, HalfSize); SDValue Ptr0 = Store->getBasePtr(); SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfAlign, DL); unsigned Alignment = Store->getAlignment(); SDValue Ch0 = DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(), Alignment, Store->getMemOperand()->getFlags()); SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1, Store->getPointerInfo().getWithOffset(HalfAlign), MinAlign(Alignment, HalfAlign), Store->getMemOperand()->getFlags()); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1); } /// Scalarize a vector store, bitcasting to TargetVT to determine the scalar /// type. static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG) { SDValue StoredVal = Store->getValue(); assert(StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"); StoredVal = DAG.getBitcast(StoreVT, StoredVal); // Splitting volatile memory ops is not allowed unless the operation was not // legal to begin with. We are assuming the input op is legal (this transform // is only used for targets with AVX). if (!Store->isSimple()) return SDValue(); MVT StoreSVT = StoreVT.getScalarType(); unsigned NumElems = StoreVT.getVectorNumElements(); unsigned ScalarSize = StoreSVT.getStoreSize(); unsigned Alignment = Store->getAlignment(); SDLoc DL(Store); SmallVector Stores; for (unsigned i = 0; i != NumElems; ++i) { unsigned Offset = i * ScalarSize; SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL); SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal, DAG.getIntPtrConstant(i, DL)); SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr, Store->getPointerInfo().getWithOffset(Offset), MinAlign(Alignment, Offset), Store->getMemOperand()->getFlags()); Stores.push_back(Ch); } return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); } static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { StoreSDNode *St = cast(Op.getNode()); SDLoc dl(St); SDValue StoredVal = St->getValue(); // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores. if (StoredVal.getValueType().isVector() && StoredVal.getValueType().getVectorElementType() == MVT::i1) { assert(StoredVal.getValueType().getVectorNumElements() <= 8 && "Unexpected VT"); assert(!St->isTruncatingStore() && "Expected non-truncating store"); assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"); StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1, DAG.getUNDEF(MVT::v16i1), StoredVal, DAG.getIntPtrConstant(0, dl)); StoredVal = DAG.getBitcast(MVT::i16, StoredVal); StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); } if (St->isTruncatingStore()) return SDValue(); // If this is a 256-bit store of concatenated ops, we are better off splitting // that store into two 128-bit stores. This avoids spurious use of 256-bit ops // and each half can execute independently. Some cores would split the op into // halves anyway, so the concat (vinsertf128) is purely an extra op. MVT StoreVT = StoredVal.getSimpleValueType(); if (StoreVT.is256BitVector()) { SmallVector CatOps; if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps)) return splitVectorStore(St, DAG); return SDValue(); } const TargetLowering &TLI = DAG.getTargetLoweringInfo(); assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && "Unexpected VT"); assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"); EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT); StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal, DAG.getUNDEF(StoreVT)); if (Subtarget.hasSSE2()) { // Widen the vector, cast to a v2x64 type, extract the single 64-bit element // and store it. MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64; MVT CastVT = MVT::getVectorVT(StVT, 2); StoredVal = DAG.getBitcast(CastVT, StoredVal); StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal, DAG.getIntPtrConstant(0, dl)); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); } assert(Subtarget.hasSSE1() && "Expected SSE"); SDVTList Tys = DAG.getVTList(MVT::Other); SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()}; return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64, St->getMemOperand()); } // Lower vector extended loads using a shuffle. If SSSE3 is not available we // may emit an illegal shuffle but the expansion is still better than scalar // code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise // we'll emit a shuffle and a arithmetic shift. // FIXME: Is the expansion actually better than scalar code? It doesn't seem so. // TODO: It is possible to support ZExt by zeroing the undef values during // the shuffle phase or after the shuffle. static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT RegVT = Op.getSimpleValueType(); assert(RegVT.isVector() && "We only custom lower vector loads."); assert(RegVT.isInteger() && "We only custom lower integer vector loads."); LoadSDNode *Ld = cast(Op.getNode()); SDLoc dl(Ld); // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads. if (RegVT.getVectorElementType() == MVT::i1) { assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load"); assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT"); assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"); SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->getAlignment(), Ld->getMemOperand()->getFlags()); // Replace chain users with the new chain. assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!"); SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd); Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT, DAG.getBitcast(MVT::v16i1, Val), DAG.getIntPtrConstant(0, dl)); return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl); } return SDValue(); } /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes /// each of which has no other use apart from the AND / OR. static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { Opc = Op.getOpcode(); if (Opc != ISD::OR && Opc != ISD::AND) return false; return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && Op.getOperand(0).hasOneUse() && Op.getOperand(1).getOpcode() == X86ISD::SETCC && Op.getOperand(1).hasOneUse()); } /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the /// SETCC node has a single use. static bool isXor1OfSetCC(SDValue Op) { if (Op.getOpcode() != ISD::XOR) return false; if (isOneConstant(Op.getOperand(1))) return Op.getOperand(0).getOpcode() == X86ISD::SETCC && Op.getOperand(0).hasOneUse(); return false; } SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { bool addTest = true; SDValue Chain = Op.getOperand(0); SDValue Cond = Op.getOperand(1); SDValue Dest = Op.getOperand(2); SDLoc dl(Op); SDValue CC; bool Inverted = false; if (Cond.getOpcode() == ISD::SETCC) { // Check for setcc([su]{add,sub,mul}o == 0). if (cast(Cond.getOperand(2))->get() == ISD::SETEQ && isNullConstant(Cond.getOperand(1)) && Cond.getOperand(0).getResNo() == 1 && (Cond.getOperand(0).getOpcode() == ISD::SADDO || Cond.getOperand(0).getOpcode() == ISD::UADDO || Cond.getOperand(0).getOpcode() == ISD::SSUBO || Cond.getOperand(0).getOpcode() == ISD::USUBO || Cond.getOperand(0).getOpcode() == ISD::SMULO || Cond.getOperand(0).getOpcode() == ISD::UMULO)) { Inverted = true; Cond = Cond.getOperand(0); } else { if (SDValue NewCond = LowerSETCC(Cond, DAG)) Cond = NewCond; } } #if 0 // FIXME: LowerXALUO doesn't handle these!! else if (Cond.getOpcode() == X86ISD::ADD || Cond.getOpcode() == X86ISD::SUB || Cond.getOpcode() == X86ISD::SMUL || Cond.getOpcode() == X86ISD::UMUL) Cond = LowerXALUO(Cond, DAG); #endif // Look pass (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && isOneConstant(Cond.getOperand(1))) Cond = Cond.getOperand(0); // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. unsigned CondOpcode = Cond.getOpcode(); if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); unsigned Opc = Cmp.getOpcode(); // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { Cond = Cmp; addTest = false; } else { switch (cast(CC)->getZExtValue()) { default: break; case X86::COND_O: case X86::COND_B: // These can only come from an arithmetic instruction with overflow, // e.g. SADDO, UADDO. Cond = Cond.getOperand(1); addTest = false; break; } } } CondOpcode = Cond.getOpcode(); if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) { SDValue Value; X86::CondCode X86Cond; std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG); if (Inverted) X86Cond = X86::GetOppositeBranchCondition(X86Cond); CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8); addTest = false; } else { unsigned CondOpc; if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { SDValue Cmp = Cond.getOperand(0).getOperand(1); if (CondOpc == ISD::OR) { // Also, recognize the pattern generated by an FCMP_UNE. We can emit // two branches instead of an explicit OR instruction with a // separate test. if (Cmp == Cond.getOperand(1).getOperand(1) && isX86LogicalCmp(Cmp)) { CC = Cond.getOperand(0).getOperand(0); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); CC = Cond.getOperand(1).getOperand(0); Cond = Cmp; addTest = false; } } else { // ISD::AND // Also, recognize the pattern generated by an FCMP_OEQ. We can emit // two branches instead of an explicit AND instruction with a // separate test. However, we only do this if this block doesn't // have a fall-through edge, because this requires an explicit // jmp when the condition is false. if (Cmp == Cond.getOperand(1).getOperand(1) && isX86LogicalCmp(Cmp) && Op.getNode()->hasOneUse()) { X86::CondCode CCode0 = (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); CCode0 = X86::GetOppositeBranchCondition(CCode0); CC = DAG.getTargetConstant(CCode0, dl, MVT::i8); SDNode *User = *Op.getNode()->use_begin(); // Look for an unconditional branch following this conditional branch. // We need this because we need to reverse the successors in order // to implement FCMP_OEQ. if (User->getOpcode() == ISD::BR) { SDValue FalseBB = User->getOperand(1); SDNode *NewBR = DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); assert(NewBR == User); (void)NewBR; Dest = FalseBB; Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); X86::CondCode CCode1 = (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); CCode1 = X86::GetOppositeBranchCondition(CCode1); CC = DAG.getTargetConstant(CCode1, dl, MVT::i8); Cond = Cmp; addTest = false; } } } } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. // It should be transformed during dag combiner except when the condition // is set by a arithmetics with overflow node. X86::CondCode CCode = (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); CC = DAG.getTargetConstant(CCode, dl, MVT::i8); Cond = Cond.getOperand(0).getOperand(1); addTest = false; } else if (Cond.getOpcode() == ISD::SETCC && cast(Cond.getOperand(2))->get() == ISD::SETOEQ) { // For FCMP_OEQ, we can emit // two branches instead of an explicit AND instruction with a // separate test. However, we only do this if this block doesn't // have a fall-through edge, because this requires an explicit // jmp when the condition is false. if (Op.getNode()->hasOneUse()) { SDNode *User = *Op.getNode()->use_begin(); // Look for an unconditional branch following this conditional branch. // We need this because we need to reverse the successors in order // to implement FCMP_OEQ. if (User->getOpcode() == ISD::BR) { SDValue FalseBB = User->getOperand(1); SDNode *NewBR = DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); assert(NewBR == User); (void)NewBR; Dest = FalseBB; SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond.getOperand(0), Cond.getOperand(1)); Cmp = ConvertCmpIfNecessary(Cmp, DAG); CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8); Cond = Cmp; addTest = false; } } } else if (Cond.getOpcode() == ISD::SETCC && cast(Cond.getOperand(2))->get() == ISD::SETUNE) { // For FCMP_UNE, we can emit // two branches instead of an explicit OR instruction with a // separate test. SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond.getOperand(0), Cond.getOperand(1)); Cmp = ConvertCmpIfNecessary(Cmp, DAG); CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8); Cond = Cmp; addTest = false; } } if (addTest) { // Look pass the truncate if the high bits are known zero. if (isTruncWithZeroHighBitsInput(Cond, DAG)) Cond = Cond.getOperand(0); // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { SDValue BTCC; if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, dl, DAG, BTCC)) { CC = BTCC; Cond = BT; addTest = false; } } } if (addTest) { X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE; CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8); Cond = EmitTest(Cond, X86Cond, dl, DAG, Subtarget); } Cond = ConvertCmpIfNecessary(Cond, DAG); return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cond); } // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. // Calls to _alloca are needed to probe the stack when allocating more than 4k // bytes in one go. Touching the stack at 4K increments is necessary to ensure // that the guard pages used by the OS virtual memory manager are allocated in // correct sequence. SDValue X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool SplitStack = MF.shouldSplitStack(); bool EmitStackProbe = !getStackProbeSymbolName(MF).empty(); bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) || SplitStack || EmitStackProbe; SDLoc dl(Op); // Get the inputs. SDNode *Node = Op.getNode(); SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); MaybeAlign Alignment(Op.getConstantOperandVal(2)); EVT VT = Node->getValueType(0); // Chain the dynamic stack allocation so that it doesn't modify the stack // pointer when other instructions are using the stack. Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); bool Is64Bit = Subtarget.is64Bit(); MVT SPTy = getPointerTy(DAG.getDataLayout()); SDValue Result; if (!Lower) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" " not tell us which reg is the stack pointer!"); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); const Align StackAlign(TFI.getStackAlignment()); Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value if (Alignment && Alignment > StackAlign) Result = DAG.getNode(ISD::AND, dl, VT, Result, DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain } else if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); if (Is64Bit) { // The 64 bit implementation of segmented stacks needs to clobber both r10 // r11. This makes it impossible to use it along with nested parameters. const Function &F = MF.getFunction(); for (const auto &A : F.args()) { if (A.hasNestAttr()) report_fatal_error("Cannot use segmented stacks with functions that " "have nested arguments."); } } const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); Register Vreg = MRI.createVirtualRegister(AddrRegClass); Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, DAG.getRegister(Vreg, SPTy)); } else { SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size); MF.getInfo()->setHasWinAlloca(true); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); Register SPReg = RegInfo->getStackRegister(); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); Chain = SP.getValue(1); if (Alignment) { SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); } Result = SP; } Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); SDValue Ops[2] = {Result, Chain}; return DAG.getMergeValues(Ops, dl); } SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); auto PtrVT = getPointerTy(MF.getDataLayout()); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); const Value *SV = cast(Op.getOperand(2))->getValue(); SDLoc DL(Op); if (!Subtarget.is64Bit() || Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), MachinePointerInfo(SV)); } // __va_list_tag: // gp_offset (0 - 6 * 8) // fp_offset (48 - 48 + 8 * 16) // overflow_arg_area (point to parameters coming in memory). // reg_save_area SmallVector MemOps; SDValue FIN = Op.getOperand(1); // Store gp_offset SDValue Store = DAG.getStore( Op.getOperand(0), DL, DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN, MachinePointerInfo(SV)); MemOps.push_back(Store); // Store fp_offset FIN = DAG.getMemBasePlusOffset(FIN, 4, DL); Store = DAG.getStore( Op.getOperand(0), DL, DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN, MachinePointerInfo(SV, 4)); MemOps.push_back(Store); // Store ptr to overflow_arg_area FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL)); SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8)); MemOps.push_back(Store); // Store ptr to reg_save_area. FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant( Subtarget.isTarget64BitLP64() ? 8 : 4, DL)); SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT); Store = DAG.getStore( Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12)); MemOps.push_back(Store); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"); assert(Op.getNumOperands() == 4); MachineFunction &MF = DAG.getMachineFunction(); if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) // The Win64 ABI uses char* instead of a structure. return DAG.expandVAArg(Op.getNode()); SDValue Chain = Op.getOperand(0); SDValue SrcPtr = Op.getOperand(1); const Value *SV = cast(Op.getOperand(2))->getValue(); unsigned Align = Op.getConstantOperandVal(3); SDLoc dl(Op); EVT ArgVT = Op.getNode()->getValueType(0); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); uint8_t ArgMode; // Decide which area this value should be read from. // TODO: Implement the AMD64 ABI in its entirety. This simple // selection mechanism works only for the basic types. if (ArgVT == MVT::f80) { llvm_unreachable("va_arg for f80 not yet implemented"); } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { ArgMode = 2; // Argument passed in XMM register. Use fp_offset. } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. } else { llvm_unreachable("Unhandled argument type in LowerVAARG"); } if (ArgMode == 2) { // Sanity Check: Make sure using fp_offset makes sense. assert(!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()); } // Insert VAARG_64 node into the DAG // VAARG_64 returns two values: Variable Argument Address, Chain SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32), DAG.getConstant(ArgMode, dl, MVT::i8), DAG.getConstant(Align, dl, MVT::i32)}; SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other); SDValue VAARG = DAG.getMemIntrinsicNode( X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, MachinePointerInfo(SV), /*Align=*/0, MachineMemOperand::MOLoad | MachineMemOperand::MOStore); Chain = VAARG.getValue(1); // Load the next argument and return it return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo()); } static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows, // where a va_list is still an i8*. assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"); if (Subtarget.isCallingConvWin64( DAG.getMachineFunction().getFunction().getCallingConv())) // Probably a Win64 va_copy. return DAG.expandVACopy(Op.getNode()); SDValue Chain = Op.getOperand(0); SDValue DstPtr = Op.getOperand(1); SDValue SrcPtr = Op.getOperand(2); const Value *DstSV = cast(Op.getOperand(3))->getValue(); const Value *SrcSV = cast(Op.getOperand(4))->getValue(); SDLoc DL(Op); return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false, false, false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); } // Helper to get immediate/variable SSE shift opcode from other shift opcodes. static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) { switch (Opc) { case ISD::SHL: case X86ISD::VSHL: case X86ISD::VSHLI: return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI; case ISD::SRL: case X86ISD::VSRL: case X86ISD::VSRLI: return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI; case ISD::SRA: case X86ISD::VSRA: case X86ISD::VSRAI: return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI; } llvm_unreachable("Unknown target vector shift node"); } /// Handle vector element shifts where the shift amount is a constant. /// Takes immediate version of shift as input. static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG) { MVT ElementType = VT.getVectorElementType(); // Bitcast the source vector to the output type, this is mainly necessary for // vXi8/vXi64 shifts. if (VT != SrcOp.getSimpleValueType()) SrcOp = DAG.getBitcast(VT, SrcOp); // Fold this packed shift into its first operand if ShiftAmt is 0. if (ShiftAmt == 0) return SrcOp; // Check for ShiftAmt >= element width if (ShiftAmt >= ElementType.getSizeInBits()) { if (Opc == X86ISD::VSRAI) ShiftAmt = ElementType.getSizeInBits() - 1; else return DAG.getConstant(0, dl, VT); } assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"); // Fold this packed vector shift into a build vector if SrcOp is a // vector of Constants or UNDEFs. if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { SmallVector Elts; unsigned NumElts = SrcOp->getNumOperands(); switch (Opc) { default: llvm_unreachable("Unknown opcode!"); case X86ISD::VSHLI: for (unsigned i = 0; i != NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->isUndef()) { // Must produce 0s in the correct bits. Elts.push_back(DAG.getConstant(0, dl, ElementType)); continue; } auto *ND = cast(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType)); } break; case X86ISD::VSRLI: for (unsigned i = 0; i != NumElts; ++i) { SDValue CurrentOp = SrcOp->getOperand(i); if (CurrentOp->isUndef()) { // Must produce 0s in the correct bits. Elts.push_back(DAG.getConstant(0, dl, ElementType)); continue; } auto *ND = cast(CurrentOp); const APInt &C = ND->getAPIntValue(); Elts.push_back(DAG.getConstant(C.lshr(Shift